Commit `cd58b40011`

Andrew Kelley <superjoe30@gmail.com>

2017-10-01 00:20:12

update C headers to clang 5.0.0

1 parent ba3d21c

@@ -2887,87 +2887,79 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 
 /* vec_ctf */
 
-static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
-  return __builtin_altivec_vcfsx(__a, __b);
-}
-
-static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
-                                                    int __b) {
-  return __builtin_altivec_vcfux((vector int)__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector double __ATTRS_o_ai
-vec_ctf(vector unsigned long long __a, int __b) {
-  vector double __ret = __builtin_convertvector(__a, vector double);
-  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
-  return __ret;
-}
-
-static __inline__ vector double __ATTRS_o_ai
-vec_ctf(vector signed long long __a, int __b) {
-  vector double __ret = __builtin_convertvector(__a, vector double);
-  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
-  return __ret;
-}
+#define vec_ctf(__a, __b)                                                      \
+  _Generic((__a), vector int                                                   \
+           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+             vector unsigned int                                               \
+           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)),  \
+             vector unsigned long long                                         \
+           : (__builtin_convertvector((vector unsigned long long)(__a),        \
+                                      vector double) *                         \
+              (vector double)(vector unsigned long long)((0x3ffULL - (__b))    \
+                                                         << 52)),              \
+             vector signed long long                                           \
+           : (__builtin_convertvector((vector signed long long)(__a),          \
+                                      vector double) *                         \
+              (vector double)(vector unsigned long long)((0x3ffULL - (__b))    \
+                                                         << 52)))
+#else
+#define vec_ctf(__a, __b)                                                      \
+  _Generic((__a), vector int                                                   \
+           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+             vector unsigned int                                               \
+           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)))
 #endif
 
 /* vec_vcfsx */
 
-static __inline__ vector float __attribute__((__always_inline__))
-vec_vcfsx(vector int __a, int __b) {
-  return __builtin_altivec_vcfsx(__a, __b);
-}
+#define vec_vcfux __builtin_altivec_vcfux
 
 /* vec_vcfux */
 
-static __inline__ vector float __attribute__((__always_inline__))
-vec_vcfux(vector unsigned int __a, int __b) {
-  return __builtin_altivec_vcfux((vector int)__a, __b);
-}
+#define vec_vcfsx(__a, __b) __builtin_altivec_vcfsx((vector int)(__a), (__b))
 
 /* vec_cts */
 
-static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
-  return __builtin_altivec_vctsxs(__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector signed long long __ATTRS_o_ai
-vec_cts(vector double __a, int __b) {
-  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
-  return __builtin_convertvector(__a, vector signed long long);
-}
+#define vec_cts(__a, __b)                                                      \
+  _Generic((__a), vector float                                                 \
+           : __builtin_altivec_vctsxs((__a), (__b)), vector double             \
+           : __extension__({                                                   \
+             vector double __ret =                                             \
+                 (__a) *                                                       \
+                 (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
+                                                            << 52);            \
+             __builtin_convertvector(__ret, vector signed long long);          \
+           }))
+#else
+#define vec_cts __builtin_altivec_vctsxs
 #endif
 
 /* vec_vctsxs */
 
-static __inline__ vector int __attribute__((__always_inline__))
-vec_vctsxs(vector float __a, int __b) {
-  return __builtin_altivec_vctsxs(__a, __b);
-}
+#define vec_vctsxs __builtin_altivec_vctsxs
 
 /* vec_ctu */
 
-static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
-                                                           int __b) {
-  return __builtin_altivec_vctuxs(__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector unsigned long long __ATTRS_o_ai
-vec_ctu(vector double __a, int __b) {
-  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
-  return __builtin_convertvector(__a, vector unsigned long long);
-}
+#define vec_ctu(__a, __b)                                                      \
+  _Generic((__a), vector float                                                 \
+           : __builtin_altivec_vctuxs((__a), (__b)), vector double             \
+           : __extension__({                                                   \
+             vector double __ret =                                             \
+                 (__a) *                                                       \
+                 (vector double)(vector unsigned long long)((0x3ffULL + __b)   \
+                                                            << 52);            \
+             __builtin_convertvector(__ret, vector unsigned long long);        \
+           }))
+#else
+#define vec_ctu __builtin_altivec_vctuxs
 #endif
 
 /* vec_vctuxs */
 
-static __inline__ vector unsigned int __attribute__((__always_inline__))
-vec_vctuxs(vector float __a, int __b) {
-  return __builtin_altivec_vctuxs(__a, __b);
-}
+#define vec_vctuxs __builtin_altivec_vctuxs
 
 /* vec_signed */
 
@@ -8045,45 +8037,51 @@ static __inline__ vector float __ATTRS_o_ai vec_vsel(vector float __a,
 
 /* vec_sl */
 
-static __inline__ vector signed char __ATTRS_o_ai
-vec_sl(vector signed char __a, vector unsigned char __b) {
-  return __a << (vector signed char)__b;
-}
-
+// vec_sl does modulo arithmetic on __b first, so __b is allowed to be more
+// than the length of __a.
 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_sl(vector unsigned char __a, vector unsigned char __b) {
-  return __a << __b;
+  return __a << (__b %
+                 (vector unsigned char)(sizeof(unsigned char) * __CHAR_BIT__));
 }
 
-static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
-                                                   vector unsigned short __b) {
-  return __a << (vector short)__b;
+static __inline__ vector signed char __ATTRS_o_ai
+vec_sl(vector signed char __a, vector unsigned char __b) {
+  return (vector signed char)vec_sl((vector unsigned char)__a, __b);
 }
 
 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_sl(vector unsigned short __a, vector unsigned short __b) {
-  return __a << __b;
+  return __a << (__b % (vector unsigned short)(sizeof(unsigned short) *
+                                               __CHAR_BIT__));
 }
 
-static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
-                                                 vector unsigned int __b) {
-  return __a << (vector int)__b;
+static __inline__ vector short __ATTRS_o_ai vec_sl(vector short __a,
+                                                   vector unsigned short __b) {
+  return (vector short)vec_sl((vector unsigned short)__a, __b);
 }
 
 static __inline__ vector unsigned int __ATTRS_o_ai
 vec_sl(vector unsigned int __a, vector unsigned int __b) {
-  return __a << __b;
+  return __a << (__b %
+                 (vector unsigned int)(sizeof(unsigned int) * __CHAR_BIT__));
 }
 
-#ifdef __POWER8_VECTOR__
-static __inline__ vector signed long long __ATTRS_o_ai
-vec_sl(vector signed long long __a, vector unsigned long long __b) {
-  return __a << (vector long long)__b;
+static __inline__ vector int __ATTRS_o_ai vec_sl(vector int __a,
+                                                 vector unsigned int __b) {
+  return (vector int)vec_sl((vector unsigned int)__a, __b);
 }
 
+#ifdef __POWER8_VECTOR__
 static __inline__ vector unsigned long long __ATTRS_o_ai
 vec_sl(vector unsigned long long __a, vector unsigned long long __b) {
-  return __a << __b;
+  return __a << (__b % (vector unsigned long long)(sizeof(unsigned long long) *
+                                                   __CHAR_BIT__));
+}
+
+static __inline__ vector long long __ATTRS_o_ai
+vec_sl(vector long long __a, vector unsigned long long __b) {
+  return (vector long long)vec_sl((vector unsigned long long)__a, __b);
 }
 #endif
 
@@ -12150,6 +12148,11 @@ static __inline__ void __ATTRS_o_ai vec_vsx_st(vector unsigned char __a,
 
 #endif
 
+#ifdef __VSX__
+#define vec_xxpermdi __builtin_vsx_xxpermdi
+#define vec_xxsldwi __builtin_vsx_xxsldwi
+#endif
+
 /* vec_xor */
 
 #define __builtin_altivec_vxor vec_xor

@@ -224,6 +224,36 @@ __rbitl(unsigned long __t) {
 #endif
 }
 
+/*
+ * 9.3 16-bit multiplications
+ */
+#if __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulbb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulbb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulbt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulbt(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smultb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smultb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smultt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smultt(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulwb(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulwb(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
+__smulwt(int32_t __a, int32_t __b) {
+  return __builtin_arm_smulwt(__a, __b);
+}
+#endif
+
 /*
  * 9.4 Saturating intrinsics
  *
@@ -231,13 +261,13 @@ __rbitl(unsigned long __t) {
  * intrinsics are implemented and the flag is enabled.
  */
 /* 9.4.1 Width-specified saturation intrinsics */
-#if __ARM_32BIT_STATE
+#if __ARM_FEATURE_SAT
 #define __ssat(x, y) __builtin_arm_ssat(x, y)
 #define __usat(x, y) __builtin_arm_usat(x, y)
 #endif
 
 /* 9.4.2 Saturating addition and subtraction intrinsics */
-#if __ARM_32BIT_STATE
+#if __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qadd(int32_t __t, int32_t __v) {
   return __builtin_arm_qadd(__t, __v);
@@ -254,6 +284,290 @@ __qdbl(int32_t __t) {
 }
 #endif
 
+/* 9.4.3 Accumultating multiplications */
+#if __ARM_FEATURE_DSP
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlabb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlabb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlabt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlabt(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlatb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlatb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlatt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlatt(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlawb(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlawb(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlawt(int32_t __a, int32_t __b, int32_t __c) {
+  return __builtin_arm_smlawt(__a, __b, __c);
+}
+#endif
+
+
+/* 9.5.4 Parallel 16-bit saturation */
+#if __ARM_FEATURE_SIMD32
+#define __ssat16(x, y) __builtin_arm_ssat16(x, y)
+#define __usat16(x, y) __builtin_arm_usat16(x, y)
+#endif
+
+/* 9.5.5 Packing and unpacking */
+#if __ARM_FEATURE_SIMD32
+typedef int32_t int8x4_t;
+typedef int32_t int16x2_t;
+typedef uint32_t uint8x4_t;
+typedef uint32_t uint16x2_t;
+
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sxtab16(int16x2_t __a, int8x4_t __b) {
+  return __builtin_arm_sxtab16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sxtb16(int8x4_t __a) {
+  return __builtin_arm_sxtb16(__a);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__uxtab16(int16x2_t __a, int8x4_t __b) {
+  return __builtin_arm_uxtab16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__uxtb16(int8x4_t __a) {
+  return __builtin_arm_uxtb16(__a);
+}
+#endif
+
+/* 9.5.6 Parallel selection */
+#if __ARM_FEATURE_SIMD32
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__sel(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_sel(__a, __b);
+}
+#endif
+
+/* 9.5.7 Parallel 8-bit addition and subtraction */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__qadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_qadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__qsub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_qsub8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__sadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_sadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__shadd8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_shadd8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__shsub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_shsub8(__a, __b);
+}
+static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
+__ssub8(int8x4_t __a, int8x4_t __b) {
+  return __builtin_arm_ssub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uhadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uhadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uhsub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uhsub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uqadd8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uqadd8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__uqsub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_uqsub8(__a, __b);
+}
+static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
+__usub8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_usub8(__a, __b);
+}
+#endif
+
+/* 9.5.8 Sum of 8-bit absolute differences */
+#if __ARM_FEATURE_SIMD32
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__usad8(uint8x4_t __a, uint8x4_t __b) {
+  return __builtin_arm_usad8(__a, __b);
+}
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
+  return __builtin_arm_usada8(__a, __b, __c);
+}
+#endif
+
+/* 9.5.9 Parallel 16-bit addition and subtraction */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qsax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qsax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__qsub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_qsub16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_sadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__sasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_sasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shadd16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shadd16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shasx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shasx(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shsax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shsax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__shsub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_shsub16(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__ssax(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_ssax(__a, __b);
+}
+static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
+__ssub16(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_ssub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhsax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhsax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uhsub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uhsub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqadd16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqadd16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqasx(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqasx(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqsax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqsax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__uqsub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_uqsub16(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__usax(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_usax(__a, __b);
+}
+static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__))
+__usub16(uint16x2_t __a, uint16x2_t __b) {
+  return __builtin_arm_usub16(__a, __b);
+}
+#endif
+
+/* 9.5.10 Parallel 16-bit multiplications */
+#if __ARM_FEATURE_SIMD32
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlad(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smladx(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlald(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlaldx(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlsd(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) {
+  return __builtin_arm_smlsdx(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlsld(__a, __b, __c);
+}
+static __inline__ int64_t __attribute__((__always_inline__, __nodebug__))
+__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) {
+  return __builtin_arm_smlsldx(__a, __b, __c);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smuad(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smuad(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smuadx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smuadx(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smusd(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smusd(__a, __b);
+}
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+__smusdx(int16x2_t __a, int16x2_t __b) {
+  return __builtin_arm_smusdx(__a, __b);
+}
+#endif
+
 /* 9.7 CRC32 intrinsics */
 #if __ARM_FEATURE_CRC32
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))

@@ -832,7 +832,8 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_stream_load_si256(__m256i const *__V)
 {
-  return (__m256i)__builtin_ia32_movntdqa256((const __v4di *)__V);
+  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
+  return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS

@@ -504,115 +504,91 @@ _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_packs_epi32 (__m512i __A, __m512i __B)
+_mm512_packs_epi32(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                       (__v32hi)_mm512_packs_epi32(__A, __B),
+                                       (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
-       __m512i __B)
+_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) __W,
-              __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                       (__v32hi)_mm512_packs_epi32(__A, __B),
+                                       (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_packs_epi16 (__m512i __A, __m512i __B)
+_mm512_packs_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
-       __m512i __B)
+_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) __W,
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                        (__v64qi)_mm512_packs_epi16(__A, __B),
+                                        (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                        (__v64qi)_mm512_packs_epi16(__A, __B),
+                                        (__v64qi)_mm512_setzero_qi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_packus_epi32 (__m512i __A, __m512i __B)
+_mm512_packus_epi32(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                       (__v32hi)_mm512_packus_epi32(__A, __B),
+                                       (__v32hi)_mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
-        __m512i __B)
+_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
-              (__v16si) __B,
-              (__v32hi) __W,
-              __M);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+                                       (__v32hi)_mm512_packus_epi32(__A, __B),
+                                       (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_packus_epi16 (__m512i __A, __m512i __B)
+_mm512_packus_epi16(__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
-        __m512i __B)
+_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) __W,
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                        (__v64qi)_mm512_packus_epi16(__A, __B),
+                                        (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __M);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+                                        (__v64qi)_mm512_packus_epi16(__A, __B),
+                                        (__v64qi)_mm512_setzero_qi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS

@@ -995,51 +995,50 @@ _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_broadcast_f32x8 (__m256 __A)
+_mm512_broadcast_f32x8(__m256 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
-                _mm512_undefined_ps(),
-                (__mmask16) -1);
+  return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
+                                         0, 1, 2, 3, 4, 5, 6, 7,
+                                         0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
+_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
-                (__v16sf)__O,
-                __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask8)__M,
+                                           (__v16sf)_mm512_broadcast_f32x8(__A),
+                                           (__v16sf)__O);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
+_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
-                (__v16sf)_mm512_setzero_ps (),
-                __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask8)__M,
+                                           (__v16sf)_mm512_broadcast_f32x8(__A),
+                                           (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_broadcast_f64x2 (__m128d __A)
+_mm512_broadcast_f64x2(__m128d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
-                 (__v8df)_mm512_undefined_pd(),
-                 (__mmask8) -1);
+  return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
+                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
-                 (__v8df)
-                 __O, __M);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+                                            (__v8df)_mm512_broadcast_f64x2(__A),
+                                            (__v8df)__O);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
-                 (__v8df)_mm512_setzero_ps (),
-                 __M);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+                                            (__v8df)_mm512_broadcast_f64x2(__A),
+                                            (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -1067,52 +1066,50 @@ _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcast_i32x8 (__m256i __A)
+_mm512_broadcast_i32x8(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
-                 (__v16si)_mm512_setzero_si512(),
-                 (__mmask16) -1);
+  return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
+                                          0, 1, 2, 3, 4, 5, 6, 7,
+                                          0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
+_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
-                 (__v16si)__O,
-                 __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M,
+                                           (__v16si)_mm512_broadcast_i32x8(__A),
+                                           (__v16si)__O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
+_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
-                 (__v16si)
-                 _mm512_setzero_si512 (),
-                 __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M,
+                                           (__v16si)_mm512_broadcast_i32x8(__A),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcast_i64x2 (__m128i __A)
+_mm512_broadcast_i64x2(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
-                 (__v8di)_mm512_setzero_si512(),
-                 (__mmask8) -1);
+  return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
+                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
-                 (__v8di)
-                 __O, __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                            (__v8di)_mm512_broadcast_i64x2(__A),
+                                            (__v8di)__O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
-                 (__v8di)_mm512_setzero_si512 (),
-                 __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                            (__v8di)_mm512_broadcast_i64x2(__A),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
 #define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \

@@ -528,6 +528,116 @@ _mm512_mask2int(__mmask16 __a)
   return (int)__a;
 }
 
+/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The upper 384 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
+///    contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_zextpd128_pd512(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
+}
+
+/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
+///    256-bit floating-point vector of [4 x double]. The lower 256 bits
+///    contain the value of the source vector. The upper 256 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
+///    contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline __m512d __DEFAULT_FN_ATTRS
+_mm512_zextpd256_pd512(__m256d __a)
+{
+  return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The upper 384 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
+///    contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_zextps128_ps512(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
+}
+
+/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
+///    256-bit floating-point vector of [8 x float]. The lower 256 bits contain
+///    the value of the source vector. The upper 256 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
+///    contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline __m512 __DEFAULT_FN_ATTRS
+_mm512_zextps256_ps512(__m256 __a)
+{
+  return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+/// \brief Constructs a 512-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The upper
+///    384 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The upper 384 bits are set to zero.
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_zextsi128_si512(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
+}
+
+/// \brief Constructs a 512-bit integer vector from a 256-bit integer vector.
+///    The lower 256 bits contain the value of the source vector. The upper
+///    256 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit integer vector.
+/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
+///    the parameter. The upper 256 bits are set to zero.
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_zextsi256_si512(__m256i __a)
+{
+  return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
 /* Bitwise operators */
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_and_epi32(__m512i __a, __m512i __b)
@@ -4179,7 +4289,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
 {
   return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
-                  (__v16si) 
+                  (__v16si)
                   _mm512_setzero_si512 (),
                   (__mmask16) __U ,
                   _MM_FROUND_CUR_DIRECTION);
@@ -4229,6 +4339,18 @@ _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
                   _MM_FROUND_CUR_DIRECTION);
 }
 
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm512_cvtsd_f64(__m512d __a)
+{
+  return __a[0];
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm512_cvtss_f32(__m512 __a)
+{
+  return __a[0];
+}
+
 /* Unpack and Interleave */
 
 static __inline __m512d __DEFAULT_FN_ATTRS
@@ -4540,7 +4662,7 @@ _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS
-_mm512_loadu_pd(double const *__p)
+_mm512_loadu_pd(void const *__p)
 {
   struct __loadu_pd {
     __m512d __v;
@@ -4549,7 +4671,7 @@ _mm512_loadu_pd(double const *__p)
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
-_mm512_loadu_ps(float const *__p)
+_mm512_loadu_ps(void const *__p)
 {
   struct __loadu_ps {
     __m512 __v;
@@ -4558,7 +4680,7 @@ _mm512_loadu_ps(float const *__p)
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS
-_mm512_load_ps(float const *__p)
+_mm512_load_ps(void const *__p)
 {
   return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
                                                   (__v16sf)
@@ -4584,7 +4706,7 @@ _mm512_maskz_load_ps(__mmask16 __U, void const *__P)
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS
-_mm512_load_pd(double const *__p)
+_mm512_load_pd(void const *__p)
 {
   return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
                                                    (__v8df)
@@ -7278,107 +7400,97 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
                                            (__mmask8)(U), (int)(R)); })
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_broadcast_f32x4 (__m128 __A)
+_mm512_broadcast_f32x4(__m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
-                 (__v16sf)
-                 _mm512_undefined_ps (),
-                 (__mmask16) -1);
+  return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+                                         0, 1, 2, 3, 0, 1, 2, 3,
+                                         0, 1, 2, 3, 0, 1, 2, 3);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
-                 (__v16sf) __O,
-                 __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                           (__v16sf)_mm512_broadcast_f32x4(__A),
+                                           (__v16sf)__O);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
-                 (__v16sf)
-                 _mm512_setzero_ps (),
-                 __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                           (__v16sf)_mm512_broadcast_f32x4(__A),
+                                           (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_broadcast_f64x4 (__m256d __A)
+_mm512_broadcast_f64x4(__m256d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
-                  (__v8df)
-                  _mm512_undefined_pd (),
-                  (__mmask8) -1);
+  return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
+                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
-                  (__v8df) __O,
-                  __M);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+                                            (__v8df)_mm512_broadcast_f64x4(__A),
+                                            (__v8df)__O);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
 {
-  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
-                  (__v8df)
-                  _mm512_setzero_pd (),
-                  __M);
+  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+                                            (__v8df)_mm512_broadcast_f64x4(__A),
+                                            (__v8df)_mm512_setzero_pd());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcast_i32x4 (__m128i __A)
+_mm512_broadcast_i32x4(__m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
-                  (__v16si)
-                  _mm512_undefined_epi32 (),
-                  (__mmask16) -1);
+  return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+                                          0, 1, 2, 3, 0, 1, 2, 3,
+                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
-                  (__v16si) __O,
-                  __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                           (__v16si)_mm512_broadcast_i32x4(__A),
+                                           (__v16si)__O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
-                  (__v16si)
-                  _mm512_setzero_si512 (),
-                  __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                           (__v16si)_mm512_broadcast_i32x4(__A),
+                                           (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcast_i64x4 (__m256i __A)
+_mm512_broadcast_i64x4(__m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
-                  (__v8di)
-                  _mm512_undefined_epi32 (),
-                  (__mmask8) -1);
+  return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
+                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
-                  (__v8di) __O,
-                  __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                            (__v8di)_mm512_broadcast_i64x4(__A),
+                                            (__v8di)__O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
-                  (__v8di)
-                  _mm512_setzero_si512 (),
-                  __M);
+  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+                                            (__v8di)_mm512_broadcast_i64x4(__A),
+                                            (__v8di)_mm512_setzero_si512());
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -7860,12 +7972,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
                                    3 + ((imm) & 0x3) * 4); })
 
 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                 (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
-                                (__v4si)__W); })
+                                (__v4si)(W)); })
 
 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, \
+  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                 (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
                                 (__v4si)_mm_setzero_si128()); })
 
@@ -7878,12 +7990,12 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
                                    ((imm) & 1) ? 7 : 3); })
 
 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,      \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                 (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
-                                (__v4di)__W); })
+                                (__v4di)(W)); })
 
 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,      \
+  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                 (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
                                 (__v4di)_mm256_setzero_si256()); })
 
@@ -8159,11 +8271,11 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
                                        (int)(scale)); })
 
-#define _mm512_mask_i64gather_ps( __v1_old, __mask, __index,\
-                                  __addr, __scale) __extension__({\
-__builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,\
-                              __addr,(__v8di) __index, __mask, __scale);\
-})
+#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__({\
+  (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
+                                       (float const *)(addr), \
+                                       (__v8di)(__m512i)(index), \
+                                       (__mmask8)(mask), (int)(scale)); })
 
 #define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
   (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
@@ -8858,6 +8970,8 @@ _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
                  (__mmask16) -1);
 }
 
+#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
+
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
              __m512i __Y)
@@ -8868,6 +8982,8 @@ _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
                  __M);
 }
 
+#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
+
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kand (__mmask16 __A, __mmask16 __B)
 {
@@ -8919,25 +9035,29 @@ _mm512_kxor (__mmask16 __A, __mmask16 __B)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm512_stream_si512 (__m512i * __P, __m512i __A)
 {
-  __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
+  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
+  __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_stream_load_si512 (void *__P)
 {
-  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
+  return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm512_stream_pd (double *__P, __m512d __A)
 {
-  __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
+  typedef __v8df __v8df_aligned __attribute__((aligned(64)));
+  __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm512_stream_ps (float *__P, __m512 __A)
 {
-  __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
+  typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
+  __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
 }
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS
@@ -9101,39 +9221,39 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
-  __m128 res = __A; 
+  __m128 res = __A;
   res[0] = (__U & 1) ? __B[0] : __W[0];
-  return res; 
+  return res;
 }
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
 {
-  __m128 res = __A; 
-  res[0] = (__U & 1) ? __B[0] : 0; 
-  return res; 
+  __m128 res = __A;
+  res[0] = (__U & 1) ? __B[0] : 0;
+  return res;
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
-  __m128d res = __A; 
+  __m128d res = __A;
   res[0] = (__U & 1) ? __B[0] : __W[0];
-  return res; 
+  return res;
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
 {
-  __m128d res = __A; 
-  res[0] = (__U & 1) ? __B[0] : 0; 
-  return res; 
+  __m128d res = __A;
+  res[0] = (__U & 1) ? __B[0] : 0;
+  return res;
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
 {
-  __builtin_ia32_storess128_mask ((__v16sf *)__W, 
+  __builtin_ia32_storess128_mask ((__v16sf *)__W,
                 (__v16sf) _mm512_castps128_ps512(__A),
                 (__mmask16) __U & (__mmask16)1);
 }
@@ -9141,7 +9261,7 @@ _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
 {
-  __builtin_ia32_storesd128_mask ((__v8df *)__W, 
+  __builtin_ia32_storesd128_mask ((__v8df *)__W,
                 (__v8df) _mm512_castpd128_pd512(__A),
                 (__mmask8) __U & 1);
 }
@@ -9490,7 +9610,7 @@ _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
 {
   return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
                                              (__v2df)(__B),
-                                             (__v4sf)(__W), 
+                                             (__v4sf)(__W),
                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
 }
 
@@ -9499,7 +9619,7 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
 {
   return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
                                              (__v2df)(__B),
-                                             (__v4sf)_mm_setzero_ps(), 
+                                             (__v4sf)_mm_setzero_ps(),
                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
 }
 
@@ -9564,7 +9684,7 @@ _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
   return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
                                               (__v4sf)(__B),
                                               (__v2df)(__W),
-                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -9572,8 +9692,8 @@ _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
 {
   return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
                                               (__v4sf)(__B),
-                                              (__v2df)_mm_setzero_pd(), 
-                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); 
+                                              (__v2df)_mm_setzero_pd(),
+                                              (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
 }
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS
@@ -9635,6 +9755,45 @@ _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
 }
 #endif
 
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
+    char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
+    char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
+    char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
+    char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
+    char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
+    char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
+    char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
+    char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
+    char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
+    char __e4, char __e3, char __e2, char __e1, char __e0) {
+
+  return __extension__ (__m512i)(__v64qi)
+    {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
+     __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
+     __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
+     __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
+     __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
+     __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
+     __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
+     __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
+}
+
+static  __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
+    short __e27, short __e26, short __e25, short __e24, short __e23,
+    short __e22, short __e21, short __e20, short __e19, short __e18,
+    short __e17, short __e16, short __e15, short __e14, short __e13,
+    short __e12, short __e11, short __e10, short __e9, short __e8,
+    short __e7, short __e6, short __e5, short __e4, short __e3,
+    short __e2, short __e1, short __e0) {
+  return __extension__ (__m512i)(__v32hi)
+    {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
+     __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
+     __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
+     __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
+}
+
 static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_set_epi32 (int __A, int __B, int __C, int __D,
      int __E, int __F, int __G, int __H,
@@ -9780,7 +9939,7 @@ static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
 }
 
 // Vec512 - Vector with size 512.
-// Vec512Neutral - All vector elements set to the identity element. 
+// Vec512Neutral - All vector elements set to the identity element.
 // Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0}
 // Operator - Can be one of following: +,*,&,|
 // Mask - Intrinsic Mask
@@ -9810,19 +9969,19 @@ _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
 
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
-  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), 
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
                                     &, __M,  i, i, q);
 }
 
 static __inline__ long long __DEFAULT_FN_ATTRS
 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
-  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, 
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M,
                                     i, i, q);
 }
 
 static __inline__ double __DEFAULT_FN_ATTRS
 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
-  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, 
+  _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
                                     f, d, pd);
 }
 
@@ -9884,17 +10043,17 @@ _mm512_reduce_add_epi32(__m512i __W) {
   _mm512_reduce_operator_32bit(__W, +, i, i);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS 
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm512_reduce_mul_epi32(__m512i __W) {
   _mm512_reduce_operator_32bit(__W, *, i, i);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS 
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm512_reduce_and_epi32(__m512i __W) {
   _mm512_reduce_operator_32bit(__W, &, i, i);
 }
 
-static __inline__ int __DEFAULT_FN_ATTRS 
+static __inline__ int __DEFAULT_FN_ATTRS
 _mm512_reduce_or_epi32(__m512i __W) {
   _mm512_reduce_operator_32bit(__W, |, i, i);
 }
@@ -9910,7 +10069,7 @@ _mm512_reduce_mul_ps(__m512 __W) {
 }
 
 // Vec512 - Vector with size 512.
-// Vec512Neutral - All vector elements set to the identity element. 
+// Vec512Neutral - All vector elements set to the identity element.
 // Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0}
 // Operator - Can be one of following: +,*,&,|
 // Mask - Intrinsic Mask
@@ -9940,7 +10099,7 @@ _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
-  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, 
+  _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
                                     i, i, d);
 }
 
@@ -10003,7 +10162,7 @@ _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
     return Vec512[0];                                                          \
   })
 
-static __inline__ long long __DEFAULT_FN_ATTRS 
+static __inline__ long long __DEFAULT_FN_ATTRS
 _mm512_reduce_max_epi64(__m512i __V) {
   _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
 }
@@ -10013,7 +10172,7 @@ _mm512_reduce_max_epu64(__m512i __V) {
   _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
 }
 
-static __inline__ double __DEFAULT_FN_ATTRS 
+static __inline__ double __DEFAULT_FN_ATTRS
 _mm512_reduce_max_pd(__m512d __V) {
   _mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
 }
@@ -10028,7 +10187,7 @@ _mm512_reduce_min_epu64(__m512i __V) {
   _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
 }
 
-static __inline__ double __DEFAULT_FN_ATTRS 
+static __inline__ double __DEFAULT_FN_ATTRS
 _mm512_reduce_min_pd(__m512d __V) {
   _mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
 }

@@ -1000,27 +1000,26 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_broadcast_f64x2 (__m128d __A)
+_mm256_broadcast_f64x2(__m128d __A)
 {
-  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
-                 (__v4df)_mm256_undefined_pd(),
-                 (__mmask8) -1);
+  return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
+                                          0, 1, 0, 1);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
+_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
 {
-  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
-                 (__v4df) __O,
-                 __M);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
+                                            (__v4df)_mm256_broadcast_f64x2(__A),
+                                            (__v4df)__O);
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
 _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
 {
-  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) __A,
-                 (__v4df) _mm256_setzero_ps (),
-                 __M);
+  return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
+                                            (__v4df)_mm256_broadcast_f64x2(__A),
+                                            (__v4df)_mm256_setzero_pd());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -1072,27 +1071,26 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_broadcast_i64x2 (__m128i __A)
+_mm256_broadcast_i64x2(__m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
-                 (__v4di)_mm256_undefined_si256(),
-                 (__mmask8) -1);
+  return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
+                                          0, 1, 0, 1);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
+_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
-                 (__v4di) __O,
-                 __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                            (__v4di)_mm256_broadcast_i64x2(__A),
+                                            (__v4di)__O);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) __A,
-                 (__v4di) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+                                            (__v4di)_mm256_broadcast_i64x2(__A),
+                                            (__v4di)_mm256_setzero_si256());
 }
 
 #define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \

@@ -7189,52 +7189,49 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_broadcast_f32x4 (__m128 __A)
+_mm256_broadcast_f32x4(__m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
-                (__v8sf)_mm256_undefined_pd (),
-                (__mmask8) -1);
+  return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+                                         0, 1, 2, 3, 0, 1, 2, 3);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
+_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
-                (__v8sf) __O,
-                __M);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+                                            (__v8sf)_mm256_broadcast_f32x4(__A),
+                                            (__v8sf)__O);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
-                (__v8sf) _mm256_setzero_ps (),
-                __M);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+                                            (__v8sf)_mm256_broadcast_f32x4(__A),
+                                            (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_broadcast_i32x4 (__m128i __A)
+_mm256_broadcast_i32x4(__m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
-                 (__v8si)_mm256_undefined_si256 (),
-                 (__mmask8) -1);
+  return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+                                          0, 1, 2, 3, 0, 1, 2, 3);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
+_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) __A,
-                 (__v8si)
-                 __O, __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                            (__v8si)_mm256_broadcast_i32x4(__A),
+                                            (__v8si)__O);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
+_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
-                 __A,
-                 (__v8si) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                            (__v8si)_mm256_broadcast_i32x4(__A),
+                                            (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS

@@ -0,0 +1,70 @@
+/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics
+ *------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQINTRIN_H
+#define __AVX512VPOPCNTDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd"   \
+                                                            "q")))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectq_512(
+      (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
+  return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
+  return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  return (__m512i)__builtin_ia32_selectd_512(
+      (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
+  return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif

@@ -1458,12 +1458,13 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \brief Computes two dot products in parallel, using the lower and upper
 ///    halves of two [8 x float] vectors as input to the two computations, and
 ///    returning the two dot products in the lower and upper halves of the
-///    [8 x float] result. The immediate integer operand controls which input
-///    elements will contribute to the dot product, and where the final results
-///    are returned. In general, for each dot product, the four corresponding
-///    elements of the input vectors are multiplied; the first two and second
-///    two products are summed, then the two sums are added to form the final
-///    result.
+///    [8 x float] result.
+///
+///    The immediate integer operand controls which input elements will
+///    contribute to the dot product, and where the final results are returned.
+///    In general, for each dot product, the four corresponding elements of the
+///    input vectors are multiplied; the first two and second two products are
+///    summed, then the two sums are added to form the final result.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1497,15 +1498,16 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 
 /* Vector shuffle */
 /// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
-///    specified by the immediate value operand. The four selected elements in
-///    each operand are copied to the destination according to the bits
-///    specified in the immediate operand. The selected elements from the first
-///    256-bit operand are copied to bits [63:0] and bits [191:128] of the
-///    destination, and the selected elements from the second 256-bit operand
-///    are copied to bits [127:64] and bits [255:192] of the destination. For
-///    example, if bits [7:0] of the immediate operand contain a value of 0xFF,
-///    the 256-bit destination vector would contain the following values: b[7],
-///    b[7], a[7], a[7], b[3], b[3], a[3], a[3].
+///    specified by the immediate value operand.
+///
+///    The four selected elements in each operand are copied to the destination
+///    according to the bits specified in the immediate operand. The selected
+///    elements from the first 256-bit operand are copied to bits [63:0] and
+///    bits [191:128] of the destination, and the selected elements from the
+///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
+///    the destination. For example, if bits [7:0] of the immediate operand
+///    contain a value of 0xFF, the 256-bit destination vector would contain the
+///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1557,13 +1559,14 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
                                   12 + (((mask) >> 6) & 0x3)); })
 
 /// \brief Selects four double-precision values from the 256-bit operands of
-///    [4 x double], as specified by the immediate value operand. The selected
-///    elements from the first 256-bit operand are copied to bits [63:0] and
-///    bits [191:128] in the destination, and the selected elements from the
-///    second 256-bit operand are copied to bits [127:64] and bits [255:192] in
-///    the destination. For example, if bits [3:0] of the immediate operand
-///    contain a value of 0xF, the 256-bit destination vector would contain the
-///    following values: b[3], a[3], b[1], a[1].
+///    [4 x double], as specified by the immediate value operand.
+///
+///    The selected elements from the first 256-bit operand are copied to bits
+///    [63:0] and bits [191:128] in the destination, and the selected elements
+///    from the second 256-bit operand are copied to bits [127:64] and bits
+///    [255:192] in the destination. For example, if bits [3:0] of the immediate
+///    operand contain a value of 0xF, the 256-bit destination vector would
+///    contain the following values: b[3], a[3], b[1], a[1].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1613,9 +1616,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
-#define _CMP_ORD_Q    0x07 /* Ordered (nonsignaling)   */
+#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
 #define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
-#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unord, signaling)  */
+#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
 #define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
 #define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
 #define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
@@ -1628,10 +1631,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 #define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
 #define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
 #define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
-#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unord, non-signaling)  */
+#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
 #define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
 #define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
-#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unord, non-sign)  */
+#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
 #define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
 #define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
 #define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
@@ -1641,9 +1644,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 
 /// \brief Compares each of the corresponding double-precision values of two
 ///    128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand. Returns a [2 x double] vector consisting of
-///    two doubles corresponding to the two comparison results: zero if the
-///    comparison is false, and all 1's if the comparison is true.
+///    immediate integer operand.
+///
+///    Returns a [2 x double] vector consisting of two doubles corresponding to
+///    the two comparison results: zero if the comparison is false, and all 1's
+///    if the comparison is true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1660,17 +1665,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    00h, 08h, 10h, 18h: Equal \n
-///    01h, 09h, 11h, 19h: Less than \n
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
-///                        (swapped operands) \n
-///    03h, 0Bh, 13h, 1Bh: Unordered \n
-///    04h, 0Ch, 14h, 1Ch: Not equal \n
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
-///                        (swapped operands) \n
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands) \n
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    0x00 : Equal (ordered, non-signaling)
+///    0x01 : Less-than (ordered, signaling)
+///    0x02 : Less-than-or-equal (ordered, signaling)
+///    0x03 : Unordered (non-signaling)
+///    0x04 : Not-equal (unordered, non-signaling)
+///    0x05 : Not-less-than (unordered, signaling)
+///    0x06 : Not-less-than-or-equal (unordered, signaling)
+///    0x07 : Ordered (non-signaling)
+///    0x08 : Equal (unordered, non-signaling)
+///    0x09 : Not-greater-than-or-equal (unordered, signaling)
+///    0x0a : Not-greater-than (unordered, signaling)
+///    0x0b : False (ordered, non-signaling)
+///    0x0c : Not-equal (ordered, non-signaling)
+///    0x0d : Greater-than-or-equal (ordered, signaling)
+///    0x0e : Greater-than (ordered, signaling)
+///    0x0f : True (unordered, non-signaling)
+///    0x10 : Equal (ordered, signaling)
+///    0x11 : Less-than (ordered, non-signaling)
+///    0x12 : Less-than-or-equal (ordered, non-signaling)
+///    0x13 : Unordered (signaling)
+///    0x14 : Not-equal (unordered, signaling)
+///    0x15 : Not-less-than (unordered, non-signaling)
+///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
+///    0x17 : Ordered (signaling)
+///    0x18 : Equal (unordered, signaling)
+///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
+///    0x1a : Not-greater-than (unordered, non-signaling)
+///    0x1b : False (ordered, signaling)
+///    0x1c : Not-equal (ordered, signaling)
+///    0x1d : Greater-than-or-equal (ordered, non-signaling)
+///    0x1e : Greater-than (ordered, non-signaling)
+///    0x1f : True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_pd(a, b, c) __extension__ ({ \
   (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
@@ -1678,9 +1704,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 
 /// \brief Compares each of the corresponding values of two 128-bit vectors of
 ///    [4 x float], using the operation specified by the immediate integer
-///    operand. Returns a [4 x float] vector consisting of four floats
-///    corresponding to the four comparison results: zero if the comparison is
-///    false, and all 1's if the comparison is true.
+///    operand.
+///
+///    Returns a [4 x float] vector consisting of four floats corresponding to
+///    the four comparison results: zero if the comparison is false, and all 1's
+///    if the comparison is true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1697,17 +1725,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    00h, 08h, 10h, 18h: Equal \n
-///    01h, 09h, 11h, 19h: Less than \n
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
-///                        (swapped operands) \n
-///    03h, 0Bh, 13h, 1Bh: Unordered \n
-///    04h, 0Ch, 14h, 1Ch: Not equal \n
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
-///                        (swapped operands) \n
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands) \n
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    0x00 : Equal (ordered, non-signaling)
+///    0x01 : Less-than (ordered, signaling)
+///    0x02 : Less-than-or-equal (ordered, signaling)
+///    0x03 : Unordered (non-signaling)
+///    0x04 : Not-equal (unordered, non-signaling)
+///    0x05 : Not-less-than (unordered, signaling)
+///    0x06 : Not-less-than-or-equal (unordered, signaling)
+///    0x07 : Ordered (non-signaling)
+///    0x08 : Equal (unordered, non-signaling)
+///    0x09 : Not-greater-than-or-equal (unordered, signaling)
+///    0x0a : Not-greater-than (unordered, signaling)
+///    0x0b : False (ordered, non-signaling)
+///    0x0c : Not-equal (ordered, non-signaling)
+///    0x0d : Greater-than-or-equal (ordered, signaling)
+///    0x0e : Greater-than (ordered, signaling)
+///    0x0f : True (unordered, non-signaling)
+///    0x10 : Equal (ordered, signaling)
+///    0x11 : Less-than (ordered, non-signaling)
+///    0x12 : Less-than-or-equal (ordered, non-signaling)
+///    0x13 : Unordered (signaling)
+///    0x14 : Not-equal (unordered, signaling)
+///    0x15 : Not-less-than (unordered, non-signaling)
+///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
+///    0x17 : Ordered (signaling)
+///    0x18 : Equal (unordered, signaling)
+///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
+///    0x1a : Not-greater-than (unordered, non-signaling)
+///    0x1b : False (ordered, signaling)
+///    0x1c : Not-equal (ordered, signaling)
+///    0x1d : Greater-than-or-equal (ordered, non-signaling)
+///    0x1e : Greater-than (ordered, non-signaling)
+///    0x1f : True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ps(a, b, c) __extension__ ({ \
   (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
@@ -1715,9 +1764,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 
 /// \brief Compares each of the corresponding double-precision values of two
 ///    256-bit vectors of [4 x double], using the operation specified by the
-///    immediate integer operand. Returns a [4 x double] vector consisting of
-///    four doubles corresponding to the four comparison results: zero if the
-///    comparison is false, and all 1's if the comparison is true.
+///    immediate integer operand.
+///
+///    Returns a [4 x double] vector consisting of four doubles corresponding to
+///    the four comparison results: zero if the comparison is false, and all 1's
+///    if the comparison is true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1734,17 +1785,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    00h, 08h, 10h, 18h: Equal \n
-///    01h, 09h, 11h, 19h: Less than \n
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
-///                        (swapped operands) \n
-///    03h, 0Bh, 13h, 1Bh: Unordered \n
-///    04h, 0Ch, 14h, 1Ch: Not equal \n
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
-///                        (swapped operands) \n
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands) \n
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    0x00 : Equal (ordered, non-signaling)
+///    0x01 : Less-than (ordered, signaling)
+///    0x02 : Less-than-or-equal (ordered, signaling)
+///    0x03 : Unordered (non-signaling)
+///    0x04 : Not-equal (unordered, non-signaling)
+///    0x05 : Not-less-than (unordered, signaling)
+///    0x06 : Not-less-than-or-equal (unordered, signaling)
+///    0x07 : Ordered (non-signaling)
+///    0x08 : Equal (unordered, non-signaling)
+///    0x09 : Not-greater-than-or-equal (unordered, signaling)
+///    0x0a : Not-greater-than (unordered, signaling)
+///    0x0b : False (ordered, non-signaling)
+///    0x0c : Not-equal (ordered, non-signaling)
+///    0x0d : Greater-than-or-equal (ordered, signaling)
+///    0x0e : Greater-than (ordered, signaling)
+///    0x0f : True (unordered, non-signaling)
+///    0x10 : Equal (ordered, signaling)
+///    0x11 : Less-than (ordered, non-signaling)
+///    0x12 : Less-than-or-equal (ordered, non-signaling)
+///    0x13 : Unordered (signaling)
+///    0x14 : Not-equal (unordered, signaling)
+///    0x15 : Not-less-than (unordered, non-signaling)
+///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
+///    0x17 : Ordered (signaling)
+///    0x18 : Equal (unordered, signaling)
+///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
+///    0x1a : Not-greater-than (unordered, non-signaling)
+///    0x1b : False (ordered, signaling)
+///    0x1c : Not-equal (ordered, signaling)
+///    0x1d : Greater-than-or-equal (ordered, non-signaling)
+///    0x1e : Greater-than (ordered, non-signaling)
+///    0x1f : True (unordered, signaling)
 /// \returns A 256-bit vector of [4 x double] containing the comparison results.
 #define _mm256_cmp_pd(a, b, c) __extension__ ({ \
   (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
@@ -1752,9 +1824,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 
 /// \brief Compares each of the corresponding values of two 256-bit vectors of
 ///    [8 x float], using the operation specified by the immediate integer
-///    operand. Returns a [8 x float] vector consisting of eight floats
-///    corresponding to the eight comparison results: zero if the comparison is
-///    false, and all 1's if the comparison is true.
+///    operand.
+///
+///    Returns a [8 x float] vector consisting of eight floats corresponding to
+///    the eight comparison results: zero if the comparison is false, and all
+///    1's if the comparison is true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1771,17 +1845,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    00h, 08h, 10h, 18h: Equal \n
-///    01h, 09h, 11h, 19h: Less than \n
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
-///                        (swapped operands) \n
-///    03h, 0Bh, 13h, 1Bh: Unordered \n
-///    04h, 0Ch, 14h, 1Ch: Not equal \n
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
-///                        (swapped operands) \n
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands) \n
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    0x00 : Equal (ordered, non-signaling)
+///    0x01 : Less-than (ordered, signaling)
+///    0x02 : Less-than-or-equal (ordered, signaling)
+///    0x03 : Unordered (non-signaling)
+///    0x04 : Not-equal (unordered, non-signaling)
+///    0x05 : Not-less-than (unordered, signaling)
+///    0x06 : Not-less-than-or-equal (unordered, signaling)
+///    0x07 : Ordered (non-signaling)
+///    0x08 : Equal (unordered, non-signaling)
+///    0x09 : Not-greater-than-or-equal (unordered, signaling)
+///    0x0a : Not-greater-than (unordered, signaling)
+///    0x0b : False (ordered, non-signaling)
+///    0x0c : Not-equal (ordered, non-signaling)
+///    0x0d : Greater-than-or-equal (ordered, signaling)
+///    0x0e : Greater-than (ordered, signaling)
+///    0x0f : True (unordered, non-signaling)
+///    0x10 : Equal (ordered, signaling)
+///    0x11 : Less-than (ordered, non-signaling)
+///    0x12 : Less-than-or-equal (ordered, non-signaling)
+///    0x13 : Unordered (signaling)
+///    0x14 : Not-equal (unordered, signaling)
+///    0x15 : Not-less-than (unordered, non-signaling)
+///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
+///    0x17 : Ordered (signaling)
+///    0x18 : Equal (unordered, signaling)
+///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
+///    0x1a : Not-greater-than (unordered, non-signaling)
+///    0x1b : False (ordered, signaling)
+///    0x1c : Not-equal (ordered, signaling)
+///    0x1d : Greater-than-or-equal (ordered, non-signaling)
+///    0x1e : Greater-than (ordered, non-signaling)
+///    0x1f : True (unordered, signaling)
 /// \returns A 256-bit vector of [8 x float] containing the comparison results.
 #define _mm256_cmp_ps(a, b, c) __extension__ ({ \
   (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
@@ -1789,8 +1884,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 
 /// \brief Compares each of the corresponding scalar double-precision values of
 ///    two 128-bit vectors of [2 x double], using the operation specified by the
-///    immediate integer operand. If the result is true, all 64 bits of the
-///    destination vector are set; otherwise they are cleared.
+///    immediate integer operand.
+///
+///    If the result is true, all 64 bits of the destination vector are set;
+///    otherwise they are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1807,17 +1904,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    00h, 08h, 10h, 18h: Equal \n
-///    01h, 09h, 11h, 19h: Less than \n
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
-///                        (swapped operands) \n
-///    03h, 0Bh, 13h, 1Bh: Unordered \n
-///    04h, 0Ch, 14h, 1Ch: Not equal \n
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
-///                        (swapped operands) \n
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands) \n
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    0x00 : Equal (ordered, non-signaling)
+///    0x01 : Less-than (ordered, signaling)
+///    0x02 : Less-than-or-equal (ordered, signaling)
+///    0x03 : Unordered (non-signaling)
+///    0x04 : Not-equal (unordered, non-signaling)
+///    0x05 : Not-less-than (unordered, signaling)
+///    0x06 : Not-less-than-or-equal (unordered, signaling)
+///    0x07 : Ordered (non-signaling)
+///    0x08 : Equal (unordered, non-signaling)
+///    0x09 : Not-greater-than-or-equal (unordered, signaling)
+///    0x0a : Not-greater-than (unordered, signaling)
+///    0x0b : False (ordered, non-signaling)
+///    0x0c : Not-equal (ordered, non-signaling)
+///    0x0d : Greater-than-or-equal (ordered, signaling)
+///    0x0e : Greater-than (ordered, signaling)
+///    0x0f : True (unordered, non-signaling)
+///    0x10 : Equal (ordered, signaling)
+///    0x11 : Less-than (ordered, non-signaling)
+///    0x12 : Less-than-or-equal (ordered, non-signaling)
+///    0x13 : Unordered (signaling)
+///    0x14 : Not-equal (unordered, signaling)
+///    0x15 : Not-less-than (unordered, non-signaling)
+///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
+///    0x17 : Ordered (signaling)
+///    0x18 : Equal (unordered, signaling)
+///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
+///    0x1a : Not-greater-than (unordered, non-signaling)
+///    0x1b : False (ordered, signaling)
+///    0x1c : Not-equal (ordered, signaling)
+///    0x1d : Greater-than-or-equal (ordered, non-signaling)
+///    0x1e : Greater-than (ordered, non-signaling)
+///    0x1f : True (unordered, signaling)
 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
 #define _mm_cmp_sd(a, b, c) __extension__ ({ \
   (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
@@ -1825,8 +1943,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 
 /// \brief Compares each of the corresponding scalar values of two 128-bit
 ///    vectors of [4 x float], using the operation specified by the immediate
-///    integer operand. If the result is true, all 32 bits of the destination
-///    vector are set; otherwise they are cleared.
+///    integer operand.
+///
+///    If the result is true, all 32 bits of the destination vector are set;
+///    otherwise they are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1843,17 +1963,38 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 /// \param c
 ///    An immediate integer operand, with bits [4:0] specifying which comparison
 ///    operation to use: \n
-///    00h, 08h, 10h, 18h: Equal \n
-///    01h, 09h, 11h, 19h: Less than \n
-///    02h, 0Ah, 12h, 1Ah: Less than or equal / Greater than or equal
-///                        (swapped operands) \n
-///    03h, 0Bh, 13h, 1Bh: Unordered \n
-///    04h, 0Ch, 14h, 1Ch: Not equal \n
-///    05h, 0Dh, 15h, 1Dh: Not less than / Not greater than
-///                        (swapped operands) \n
-///    06h, 0Eh, 16h, 1Eh: Not less than or equal / Not greater than or equal
-///                        (swapped operands) \n
-///    07h, 0Fh, 17h, 1Fh: Ordered
+///    0x00 : Equal (ordered, non-signaling)
+///    0x01 : Less-than (ordered, signaling)
+///    0x02 : Less-than-or-equal (ordered, signaling)
+///    0x03 : Unordered (non-signaling)
+///    0x04 : Not-equal (unordered, non-signaling)
+///    0x05 : Not-less-than (unordered, signaling)
+///    0x06 : Not-less-than-or-equal (unordered, signaling)
+///    0x07 : Ordered (non-signaling)
+///    0x08 : Equal (unordered, non-signaling)
+///    0x09 : Not-greater-than-or-equal (unordered, signaling)
+///    0x0a : Not-greater-than (unordered, signaling)
+///    0x0b : False (ordered, non-signaling)
+///    0x0c : Not-equal (ordered, non-signaling)
+///    0x0d : Greater-than-or-equal (ordered, signaling)
+///    0x0e : Greater-than (ordered, signaling)
+///    0x0f : True (unordered, non-signaling)
+///    0x10 : Equal (ordered, signaling)
+///    0x11 : Less-than (ordered, non-signaling)
+///    0x12 : Less-than-or-equal (ordered, non-signaling)
+///    0x13 : Unordered (signaling)
+///    0x14 : Not-equal (unordered, signaling)
+///    0x15 : Not-less-than (unordered, non-signaling)
+///    0x16 : Not-less-than-or-equal (unordered, non-signaling)
+///    0x17 : Ordered (signaling)
+///    0x18 : Equal (unordered, signaling)
+///    0x19 : Not-greater-than-or-equal (unordered, non-signaling)
+///    0x1a : Not-greater-than (unordered, non-signaling)
+///    0x1b : False (ordered, signaling)
+///    0x1c : Not-equal (ordered, signaling)
+///    0x1d : Greater-than-or-equal (ordered, non-signaling)
+///    0x1e : Greater-than (ordered, non-signaling)
+///    0x1f : True (unordered, signaling)
 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
 #define _mm_cmp_ss(a, b, c) __extension__ ({ \
   (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
@@ -2184,12 +2325,32 @@ _mm256_cvttps_epi32(__m256 __a)
   return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
 }
 
+/// \brief Returns the first element of the input vector of [4 x double].
+///
+/// \headerfile <avxintrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __a
+///    A 256-bit vector of [4 x double].
+/// \returns A 64 bit double containing the first element of the input vector.
 static __inline double __DEFAULT_FN_ATTRS
 _mm256_cvtsd_f64(__m256d __a)
 {
  return __a[0];
 }
 
+/// \brief Returns the first element of the input vector of [8 x i32].
+///
+/// \headerfile <avxintrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x i32].
+/// \returns A 32 bit integer containing the first element of the input vector.
 static __inline int __DEFAULT_FN_ATTRS
 _mm256_cvtsi256_si32(__m256i __a)
 {
@@ -2197,6 +2358,16 @@ _mm256_cvtsi256_si32(__m256i __a)
  return __b[0];
 }
 
+/// \brief Returns the first element of the input vector of [8 x float].
+///
+/// \headerfile <avxintrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __a
+///    A 256-bit vector of [8 x float].
+/// \returns A 32 bit float containing the first element of the input vector.
 static __inline float __DEFAULT_FN_ATTRS
 _mm256_cvtss_f32(__m256 __a)
 {
@@ -2380,7 +2551,9 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b)
 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
 ///    element-by-element comparison of the double-precision element in the
 ///    first source vector and the corresponding element in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of double-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2407,7 +2580,9 @@ _mm_testz_pd(__m128d __a, __m128d __b)
 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
 ///    element-by-element comparison of the double-precision element in the
 ///    first source vector and the corresponding element in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of double-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2434,7 +2609,9 @@ _mm_testc_pd(__m128d __a, __m128d __b)
 /// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
 ///    element-by-element comparison of the double-precision element in the
 ///    first source vector and the corresponding element in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of double-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2462,7 +2639,9 @@ _mm_testnzc_pd(__m128d __a, __m128d __b)
 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
 ///    element-by-element comparison of the single-precision element in the
 ///    first source vector and the corresponding element in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of single-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2489,7 +2668,9 @@ _mm_testz_ps(__m128 __a, __m128 __b)
 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
 ///    element-by-element comparison of the single-precision element in the
 ///    first source vector and the corresponding element in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of single-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2516,7 +2697,9 @@ _mm_testc_ps(__m128 __a, __m128 __b)
 /// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
 ///    element-by-element comparison of the single-precision element in the
 ///    first source vector and the corresponding element in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of single-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2544,7 +2727,9 @@ _mm_testnzc_ps(__m128 __a, __m128 __b)
 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
 ///    element-by-element comparison of the double-precision elements in the
 ///    first source vector and the corresponding elements in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of double-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2571,7 +2756,9 @@ _mm256_testz_pd(__m256d __a, __m256d __b)
 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
 ///    element-by-element comparison of the double-precision elements in the
 ///    first source vector and the corresponding elements in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of double-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2598,7 +2785,9 @@ _mm256_testc_pd(__m256d __a, __m256d __b)
 /// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
 ///    element-by-element comparison of the double-precision elements in the
 ///    first source vector and the corresponding elements in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of double-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2626,7 +2815,9 @@ _mm256_testnzc_pd(__m256d __a, __m256d __b)
 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
 ///    element-by-element comparison of the single-precision element in the
 ///    first source vector and the corresponding element in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of single-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2653,7 +2844,9 @@ _mm256_testz_ps(__m256 __a, __m256 __b)
 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
 ///    element-by-element comparison of the single-precision element in the
 ///    first source vector and the corresponding element in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of single-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2680,7 +2873,9 @@ _mm256_testc_ps(__m256 __a, __m256 __b)
 /// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
 ///    element-by-element comparison of the single-precision elements in the
 ///    first source vector and the corresponding elements in the second source
-///    vector. The EFLAGS register is updated as follows: \n
+///    vector.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of single-precision elements where the
 ///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
 ///    ZF flag is set to 1. \n
@@ -2706,7 +2901,9 @@ _mm256_testnzc_ps(__m256 __a, __m256 __b)
 }
 
 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
-///    of the two source vectors and update the EFLAGS register as follows: \n
+///    of the two source vectors.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
 ///    If there is at least one pair of bits where the bit from the first source
@@ -2730,7 +2927,9 @@ _mm256_testz_si256(__m256i __a, __m256i __b)
 }
 
 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
-///    of the two source vectors and update the EFLAGS register as follows: \n
+///    of the two source vectors.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
 ///    If there is at least one pair of bits where the bit from the first source
@@ -2754,7 +2953,9 @@ _mm256_testc_si256(__m256i __a, __m256i __b)
 }
 
 /// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
-///    of the two source vectors and update the EFLAGS register as follows: \n
+///    of the two source vectors.
+///
+///    The EFLAGS register is updated as follows: \n
 ///    If there is at least one pair of bits where both bits are 1, the ZF flag
 ///    is set to 0. Otherwise the ZF flag is set to 1. \n
 ///    If there is at least one pair of bits where the bit from the first source
@@ -3389,7 +3590,8 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_si256(__m256i *__a, __m256i __b)
 {
-  __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
+  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
+  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
 }
 
 /// \brief Moves double-precision values from a 256-bit vector of [4 x double]
@@ -3402,13 +3604,14 @@ _mm256_stream_si256(__m256i *__a, __m256i __b)
 ///
 /// \param __a
 ///    A pointer to a 32-byte aligned memory location that will receive the
-///    integer values.
+///    double-precision floating-point values.
 /// \param __b
 ///    A 256-bit vector of [4 x double] containing the values to be moved.
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_pd(double *__a, __m256d __b)
 {
-  __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
+  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
+  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
 }
 
 /// \brief Moves single-precision floating point values from a 256-bit vector
@@ -3428,7 +3631,8 @@ _mm256_stream_pd(double *__a, __m256d __b)
 static __inline void __DEFAULT_FN_ATTRS
 _mm256_stream_ps(float *__p, __m256 __a)
 {
-  __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
+  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
+  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
 }
 
 /* Create vectors */
@@ -4310,9 +4514,10 @@ _mm256_castsi256_si128(__m256i __a)
 }
 
 /// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
-///    128-bit floating-point vector of [2 x double]. The lower 128 bits
-///    contain the value of the source vector. The contents of the upper 128
-///    bits are undefined.
+///    128-bit floating-point vector of [2 x double].
+///
+///    The lower 128 bits contain the value of the source vector. The contents
+///    of the upper 128 bits are undefined.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4330,9 +4535,10 @@ _mm256_castpd128_pd256(__m128d __a)
 }
 
 /// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
-///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
-///    the value of the source vector. The contents of the upper 128 bits are
-///    undefined.
+///    128-bit floating-point vector of [4 x float].
+///
+///    The lower 128 bits contain the value of the source vector. The contents
+///    of the upper 128 bits are undefined.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4350,6 +4556,7 @@ _mm256_castps128_ps256(__m128 __a)
 }
 
 /// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
+///
 ///    The lower 128 bits contain the value of the source vector. The contents
 ///    of the upper 128 bits are undefined.
 ///
@@ -4367,6 +4574,61 @@ _mm256_castsi128_si256(__m128i __a)
   return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
 }
 
+/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
+///    128-bit floating-point vector of [2 x double]. The lower 128 bits
+///    contain the value of the source vector. The upper 128 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+///    contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_zextpd128_pd256(__m128d __a)
+{
+  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
+}
+
+/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
+///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+///    the value of the source vector. The upper 128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+///    contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_zextps128_ps256(__m128 __a)
+{
+  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
+///    The lower 128 bits contain the value of the source vector. The upper
+///    128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+///    the parameter. The upper 128 bits are set to zero.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_zextsi128_si256(__m128i __a)
+{
+  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
+}
+
 /*
    Vector insert.
    We use macros rather than inlines because we only want to accept
@@ -4375,8 +4637,10 @@ _mm256_castsi128_si256(__m128i __a)
 /// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
 ///    a 256-bit vector of [8 x float] given in the first parameter, and then
 ///    replacing either the upper or the lower 128 bits with the contents of a
-///    128-bit vector of [4 x float] in the second parameter. The immediate
-///    integer parameter determines between the upper or the lower 128 bits.
+///    128-bit vector of [4 x float] in the second parameter.
+///
+///    The immediate integer parameter determines between the upper or the lower
+///    128 bits.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4420,8 +4684,10 @@ _mm256_castsi128_si256(__m128i __a)
 /// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
 ///    a 256-bit vector of [4 x double] given in the first parameter, and then
 ///    replacing either the upper or the lower 128 bits with the contents of a
-///    128-bit vector of [2 x double] in the second parameter. The immediate
-///    integer parameter determines between the upper or the lower 128 bits.
+///    128-bit vector of [2 x double] in the second parameter.
+///
+///    The immediate integer parameter determines between the upper or the lower
+///    128 bits.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4461,8 +4727,10 @@ _mm256_castsi128_si256(__m128i __a)
 /// \brief Constructs a new 256-bit integer vector by first duplicating a
 ///    256-bit integer vector given in the first parameter, and then replacing
 ///    either the upper or the lower 128 bits with the contents of a 128-bit
-///    integer vector in the second parameter. The immediate integer parameter
-///    determines between the upper or the lower 128 bits.
+///    integer vector in the second parameter.
+///
+///    The immediate integer parameter determines between the upper or the lower
+///    128 bits.
 ///
 /// \headerfile <x86intrin.h>
 ///

@@ -28,107 +28,17 @@
 #ifndef __BMIINTRIN_H
 #define __BMIINTRIN_H
 
-/// \brief Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned short _tzcnt_u16(unsigned short a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param a
-///    An unsigned 16-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 16-bit integer containing the number of trailing zero
-///    bits in the operand.
 #define _tzcnt_u16(a)     (__tzcnt_u16((a)))
 
-/// \brief Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _andn_u32(unsigned int a, unsigned int b);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
-///
-/// \param a
-///    An unsigned integer containing one of the operands.
-/// \param b
-///    An unsigned integer containing one of the operands.
-/// \returns An unsigned integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
 #define _andn_u32(a, b)   (__andn_u32((a), (b)))
 
 /* _bextr_u32 != __bextr_u32 */
-/// \brief Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _blsi_u32(unsigned int a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
-///
-/// \param a
-///    An unsigned integer whose bits are to be cleared.
-/// \returns An unsigned integer containing the result of clearing the bits from
-///    the source operand.
 #define _blsi_u32(a)      (__blsi_u32((a)))
 
-/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least siginificant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _blsmsk_u32(unsigned int a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
-///
-/// \param a
-///    An unsigned integer used to create the mask.
-/// \returns An unsigned integer containing the newly created mask.
 #define _blsmsk_u32(a)    (__blsmsk_u32((a)))
 
-/// \brief Clears the least siginificant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _blsr_u32(unsigned int a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
-///
-/// \param a
-///    An unsigned integer containing the operand to be cleared.
-/// \returns An unsigned integer containing the result of clearing the source
-///    operand.
 #define _blsr_u32(a)      (__blsr_u32((a)))
 
-/// \brief Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned int _tzcnt_u32(unsigned int a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param a
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of trailing zero
-///    bits in the operand.
 #define _tzcnt_u32(a)     (__tzcnt_u32((a)))
 
 /* Define the default attributes for the functions in this file. */
@@ -238,7 +148,7 @@ __blsi_u32(unsigned int __X)
 }
 
 /// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least siginificant bit that is set to 1 in the source
+///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
@@ -254,7 +164,7 @@ __blsmsk_u32(unsigned int __X)
   return __X ^ (__X - 1);
 }
 
-/// \brief Clears the least siginificant bit that is set to 1 in the source
+/// \brief Clears the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
@@ -305,91 +215,15 @@ _mm_tzcnt_32(unsigned int __X)
 
 #ifdef __x86_64__
 
-/// \brief Performs a bitwise AND of the second operand with the one's
-///    complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _andn_u64 (unsigned long long a, unsigned long long b);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer containing one of the operands.
-/// \param b
-///    An unsigned 64-bit integer containing one of the operands.
-/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
-///    operand with the one's complement of the first operand.
 #define _andn_u64(a, b)   (__andn_u64((a), (b)))
 
 /* _bextr_u64 != __bextr_u64 */
-/// \brief Clears all bits in the source except for the least significant bit
-///    containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsi_u64(unsigned long long a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer whose bits are to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    bits from the source operand.
 #define _blsi_u64(a)      (__blsi_u64((a)))
 
-/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least siginificant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsmsk_u64(unsigned long long a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer used to create the mask.
-/// \returns A unsigned 64-bit integer containing the newly created mask.
 #define _blsmsk_u64(a)    (__blsmsk_u64((a)))
 
-/// \brief Clears the least siginificant bit that is set to 1 in the source
-///    operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _blsr_u64(unsigned long long a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer containing the operand to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-///    source operand.
 #define _blsr_u64(a)      (__blsr_u64((a)))
 
-/// \brief Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned long long _tzcnt_u64(unsigned long long a);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param a
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of trailing zero
-///    bits in the operand.
 #define _tzcnt_u64(a)     (__tzcnt_u64((a)))
 
 /// \brief Performs a bitwise AND of the second operand with the one's
@@ -475,7 +309,7 @@ __blsi_u64(unsigned long long __X)
 }
 
 /// \brief Creates a mask whose bits are set to 1, using bit 0 up to and
-///    including the least siginificant bit that is set to 1 in the source
+///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
@@ -484,14 +318,14 @@ __blsi_u64(unsigned long long __X)
 ///
 /// \param __X
 ///    An unsigned 64-bit integer used to create the mask.
-/// \returns A unsigned 64-bit integer containing the newly created mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsmsk_u64(unsigned long long __X)
 {
   return __X ^ (__X - 1);
 }
 
-/// \brief Clears the least siginificant bit that is set to 1 in the source
+/// \brief Clears the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>

@@ -0,0 +1,50 @@
+/*===----------------------- clzerointrin.h - CLZERO ----------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86INTRIN_H
+#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _CLZEROINTRIN_H
+#define _CLZEROINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+  __attribute__((__always_inline__, __nodebug__,  __target__("clzero")))
+
+/// \brief Loads the cache line address and zero's out the cacheline
+///
+/// \headerfile <clzerointrin.h>
+///
+/// This intrinsic corresponds to the <c> CLZERO </c> instruction.
+///
+/// \param __line
+///    A pointer to a cacheline which needs to be zeroed out.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clzero (void * __line)
+{
+  __builtin_ia32_clzero ((void *)__line);
+}
+
+#undef __DEFAULT_FN_ATTRS 
+
+#endif /* _CLZEROINTRIN_H */

@@ -79,7 +79,7 @@
 #define signature_VORTEX_edx 0x36387865
 #define signature_VORTEX_ecx 0x436f5320
 
-/* Features in %ecx for level 1 */
+/* Features in %ecx for leaf 1 */
 #define bit_SSE3        0x00000001
 #define bit_PCLMULQDQ   0x00000002
 #define bit_PCLMUL      bit_PCLMULQDQ   /* for gcc compat */
@@ -114,7 +114,7 @@
 #define bit_F16C        0x20000000
 #define bit_RDRND       0x40000000
 
-/* Features in %edx for level 1 */
+/* Features in %edx for leaf 1 */
 #define bit_FPU         0x00000001
 #define bit_VME         0x00000002
 #define bit_DE          0x00000004
@@ -147,44 +147,95 @@
 #define bit_TM          0x20000000
 #define bit_PBE         0x80000000
 
-/* Features in %ebx for level 7 sub-leaf 0 */
+/* Features in %ebx for leaf 7 sub-leaf 0 */
 #define bit_FSGSBASE    0x00000001
+#define bit_SGX         0x00000004
+#define bit_BMI         0x00000008
+#define bit_HLE         0x00000010
+#define bit_AVX2        0x00000020
 #define bit_SMEP        0x00000080
+#define bit_BMI2        0x00000100
 #define bit_ENH_MOVSB   0x00000200
+#define bit_RTM         0x00000800
+#define bit_MPX         0x00004000
+#define bit_AVX512F     0x00010000
+#define bit_AVX512DQ    0x00020000
+#define bit_RDSEED      0x00040000
+#define bit_ADX         0x00080000
+#define bit_AVX512IFMA  0x00200000
+#define bit_CLFLUSHOPT  0x00800000
+#define bit_CLWB        0x01000000
+#define bit_AVX512PF    0x04000000
+#define bit_AVX51SER    0x08000000
+#define bit_AVX512CD    0x10000000
+#define bit_SHA         0x20000000
+#define bit_AVX512BW    0x40000000
+#define bit_AVX512VL    0x80000000
+
+/* Features in %ecx for leaf 7 sub-leaf 0 */
+#define bit_PREFTCHWT1  0x00000001
+#define bit_AVX512VBMI  0x00000002
+#define bit_PKU         0x00000004
+#define bit_OSPKE       0x00000010
+#define bit_AVX512VPOPCNTDQ  0x00004000
+#define bit_RDPID       0x00400000
+
+/* Features in %edx for leaf 7 sub-leaf 0 */
+#define bit_AVX5124VNNIW  0x00000004
+#define bit_AVX5124FMAPS  0x00000008
+
+/* Features in %eax for leaf 13 sub-leaf 1 */
+#define bit_XSAVEOPT    0x00000001
+#define bit_XSAVEC      0x00000002
+#define bit_XSAVES      0x00000008
+
+/* Features in %ecx for leaf 0x80000001 */
+#define bit_LAHF_LM     0x00000001
+#define bit_ABM         0x00000020
+#define bit_SSE4a       0x00000040
+#define bit_PRFCHW      0x00000100
+#define bit_XOP         0x00000800
+#define bit_LWP         0x00008000
+#define bit_FMA4        0x00010000
+#define bit_TBM         0x00200000
+#define bit_MWAITX      0x20000000
+
+/* Features in %edx for leaf 0x80000001 */
+#define bit_MMXEXT      0x00400000
+#define bit_LM          0x20000000
+#define bit_3DNOWP      0x40000000
+#define bit_3DNOW       0x80000000
+
+/* Features in %ebx for leaf 0x80000001 */
+#define bit_CLZERO      0x00000001
+
 
 #if __i386__
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
     __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-                  : "0"(__level))
+                  : "0"(__leaf))
 
-#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
     __asm("cpuid" : "=a"(__eax), "=b" (__ebx), "=c"(__ecx), "=d"(__edx) \
-                  : "0"(__level), "2"(__count))
+                  : "0"(__leaf), "2"(__count))
 #else
 /* x86-64 uses %rbx as the base register, so preserve it. */
-#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+#define __cpuid(__leaf, __eax, __ebx, __ecx, __edx) \
     __asm("  xchgq  %%rbx,%q1\n" \
           "  cpuid\n" \
           "  xchgq  %%rbx,%q1" \
         : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-        : "0"(__level))
+        : "0"(__leaf))
 
-#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+#define __cpuid_count(__leaf, __count, __eax, __ebx, __ecx, __edx) \
     __asm("  xchgq  %%rbx,%q1\n" \
           "  cpuid\n" \
           "  xchgq  %%rbx,%q1" \
         : "=a"(__eax), "=r" (__ebx), "=c"(__ecx), "=d"(__edx) \
-        : "0"(__level), "2"(__count))
+        : "0"(__leaf), "2"(__count))
 #endif
 
-static __inline int __get_cpuid (unsigned int __level, unsigned int *__eax,
-                                 unsigned int *__ebx, unsigned int *__ecx,
-                                 unsigned int *__edx) {
-    __cpuid(__level, *__eax, *__ebx, *__ecx, *__edx);
-    return 1;
-}
-
-static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
+static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
 {
     unsigned int __eax, __ebx, __ecx, __edx;
 #if __i386__
@@ -208,8 +259,35 @@ static __inline int __get_cpuid_max (unsigned int __level, unsigned int *__sig)
         return 0;
 #endif
 
-    __cpuid(__level, __eax, __ebx, __ecx, __edx);
+    __cpuid(__leaf, __eax, __ebx, __ecx, __edx);
     if (__sig)
         *__sig = __ebx;
     return __eax;
 }
+
+static __inline int __get_cpuid (unsigned int __leaf, unsigned int *__eax,
+                                 unsigned int *__ebx, unsigned int *__ecx,
+                                 unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}
+
+static __inline int __get_cpuid_count (unsigned int __leaf,
+                                       unsigned int __subleaf,
+                                       unsigned int *__eax, unsigned int *__ebx,
+                                       unsigned int *__ecx, unsigned int *__edx)
+{
+    unsigned int __max_leaf = __get_cpuid_max(__leaf & 0x80000000, 0);
+
+    if (__max_leaf == 0 || __max_leaf < __leaf)
+        return 0;
+
+    __cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
+    return 1;
+}

@@ -302,7 +302,7 @@ _mm_min_pd(__m128d __a, __m128d __b)
   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
 }
 
-/// \brief Compares lower 64-bits double-precision values of both operands, and
+/// \brief Compares lower 64-bit double-precision values of both operands, and
 ///    returns the greater of the pair of values in the lower 64-bits of the
 ///    result. The upper 64 bits of the result are copied from the upper double-
 ///    precision value of the first operand.
@@ -462,8 +462,9 @@ _mm_cmplt_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are less than or equal to those in the second operand. Each
-///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///    operand are less than or equal to those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -482,8 +483,9 @@ _mm_cmple_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are greater than those in the second operand. Each comparison
-///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///    operand are greater than those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -502,8 +504,9 @@ _mm_cmpgt_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are greater than or equal to those in the second operand. Each
-///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///    operand are greater than or equal to those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -522,9 +525,10 @@ _mm_cmpge_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are ordered with respect to those in the second operand. A pair
-///    of double-precision values are "ordered" with respect to each other if
-///    neither value is a NaN. Each comparison yields 0h for false,
+///    operand are ordered with respect to those in the second operand.
+///
+///    A pair of double-precision values are "ordered" with respect to each
+///    other if neither value is a NaN. Each comparison yields 0h for false,
 ///    FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
@@ -544,9 +548,10 @@ _mm_cmpord_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are unordered with respect to those in the second operand. A pair
-///    of double-precision values are "unordered" with respect to each other if
-///    one or both values are NaN. Each comparison yields 0h for false,
+///    operand are unordered with respect to those in the second operand.
+///
+///    A pair of double-precision values are "unordered" with respect to each
+///    other if one or both values are NaN. Each comparison yields 0h for false,
 ///    FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
@@ -567,8 +572,9 @@ _mm_cmpunord_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are unequal to those in the second operand. Each comparison
-///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///    operand are unequal to those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -587,8 +593,9 @@ _mm_cmpneq_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are not less than those in the second operand. Each comparison
-///    yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///    operand are not less than those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -607,8 +614,9 @@ _mm_cmpnlt_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are not less than or equal to those in the second operand. Each
-///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///    operand are not less than or equal to those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -627,8 +635,9 @@ _mm_cmpnle_pd(__m128d __a, __m128d __b)
 
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
-///    operand are not greater than those in the second operand. Each
-///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///    operand are not greater than those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -648,6 +657,7 @@ _mm_cmpngt_pd(__m128d __a, __m128d __b)
 /// \brief Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are not greater than or equal to those in the second operand.
+///
 ///    Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
@@ -666,8 +676,9 @@ _mm_cmpnge_pd(__m128d __a, __m128d __b)
 }
 
 /// \brief Compares the lower double-precision floating-point values in each of
-///    the two 128-bit floating-point vectors of [2 x double] for equality. The
-///    comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
+///    the two 128-bit floating-point vectors of [2 x double] for equality.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -690,8 +701,9 @@ _mm_cmpeq_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is less than the corresponding value in
-///    the second parameter. The comparison yields 0h for false,
-///    FFFFFFFFFFFFFFFFh for true.
+///    the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -714,8 +726,9 @@ _mm_cmplt_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is less than or equal to the
-///    corresponding value in the second parameter. The comparison yields 0h for
-///    false, FFFFFFFFFFFFFFFFh for true.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -738,8 +751,9 @@ _mm_cmple_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is greater than the corresponding value
-///    in the second parameter. The comparison yields 0h for false,
-///    FFFFFFFFFFFFFFFFh for true.
+///    in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -763,8 +777,9 @@ _mm_cmpgt_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is greater than or equal to the
-///    corresponding value in the second parameter. The comparison yields 0h for
-///    false, FFFFFFFFFFFFFFFFh for true.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -788,9 +803,11 @@ _mm_cmpge_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is "ordered" with respect to the
-///    corresponding value in the second parameter. The comparison yields 0h for
-///    false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values are
-///    "ordered" with respect to each other if neither value is a NaN.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
+///    double-precision values are "ordered" with respect to each other if
+///    neither value is a NaN.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -813,9 +830,11 @@ _mm_cmpord_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is "unordered" with respect to the
-///    corresponding value in the second parameter. The comparison yields 0h
-///    for false, FFFFFFFFFFFFFFFFh for true. A pair of double-precision values
-///    are "unordered" with respect to each other if one or both values are NaN.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of
+///    double-precision values are "unordered" with respect to each other if one
+///    or both values are NaN.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -839,8 +858,9 @@ _mm_cmpunord_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is unequal to the corresponding value in
-///    the second parameter. The comparison yields 0h for false,
-///    FFFFFFFFFFFFFFFFh for true.
+///    the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -863,8 +883,9 @@ _mm_cmpneq_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is not less than the corresponding
-///    value in the second parameter. The comparison yields 0h for false,
-///    FFFFFFFFFFFFFFFFh for true.
+///    value in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -887,8 +908,9 @@ _mm_cmpnlt_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is not less than or equal to the
-///    corresponding value in the second parameter. The comparison yields 0h
-///    for false, FFFFFFFFFFFFFFFFh for true.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -911,8 +933,9 @@ _mm_cmpnle_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is not greater than the corresponding
-///    value in the second parameter. The comparison yields 0h for false,
-///    FFFFFFFFFFFFFFFFh for true.
+///    value in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -936,8 +959,9 @@ _mm_cmpngt_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is not greater than or equal to the
-///    corresponding value in the second parameter. The comparison yields 0h
-///    for false, FFFFFFFFFFFFFFFFh for true.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -982,7 +1006,9 @@ _mm_comieq_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is less than the corresponding value in
-///    the second parameter. The comparison yields 0 for false, 1 for true.
+///    the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1004,8 +1030,9 @@ _mm_comilt_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is less than or equal to the
-///    corresponding value in the second parameter. The comparison yields 0 for
-///    false, 1 for true.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1027,7 +1054,9 @@ _mm_comile_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is greater than the corresponding value
-///    in the second parameter. The comparison yields 0 for false, 1 for true.
+///    in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1049,8 +1078,9 @@ _mm_comigt_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is greater than or equal to the
-///    corresponding value in the second parameter. The comparison yields 0 for
-///    false, 1 for true.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1072,7 +1102,9 @@ _mm_comige_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is unequal to the corresponding value in
-///    the second parameter. The comparison yields 0 for false, 1 for true.
+///    the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1093,8 +1125,9 @@ _mm_comineq_sd(__m128d __a, __m128d __b)
 
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
-///    comparison yields 0 for false, 1 for true. If either of the two lower
-///    double-precision values is NaN, 1 is returned.
+///    comparison yields 0 for false, 1 for true.
+///
+///    If either of the two lower double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1117,8 +1150,10 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is less than the corresponding value in
-///    the second parameter. The comparison yields 0 for false, 1 for true. If
-///    either of the two lower double-precision values is NaN, 1 is returned.
+///    the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1141,9 +1176,10 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is less than or equal to the
-///    corresponding value in the second parameter. The comparison yields 0 for
-///    false, 1 for true. If either of the two lower double-precision values is
-///    NaN, 1 is returned.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 1 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1166,8 +1202,10 @@ _mm_ucomile_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is greater than the corresponding value
-///    in the second parameter. The comparison yields 0 for false, 1 for true.
-///    If either of the two lower double-precision values is NaN, 0 is returned.
+///    in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1190,9 +1228,10 @@ _mm_ucomigt_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is greater than or equal to the
-///    corresponding value in the second parameter. The comparison yields 0 for
-///    false, 1 for true.  If either of the two lower double-precision values
-///    is NaN, 0 is returned.
+///    corresponding value in the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true.  If either of the two
+///    lower double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1215,8 +1254,10 @@ _mm_ucomige_sd(__m128d __a, __m128d __b)
 /// \brief Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
 ///    the value in the first parameter is unequal to the corresponding value in
-///    the second parameter. The comparison yields 0 for false, 1 for true. If
-///    either of the two lower double-precision values is NaN, 0 is returned.
+///    the second parameter.
+///
+///    The comparison yields 0 for false, 1 for true. If either of the two lower
+///    double-precision values is NaN, 0 is returned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1278,8 +1319,9 @@ _mm_cvtps_pd(__m128 __a)
 
 /// \brief Converts the lower two integer elements of a 128-bit vector of
 ///    [4 x i32] into two double-precision floating-point values, returned in a
-///    128-bit vector of [2 x double]. The upper two elements of the input
-///    vector are unused.
+///    128-bit vector of [2 x double].
+///
+///    The upper two elements of the input vector are unused.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1287,7 +1329,9 @@ _mm_cvtps_pd(__m128 __a)
 ///
 /// \param __a
 ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
-///    converted to double-precision values. The upper two elements are unused.
+///    converted to double-precision values.
+///
+///    The upper two elements are unused.
 /// \returns A 128-bit vector of [2 x double] containing the converted values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_cvtepi32_pd(__m128i __a)
@@ -1409,10 +1453,11 @@ _mm_cvtss_sd(__m128d __a, __m128 __b)
 
 /// \brief Converts the two double-precision floating-point elements of a
 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
-///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. If the
-///    result of either conversion is inexact, the result is truncated (rounded
-///    towards zero) regardless of the current MXCSR setting. The upper 64 bits
-///    of the result vector are set to zero.
+///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
+///
+///    If the result of either conversion is inexact, the result is truncated
+///    (rounded towards zero) regardless of the current MXCSR setting. The upper
+///    64 bits of the result vector are set to zero.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1466,9 +1511,10 @@ _mm_cvtpd_pi32(__m128d __a)
 
 /// \brief Converts the two double-precision floating-point elements of a
 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
-///    returned in a 64-bit vector of [2 x i32]. If the result of either
-///    conversion is inexact, the result is truncated (rounded towards zero)
-///    regardless of the current MXCSR setting.
+///    returned in a 64-bit vector of [2 x i32].
+///
+///    If the result of either conversion is inexact, the result is truncated
+///    (rounded towards zero) regardless of the current MXCSR setting.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1599,6 +1645,17 @@ _mm_loadu_pd(double const *__dp)
   return ((struct __loadu_pd*)__dp)->__v;
 }
 
+/// \brief Loads a 64-bit integer value to the low element of a 128-bit integer
+///    vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+///    A pointer to a 64-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_loadu_si64(void const *__a)
 {
@@ -1609,6 +1666,17 @@ _mm_loadu_si64(void const *__a)
   return (__m128i){__u, 0L};
 }
 
+/// \brief Loads a 64-bit double-precision value to the low element of a
+///    128-bit integer vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location containing a double-precision value.
+///    The address of the memory location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded value.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_load_sd(double const *__dp)
 {
@@ -1728,6 +1796,24 @@ _mm_set1_pd(double __w)
   return (__m128d){ __w, __w };
 }
 
+/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each
+///    of the two double-precision floating-point vector elements set to the
+///    specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+///    A double-precision floating-point value used to initialize each vector
+///    element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_pd1(double __w)
+{
+  return _mm_set1_pd(__w);
+}
+
 /// \brief Constructs a 128-bit floating-point vector of [2 x double]
 ///    initialized with the specified double-precision floating-point values.
 ///
@@ -1787,7 +1873,7 @@ _mm_setzero_pd(void)
 /// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower
 ///    64 bits are set to the lower 64 bits of the second parameter. The upper
 ///    64 bits are set to the upper 64 bits of the first parameter.
-//
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
@@ -1825,12 +1911,38 @@ _mm_store_sd(double *__dp, __m128d __a)
   ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
 }
 
+/// \brief Moves packed double-precision values from a 128-bit vector of
+///    [2 x double] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
+///
+/// \param __dp
+///    A pointer to an aligned memory location that can store two
+///    double-precision values.
+/// \param __a
+///    A packed 128-bit vector of [2 x double] containing the values to be
+///    moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_pd(double *__dp, __m128d __a)
 {
   *(__m128d*)__dp = __a;
 }
 
+/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
+///    the upper and lower 64 bits of a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
+///
+/// \param __dp
+///    A pointer to a memory location that can store two double-precision
+///    values.
+/// \param __a
+///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
+///    of the values in \a dp.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store1_pd(double *__dp, __m128d __a)
 {
@@ -1940,8 +2052,9 @@ _mm_storel_pd(double *__dp, __m128d __a)
 
 /// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8],
 ///    saving the lower 8 bits of each sum in the corresponding element of a
-///    128-bit result vector of [16 x i8]. The integer elements of both
-///    parameters can be either signed or unsigned.
+///    128-bit result vector of [16 x i8].
+///
+///    The integer elements of both parameters can be either signed or unsigned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1961,8 +2074,9 @@ _mm_add_epi8(__m128i __a, __m128i __b)
 
 /// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16],
 ///    saving the lower 16 bits of each sum in the corresponding element of a
-///    128-bit result vector of [8 x i16]. The integer elements of both
-///    parameters can be either signed or unsigned.
+///    128-bit result vector of [8 x i16].
+///
+///    The integer elements of both parameters can be either signed or unsigned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1982,8 +2096,9 @@ _mm_add_epi16(__m128i __a, __m128i __b)
 
 /// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32],
 ///    saving the lower 32 bits of each sum in the corresponding element of a
-///    128-bit result vector of [4 x i32]. The integer elements of both
-///    parameters can be either signed or unsigned.
+///    128-bit result vector of [4 x i32].
+///
+///    The integer elements of both parameters can be either signed or unsigned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2021,8 +2136,9 @@ _mm_add_si64(__m64 __a, __m64 __b)
 
 /// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64],
 ///    saving the lower 64 bits of each sum in the corresponding element of a
-///    128-bit result vector of [2 x i64]. The integer elements of both
-///    parameters can be either signed or unsigned.
+///    128-bit result vector of [2 x i64].
+///
+///    The integer elements of both parameters can be either signed or unsigned.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2168,10 +2284,12 @@ _mm_avg_epu16(__m128i __a, __m128i __b)
 /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
 ///    vectors, producing eight intermediate 32-bit signed integer products, and
 ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
-///    [4 x i32] vector. For example, bits [15:0] of both parameters are
-///    multiplied producing a 32-bit product, bits [31:16] of both parameters
-///    are multiplied producing a 32-bit product, and the sum of those two
-///    products becomes bits [31:0] of the result.
+///    [4 x i32] vector.
+///
+///    For example, bits [15:0] of both parameters are multiplied producing a
+///    32-bit product, bits [31:16] of both parameters are multiplied producing
+///    a 32-bit product, and the sum of those two products becomes bits [31:0]
+///    of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2369,7 +2487,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b)
 
 /// \brief Computes the absolute differences of corresponding 8-bit integer
 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
-///    separately sums the second 8 absolute differences. Packss these two
+///    separately sums the second 8 absolute differences. Packs these two
 ///    unsigned 16-bit integer sums into the upper and lower elements of a
 ///    [2 x i64] vector.
 ///
@@ -3106,8 +3224,9 @@ _mm_cmpgt_epi8(__m128i __a, __m128i __b)
 
 /// \brief Compares each of the corresponding signed 16-bit values of the
 ///    128-bit integer vectors to determine if the values in the first operand
-///    are greater than those in the second operand. Each comparison yields 0h
-///    for false, FFFFh for true.
+///    are greater than those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3126,8 +3245,9 @@ _mm_cmpgt_epi16(__m128i __a, __m128i __b)
 
 /// \brief Compares each of the corresponding signed 32-bit values of the
 ///    128-bit integer vectors to determine if the values in the first operand
-///    are greater than those in the second operand. Each comparison yields 0h
-///    for false, FFFFFFFFh for true.
+///    are greater than those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3146,8 +3266,9 @@ _mm_cmpgt_epi32(__m128i __a, __m128i __b)
 
 /// \brief Compares each of the corresponding signed 8-bit values of the 128-bit
 ///    integer vectors to determine if the values in the first operand are less
-///    than those in the second operand. Each comparison yields 0h for false,
-///    FFh for true.
+///    than those in the second operand.
+///
+///    Each comparison yields 0h for false, FFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3166,8 +3287,9 @@ _mm_cmplt_epi8(__m128i __a, __m128i __b)
 
 /// \brief Compares each of the corresponding signed 16-bit values of the
 ///    128-bit integer vectors to determine if the values in the first operand
-///    are less than those in the second operand. Each comparison yields 0h for
-///    false, FFFFh for true.
+///    are less than those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3186,8 +3308,9 @@ _mm_cmplt_epi16(__m128i __a, __m128i __b)
 
 /// \brief Compares each of the corresponding signed 32-bit values of the
 ///    128-bit integer vectors to determine if the values in the first operand
-///    are less than those in the second operand. Each comparison yields 0h for
-///    false, FFFFFFFFh for true.
+///    are less than those in the second operand.
+///
+///    Each comparison yields 0h for false, FFFFFFFFh for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3885,10 +4008,11 @@ _mm_storeu_si128(__m128i *__p, __m128i __b)
 
 /// \brief Moves bytes selected by the mask from the first operand to the
 ///    specified unaligned memory location. When a mask bit is 1, the
-///    corresponding byte is written, otherwise it is not written. To minimize
-///    caching, the date is flagged as non-temporal (unlikely to be used again
-///    soon). Exception and trap behavior for elements not selected for storage
-///    to memory are implementation dependent.
+///    corresponding byte is written, otherwise it is not written.
+///
+///    To minimize caching, the date is flagged as non-temporal (unlikely to be
+///    used again soon). Exception and trap behavior for elements not selected
+///    for storage to memory are implementation dependent.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3932,8 +4056,10 @@ _mm_storel_epi64(__m128i *__p, __m128i __a)
 }
 
 /// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
-///    aligned memory location. To minimize caching, the data is flagged as
-///    non-temporal (unlikely to be used again soon).
+///    aligned memory location.
+///
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3950,6 +4076,7 @@ _mm_stream_pd(double *__p, __m128d __a)
 }
 
 /// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
+///
 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon).
 ///
@@ -3967,8 +4094,9 @@ _mm_stream_si128(__m128i *__p, __m128i __a)
   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
 }
 
-/// \brief Stores a 32-bit integer value in the specified memory location. To
-///    minimize caching, the data is flagged as non-temporal (unlikely to be
+/// \brief Stores a 32-bit integer value in the specified memory location.
+///
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon).
 ///
 /// \headerfile <x86intrin.h>
@@ -3986,8 +4114,9 @@ _mm_stream_si32(int *__p, int __a)
 }
 
 #ifdef __x86_64__
-/// \brief Stores a 64-bit integer value in the specified memory location. To
-///    minimize caching, the data is flagged as non-temporal (unlikely to be
+/// \brief Stores a 64-bit integer value in the specified memory location.
+///
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
 ///    used again soon).
 ///
 /// \headerfile <x86intrin.h>
@@ -4019,7 +4148,7 @@ extern "C" {
 /// \param __p
 ///    A pointer to the memory location used to identify the cache line to be
 ///    flushed.
-void _mm_clflush(void const *);
+void _mm_clflush(void const * __p);
 
 /// \brief Forces strong memory ordering (serialization) between load
 ///    instructions preceding this instruction and load instructions following
@@ -4141,7 +4270,7 @@ _mm_packus_epi16(__m128i __a, __m128i __b)
 /// \param __a
 ///    A 128-bit integer vector.
 /// \param __imm
-///    An immediate value. Bits [3:0] selects values from \a __a to be assigned
+///    An immediate value. Bits [2:0] selects values from \a __a to be assigned
 ///    to bits[15:0] of the result. \n
 ///    000: assign values from bits [15:0] of \a __a. \n
 ///    001: assign values from bits [31:16] of \a __a. \n
@@ -4788,4 +4917,12 @@ void _mm_pause(void);
 
 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
 
+#define _MM_DENORMALS_ZERO_ON   (0x0040)
+#define _MM_DENORMALS_ZERO_OFF  (0x0000)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
 #endif /* __EMMINTRIN_H */

@@ -72,9 +72,9 @@ _cvtsh_ss(unsigned short __a)
 ///    011: Truncate \n
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns The converted 16-bit half-precision float value.
-#define _cvtss_sh(a, imm)  \
-  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                      (imm)))[0]))
+#define _cvtss_sh(a, imm) __extension__ ({ \
+  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                     (imm)))[0]); })
 
 /// \brief Converts a 128-bit vector containing 32-bit float values into a
 ///    128-bit vector containing 16-bit half-precision float values.
@@ -99,8 +99,8 @@ _cvtsh_ss(unsigned short __a)
 /// \returns A 128-bit vector containing converted 16-bit half-precision float
 ///    values. The lower 64 bits are used to store the converted 16-bit
 ///    half-precision floating-point values.
-#define _mm_cvtps_ph(a, imm) \
-  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
+#define _mm_cvtps_ph(a, imm) __extension__ ({ \
+  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); })
 
 /// \brief Converts a 128-bit vector containing 16-bit half-precision float
 ///    values into a 128-bit vector containing 32-bit float values.

@@ -33,6 +33,15 @@
  */
 #if (defined(__APPLE__) || (defined(__MINGW32__) || defined(_MSC_VER))) && \
     __STDC_HOSTED__ && __has_include_next(<float.h>)
+
+/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
+ * of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this
+ * extra indirection.
+ */
+#ifdef __APPLE__
+#define _FLOAT_H_
+#endif
+
 #  include_next <float.h>
 
 /* Undefine anything that we'll be redefining below. */

@@ -35,14 +35,10 @@
 extern "C" {
 #endif
 
-#define _TEXASR_PTR(TM_BUF) \
-  ((texasr_t *)((TM_BUF)+0))
-#define _TEXASRU_PTR(TM_BUF) \
-  ((texasru_t *)((TM_BUF)+0))
-#define _TEXASRL_PTR(TM_BUF) \
-  ((texasrl_t *)((TM_BUF)+4))
-#define _TFIAR_PTR(TM_BUF) \
-  ((tfiar_t *)((TM_BUF)+8))
+#define _TEXASR_PTR(TM_BUF) ((texasr_t *)((char *)(TM_BUF) + 0))
+#define _TEXASRU_PTR(TM_BUF) ((texasru_t *)((char *)(TM_BUF) + 0))
+#define _TEXASRL_PTR(TM_BUF) ((texasrl_t *)((char *)(TM_BUF) + 4))
+#define _TFIAR_PTR(TM_BUF) ((tfiar_t *)((char *)(TM_BUF) + 8))
 
 typedef char TM_buff_type[16];
 
@@ -178,7 +174,7 @@ extern __inline long
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 __TM_is_conflict(void* const __TM_buff)
 {
-  texasru_t texasru = *_TEXASRU_PTR (TM_buff);
+  texasru_t texasru = *_TEXASRU_PTR (__TM_buff);
   /* Return TEXASR bits 11 (Self-Induced Conflict) through
      14 (Translation Invalidation Conflict).  */
   return (_TEXASRU_EXTRACT_BITS (texasru, 14, 4)) ? 1 : 0;

@@ -146,6 +146,10 @@ _mm256_cvtph_ps(__m128i __a)
 #include <avx512cdintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VPOPCNTDQ__)
+#include <avx512vpopcntdqintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
 #endif
@@ -208,6 +212,15 @@ _rdrand32_step(unsigned int *__p)
   return __builtin_ia32_rdrand32_step(__p);
 }
 
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
 /* __bit_scan_forward */
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _bit_scan_forward(int __A) {
@@ -220,15 +233,6 @@ _bit_scan_reverse(int __A) {
   return 31 - __builtin_clz(__A);
 }
 
-#ifdef __x86_64__
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
-  return __builtin_ia32_rdrand64_step(__p);
-}
-#endif
-#endif /* __RDRND__ */
-
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FSGSBASE__)
 #ifdef __x86_64__
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))

@@ -69,7 +69,6 @@ static __inline__
 __int64 __emul(int, int);
 static __inline__
 unsigned __int64 __emulu(unsigned int, unsigned int);
-void __cdecl __fastfail(unsigned int);
 unsigned int __getcallerseflags(void);
 static __inline__
 void __halt(void);
@@ -80,16 +79,12 @@ void __incfsdword(unsigned long);
 void __incfsword(unsigned long);
 unsigned long __indword(unsigned short);
 void __indwordstring(unsigned short, unsigned long *, unsigned long);
-void __int2c(void);
 void __invlpg(void *);
 unsigned short __inword(unsigned short);
 void __inwordstring(unsigned short, unsigned short *, unsigned long);
 void __lidt(void *);
 unsigned __int64 __ll_lshift(unsigned __int64, int);
 __int64 __ll_rshift(__int64, int);
-void __llwpcb(void *);
-unsigned char __lwpins32(unsigned int, unsigned int, unsigned int);
-void __lwpval32(unsigned int, unsigned int, unsigned int);
 unsigned int __lzcnt(unsigned int);
 unsigned short __lzcnt16(unsigned short);
 static __inline__
@@ -128,7 +123,6 @@ unsigned __int64 __readmsr(unsigned long);
 unsigned __int64 __readpmc(unsigned long);
 unsigned long __segmentlimit(unsigned long);
 void __sidt(void *);
-void *__slwpcb(void);
 static __inline__
 void __stosb(unsigned char *, unsigned char, size_t);
 static __inline__
@@ -142,7 +136,6 @@ void __svm_stgi(void);
 void __svm_vmload(size_t);
 void __svm_vmrun(size_t);
 void __svm_vmsave(size_t);
-void __ud2(void);
 unsigned __int64 __ull_rshift(unsigned __int64, int);
 void __vmx_off(void);
 void __vmx_vmptrst(unsigned __int64 *);
@@ -176,7 +169,6 @@ void __cdecl _disable(void);
 void __cdecl _enable(void);
 long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value);
 unsigned char _interlockedbittestandreset(long volatile *, long);
-static __inline__
 unsigned char _interlockedbittestandset(long volatile *, long);
 long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long);
 long _InterlockedCompareExchange_HLERelease(long volatile *, long, long);
@@ -231,8 +223,6 @@ void __incgsbyte(unsigned long);
 void __incgsdword(unsigned long);
 void __incgsqword(unsigned long);
 void __incgsword(unsigned long);
-unsigned char __lwpins64(unsigned __int64, unsigned int, unsigned int);
-void __lwpval64(unsigned __int64, unsigned int, unsigned int);
 unsigned __int64 __lzcnt64(unsigned __int64);
 static __inline__
 void __movsq(unsigned long long *, unsigned long long const *, size_t);
@@ -372,11 +362,6 @@ _bittestandset(long *_BitBase, long _BitPos) {
   *_BitBase = *_BitBase | (1 << _BitPos);
   return _Res;
 }
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_interlockedbittestandset(long volatile *_BitBase, long _BitPos) {
-  long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_SEQ_CST);
-  return (_PrevVal >> _BitPos) & 1;
-}
 #if defined(__arm__) || defined(__aarch64__)
 static __inline__ unsigned char __DEFAULT_FN_ATTRS
 _interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) {
@@ -872,48 +857,7 @@ _InterlockedCompareExchange64_rel(__int64 volatile *_Destination,
   return _Comparand;
 }
 #endif
-/*----------------------------------------------------------------------------*\
-|* readfs, readgs
-|* (Pointers in address space #256 and #257 are relative to the GS and FS
-|* segment registers, respectively.)
-\*----------------------------------------------------------------------------*/
-#define __ptr_to_addr_space(__addr_space_nbr, __type, __offset)              \
-    ((volatile __type __attribute__((__address_space__(__addr_space_nbr)))*) \
-    (__offset))
 
-#ifdef __i386__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-__readfsbyte(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned char, __offset);
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__readfsword(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned short, __offset);
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__readfsqword(unsigned long __offset) {
-  return *__ptr_to_addr_space(257, unsigned __int64, __offset);
-}
-#endif
-#ifdef __x86_64__
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-__readgsbyte(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned char, __offset);
-}
-static __inline__ unsigned short __DEFAULT_FN_ATTRS
-__readgsword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned short, __offset);
-}
-static __inline__ unsigned long __DEFAULT_FN_ATTRS
-__readgsdword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned long, __offset);
-}
-static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS
-__readgsqword(unsigned long __offset) {
-  return *__ptr_to_addr_space(256, unsigned __int64, __offset);
-}
-#endif
-#undef __ptr_to_addr_space
 /*----------------------------------------------------------------------------*\
 |* movs, stos
 \*----------------------------------------------------------------------------*/

@@ -0,0 +1,150 @@
+/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LWPINTRIN_H
+#define __LWPINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
+
+/// \brief Parses the LWPCB at the specified address and enables
+///        profiling if valid.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
+///
+/// \param __addr
+///    Address to the new Lightweight Profiling Control Block (LWPCB). If the
+///    LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
+///    Lightweight Profiling.
+static __inline__ void __DEFAULT_FN_ATTRS
+__llwpcb (void *__addr)
+{
+  __builtin_ia32_llwpcb(__addr);
+}
+
+/// \brief Flushes the LWP state to memory and returns the address of the LWPCB.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
+///
+/// \return
+///    Address to the current Lightweight Profiling Control Block (LWPCB).
+///    If LWP is not currently enabled, returns NULL.
+static __inline__ void* __DEFAULT_FN_ATTRS
+__slwpcb ()
+{
+  return __builtin_ia32_slwpcb();
+}
+
+/// \brief Inserts programmed event record into the LWP event ring buffer
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+///    the event record overwrites the last record in the buffer, the MissedEvents
+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
+///    1 is returned. Otherwise 0 is returned.
+#define __lwpins32(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+/// \brief Decrements the LWP programmed value sample event counter. If the result is 
+///        negative, inserts an event record into the LWP event ring buffer in memory
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+///    A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval32(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+#ifdef __x86_64__
+
+/// \brief Inserts programmed event record into the LWP event ring buffer
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+///    A 64-bit value is inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+///    the event record overwrites the last record in the buffer, the MissedEvents
+///    counter in the LWPCB is incremented, the head pointer is not advanced, and
+///    1 is returned. Otherwise 0 is returned.
+#define __lwpins64(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+/// \brief Decrements the LWP programmed value sample event counter. If the result is 
+///        negative, inserts an event record into the LWP event ring buffer in memory
+///        and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+///    A 64-bit value is and inserted into the 64-bit Data2 field.
+/// \param DATA1
+///    A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+///    A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval64(DATA2, DATA1, FLAGS) \
+  (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+                           (unsigned int) (FLAGS)))
+
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LWPINTRIN_H */

@@ -211,7 +211,7 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2)
 /// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [8 x i8]. \n 
+///    A 64-bit integer vector of [8 x i8]. \n
 ///    Bits [39:32] are written to bits [7:0] of the result. \n
 ///    Bits [47:40] are written to bits [23:16] of the result. \n
 ///    Bits [55:48] are written to bits [39:32] of the result. \n
@@ -608,10 +608,11 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2)
 
 /// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit
 ///    integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
-///    element of the first 64-bit integer vector of [8 x i8]. If an element of
-///    the first vector is less than the corresponding element of the second
-///    vector, the result is saturated to 0. The results are packed into a
-///    64-bit integer vector of [8 x i8].
+///    element of the first 64-bit integer vector of [8 x i8].
+///
+///    If an element of the first vector is less than the corresponding element
+///    of the second vector, the result is saturated to 0. The results are
+///    packed into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -631,10 +632,11 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2)
 
 /// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit
 ///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
-///    integer element of the first 64-bit integer vector of [4 x i16]. If an
-///    element of the first vector is less than the corresponding element of the
-///    second vector, the result is saturated to 0. The results are packed into
-///    a 64-bit integer vector of [4 x i16].
+///    integer element of the first 64-bit integer vector of [4 x i16].
+///
+///    If an element of the first vector is less than the corresponding element
+///    of the second vector, the result is saturated to 0. The results are
+///    packed into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -657,9 +659,11 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2)
 ///    element of the second 64-bit integer vector of [4 x i16] and get four
 ///    32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
 ///    The lower 32 bits of these two sums are packed into a 64-bit integer
-///    vector of [2 x i32]. For example, bits [15:0] of both parameters are
-///    multiplied, bits [31:16] of both parameters are multiplied, and the sum
-///    of both results is written to bits [31:0] of the result.
+///    vector of [2 x i32].
+///
+///    For example, bits [15:0] of both parameters are multiplied, bits [31:16]
+///    of both parameters are multiplied, and the sum of both results is written
+///    to bits [31:0] of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -851,10 +855,11 @@ _mm_slli_si64(__m64 __m, int __count)
 
 /// \brief Right-shifts each 16-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer. High-order
-///    bits are filled with the sign bit of the initial value of each 16-bit
-///    element. The 16-bit results are packed into a 64-bit integer vector of
-///    [4 x i16].
+///    specified by the second parameter, which is a 64-bit integer.
+///
+///    High-order bits are filled with the sign bit of the initial value of each
+///    16-bit element. The 16-bit results are packed into a 64-bit integer
+///    vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -874,6 +879,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count)
 
 /// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///
 ///    High-order bits are filled with the sign bit of the initial value of each
 ///    16-bit element. The 16-bit results are packed into a 64-bit integer
 ///    vector of [4 x i16].
@@ -896,10 +902,11 @@ _mm_srai_pi16(__m64 __m, int __count)
 
 /// \brief Right-shifts each 32-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer. High-order
-///    bits are filled with the sign bit of the initial value of each 32-bit
-///    element. The 32-bit results are packed into a 64-bit integer vector of
-///    [2 x i32].
+///    specified by the second parameter, which is a 64-bit integer.
+///
+///    High-order bits are filled with the sign bit of the initial value of each
+///    32-bit element. The 32-bit results are packed into a 64-bit integer
+///    vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -919,6 +926,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count)
 
 /// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///
 ///    High-order bits are filled with the sign bit of the initial value of each
 ///    32-bit element. The 32-bit results are packed into a 64-bit integer
 ///    vector of [2 x i32].
@@ -941,9 +949,10 @@ _mm_srai_pi32(__m64 __m, int __count)
 
 /// \brief Right-shifts each 16-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [4 x i16], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer. High-order
-///    bits are cleared. The 16-bit results are packed into a 64-bit integer
-///    vector of [4 x i16].
+///    specified by the second parameter, which is a 64-bit integer.
+///
+///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
+///    integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -963,6 +972,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count)
 
 /// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector
 ///    of [4 x i16] by the number of bits specified by a 32-bit integer.
+///
 ///    High-order bits are cleared. The 16-bit results are packed into a 64-bit
 ///    integer vector of [4 x i16].
 ///
@@ -984,9 +994,10 @@ _mm_srli_pi16(__m64 __m, int __count)
 
 /// \brief Right-shifts each 32-bit integer element of the first parameter,
 ///    which is a 64-bit integer vector of [2 x i32], by the number of bits
-///    specified by the second parameter, which is a 64-bit integer. High-order
-///    bits are cleared. The 32-bit results are packed into a 64-bit integer
-///    vector of [2 x i32].
+///    specified by the second parameter, which is a 64-bit integer.
+///
+///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
+///    integer vector of [2 x i32].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1006,6 +1017,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count)
 
 /// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector
 ///    of [2 x i32] by the number of bits specified by a 32-bit integer.
+///
 ///    High-order bits are cleared. The 32-bit results are packed into a 64-bit
 ///    integer vector of [2 x i32].
 ///
@@ -1026,8 +1038,9 @@ _mm_srli_pi32(__m64 __m, int __count)
 }
 
 /// \brief Right-shifts the first 64-bit integer parameter by the number of bits
-///    specified by the second 64-bit integer parameter. High-order bits are
-///    cleared.
+///    specified by the second 64-bit integer parameter.
+///
+///    High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1046,7 +1059,9 @@ _mm_srl_si64(__m64 __m, __m64 __count)
 
 /// \brief Right-shifts the first parameter, which is a 64-bit integer, by the
 ///    number of bits specified by the second parameter, which is a 32-bit
-///    integer. High-order bits are cleared.
+///    integer.
+///
+///    High-order bits are cleared.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1140,8 +1155,9 @@ _mm_xor_si64(__m64 __m1, __m64 __m2)
 
 /// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
 ///    [8 x i8] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector. The comparison yields 0 for
-///    false, 0xFF for true.
+///    corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1161,8 +1177,9 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
 
 /// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
 ///    [4 x i16] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector. The comparison yields 0 for
-///    false, 0xFFFF for true.
+///    corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1182,8 +1199,9 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
 
 /// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
 ///    [2 x i32] to determine if the element of the first vector is equal to the
-///    corresponding element of the second vector. The comparison yields 0 for
-///    false, 0xFFFFFFFF for true.
+///    corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1203,8 +1221,9 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
 
 /// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of
 ///    [8 x i8] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector. The comparison yields 0
-///    for false, 0xFF for true.
+///    the corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1224,8 +1243,9 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
 
 /// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of
 ///    [4 x i16] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector. The comparison yields 0
-///    for false, 0xFFFF for true.
+///    the corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1245,8 +1265,9 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
 
 /// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of
 ///    [2 x i32] to determine if the element of the first vector is greater than
-///    the corresponding element of the second vector. The comparison yields 0
-///    for false, 0xFFFFFFFF for true.
+///    the corresponding element of the second vector.
+///
+///    The comparison yields 0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1268,7 +1289,7 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the the <c> VXORPS / XORPS </c> instruction.
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
 ///
 /// \returns An initialized 64-bit integer vector with all elements set to zero.
 static __inline__ __m64 __DEFAULT_FN_ATTRS

@@ -61,6 +61,7 @@ module _Builtin_intrinsics [system] [extern_c] {
     textual header "xopintrin.h"
     textual header "fma4intrin.h"
     textual header "mwaitxintrin.h"
+    textual header "clzerointrin.h"
 
     explicit module mm_malloc {
       requires !freestanding

@@ -16,6 +16,12 @@
 #endif //cl_khr_depth_images
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+#ifdef cl_khr_3d_image_writes
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#endif //cl_khr_3d_image_writes
+#endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0
+
 #define __ovld __attribute__((overloadable))
 #define __conv __attribute__((convergent))
 
@@ -6578,777 +6584,85 @@ half16 __ovld __cnfn convert_half16_rtz(double16);
  * OpenCL v1.1/1.2/2.0 s6.2.4.2 - as_type operators
  * Reinterprets a data type as another data type of the same size
  */
-char __ovld __cnfn as_char(char);
-char __ovld __cnfn as_char(uchar);
-
-char2 __ovld __cnfn as_char2(char2);
-char2 __ovld __cnfn as_char2(uchar2);
-char2 __ovld __cnfn as_char2(short);
-char2 __ovld __cnfn as_char2(ushort);
-
-char3 __ovld __cnfn as_char3(char3);
-char3 __ovld __cnfn as_char3(char4);
-char3 __ovld __cnfn as_char3(uchar3);
-char3 __ovld __cnfn as_char3(uchar4);
-char3 __ovld __cnfn as_char3(short2);
-char3 __ovld __cnfn as_char3(ushort2);
-char3 __ovld __cnfn as_char3(int);
-char3 __ovld __cnfn as_char3(uint);
-char3 __ovld __cnfn as_char3(float);
-
-char4 __ovld __cnfn as_char4(char3);
-char4 __ovld __cnfn as_char4(char4);
-char4 __ovld __cnfn as_char4(uchar3);
-char4 __ovld __cnfn as_char4(uchar4);
-char4 __ovld __cnfn as_char4(short2);
-char4 __ovld __cnfn as_char4(ushort2);
-char4 __ovld __cnfn as_char4(int);
-char4 __ovld __cnfn as_char4(uint);
-char4 __ovld __cnfn as_char4(float);
-
-char8 __ovld __cnfn as_char8(char8);
-char8 __ovld __cnfn as_char8(uchar8);
-char8 __ovld __cnfn as_char8(short3);
-char8 __ovld __cnfn as_char8(short4);
-char8 __ovld __cnfn as_char8(ushort3);
-char8 __ovld __cnfn as_char8(ushort4);
-char8 __ovld __cnfn as_char8(int2);
-char8 __ovld __cnfn as_char8(uint2);
-char8 __ovld __cnfn as_char8(long);
-char8 __ovld __cnfn as_char8(ulong);
-char8 __ovld __cnfn as_char8(float2);
-
-char16 __ovld __cnfn as_char16(char16);
-char16 __ovld __cnfn as_char16(uchar16);
-char16 __ovld __cnfn as_char16(short8);
-char16 __ovld __cnfn as_char16(ushort8);
-char16 __ovld __cnfn as_char16(int3);
-char16 __ovld __cnfn as_char16(int4);
-char16 __ovld __cnfn as_char16(uint3);
-char16 __ovld __cnfn as_char16(uint4);
-char16 __ovld __cnfn as_char16(long2);
-char16 __ovld __cnfn as_char16(ulong2);
-char16 __ovld __cnfn as_char16(float3);
-char16 __ovld __cnfn as_char16(float4);
-
-uchar __ovld __cnfn as_uchar(char);
-uchar __ovld __cnfn as_uchar(uchar);
-
-uchar2 __ovld __cnfn as_uchar2(char2);
-uchar2 __ovld __cnfn as_uchar2(uchar2);
-uchar2 __ovld __cnfn as_uchar2(short);
-uchar2 __ovld __cnfn as_uchar2(ushort);
-
-uchar3 __ovld __cnfn as_uchar3(char3);
-uchar3 __ovld __cnfn as_uchar3(char4);
-uchar3 __ovld __cnfn as_uchar3(uchar3);
-uchar3 __ovld __cnfn as_uchar3(uchar4);
-uchar3 __ovld __cnfn as_uchar3(short2);
-uchar3 __ovld __cnfn as_uchar3(ushort2);
-uchar3 __ovld __cnfn as_uchar3(int);
-uchar3 __ovld __cnfn as_uchar3(uint);
-uchar3 __ovld __cnfn as_uchar3(float);
-
-uchar4 __ovld __cnfn as_uchar4(char3);
-uchar4 __ovld __cnfn as_uchar4(char4);
-uchar4 __ovld __cnfn as_uchar4(uchar3);
-uchar4 __ovld __cnfn as_uchar4(uchar4);
-uchar4 __ovld __cnfn as_uchar4(short2);
-uchar4 __ovld __cnfn as_uchar4(ushort2);
-uchar4 __ovld __cnfn as_uchar4(int);
-uchar4 __ovld __cnfn as_uchar4(uint);
-uchar4 __ovld __cnfn as_uchar4(float);
-
-uchar8 __ovld __cnfn as_uchar8(char8);
-uchar8 __ovld __cnfn as_uchar8(uchar8);
-uchar8 __ovld __cnfn as_uchar8(short3);
-uchar8 __ovld __cnfn as_uchar8(short4);
-uchar8 __ovld __cnfn as_uchar8(ushort3);
-uchar8 __ovld __cnfn as_uchar8(ushort4);
-uchar8 __ovld __cnfn as_uchar8(int2);
-uchar8 __ovld __cnfn as_uchar8(uint2);
-uchar8 __ovld __cnfn as_uchar8(long);
-uchar8 __ovld __cnfn as_uchar8(ulong);
-uchar8 __ovld __cnfn as_uchar8(float2);
-
-uchar16 __ovld __cnfn as_uchar16(char16);
-uchar16 __ovld __cnfn as_uchar16(uchar16);
-uchar16 __ovld __cnfn as_uchar16(short8);
-uchar16 __ovld __cnfn as_uchar16(ushort8);
-uchar16 __ovld __cnfn as_uchar16(int3);
-uchar16 __ovld __cnfn as_uchar16(int4);
-uchar16 __ovld __cnfn as_uchar16(uint3);
-uchar16 __ovld __cnfn as_uchar16(uint4);
-uchar16 __ovld __cnfn as_uchar16(long2);
-uchar16 __ovld __cnfn as_uchar16(ulong2);
-uchar16 __ovld __cnfn as_uchar16(float3);
-uchar16 __ovld __cnfn as_uchar16(float4);
-
-short __ovld __cnfn as_short(char2);
-short __ovld __cnfn as_short(uchar2);
-short __ovld __cnfn as_short(short);
-short __ovld __cnfn as_short(ushort);
-
-short2 __ovld __cnfn as_short2(char3);
-short2 __ovld __cnfn as_short2(char4);
-short2 __ovld __cnfn as_short2(uchar3);
-short2 __ovld __cnfn as_short2(uchar4);
-short2 __ovld __cnfn as_short2(short2);
-short2 __ovld __cnfn as_short2(ushort2);
-short2 __ovld __cnfn as_short2(int);
-short2 __ovld __cnfn as_short2(uint);
-short2 __ovld __cnfn as_short2(float);
-
-short3 __ovld __cnfn as_short3(char8);
-short3 __ovld __cnfn as_short3(uchar8);
-short3 __ovld __cnfn as_short3(short3);
-short3 __ovld __cnfn as_short3(short4);
-short3 __ovld __cnfn as_short3(ushort3);
-short3 __ovld __cnfn as_short3(ushort4);
-short3 __ovld __cnfn as_short3(int2);
-short3 __ovld __cnfn as_short3(uint2);
-short3 __ovld __cnfn as_short3(long);
-short3 __ovld __cnfn as_short3(ulong);
-short3 __ovld __cnfn as_short3(float2);
-
-short4 __ovld __cnfn as_short4(char8);
-short4 __ovld __cnfn as_short4(uchar8);
-short4 __ovld __cnfn as_short4(short3);
-short4 __ovld __cnfn as_short4(short4);
-short4 __ovld __cnfn as_short4(ushort3);
-short4 __ovld __cnfn as_short4(ushort4);
-short4 __ovld __cnfn as_short4(int2);
-short4 __ovld __cnfn as_short4(uint2);
-short4 __ovld __cnfn as_short4(long);
-short4 __ovld __cnfn as_short4(ulong);
-short4 __ovld __cnfn as_short4(float2);
-
-short8 __ovld __cnfn as_short8(char16);
-short8 __ovld __cnfn as_short8(uchar16);
-short8 __ovld __cnfn as_short8(short8);
-short8 __ovld __cnfn as_short8(ushort8);
-short8 __ovld __cnfn as_short8(int3);
-short8 __ovld __cnfn as_short8(int4);
-short8 __ovld __cnfn as_short8(uint3);
-short8 __ovld __cnfn as_short8(uint4);
-short8 __ovld __cnfn as_short8(long2);
-short8 __ovld __cnfn as_short8(ulong2);
-short8 __ovld __cnfn as_short8(float3);
-short8 __ovld __cnfn as_short8(float4);
-
-short16 __ovld __cnfn as_short16(short16);
-short16 __ovld __cnfn as_short16(ushort16);
-short16 __ovld __cnfn as_short16(int8);
-short16 __ovld __cnfn as_short16(uint8);
-short16 __ovld __cnfn as_short16(long3);
-short16 __ovld __cnfn as_short16(long4);
-short16 __ovld __cnfn as_short16(ulong3);
-short16 __ovld __cnfn as_short16(ulong4);
-short16 __ovld __cnfn as_short16(float8);
-
-ushort __ovld __cnfn as_ushort(char2);
-ushort __ovld __cnfn as_ushort(uchar2);
-ushort __ovld __cnfn as_ushort(short);
-ushort __ovld __cnfn as_ushort(ushort);
-
-ushort2 __ovld __cnfn as_ushort2(char3);
-ushort2 __ovld __cnfn as_ushort2(char4);
-ushort2 __ovld __cnfn as_ushort2(uchar3);
-ushort2 __ovld __cnfn as_ushort2(uchar4);
-ushort2 __ovld __cnfn as_ushort2(short2);
-ushort2 __ovld __cnfn as_ushort2(ushort2);
-ushort2 __ovld __cnfn as_ushort2(int);
-ushort2 __ovld __cnfn as_ushort2(uint);
-ushort2 __ovld __cnfn as_ushort2(float);
-
-ushort3 __ovld __cnfn as_ushort3(char8);
-ushort3 __ovld __cnfn as_ushort3(uchar8);
-ushort3 __ovld __cnfn as_ushort3(short3);
-ushort3 __ovld __cnfn as_ushort3(short4);
-ushort3 __ovld __cnfn as_ushort3(ushort3);
-ushort3 __ovld __cnfn as_ushort3(ushort4);
-ushort3 __ovld __cnfn as_ushort3(int2);
-ushort3 __ovld __cnfn as_ushort3(uint2);
-ushort3 __ovld __cnfn as_ushort3(long);
-ushort3 __ovld __cnfn as_ushort3(ulong);
-ushort3 __ovld __cnfn as_ushort3(float2);
-
-ushort4 __ovld __cnfn as_ushort4(char8);
-ushort4 __ovld __cnfn as_ushort4(uchar8);
-ushort4 __ovld __cnfn as_ushort4(short3);
-ushort4 __ovld __cnfn as_ushort4(short4);
-ushort4 __ovld __cnfn as_ushort4(ushort3);
-ushort4 __ovld __cnfn as_ushort4(ushort4);
-ushort4 __ovld __cnfn as_ushort4(int2);
-ushort4 __ovld __cnfn as_ushort4(uint2);
-ushort4 __ovld __cnfn as_ushort4(long);
-ushort4 __ovld __cnfn as_ushort4(ulong);
-ushort4 __ovld __cnfn as_ushort4(float2);
-
-ushort8 __ovld __cnfn as_ushort8(char16);
-ushort8 __ovld __cnfn as_ushort8(uchar16);
-ushort8 __ovld __cnfn as_ushort8(short8);
-ushort8 __ovld __cnfn as_ushort8(ushort8);
-ushort8 __ovld __cnfn as_ushort8(int3);
-ushort8 __ovld __cnfn as_ushort8(int4);
-ushort8 __ovld __cnfn as_ushort8(uint3);
-ushort8 __ovld __cnfn as_ushort8(uint4);
-ushort8 __ovld __cnfn as_ushort8(long2);
-ushort8 __ovld __cnfn as_ushort8(ulong2);
-ushort8 __ovld __cnfn as_ushort8(float3);
-ushort8 __ovld __cnfn as_ushort8(float4);
-
-ushort16 __ovld __cnfn as_ushort16(short16);
-ushort16 __ovld __cnfn as_ushort16(ushort16);
-ushort16 __ovld __cnfn as_ushort16(int8);
-ushort16 __ovld __cnfn as_ushort16(uint8);
-ushort16 __ovld __cnfn as_ushort16(long3);
-ushort16 __ovld __cnfn as_ushort16(long4);
-ushort16 __ovld __cnfn as_ushort16(ulong3);
-ushort16 __ovld __cnfn as_ushort16(ulong4);
-ushort16 __ovld __cnfn as_ushort16(float8);
-
-int __ovld __cnfn as_int(char3);
-int __ovld __cnfn as_int(char4);
-int __ovld __cnfn as_int(uchar3);
-int __ovld __cnfn as_int(uchar4);
-int __ovld __cnfn as_int(short2);
-int __ovld __cnfn as_int(ushort2);
-int __ovld __cnfn as_int(int);
-int __ovld __cnfn as_int(uint);
-int __ovld __cnfn as_int(float);
-
-int2 __ovld __cnfn as_int2(char8);
-int2 __ovld __cnfn as_int2(uchar8);
-int2 __ovld __cnfn as_int2(short3);
-int2 __ovld __cnfn as_int2(short4);
-int2 __ovld __cnfn as_int2(ushort3);
-int2 __ovld __cnfn as_int2(ushort4);
-int2 __ovld __cnfn as_int2(int2);
-int2 __ovld __cnfn as_int2(uint2);
-int2 __ovld __cnfn as_int2(long);
-int2 __ovld __cnfn as_int2(ulong);
-int2 __ovld __cnfn as_int2(float2);
-
-int3 __ovld __cnfn as_int3(char16);
-int3 __ovld __cnfn as_int3(uchar16);
-int3 __ovld __cnfn as_int3(short8);
-int3 __ovld __cnfn as_int3(ushort8);
-int3 __ovld __cnfn as_int3(int3);
-int3 __ovld __cnfn as_int3(int4);
-int3 __ovld __cnfn as_int3(uint3);
-int3 __ovld __cnfn as_int3(uint4);
-int3 __ovld __cnfn as_int3(long2);
-int3 __ovld __cnfn as_int3(ulong2);
-int3 __ovld __cnfn as_int3(float3);
-int3 __ovld __cnfn as_int3(float4);
-
-int4 __ovld __cnfn as_int4(char16);
-int4 __ovld __cnfn as_int4(uchar16);
-int4 __ovld __cnfn as_int4(short8);
-int4 __ovld __cnfn as_int4(ushort8);
-int4 __ovld __cnfn as_int4(int3);
-int4 __ovld __cnfn as_int4(int4);
-int4 __ovld __cnfn as_int4(uint3);
-int4 __ovld __cnfn as_int4(uint4);
-int4 __ovld __cnfn as_int4(long2);
-int4 __ovld __cnfn as_int4(ulong2);
-int4 __ovld __cnfn as_int4(float3);
-int4 __ovld __cnfn as_int4(float4);
-
-int8 __ovld __cnfn as_int8(short16);
-int8 __ovld __cnfn as_int8(ushort16);
-int8 __ovld __cnfn as_int8(int8);
-int8 __ovld __cnfn as_int8(uint8);
-int8 __ovld __cnfn as_int8(long3);
-int8 __ovld __cnfn as_int8(long4);
-int8 __ovld __cnfn as_int8(ulong3);
-int8 __ovld __cnfn as_int8(ulong4);
-int8 __ovld __cnfn as_int8(float8);
-
-int16 __ovld __cnfn as_int16(int16);
-int16 __ovld __cnfn as_int16(uint16);
-int16 __ovld __cnfn as_int16(long8);
-int16 __ovld __cnfn as_int16(ulong8);
-int16 __ovld __cnfn as_int16(float16);
-
-uint __ovld __cnfn as_uint(char3);
-uint __ovld __cnfn as_uint(char4);
-uint __ovld __cnfn as_uint(uchar3);
-uint __ovld __cnfn as_uint(uchar4);
-uint __ovld __cnfn as_uint(short2);
-uint __ovld __cnfn as_uint(ushort2);
-uint __ovld __cnfn as_uint(int);
-uint __ovld __cnfn as_uint(uint);
-uint __ovld __cnfn as_uint(float);
-
-uint2 __ovld __cnfn as_uint2(char8);
-uint2 __ovld __cnfn as_uint2(uchar8);
-uint2 __ovld __cnfn as_uint2(short3);
-uint2 __ovld __cnfn as_uint2(short4);
-uint2 __ovld __cnfn as_uint2(ushort3);
-uint2 __ovld __cnfn as_uint2(ushort4);
-uint2 __ovld __cnfn as_uint2(int2);
-uint2 __ovld __cnfn as_uint2(uint2);
-uint2 __ovld __cnfn as_uint2(long);
-uint2 __ovld __cnfn as_uint2(ulong);
-uint2 __ovld __cnfn as_uint2(float2);
-
-uint3 __ovld __cnfn as_uint3(char16);
-uint3 __ovld __cnfn as_uint3(uchar16);
-uint3 __ovld __cnfn as_uint3(short8);
-uint3 __ovld __cnfn as_uint3(ushort8);
-uint3 __ovld __cnfn as_uint3(int3);
-uint3 __ovld __cnfn as_uint3(int4);
-uint3 __ovld __cnfn as_uint3(uint3);
-uint3 __ovld __cnfn as_uint3(uint4);
-uint3 __ovld __cnfn as_uint3(long2);
-uint3 __ovld __cnfn as_uint3(ulong2);
-uint3 __ovld __cnfn as_uint3(float3);
-uint3 __ovld __cnfn as_uint3(float4);
-
-uint4 __ovld __cnfn as_uint4(char16);
-uint4 __ovld __cnfn as_uint4(uchar16);
-uint4 __ovld __cnfn as_uint4(short8);
-uint4 __ovld __cnfn as_uint4(ushort8);
-uint4 __ovld __cnfn as_uint4(int3);
-uint4 __ovld __cnfn as_uint4(int4);
-uint4 __ovld __cnfn as_uint4(uint3);
-uint4 __ovld __cnfn as_uint4(uint4);
-uint4 __ovld __cnfn as_uint4(long2);
-uint4 __ovld __cnfn as_uint4(ulong2);
-uint4 __ovld __cnfn as_uint4(float3);
-uint4 __ovld __cnfn as_uint4(float4);
-
-uint8 __ovld __cnfn as_uint8(short16);
-uint8 __ovld __cnfn as_uint8(ushort16);
-uint8 __ovld __cnfn as_uint8(int8);
-uint8 __ovld __cnfn as_uint8(uint8);
-uint8 __ovld __cnfn as_uint8(long3);
-uint8 __ovld __cnfn as_uint8(long4);
-uint8 __ovld __cnfn as_uint8(ulong3);
-uint8 __ovld __cnfn as_uint8(ulong4);
-uint8 __ovld __cnfn as_uint8(float8);
-
-uint16 __ovld __cnfn as_uint16(int16);
-uint16 __ovld __cnfn as_uint16(uint16);
-uint16 __ovld __cnfn as_uint16(long8);
-uint16 __ovld __cnfn as_uint16(ulong8);
-uint16 __ovld __cnfn as_uint16(float16);
-
-long __ovld __cnfn as_long(char8);
-long __ovld __cnfn as_long(uchar8);
-long __ovld __cnfn as_long(short3);
-long __ovld __cnfn as_long(short4);
-long __ovld __cnfn as_long(ushort3);
-long __ovld __cnfn as_long(ushort4);
-long __ovld __cnfn as_long(int2);
-long __ovld __cnfn as_long(uint2);
-long __ovld __cnfn as_long(long);
-long __ovld __cnfn as_long(ulong);
-long __ovld __cnfn as_long(float2);
-
-long2 __ovld __cnfn as_long2(char16);
-long2 __ovld __cnfn as_long2(uchar16);
-long2 __ovld __cnfn as_long2(short8);
-long2 __ovld __cnfn as_long2(ushort8);
-long2 __ovld __cnfn as_long2(int3);
-long2 __ovld __cnfn as_long2(int4);
-long2 __ovld __cnfn as_long2(uint3);
-long2 __ovld __cnfn as_long2(uint4);
-long2 __ovld __cnfn as_long2(long2);
-long2 __ovld __cnfn as_long2(ulong2);
-long2 __ovld __cnfn as_long2(float3);
-long2 __ovld __cnfn as_long2(float4);
-
-long3 __ovld __cnfn as_long3(short16);
-long3 __ovld __cnfn as_long3(ushort16);
-long3 __ovld __cnfn as_long3(int8);
-long3 __ovld __cnfn as_long3(uint8);
-long3 __ovld __cnfn as_long3(long3);
-long3 __ovld __cnfn as_long3(long4);
-long3 __ovld __cnfn as_long3(ulong3);
-long3 __ovld __cnfn as_long3(ulong4);
-long3 __ovld __cnfn as_long3(float8);
-
-long4 __ovld __cnfn as_long4(short16);
-long4 __ovld __cnfn as_long4(ushort16);
-long4 __ovld __cnfn as_long4(int8);
-long4 __ovld __cnfn as_long4(uint8);
-long4 __ovld __cnfn as_long4(long3);
-long4 __ovld __cnfn as_long4(long4);
-long4 __ovld __cnfn as_long4(ulong3);
-long4 __ovld __cnfn as_long4(ulong4);
-long4 __ovld __cnfn as_long4(float8);
-
-long8 __ovld __cnfn as_long8(int16);
-long8 __ovld __cnfn as_long8(uint16);
-long8 __ovld __cnfn as_long8(long8);
-long8 __ovld __cnfn as_long8(ulong8);
-long8 __ovld __cnfn as_long8(float16);
-
-long16 __ovld __cnfn as_long16(long16);
-long16 __ovld __cnfn as_long16(ulong16);
-
-ulong __ovld __cnfn as_ulong(char8);
-ulong __ovld __cnfn as_ulong(uchar8);
-ulong __ovld __cnfn as_ulong(short3);
-ulong __ovld __cnfn as_ulong(short4);
-ulong __ovld __cnfn as_ulong(ushort3);
-ulong __ovld __cnfn as_ulong(ushort4);
-ulong __ovld __cnfn as_ulong(int2);
-ulong __ovld __cnfn as_ulong(uint2);
-ulong __ovld __cnfn as_ulong(long);
-ulong __ovld __cnfn as_ulong(ulong);
-ulong __ovld __cnfn as_ulong(float2);
-
-ulong2 __ovld __cnfn as_ulong2(char16);
-ulong2 __ovld __cnfn as_ulong2(uchar16);
-ulong2 __ovld __cnfn as_ulong2(short8);
-ulong2 __ovld __cnfn as_ulong2(ushort8);
-ulong2 __ovld __cnfn as_ulong2(int3);
-ulong2 __ovld __cnfn as_ulong2(int4);
-ulong2 __ovld __cnfn as_ulong2(uint3);
-ulong2 __ovld __cnfn as_ulong2(uint4);
-ulong2 __ovld __cnfn as_ulong2(long2);
-ulong2 __ovld __cnfn as_ulong2(ulong2);
-ulong2 __ovld __cnfn as_ulong2(float3);
-ulong2 __ovld __cnfn as_ulong2(float4);
-
-ulong3 __ovld __cnfn as_ulong3(short16);
-ulong3 __ovld __cnfn as_ulong3(ushort16);
-ulong3 __ovld __cnfn as_ulong3(int8);
-ulong3 __ovld __cnfn as_ulong3(uint8);
-ulong3 __ovld __cnfn as_ulong3(long3);
-ulong3 __ovld __cnfn as_ulong3(long4);
-ulong3 __ovld __cnfn as_ulong3(ulong3);
-ulong3 __ovld __cnfn as_ulong3(ulong4);
-ulong3 __ovld __cnfn as_ulong3(float8);
-
-ulong4 __ovld __cnfn as_ulong4(short16);
-ulong4 __ovld __cnfn as_ulong4(ushort16);
-ulong4 __ovld __cnfn as_ulong4(int8);
-ulong4 __ovld __cnfn as_ulong4(uint8);
-ulong4 __ovld __cnfn as_ulong4(long3);
-ulong4 __ovld __cnfn as_ulong4(long4);
-ulong4 __ovld __cnfn as_ulong4(ulong3);
-ulong4 __ovld __cnfn as_ulong4(ulong4);
-ulong4 __ovld __cnfn as_ulong4(float8);
-
-ulong8 __ovld __cnfn as_ulong8(int16);
-ulong8 __ovld __cnfn as_ulong8(uint16);
-ulong8 __ovld __cnfn as_ulong8(long8);
-ulong8 __ovld __cnfn as_ulong8(ulong8);
-ulong8 __ovld __cnfn as_ulong8(float16);
-
-ulong16 __ovld __cnfn as_ulong16(long16);
-ulong16 __ovld __cnfn as_ulong16(ulong16);
-
-float __ovld __cnfn as_float(char3);
-float __ovld __cnfn as_float(char4);
-float __ovld __cnfn as_float(uchar3);
-float __ovld __cnfn as_float(uchar4);
-float __ovld __cnfn as_float(short2);
-float __ovld __cnfn as_float(ushort2);
-float __ovld __cnfn as_float(int);
-float __ovld __cnfn as_float(uint);
-float __ovld __cnfn as_float(float);
-
-float2 __ovld __cnfn as_float2(char8);
-float2 __ovld __cnfn as_float2(uchar8);
-float2 __ovld __cnfn as_float2(short3);
-float2 __ovld __cnfn as_float2(short4);
-float2 __ovld __cnfn as_float2(ushort3);
-float2 __ovld __cnfn as_float2(ushort4);
-float2 __ovld __cnfn as_float2(int2);
-float2 __ovld __cnfn as_float2(uint2);
-float2 __ovld __cnfn as_float2(long);
-float2 __ovld __cnfn as_float2(ulong);
-float2 __ovld __cnfn as_float2(float2);
-
-float3 __ovld __cnfn as_float3(char16);
-float3 __ovld __cnfn as_float3(uchar16);
-float3 __ovld __cnfn as_float3(short8);
-float3 __ovld __cnfn as_float3(ushort8);
-float3 __ovld __cnfn as_float3(int3);
-float3 __ovld __cnfn as_float3(int4);
-float3 __ovld __cnfn as_float3(uint3);
-float3 __ovld __cnfn as_float3(uint4);
-float3 __ovld __cnfn as_float3(long2);
-float3 __ovld __cnfn as_float3(ulong2);
-float3 __ovld __cnfn as_float3(float3);
-float3 __ovld __cnfn as_float3(float4);
-
-float4 __ovld __cnfn as_float4(char16);
-float4 __ovld __cnfn as_float4(uchar16);
-float4 __ovld __cnfn as_float4(short8);
-float4 __ovld __cnfn as_float4(ushort8);
-float4 __ovld __cnfn as_float4(int3);
-float4 __ovld __cnfn as_float4(int4);
-float4 __ovld __cnfn as_float4(uint3);
-float4 __ovld __cnfn as_float4(uint4);
-float4 __ovld __cnfn as_float4(long2);
-float4 __ovld __cnfn as_float4(ulong2);
-float4 __ovld __cnfn as_float4(float3);
-float4 __ovld __cnfn as_float4(float4);
-
-float8 __ovld __cnfn as_float8(short16);
-float8 __ovld __cnfn as_float8(ushort16);
-float8 __ovld __cnfn as_float8(int8);
-float8 __ovld __cnfn as_float8(uint8);
-float8 __ovld __cnfn as_float8(long3);
-float8 __ovld __cnfn as_float8(long4);
-float8 __ovld __cnfn as_float8(ulong3);
-float8 __ovld __cnfn as_float8(ulong4);
-float8 __ovld __cnfn as_float8(float8);
-
-float16 __ovld __cnfn as_float16(int16);
-float16 __ovld __cnfn as_float16(uint16);
-float16 __ovld __cnfn as_float16(long8);
-float16 __ovld __cnfn as_float16(ulong8);
-float16 __ovld __cnfn as_float16(float16);
+#define as_char(x) __builtin_astype((x),   char)
+#define as_char2(x) __builtin_astype((x),  char2)
+#define as_char3(x) __builtin_astype((x),  char3)
+#define as_char4(x) __builtin_astype((x),  char4)
+#define as_char8(x) __builtin_astype((x),  char8)
+#define as_char16(x) __builtin_astype((x), char16)
+
+#define as_uchar(x) __builtin_astype((x),   uchar)
+#define as_uchar2(x) __builtin_astype((x),  uchar2)
+#define as_uchar3(x) __builtin_astype((x),  uchar3)
+#define as_uchar4(x) __builtin_astype((x),  uchar4)
+#define as_uchar8(x) __builtin_astype((x),  uchar8)
+#define as_uchar16(x) __builtin_astype((x), uchar16)
+
+#define as_short(x) __builtin_astype((x),   short)
+#define as_short2(x) __builtin_astype((x),  short2)
+#define as_short3(x) __builtin_astype((x),  short3)
+#define as_short4(x) __builtin_astype((x),  short4)
+#define as_short8(x) __builtin_astype((x),  short8)
+#define as_short16(x) __builtin_astype((x), short16)
+
+#define as_ushort(x) __builtin_astype((x),   ushort)
+#define as_ushort2(x) __builtin_astype((x),  ushort2)
+#define as_ushort3(x) __builtin_astype((x),  ushort3)
+#define as_ushort4(x) __builtin_astype((x),  ushort4)
+#define as_ushort8(x) __builtin_astype((x),  ushort8)
+#define as_ushort16(x) __builtin_astype((x), ushort16)
+
+#define as_int(x) __builtin_astype((x),   int)
+#define as_int2(x) __builtin_astype((x),  int2)
+#define as_int3(x) __builtin_astype((x),  int3)
+#define as_int4(x) __builtin_astype((x),  int4)
+#define as_int8(x) __builtin_astype((x),  int8)
+#define as_int16(x) __builtin_astype((x), int16)
+
+#define as_uint(x) __builtin_astype((x),   uint)
+#define as_uint2(x) __builtin_astype((x),  uint2)
+#define as_uint3(x) __builtin_astype((x),  uint3)
+#define as_uint4(x) __builtin_astype((x),  uint4)
+#define as_uint8(x) __builtin_astype((x),  uint8)
+#define as_uint16(x) __builtin_astype((x), uint16)
+
+#define as_long(x) __builtin_astype((x),   long)
+#define as_long2(x) __builtin_astype((x),  long2)
+#define as_long3(x) __builtin_astype((x),  long3)
+#define as_long4(x) __builtin_astype((x),  long4)
+#define as_long8(x) __builtin_astype((x),  long8)
+#define as_long16(x) __builtin_astype((x), long16)
+
+#define as_ulong(x) __builtin_astype((x),   ulong)
+#define as_ulong2(x) __builtin_astype((x),  ulong2)
+#define as_ulong3(x) __builtin_astype((x),  ulong3)
+#define as_ulong4(x) __builtin_astype((x),  ulong4)
+#define as_ulong8(x) __builtin_astype((x),  ulong8)
+#define as_ulong16(x) __builtin_astype((x), ulong16)
+
+#define as_float(x) __builtin_astype((x),   float)
+#define as_float2(x) __builtin_astype((x),  float2)
+#define as_float3(x) __builtin_astype((x),  float3)
+#define as_float4(x) __builtin_astype((x),  float4)
+#define as_float8(x) __builtin_astype((x),  float8)
+#define as_float16(x) __builtin_astype((x), float16)
 
 #ifdef cl_khr_fp64
-char8 __ovld __cnfn as_char8(double);
-char16 __ovld __cnfn as_char16(double2);
-uchar8 __ovld __cnfn as_uchar8(double);
-uchar16 __ovld __cnfn as_uchar16(double2);
-short3 __ovld __cnfn as_short3(double);
-short4 __ovld __cnfn as_short4(double);
-short8 __ovld __cnfn as_short8(double2);
-short16 __ovld __cnfn as_short16(double3);
-short16 __ovld __cnfn as_short16(double4);
-ushort3 __ovld __cnfn as_ushort3(double);
-ushort4 __ovld __cnfn as_ushort4(double);
-ushort8 __ovld __cnfn as_ushort8(double2);
-ushort16 __ovld __cnfn as_ushort16(double3);
-ushort16 __ovld __cnfn as_ushort16(double4);
-int2 __ovld __cnfn as_int2(double);
-int3 __ovld __cnfn as_int3(double2);
-int4 __ovld __cnfn as_int4(double2);
-int8 __ovld __cnfn as_int8(double3);
-int8 __ovld __cnfn as_int8(double4);
-int16 __ovld __cnfn as_int16(double8);
-uint2 __ovld __cnfn as_uint2(double);
-uint3 __ovld __cnfn as_uint3(double2);
-uint4 __ovld __cnfn as_uint4(double2);
-uint8 __ovld __cnfn as_uint8(double3);
-uint8 __ovld __cnfn as_uint8(double4);
-uint16 __ovld __cnfn as_uint16(double8);
-long __ovld __cnfn as_long(double);
-long2 __ovld __cnfn as_long2(double2);
-long3 __ovld __cnfn as_long3(double3);
-long3 __ovld __cnfn as_long3(double4);
-long4 __ovld __cnfn as_long4(double3);
-long4 __ovld __cnfn as_long4(double4);
-long8 __ovld __cnfn as_long8(double8);
-long16 __ovld __cnfn as_long16(double16);
-ulong __ovld __cnfn as_ulong(double);
-ulong2 __ovld __cnfn as_ulong2(double2);
-ulong3 __ovld __cnfn as_ulong3(double3);
-ulong3 __ovld __cnfn as_ulong3(double4);
-ulong4 __ovld __cnfn as_ulong4(double3);
-ulong4 __ovld __cnfn as_ulong4(double4);
-ulong8 __ovld __cnfn as_ulong8(double8);
-ulong16 __ovld __cnfn as_ulong16(double16);
-float2 __ovld __cnfn as_float2(double);
-float3 __ovld __cnfn as_float3(double2);
-float4 __ovld __cnfn as_float4(double2);
-float8 __ovld __cnfn as_float8(double3);
-float8 __ovld __cnfn as_float8(double4);
-float16 __ovld __cnfn as_float16(double8);
-double __ovld __cnfn as_double(char8);
-double __ovld __cnfn as_double(uchar8);
-double __ovld __cnfn as_double(short3);
-double __ovld __cnfn as_double(short4);
-double __ovld __cnfn as_double(ushort3);
-double __ovld __cnfn as_double(ushort4);
-double __ovld __cnfn as_double(int2);
-double __ovld __cnfn as_double(uint2);
-double __ovld __cnfn as_double(long);
-double __ovld __cnfn as_double(ulong);
-double __ovld __cnfn as_double(float2);
-double __ovld __cnfn as_double(double);
-double2 __ovld __cnfn as_double2(char16);
-double2 __ovld __cnfn as_double2(uchar16);
-double2 __ovld __cnfn as_double2(short8);
-double2 __ovld __cnfn as_double2(ushort8);
-double2 __ovld __cnfn as_double2(int3);
-double2 __ovld __cnfn as_double2(int4);
-double2 __ovld __cnfn as_double2(uint3);
-double2 __ovld __cnfn as_double2(uint4);
-double2 __ovld __cnfn as_double2(long2);
-double2 __ovld __cnfn as_double2(ulong2);
-double2 __ovld __cnfn as_double2(float3);
-double2 __ovld __cnfn as_double2(float4);
-double2 __ovld __cnfn as_double2(double2);
-double3 __ovld __cnfn as_double3(short16);
-double3 __ovld __cnfn as_double3(ushort16);
-double3 __ovld __cnfn as_double3(int8);
-double3 __ovld __cnfn as_double3(uint8);
-double3 __ovld __cnfn as_double3(long3);
-double3 __ovld __cnfn as_double3(long4);
-double3 __ovld __cnfn as_double3(ulong3);
-double3 __ovld __cnfn as_double3(ulong4);
-double3 __ovld __cnfn as_double3(float8);
-double3 __ovld __cnfn as_double3(double3);
-double3 __ovld __cnfn as_double3(double4);
-double4 __ovld __cnfn as_double4(short16);
-double4 __ovld __cnfn as_double4(ushort16);
-double4 __ovld __cnfn as_double4(int8);
-double4 __ovld __cnfn as_double4(uint8);
-double4 __ovld __cnfn as_double4(long3);
-double4 __ovld __cnfn as_double4(long4);
-double4 __ovld __cnfn as_double4(ulong3);
-double4 __ovld __cnfn as_double4(ulong4);
-double4 __ovld __cnfn as_double4(float8);
-double4 __ovld __cnfn as_double4(double3);
-double4 __ovld __cnfn as_double4(double4);
-double8 __ovld __cnfn as_double8(int16);
-double8 __ovld __cnfn as_double8(uint16);
-double8 __ovld __cnfn as_double8(long8);
-double8 __ovld __cnfn as_double8(ulong8);
-double8 __ovld __cnfn as_double8(float16);
-double8 __ovld __cnfn as_double8(double8);
-double16 __ovld __cnfn as_double16(long16);
-double16 __ovld __cnfn as_double16(ulong16);
-double16 __ovld __cnfn as_double16(double16);
+#define as_double(x) __builtin_astype((x),   double)
+#define as_double2(x) __builtin_astype((x),  double2)
+#define as_double3(x) __builtin_astype((x),  double3)
+#define as_double4(x) __builtin_astype((x),  double4)
+#define as_double8(x) __builtin_astype((x),  double8)
+#define as_double16(x) __builtin_astype((x), double16)
 #endif //cl_khr_fp64
 
 #ifdef cl_khr_fp16
-char2 __ovld __cnfn as_char2(half);
-char3 __ovld __cnfn as_char3(half2);
-char4 __ovld __cnfn as_char4(half2);
-char8 __ovld __cnfn as_char8(half3);
-char8 __ovld __cnfn as_char8(half4);
-char16 __ovld __cnfn as_char16(half8);
-uchar2 __ovld __cnfn as_uchar2(half);
-uchar3 __ovld __cnfn as_uchar3(half2);
-uchar4 __ovld __cnfn as_uchar4(half2);
-uchar8 __ovld __cnfn as_uchar8(half3);
-uchar8 __ovld __cnfn as_uchar8(half4);
-uchar16 __ovld __cnfn as_uchar16(half8);
-short __ovld __cnfn as_short(half);
-short2 __ovld __cnfn as_short2(half2);
-short3 __ovld __cnfn as_short3(half3);
-short3 __ovld __cnfn as_short3(half4);
-short4 __ovld __cnfn as_short4(half3);
-short4 __ovld __cnfn as_short4(half4);
-short8 __ovld __cnfn as_short8(half8);
-short16 __ovld __cnfn as_short16(half16);
-ushort __ovld __cnfn as_ushort(half);
-ushort2 __ovld __cnfn as_ushort2(half2);
-ushort3 __ovld __cnfn as_ushort3(half3);
-ushort3 __ovld __cnfn as_ushort3(half4);
-ushort4 __ovld __cnfn as_ushort4(half3);
-ushort4 __ovld __cnfn as_ushort4(half4);
-ushort8 __ovld __cnfn as_ushort8(half8);
-ushort16 __ovld __cnfn as_ushort16(half16);
-int __ovld __cnfn as_int(half2);
-int2 __ovld __cnfn as_int2(half3);
-int2 __ovld __cnfn as_int2(half4);
-int3 __ovld __cnfn as_int3(half8);
-int4 __ovld __cnfn as_int4(half8);
-int8 __ovld __cnfn as_int8(half16);
-uint __ovld __cnfn as_uint(half2);
-uint2 __ovld __cnfn as_uint2(half3);
-uint2 __ovld __cnfn as_uint2(half4);
-uint3 __ovld __cnfn as_uint3(half8);
-uint4 __ovld __cnfn as_uint4(half8);
-uint8 __ovld __cnfn as_uint8(half16);
-long __ovld __cnfn as_long(half3);
-long __ovld __cnfn as_long(half4);
-long2 __ovld __cnfn as_long2(half8);
-long3 __ovld __cnfn as_long3(half16);
-long4 __ovld __cnfn as_long4(half16);
-ulong __ovld __cnfn as_ulong(half3);
-ulong __ovld __cnfn as_ulong(half4);
-ulong2 __ovld __cnfn as_ulong2(half8);
-ulong3 __ovld __cnfn as_ulong3(half16);
-ulong4 __ovld __cnfn as_ulong4(half16);
-half __ovld __cnfn as_half(char2);
-half __ovld __cnfn as_half(uchar2);
-half __ovld __cnfn as_half(short);
-half __ovld __cnfn as_half(ushort);
-half __ovld __cnfn as_half(half);
-half2 __ovld __cnfn as_half2(char3);
-half2 __ovld __cnfn as_half2(char4);
-half2 __ovld __cnfn as_half2(uchar3);
-half2 __ovld __cnfn as_half2(uchar4);
-half2 __ovld __cnfn as_half2(short2);
-half2 __ovld __cnfn as_half2(ushort2);
-half2 __ovld __cnfn as_half2(int);
-half2 __ovld __cnfn as_half2(uint);
-half2 __ovld __cnfn as_half2(half2);
-half2 __ovld __cnfn as_half2(float);
-half3 __ovld __cnfn as_half3(char8);
-half3 __ovld __cnfn as_half3(uchar8);
-half3 __ovld __cnfn as_half3(short3);
-half3 __ovld __cnfn as_half3(short4);
-half3 __ovld __cnfn as_half3(ushort3);
-half3 __ovld __cnfn as_half3(ushort4);
-half3 __ovld __cnfn as_half3(int2);
-half3 __ovld __cnfn as_half3(uint2);
-half3 __ovld __cnfn as_half3(long);
-half3 __ovld __cnfn as_half3(ulong);
-half3 __ovld __cnfn as_half3(half3);
-half3 __ovld __cnfn as_half3(half4);
-half3 __ovld __cnfn as_half3(float2);
-half4 __ovld __cnfn as_half4(char8);
-half4 __ovld __cnfn as_half4(uchar8);
-half4 __ovld __cnfn as_half4(short3);
-half4 __ovld __cnfn as_half4(short4);
-half4 __ovld __cnfn as_half4(ushort3);
-half4 __ovld __cnfn as_half4(ushort4);
-half4 __ovld __cnfn as_half4(int2);
-half4 __ovld __cnfn as_half4(uint2);
-half4 __ovld __cnfn as_half4(long);
-half4 __ovld __cnfn as_half4(ulong);
-half4 __ovld __cnfn as_half4(half3);
-half4 __ovld __cnfn as_half4(half4);
-half4 __ovld __cnfn as_half4(float2);
-half8 __ovld __cnfn as_half8(char16);
-half8 __ovld __cnfn as_half8(uchar16);
-half8 __ovld __cnfn as_half8(short8);
-half8 __ovld __cnfn as_half8(ushort8);
-half8 __ovld __cnfn as_half8(int3);
-half8 __ovld __cnfn as_half8(int4);
-half8 __ovld __cnfn as_half8(uint3);
-half8 __ovld __cnfn as_half8(uint4);
-half8 __ovld __cnfn as_half8(long2);
-half8 __ovld __cnfn as_half8(ulong2);
-half8 __ovld __cnfn as_half8(half8);
-half8 __ovld __cnfn as_half8(float3);
-half8 __ovld __cnfn as_half8(float4);
-half16 __ovld __cnfn as_half16(short16);
-half16 __ovld __cnfn as_half16(ushort16);
-half16 __ovld __cnfn as_half16(int8);
-half16 __ovld __cnfn as_half16(uint8);
-half16 __ovld __cnfn as_half16(long3);
-half16 __ovld __cnfn as_half16(long4);
-half16 __ovld __cnfn as_half16(ulong3);
-half16 __ovld __cnfn as_half16(ulong4);
-half16 __ovld __cnfn as_half16(half16);
-half16 __ovld __cnfn as_half16(float8);
-float __ovld __cnfn as_float(half2);
-float2 __ovld __cnfn as_float2(half3);
-float2 __ovld __cnfn as_float2(half4);
-float3 __ovld __cnfn as_float3(half8);
-float4 __ovld __cnfn as_float4(half8);
-float8 __ovld __cnfn as_float8(half16);
-
-#ifdef cl_khr_fp64
-half3 __ovld __cnfn as_half3(double);
-half4 __ovld __cnfn as_half4(double);
-half8 __ovld __cnfn as_half8(double2);
-half16 __ovld __cnfn as_half16(double3);
-half16 __ovld __cnfn as_half16(double4);
-double __ovld __cnfn as_double(half3);
-double __ovld __cnfn as_double(half4);
-double2 __ovld __cnfn as_double2(half8);
-double3 __ovld __cnfn as_double3(half16);
-double4 __ovld __cnfn as_double4(half16);
-#endif //cl_khr_fp64
+#define as_half(x) __builtin_astype((x),   half)
+#define as_half2(x) __builtin_astype((x),  half2)
+#define as_half3(x) __builtin_astype((x),  half3)
+#define as_half4(x) __builtin_astype((x),  half4)
+#define as_half8(x) __builtin_astype((x),  half8)
+#define as_half16(x) __builtin_astype((x), half16)
 #endif //cl_khr_fp16
 
 // OpenCL v1.1 s6.9, v1.2/2.0 s6.10 - Function qualifiers
@@ -14389,10 +13703,10 @@ float __ovld atomic_xchg(volatile __local float *p, float val);
 
 #if defined(cl_khr_global_int32_base_atomics)
 int __ovld atom_xchg(volatile __global int *p, int val);
-int __ovld atom_xchg(volatile __local int *p, int val);
+unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-unsigned int __ovld atom_xchg(volatile __global unsigned int *p, unsigned int val);
+int __ovld atom_xchg(volatile __local int *p, int val);
 unsigned int __ovld atom_xchg(volatile __local unsigned int *p, unsigned int val);
 #endif
 
@@ -14509,8 +13823,6 @@ unsigned int __ovld atom_min(volatile __local unsigned int *p, unsigned int val)
 #if defined(cl_khr_int64_extended_atomics)
 long __ovld atom_min(volatile __global long *p, long val);
 unsigned long __ovld atom_min(volatile __global unsigned long *p, unsigned long val);
-#endif
-#if defined(cl_khr_local_int32_extended_atomics)
 long __ovld atom_min(volatile __local long *p, long val);
 unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long val);
 #endif
@@ -15650,6 +14962,7 @@ float __purefn __ovld read_imagef(read_only image2d_array_msaa_depth_t image, in
 #endif //cl_khr_gl_msaa_sharing
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 
 float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
@@ -15725,6 +15038,7 @@ int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, f
 uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float lod);
 
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 /**
 * Sampler-less Image Access
@@ -15823,6 +15137,7 @@ float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 co
 float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample);
 #endif //cl_khr_gl_msaa_sharing
 
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
 int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
@@ -15896,6 +15211,7 @@ float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler
 int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
 uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod);
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // Image read functions returning half4 type
 #ifdef cl_khr_fp16
@@ -15995,9 +15311,11 @@ void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, flo
 void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color);
 void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color);
 
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color);
 void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color);
 void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color);
+#endif
 
 #ifdef cl_khr_depth_images
 void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, float color);
@@ -16005,6 +15323,7 @@ void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, flo
 #endif //cl_khr_depth_images
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
 void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
@@ -16025,16 +15344,21 @@ void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, in
 void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float color);
 void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float color);
 
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color);
 void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color);
 void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
+#endif
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // Image write functions for half4 type
 #ifdef cl_khr_fp16
 void __ovld write_imageh(write_only image1d_t image, int coord, half4 color);
 void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color);
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color);
+#endif
 void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color);
 void __ovld write_imageh(write_only image2d_array_t image, int4 coord, half4 color);
 void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 color);
@@ -16062,15 +15386,18 @@ void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, flo
 void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color);
 void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color);
 
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color);
 void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color);
 void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color);
+#endif
 
 #ifdef cl_khr_depth_images
 void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float color);
 void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, float color);
 #endif //cl_khr_depth_images
 
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
 void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
@@ -16091,16 +15418,21 @@ void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, in
 void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color);
 void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color);
 
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color);
 void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color);
 void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
+#endif
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // Image write functions for half4 type
 #ifdef cl_khr_fp16
 void __ovld write_imageh(read_write image1d_t image, int coord, half4 color);
 void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color);
+#ifdef cl_khr_3d_image_writes
 void __ovld write_imageh(read_write image3d_t image, int4 coord, half4 color);
+#endif
 void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 color);
 void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color);
 void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color);
@@ -16118,7 +15450,9 @@ void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 col
 int __ovld __cnfn get_image_width(read_only image1d_t image);
 int __ovld __cnfn get_image_width(read_only image1d_buffer_t image);
 int __ovld __cnfn get_image_width(read_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_width(read_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_width(read_only image1d_array_t image);
 int __ovld __cnfn get_image_width(read_only image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -16135,7 +15469,9 @@ int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image);
 int __ovld __cnfn get_image_width(write_only image1d_t image);
 int __ovld __cnfn get_image_width(write_only image1d_buffer_t image);
 int __ovld __cnfn get_image_width(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_width(write_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_width(write_only image1d_array_t image);
 int __ovld __cnfn get_image_width(write_only image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -16186,7 +15522,9 @@ int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
 
 int __ovld __cnfn get_image_height(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_height(write_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_height(write_only image2d_array_t image);
 #ifdef cl_khr_depth_images
 int __ovld __cnfn get_image_height(write_only image2d_depth_t image);
@@ -16220,13 +15558,16 @@ int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image);
  */
 int __ovld __cnfn get_image_depth(read_only image3d_t image);
 
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_depth(write_only image3d_t image);
+#endif
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int __ovld __cnfn get_image_depth(read_write image3d_t image);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 #ifdef cl_khr_mipmap_image
 /**
  * Return the image miplevels.
@@ -16238,13 +15579,13 @@ int __ovld get_image_num_mip_levels(read_only image3d_t image);
 
 int __ovld get_image_num_mip_levels(write_only image1d_t image);
 int __ovld get_image_num_mip_levels(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld get_image_num_mip_levels(write_only image3d_t image);
+#endif
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int __ovld get_image_num_mip_levels(read_write image1d_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_t image);
 int __ovld get_image_num_mip_levels(read_write image3d_t image);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 int __ovld get_image_num_mip_levels(read_only image1d_array_t image);
 int __ovld get_image_num_mip_levels(read_only image2d_array_t image);
@@ -16256,14 +15597,13 @@ int __ovld get_image_num_mip_levels(write_only image2d_array_t image);
 int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t image);
 int __ovld get_image_num_mip_levels(write_only image2d_depth_t image);
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int __ovld get_image_num_mip_levels(read_write image1d_array_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_array_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 #endif //cl_khr_mipmap_image
+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 /**
  * Return the channel data type. Valid values are:
@@ -16324,7 +15664,9 @@ int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth
 int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image);
 int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image);
 int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image);
 int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -16418,7 +15760,9 @@ int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t i
 int __ovld __cnfn get_image_channel_order(write_only image1d_t image);
 int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image);
 int __ovld __cnfn get_image_channel_order(write_only image2d_t image);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_order(write_only image3d_t image);
+#endif
 int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image);
 int __ovld __cnfn get_image_channel_order(write_only image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -16504,7 +15848,9 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image);
  * component and the w component is 0.
  */
 int4 __ovld __cnfn get_image_dim(read_only image3d_t image);
+#ifdef cl_khr_3d_image_writes
 int4 __ovld __cnfn get_image_dim(write_only image3d_t image);
+#endif
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 int4 __ovld __cnfn get_image_dim(read_write image3d_t image);
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
@@ -16714,16 +16060,12 @@ typedef int clk_profiling_info;
 
 #define MAX_WORK_DIM        3
 
-// ToDo: Remove definition of ndrange_t in Clang as an opaque type and add back
-// the following ndrange_t definition.
-#if 0
 typedef struct {
     unsigned int workDimension;
     size_t globalWorkOffset[MAX_WORK_DIM];
     size_t globalWorkSize[MAX_WORK_DIM];
     size_t localWorkSize[MAX_WORK_DIM];
 } ndrange_t;
-#endif
 
 ndrange_t __ovld ndrange_1D(size_t);
 ndrange_t __ovld ndrange_1D(size_t, size_t);

@@ -31,9 +31,11 @@
   __attribute__((__always_inline__, __nodebug__, __target__("sse3")))
 
 /// \brief Loads data from an unaligned memory location to elements in a 128-bit
-///    vector. If the address of the data is not 16-byte aligned, the
-///    instruction may read two adjacent aligned blocks of memory to retrieve
-///    the requested data.
+///    vector.
+///
+///    If the address of the data is not 16-byte aligned, the instruction may
+///    read two adjacent aligned blocks of memory to retrieve the requested
+///    data.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -115,7 +117,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b)
 
 /// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit
 ///    vector of [4 x float] to float values stored in a 128-bit vector of
-///    [4 x float]. 
+///    [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -136,7 +138,7 @@ _mm_movehdup_ps(__m128 __a)
 }
 
 /// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of
-///    [4 x float] to float values stored in a 128-bit vector of [4 x float]. 
+///    [4 x float] to float values stored in a 128-bit vector of [4 x float].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -257,14 +259,6 @@ _mm_movedup_pd(__m128d __a)
   return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
 }
 
-#define _MM_DENORMALS_ZERO_ON   (0x0040)
-#define _MM_DENORMALS_ZERO_OFF  (0x0000)
-
-#define _MM_DENORMALS_ZERO_MASK (0x0040)
-
-#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
-#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
-
 /// \brief Establishes a linear address memory range to be monitored and puts
 ///    the processor in the monitor event pending state. Data stored in the
 ///    monitored address range causes the processor to exit the pending state.

@@ -29,12 +29,38 @@
 #define __PRFCHWINTRIN_H
 
 #if defined(__PRFCHW__) || defined(__3dNOW__)
+/// \brief Loads a memory sequence containing the specified memory address into
+///    all data cache levels. The cache-coherency state is set to exclusive.
+///    Data can be read from and written to the cache line without additional
+///    delay.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _m_prefetch(void *__P)
 {
   __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
 }
 
+/// \brief Loads a memory sequence containing the specified memory address into
+///    the L1 data cache and sets the cache-coherency to modified. This
+///    provides a hint to the processor that the cache line will be modified.
+///    It is intended for use when the cache line will be written to shortly
+///    after the prefetch is performed.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHW instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
 static __inline__ void __attribute__((__always_inline__, __nodebug__))
 _m_prefetchw(void *__P)
 {

@@ -46,37 +46,379 @@
 #define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
 
+/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an
+///    integer and returns the rounded values in a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_ceil_ps(__m128 X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values to be rounded up.
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
+
+/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an
+///    integer and returns the rounded values in a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_ceil_pd(__m128d X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double] values to be rounded up.
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
+
+/// \brief Copies three upper elements of the first 128-bit vector operand to
+///    the corresponding three upper elements of the 128-bit result vector of
+///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
+///    operand to an integer and copies it to the lowest element of the 128-bit
+///    result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+///    rounded up to the nearest integer and copied to the corresponding bits
+///    of the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
 #define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+
+/// \brief Copies the upper element of the first 128-bit vector operand to the
+///    corresponding upper element of the 128-bit result vector of [2 x double].
+///    Rounds up the lower element of the second 128-bit vector operand to an
+///    integer and copies it to the lower element of the 128-bit result vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+///    rounded up to the nearest integer and copied to the corresponding bits
+///    of the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
 #define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
 
+/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an
+///    an integer and returns the rounded values in a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_floor_ps(__m128 X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float] values to be rounded down.
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
+
+/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an
+///    integer and returns the rounded values in a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_floor_pd(__m128d X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
+
+/// \brief Copies three upper elements of the first 128-bit vector operand to
+///    the corresponding three upper elements of the 128-bit result vector of
+///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
+///    operand to an integer and copies it to the lowest element of the 128-bit
+///    result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+///    rounded down to the nearest integer and copied to the corresponding bits
+///    of the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
 #define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+
+/// \brief Copies the upper element of the first 128-bit vector operand to the
+///    corresponding upper element of the 128-bit result vector of [2 x double].
+///    Rounds down the lower element of the second 128-bit vector operand to an
+///    integer and copies it to the lower element of the 128-bit result vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+///    rounded down to the nearest integer and copied to the corresponding bits
+///    of the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
 #define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
 
+/// \brief Rounds each element of the 128-bit vector of [4 x float] to an
+///    integer value according to the rounding control specified by the second
+///    argument and returns the rounded values in a 128-bit vector of
+///    [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_round_ps(__m128 X, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float].
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used \n
+///      1: The PE field is not updated \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M \n
+///      1: Use the current MXCSR setting \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest \n
+///      01: Downward (toward negative infinity) \n
+///      10: Upward (toward positive infinity) \n
+///      11: Truncated
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_round_ps(X, M) __extension__ ({ \
   (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
 
+/// \brief Copies three upper elements of the first 128-bit vector operand to
+///    the corresponding three upper elements of the 128-bit result vector of
+///    [4 x float]. Rounds the lowest element of the second 128-bit vector
+///    operand to an integer value according to the rounding control specified
+///    by the third argument and copies it to the lowest element of the 128-bit
+///    result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+///    rounded to the nearest integer using the specified rounding control and
+///    copied to the corresponding bits of the result.
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used \n
+///      1: The PE field is not updated \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M \n
+///      1: Use the current MXCSR setting \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest \n
+///      01: Downward (toward negative infinity) \n
+///      10: Upward (toward positive infinity) \n
+///      11: Truncated
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+///    values.
 #define _mm_round_ss(X, Y, M) __extension__ ({ \
   (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
                                  (__v4sf)(__m128)(Y), (M)); })
 
+/// \brief Rounds each element of the 128-bit vector of [2 x double] to an
+///    integer value according to the rounding control specified by the second
+///    argument and returns the rounded values in a 128-bit vector of
+///    [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_round_pd(__m128d X, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double].
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used \n
+///      1: The PE field is not updated \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M \n
+///      1: Use the current MXCSR setting \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest \n
+///      01: Downward (toward negative infinity) \n
+///      10: Upward (toward positive infinity) \n
+///      11: Truncated
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_round_pd(X, M) __extension__ ({ \
   (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
 
+/// \brief Copies the upper element of the first 128-bit vector operand to the
+///    corresponding upper element of the 128-bit result vector of [2 x double].
+///    Rounds the lower element of the second 128-bit vector operand to an
+///    integer value according to the rounding control specified by the third
+///    argument and copies it to the lower element of the 128-bit result vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+///    copied to the corresponding bits of the result.
+/// \param Y
+///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+///    rounded to the nearest integer using the specified rounding control and
+///    copied to the corresponding bits of the result.
+/// \param M
+///    An integer value that specifies the rounding operation. \n
+///    Bits [7:4] are reserved. \n
+///    Bit [3] is a precision exception value: \n
+///      0: A normal PE exception is used \n
+///      1: The PE field is not updated \n
+///    Bit [2] is the rounding control source: \n
+///      0: Use bits [1:0] of \a M \n
+///      1: Use the current MXCSR setting \n
+///    Bits [1:0] contain the rounding control definition: \n
+///      00: Nearest \n
+///      01: Downward (toward negative infinity) \n
+///      10: Upward (toward positive infinity) \n
+///      11: Truncated
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+///    values.
 #define _mm_round_sd(X, Y, M) __extension__ ({ \
   (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
                                   (__v2df)(__m128d)(Y), (M)); })
 
 /* SSE4 Packed Blending Intrinsics.  */
+/// \brief Returns a 128-bit vector of [2 x double] where the values are
+///    selected from either the first or second operand as specified by the
+///    third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param V1
+///    A 128-bit vector of [2 x double].
+/// \param V2
+///    A 128-bit vector of [2 x double].
+/// \param M
+///    An immediate integer operand, with mask bits [1:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
+///    element in operand \a V1 is copied to the same position in the result.
+///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
+///    is copied to the same position in the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 #define _mm_blend_pd(V1, V2, M) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
                                    (__v2df)(__m128d)(V2), \
                                    (((M) & 0x01) ? 2 : 0), \
                                    (((M) & 0x02) ? 3 : 1)); })
 
+/// \brief Returns a 128-bit vector of [4 x float] where the values are selected
+///    from either the first or second operand as specified by the third
+///    operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
+///
+/// \param V1
+///    A 128-bit vector of [4 x float].
+/// \param V2
+///    A 128-bit vector of [4 x float].
+/// \param M
+///    An immediate integer operand, with mask bits [3:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
+///    element in operand \a V1 is copied to the same position in the result.
+///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
+///    is copied to the same position in the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 #define _mm_blend_ps(V1, V2, M) __extension__ ({ \
   (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
                                   (((M) & 0x01) ? 4 : 0), \
@@ -84,6 +426,26 @@
                                   (((M) & 0x04) ? 6 : 2), \
                                   (((M) & 0x08) ? 7 : 3)); })
 
+/// \brief Returns a 128-bit vector of [2 x double] where the values are
+///    selected from either the first or second operand as specified by the
+///    third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [2 x double].
+/// \param __V2
+///    A 128-bit vector of [2 x double].
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 64-bit element in operand \a __V1 is copied to the same
+///    position in the result. When a mask bit is 1, the corresponding 64-bit
+///    element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
 {
@@ -91,6 +453,26 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
                                             (__v2df)__M);
 }
 
+/// \brief Returns a 128-bit vector of [4 x float] where the values are
+///    selected from either the first or second operand as specified by the
+///    third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x float].
+/// \param __V2
+///    A 128-bit vector of [4 x float].
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
+///    how the values are to be copied. The position of the mask bit corresponds
+///    to the most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 32-bit element in operand \a __V1 is copied to the same
+///    position in the result. When a mask bit is 1, the corresponding 32-bit
+///    element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
 {
@@ -98,6 +480,26 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
                                            (__v4sf)__M);
 }
 
+/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected
+///    from either of the first or second operand as specified by the third
+///    operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [16 x i8].
+/// \param __V2
+///    A 128-bit vector of [16 x i8].
+/// \param __M
+///    A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying
+///    how the values are to be copied. The position of the mask bit corresponds
+///    to the most significant bit of a copied value. When a mask bit is 0, the
+///    corresponding 8-bit element in operand \a __V1 is copied to the same
+///    position in the result. When a mask bit is 1, the corresponding 8-bit
+///    element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [16 x i8] containing the copied values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 {
@@ -105,6 +507,30 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
                                                (__v16qi)__M);
 }
 
+/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected
+///    from either of the first or second operand as specified by the third
+///    operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
+///
+/// \param V1
+///    A 128-bit vector of [8 x i16].
+/// \param V2
+///    A 128-bit vector of [8 x i16].
+/// \param M
+///    An immediate integer operand, with mask bits [7:0] specifying how the
+///    values are to be copied. The position of the mask bit corresponds to the
+///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
+///    element in operand \a V1 is copied to the same position in the result.
+///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
+///    is copied to the same position in the result.
+/// \returns A 128-bit vector of [8 x i16] containing the copied values.
 #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
                                    (__v8hi)(__m128i)(V2), \
@@ -118,12 +544,39 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
                                    (((M) & 0x80) ? 15 : 7)); })
 
 /* SSE4 Dword Multiply Instructions.  */
+/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32]
+///    and returns the lower 32 bits of the each product in a 128-bit vector of
+///    [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector.
+/// \param __V2
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the products of both operands.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
 }
 
+/// \brief Multiplies corresponding even-indexed elements of two 128-bit
+///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
+///    containing the products.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32].
+/// \param __V2
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [2 x i64] containing the products of both
+///    operands.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 {
@@ -131,64 +584,243 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 }
 
 /* SSE4 Floating Point Dot Product Instructions.  */
+/// \brief Computes the dot product of the two 128-bit vectors of [4 x float]
+///    and returns it in the elements of the 128-bit result vector of
+///    [4 x float].
+///
+///    The immediate integer operand controls which input elements
+///    will contribute to the dot product, and where the final results are
+///    returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float].
+/// \param Y
+///    A 128-bit vector of [4 x float].
+/// \param M
+///    An immediate integer operand. Mask bits [7:4] determine which elements
+///    of the input vectors are used, with bit [4] corresponding to the lowest
+///    element and bit [7] corresponding to the highest element of each [4 x
+///    float] vector. If a bit is set, the corresponding elements from the two
+///    input vectors are used as an input for dot product; otherwise that input
+///    is treated as zero. Bits [3:0] determine which elements of the result
+///    will receive a copy of the final dot product, with bit [0] corresponding
+///    to the lowest element and bit [3] corresponding to the highest element of
+///    each [4 x float] subvector. If a bit is set, the dot product is returned
+///    in the corresponding element; otherwise that element is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the dot product.
 #define _mm_dp_ps(X, Y, M) __extension__ ({ \
   (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
                                (__v4sf)(__m128)(Y), (M)); })
 
+/// \brief Computes the dot product of the two 128-bit vectors of [2 x double]
+///    and returns it in the elements of the 128-bit result vector of
+///    [2 x double].
+///
+///    The immediate integer operand controls which input
+///    elements will contribute to the dot product, and where the final results
+///    are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [2 x double].
+/// \param Y
+///    A 128-bit vector of [2 x double].
+/// \param M
+///    An immediate integer operand. Mask bits [5:4] determine which elements
+///    of the input vectors are used, with bit [4] corresponding to the lowest
+///    element and bit [5] corresponding to the highest element of each of [2 x
+///    double] vector. If a bit is set, the corresponding elements from the two
+///    input vectors are used as an input for dot product; otherwise that input
+///    is treated as zero. Bits [1:0] determine which elements of the result
+///    will receive a copy of the final dot product, with bit [0] corresponding
+///    to the lowest element and bit [3] corresponding to the highest element of
+///    each [2 x double] vector. If a bit is set, the dot product is returned in
+///    the corresponding element; otherwise that element is set to zero.
 #define _mm_dp_pd(X, Y, M) __extension__ ({\
   (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
                                 (__v2df)(__m128d)(Y), (M)); })
 
 /* SSE4 Streaming Load Hint Instruction.  */
+/// \brief Loads integer values from a 128-bit aligned memory location to a
+///    128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
+///
+/// \param __V
+///    A pointer to a 128-bit aligned memory location that contains the integer
+///    values.
+/// \returns A 128-bit integer vector containing the data stored at the
+///    specified memory location.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_stream_load_si128 (__m128i const *__V)
 {
-  return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V);
+  return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
 }
 
 /* SSE4 Packed Integer Min/Max Instructions.  */
+/// \brief Compares the corresponding elements of two 128-bit vectors of
+///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
+///    of the two values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [16 x i8].
+/// \param __V2
+///    A 128-bit vector of [16 x i8]
+/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi8 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
+/// \brief Compares the corresponding elements of two 128-bit vectors of
+///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
+///    greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [16 x i8].
+/// \param __V2
+///    A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the greater values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi8 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
+/// \brief Compares the corresponding elements of two 128-bit vectors of
+///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
+///    value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [8 x u16].
+/// \param __V2
+///    A 128-bit vector of [8 x u16].
+/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu16 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
+/// \brief Compares the corresponding elements of two 128-bit vectors of
+///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
+///    greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [8 x u16].
+/// \param __V2
+///    A 128-bit vector of [8 x u16].
+/// \returns A 128-bit vector of [8 x u16] containing the greater values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu16 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
+/// \brief Compares the corresponding elements of two 128-bit vectors of
+///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
+///    value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32].
+/// \param __V2
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
+/// \brief Compares the corresponding elements of two 128-bit vectors of
+///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
+///    greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32].
+/// \param __V2
+///    A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the greater values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epi32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
+/// \brief Compares the corresponding elements of two 128-bit vectors of
+///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
+///    value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x u32].
+/// \param __V2
+///    A 128-bit vector of [4 x u32].
+/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_min_epu32 (__m128i __V1, __m128i __V2)
 {
   return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
 }
 
+/// \brief Compares the corresponding elements of two 128-bit vectors of
+///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
+///    greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x u32].
+/// \param __V2
+///    A 128-bit vector of [4 x u32].
+/// \returns A 128-bit vector of [4 x u32] containing the greater values.
 static __inline__  __m128i __DEFAULT_FN_ATTRS
 _mm_max_epu32 (__m128i __V1, __m128i __V2)
 {
@@ -196,7 +828,70 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 }
 
 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
+/// \brief Takes the first argument \a X and inserts an element from the second
+///    argument \a Y as selected by the third argument \a N. That result then
+///    has elements zeroed out also as selected by the third argument \a N. The
+///    resulting 128-bit vector of [4 x float] is then returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
+///
+/// \param X
+///    A 128-bit vector source operand of [4 x float]. With the exception of
+///    those bits in the result copied from parameter \a Y and zeroed by bits
+///    [3:0] of \a N, all bits from this parameter are copied to the result.
+/// \param Y
+///    A 128-bit vector source operand of [4 x float]. One single-precision
+///    floating-point element from this source, as determined by the immediate
+///    parameter, is copied to the result.
+/// \param N
+///    Specifies which bits from operand \a Y will be copied, which bits in the
+///    result they will be be copied to, and which bits in the result will be
+///    cleared. The following assignments are made: \n
+///    Bits [7:6] specify the bits to copy from operand \a Y: \n
+///      00: Selects bits [31:0] from operand \a Y. \n
+///      01: Selects bits [63:32] from operand \a Y. \n
+///      10: Selects bits [95:64] from operand \a Y. \n
+///      11: Selects bits [127:96] from operand \a Y. \n
+///    Bits [5:4] specify the bits in the result to which the selected bits
+///    from operand \a Y are copied: \n
+///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
+///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
+///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
+///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
+///    Bits[3:0]: If any of these bits are set, the corresponding result
+///    element is cleared.
+/// \returns A 128-bit vector of [4 x float] containing the copied single-
+///    precision floating point elements from the operands.
 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+
+/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
+///    returns it, using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_ps(__m128 X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
+/// instruction.
+///
+/// \param X
+///    A 128-bit vector of [4 x float].
+/// \param N
+///    An immediate value. Bits [1:0] determines which bits from the argument
+///    \a X are extracted and returned: \n
+///    00: Bits [31:0] of parameter \a X are returned. \n
+///    01: Bits [63:32] of parameter \a X are returned. \n
+///    10: Bits [95:64] of parameter \a X are returned. \n
+///    11: Bits [127:96] of parameter \a X are returned.
+/// \returns A 32-bit integer containing the extracted 32 bits of float data.
 #define _mm_extract_ps(X, N) (__extension__                      \
                               ({ union { int __i; float __f; } __t;  \
                                  __v4sf __a = (__v4sf)(__m128)(X);       \
@@ -217,15 +912,111 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
                                              _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
 
 /* Insert int into packed integer array at index.  */
+/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of
+///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
+///    of an integer parameter \a I into an offset specified by the immediate
+///    value parameter \a N.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
+///    result and then one of the sixteen elements in the result vector is
+///    replaced by the lower 8 bits of \a I.
+/// \param I
+///    An integer. The lower 8 bits of this operand are written to the result
+///    beginning at the offset specified by \a N.
+/// \param N
+///    An immediate value. Bits [3:0] specify the bit offset in the result at
+///    which the lower 8 bits of \a I are written. \n
+///    0000: Bits [7:0] of the result are used for insertion. \n
+///    0001: Bits [15:8] of the result are used for insertion. \n
+///    0010: Bits [23:16] of the result are used for insertion. \n
+///    0011: Bits [31:24] of the result are used for insertion. \n
+///    0100: Bits [39:32] of the result are used for insertion. \n
+///    0101: Bits [47:40] of the result are used for insertion. \n
+///    0110: Bits [55:48] of the result are used for insertion. \n
+///    0111: Bits [63:56] of the result are used for insertion. \n
+///    1000: Bits [71:64] of the result are used for insertion. \n
+///    1001: Bits [79:72] of the result are used for insertion. \n
+///    1010: Bits [87:80] of the result are used for insertion. \n
+///    1011: Bits [95:88] of the result are used for insertion. \n
+///    1100: Bits [103:96] of the result are used for insertion. \n
+///    1101: Bits [111:104] of the result are used for insertion. \n
+///    1110: Bits [119:112] of the result are used for insertion. \n
+///    1111: Bits [127:120] of the result are used for insertion.
+/// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi8(X, I, N) (__extension__                           \
                                   ({ __v16qi __a = (__v16qi)(__m128i)(X); \
                                      __a[(N) & 15] = (I);                 \
                                      (__m128i)__a;}))
+
+/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of
+///    the 128-bit integer vector parameter, and then inserting the 32-bit
+///    integer parameter \a I at the offset specified by the immediate value
+///    parameter \a N.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
+///    result and then one of the four elements in the result vector is
+///    replaced by \a I.
+/// \param I
+///    A 32-bit integer that is written to the result beginning at the offset
+///    specified by \a N.
+/// \param N
+///    An immediate value. Bits [1:0] specify the bit offset in the result at
+///    which the integer \a I is written. \n
+///    00: Bits [31:0] of the result are used for insertion. \n
+///    01: Bits [63:32] of the result are used for insertion. \n
+///    10: Bits [95:64] of the result are used for insertion. \n
+///    11: Bits [127:96] of the result are used for insertion.
+/// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi32(X, I, N) (__extension__                         \
                                    ({ __v4si __a = (__v4si)(__m128i)(X); \
                                       __a[(N) & 3] = (I);                \
                                       (__m128i)__a;}))
+
 #ifdef __x86_64__
+/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of
+///    the 128-bit integer vector parameter, and then inserting the 64-bit
+///    integer parameter \a I, using the immediate value parameter \a N as an
+///    insertion location selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
+///    result and then one of the two elements in the result vector is replaced
+///    by \a I.
+/// \param I
+///    A 64-bit integer that is written to the result beginning at the offset
+///    specified by \a N.
+/// \param N
+///    An immediate value. Bit [0] specifies the bit offset in the result at
+///    which the integer \a I is written. \n
+///    0: Bits [63:0] of the result are used for insertion. \n
+///    1: Bits [127:64] of the result are used for insertion. \n
+/// \returns A 128-bit integer vector containing the constructed values.
 #define _mm_insert_epi64(X, I, N) (__extension__                         \
                                    ({ __v2di __a = (__v2di)(__m128i)(X); \
                                       __a[(N) & 1] = (I);                \
@@ -235,42 +1026,219 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /* Extract int from packed integer array at index.  This returns the element
  * as a zero extended value, so it is unsigned.
  */
+/// \brief Extracts an 8-bit element from the 128-bit integer vector of
+///    [16 x i8], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_epi8(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    An immediate value. Bits [3:0] specify which 8-bit vector element from
+///    the argument \a X to extract and copy to the result. \n
+///    0000: Bits [7:0] of parameter \a X are extracted. \n
+///    0001: Bits [15:8] of the parameter \a X are extracted. \n
+///    0010: Bits [23:16] of the parameter \a X are extracted. \n
+///    0011: Bits [31:24] of the parameter \a X are extracted. \n
+///    0100: Bits [39:32] of the parameter \a X are extracted. \n
+///    0101: Bits [47:40] of the parameter \a X are extracted. \n
+///    0110: Bits [55:48] of the parameter \a X are extracted. \n
+///    0111: Bits [63:56] of the parameter \a X are extracted. \n
+///    1000: Bits [71:64] of the parameter \a X are extracted. \n
+///    1001: Bits [79:72] of the parameter \a X are extracted. \n
+///    1010: Bits [87:80] of the parameter \a X are extracted. \n
+///    1011: Bits [95:88] of the parameter \a X are extracted. \n
+///    1100: Bits [103:96] of the parameter \a X are extracted. \n
+///    1101: Bits [111:104] of the parameter \a X are extracted. \n
+///    1110: Bits [119:112] of the parameter \a X are extracted. \n
+///    1111: Bits [127:120] of the parameter \a X are extracted.
+/// \returns  An unsigned integer, whose lower 8 bits are selected from the
+///    128-bit integer vector parameter and the remaining bits are assigned
+///    zeros.
 #define _mm_extract_epi8(X, N) (__extension__                           \
                                 ({ __v16qi __a = (__v16qi)(__m128i)(X); \
                                    (int)(unsigned char) __a[(N) & 15];}))
+
+/// \brief Extracts a 32-bit element from the 128-bit integer vector of
+///    [4 x i32], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_epi32(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    An immediate value. Bits [1:0] specify which 32-bit vector element from
+///    the argument \a X to extract and copy to the result. \n
+///    00: Bits [31:0] of the parameter \a X are extracted. \n
+///    01: Bits [63:32] of the parameter \a X are extracted. \n
+///    10: Bits [95:64] of the parameter \a X are extracted. \n
+///    11: Bits [127:96] of the parameter \a X are exracted.
+/// \returns  An integer, whose lower 32 bits are selected from the 128-bit
+///    integer vector parameter and the remaining bits are assigned zeros.
 #define _mm_extract_epi32(X, N) (__extension__                         \
                                  ({ __v4si __a = (__v4si)(__m128i)(X); \
                                     (int)__a[(N) & 3];}))
+
 #ifdef __x86_64__
+/// \brief Extracts a 64-bit element from the 128-bit integer vector of
+///    [2 x i64], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// long long _mm_extract_epi64(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
+///
+/// \param X
+///    A 128-bit integer vector.
+/// \param N
+///    An immediate value. Bit [0] specifies which 64-bit vector element from
+///    the argument \a X to return. \n
+///    0: Bits [63:0] are returned. \n
+///    1: Bits [127:64] are returned. \n
+/// \returns  A 64-bit integer.
 #define _mm_extract_epi64(X, N) (__extension__                         \
                                  ({ __v2di __a = (__v2di)(__m128i)(X); \
                                     (long long)__a[(N) & 1];}))
 #endif /* __x86_64 */
 
 /* SSE4 128-bit Packed Integer Comparisons.  */
+/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+///    zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testz_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
 }
 
+/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+///    ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testc_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
 }
 
+/// \brief Tests whether the specified bits in a 128-bit integer vector are
+///    neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+///    A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones;
+///    FALSE otherwise.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_testnzc_si128(__m128i __M, __m128i __V)
 {
   return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
 }
 
+/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+///    ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_all_ones(__m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param V
+///    A 128-bit integer vector containing the bits to be tested.
+/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
+///    otherwise.
 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+
+/// \brief Tests whether the specified bits in a 128-bit integer vector are
+///    neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param V
+///    A 128-bit integer vector selecting which bits to test in operand \a M.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones;
+///    FALSE otherwise.
 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+
+/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+///    zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_all_zeros(__m128i M, __m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param M
+///    A 128-bit integer vector containing the bits to be tested.
+/// \param V
+///    A 128-bit integer vector selecting which bits to test in operand \a M.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
 
 /* SSE4 64-bit Packed Integer Comparisons.  */
+/// \brief Compares each of the corresponding 64-bit values of the 128-bit
+///    integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector.
+/// \param __V2
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 {
@@ -278,6 +1246,19 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 }
 
 /* SSE4 Packed Integer Sign-Extension.  */
+/// \brief Sign-extends each of the lower eight 8-bit integer elements of a
+///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
+///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
+///    extended to 16-bit values.
+/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi16(__m128i __V)
 {
@@ -286,6 +1267,19 @@ _mm_cvtepi8_epi16(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
+/// \brief Sign-extends each of the lower four 8-bit integer elements of a
+///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
+///    128-bit vector of [4 x i32]. The upper twelve elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign-
+///    extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi32(__m128i __V)
 {
@@ -294,6 +1288,19 @@ _mm_cvtepi8_epi32(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// \brief Sign-extends each of the lower two 8-bit integer elements of a
+///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign-
+///    extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi64(__m128i __V)
 {
@@ -302,18 +1309,57 @@ _mm_cvtepi8_epi64(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }
 
+/// \brief Sign-extends each of the lower four 16-bit integer elements of a
+///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
+///    a 128-bit vector of [4 x i32]. The upper four elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign-
+///    extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi32(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// \brief Sign-extends each of the lower two 16-bit integer elements of a
+///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper six elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign-
+///    extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi64(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
 }
 
+/// \brief Sign-extends each of the lower two 32-bit integer elements of a
+///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign-
+///    extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi64(__m128i __V)
 {
@@ -321,36 +1367,114 @@ _mm_cvtepi32_epi64(__m128i __V)
 }
 
 /* SSE4 Packed Integer Zero-Extension.  */
+/// \brief Zero-extends each of the lower eight 8-bit integer elements of a
+///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
+///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero-
+///    extended to 16-bit values.
+/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi16(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
+/// \brief Zero-extends each of the lower four 8-bit integer elements of a
+///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
+///    128-bit vector of [4 x i32]. The upper twelve elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero-
+///    extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi32(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// \brief Zero-extends each of the lower two 8-bit integer elements of a
+///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero-
+///    extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi64(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
 }
 
+/// \brief Zero-extends each of the lower four 16-bit integer elements of a
+///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
+///    a 128-bit vector of [4 x i32]. The upper four elements of the input
+///    vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero-
+///    extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi32(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
 }
 
+/// \brief Zero-extends each of the lower two 16-bit integer elements of a
+///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero-
+///    extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi64(__m128i __V)
 {
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
 }
 
+/// \brief Zero-extends each of the lower two 32-bit integer elements of a
+///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
+///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
+///    are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
+///
+/// \param __V
+///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero-
+///    extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu32_epi64(__m128i __V)
 {
@@ -358,6 +1482,28 @@ _mm_cvtepu32_epi64(__m128i __V)
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit unsigned integers, and returns the packed result.
+///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
+///    0x0000 are saturated to 0x0000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
+///
+/// \param __V1
+///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
+///    signed integer and is converted to a 16-bit unsigned integer with
+///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
+///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
+///    are written to the lower 64 bits of the result.
+/// \param __V2
+///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
+///    signed integer and is converted to a 16-bit unsigned integer with
+///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
+///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
+///    are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi32(__m128i __V1, __m128i __V2)
 {
@@ -365,10 +1511,58 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 }
 
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
+/// \brief Subtracts 8-bit unsigned integer values and computes the absolute
+///    values of the differences to the corresponding bits in the destination.
+///    Then sums of the absolute differences are returned according to the bit
+///    fields in the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
+///
+/// \param X
+///    A 128-bit vector of [16 x i8].
+/// \param Y
+///    A 128-bit vector of [16 x i8].
+/// \param M
+///    An 8-bit immediate operand specifying how the absolute differences are to
+///    be calculated, according to the following algorithm:
+///    \code
+///    // M2 represents bit 2 of the immediate operand
+///    // M10 represents bits [1:0] of the immediate operand
+///    i = M2 * 4
+///    j = M10 * 4
+///    for (k = 0; k < 8; k = k + 1) {
+///      d0 = abs(X[i + k + 0] - Y[j + 0])
+///      d1 = abs(X[i + k + 1] - Y[j + 1])
+///      d2 = abs(X[i + k + 2] - Y[j + 2])
+///      d3 = abs(X[i + k + 3] - Y[j + 3])
+///      r[k] = d0 + d1 + d2 + d3
+///    }
+///    \endcode
+/// \returns A 128-bit integer vector containing the sums of the sets of
+///    absolute differences between both operands.
 #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
   (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
                                       (__v16qi)(__m128i)(Y), (M)); })
 
+/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit
+///    vector of [8 x u16] and returns it and along with its index.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
+/// instruction.
+///
+/// \param __V
+///    A 128-bit vector of [8 x u16].
+/// \returns A 128-bit value where bits [15:0] contain the minimum value found
+///    in parameter \a __V, bits [18:16] contain the index of the minimum value
+///    and the remaining bits are set to 0.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_minpos_epu16(__m128i __V)
 {
@@ -410,61 +1604,769 @@ _mm_minpos_epu16(__m128i __V)
 #define _SIDD_UNIT_MASK                 0x40
 
 /* SSE4.2 Packed Comparison Intrinsics.  */
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns a 128-bit integer vector representing the result
+///    mask of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words, the type of comparison to perform, and the format of the return
+///    value. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
+///             bytes. \n
+///      0: The result is zero-extended to 16 bytes. \n
+///      1: The result is expanded to 16 bytes (this expansion is performed by
+///         repeating each bit 8 or 16 times).
+/// \returns Returns a 128-bit integer vector representing the result mask of
+///    the comparison.
 #define _mm_cmpistrm(A, B, M) \
   (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
                                        (__v16qi)(__m128i)(B), (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns an integer representing the result index of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words, the type of comparison to perform, and the format of the return
+///    value. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+///    Bit [6]: Determines whether the index of the lowest set bit or the
+///             highest set bit is returned. \n
+///      0: The index of the least significant set bit. \n
+///      1: The index of the most significant set bit. \n
+/// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpistri(A, B, M) \
   (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))
 
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns a 128-bit integer vector representing the result
+///    mask of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words, the type of comparison to perform, and the format of the return
+///    value. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
+///             bytes. \n
+///      0: The result is zero-extended to 16 bytes. \n
+///      1: The result is expanded to 16 bytes (this expansion is performed by
+///         repeating each bit 8 or 16 times). \n
+/// \returns Returns a 128-bit integer vector representing the result mask of
+///    the comparison.
 #define _mm_cmpestrm(A, LA, B, LB, M) \
   (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
                                        (__v16qi)(__m128i)(B), (int)(LB), \
                                        (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns an integer representing the result index of the
+///    comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words, the type of comparison to perform, and the format of the return
+///    value. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+///    Bit [6]: Determines whether the index of the lowest set bit or the
+///             highest set bit is returned. \n
+///      0: The index of the least significant set bit. \n
+///      1: The index of the most significant set bit. \n
+/// \returns Returns an integer representing the result index of the comparison.
 #define _mm_cmpestri(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
                                    (__v16qi)(__m128i)(B), (int)(LB), \
                                    (int)(M))
 
 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
+///    string in \a B is the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns 1 if the bit mask is zero and the length of the string in
+///    \a B is the maximum; otherwise, returns 0.
 #define _mm_cmpistra(A, B, M) \
   (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
+///    0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
 #define _mm_cmpistrc(A, B, M) \
   (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns bit 0 of the resulting bit mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpistro(A, B, M) \
   (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
+///    the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns 1 if the length of the string in \a A is less than the
+///    maximum, otherwise, returns 0.
 #define _mm_cmpistrs(A, B, M) \
   (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with implicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
+///    the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns 1 if the length of the string in \a B is less than the
+///    maximum, otherwise, returns 0.
 #define _mm_cmpistrz(A, B, M) \
   (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
 
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
+///    string in \a B is the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns 1 if the bit mask is zero and the length of the string in
+///    \a B is the maximum, otherwise, returns 0.
 #define _mm_cmpestra(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
+///    returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
 #define _mm_cmpestrc(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns bit 0 of the resulting bit mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns bit 0 of the resulting bit mask.
 #define _mm_cmpestro(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
+///    the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement in the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B. \n
+/// \returns Returns 1 if the length of the string in \a A is less than the
+///    maximum, otherwise, returns 0.
 #define _mm_cmpestrs(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
+
+/// \brief Uses the immediate operand \a M to perform a comparison of string
+///    data with explicitly defined lengths that is contained in source operands
+///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
+///    the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
+///
+/// \param A
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LA
+///    An integer that specifies the length of the string in \a A.
+/// \param B
+///    A 128-bit integer vector containing one of the source operands to be
+///    compared.
+/// \param LB
+///    An integer that specifies the length of the string in \a B.
+/// \param M
+///    An 8-bit immediate operand specifying whether the characters are bytes or
+///    words and the type of comparison to perform. \n
+///    Bits [1:0]: Determine source data format. \n
+///      00: 16 unsigned bytes  \n
+///      01: 8 unsigned words \n
+///      10: 16 signed bytes \n
+///      11: 8 signed words \n
+///    Bits [3:2]: Determine comparison type and aggregation method. \n
+///      00: Subset: Each character in \a B is compared for equality with all
+///          the characters in \a A. \n
+///      01: Ranges: Each character in \a B is compared to \a A. The comparison
+///          basis is greater than or equal for even-indexed elements in \a A,
+///          and less than or equal for odd-indexed elements in \a A. \n
+///      10: Match: Compare each pair of corresponding characters in \a A and
+///          \a B for equality. \n
+///      11: Substring: Search \a B for substring matches of \a A. \n
+///    Bits [5:4]: Determine whether to perform a one's complement on the bit
+///                mask of the comparison results. \n
+///      00: No effect. \n
+///      01: Negate the bit mask. \n
+///      10: No effect. \n
+///      11: Negate the bit mask only for bits with an index less than or equal
+///          to the size of \a A or \a B.
+/// \returns Returns 1 if the length of the string in \a B is less than the
+///    maximum, otherwise, returns 0.
 #define _mm_cmpestrz(A, LA, B, LB, M) \
   (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
 
 /* SSE4.2 Compare Packed Data -- Greater Than.  */
+/// \brief Compares each of the corresponding 64-bit values of the 128-bit
+///    integer vectors to determine if the values in the first operand are
+///    greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
+///
+/// \param __V1
+///    A 128-bit integer vector.
+/// \param __V2
+///    A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
 {
@@ -472,18 +2374,60 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
 }
 
 /* SSE4.2 Accumulate CRC32.  */
+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned char operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a  __D.
+/// \param __D
+///    An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u8(unsigned int __C, unsigned char __D)
 {
   return __builtin_ia32_crc32qi(__C, __D);
 }
 
+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned short operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u16(unsigned int __C, unsigned short __D)
 {
   return __builtin_ia32_crc32hi(__C, __D);
 }
 
+/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of
+///    the second unsigned integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _mm_crc32_u32(unsigned int __C, unsigned int __D)
 {
@@ -491,6 +2435,20 @@ _mm_crc32_u32(unsigned int __C, unsigned int __D)
 }
 
 #ifdef __x86_64__
+/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+///    unsigned 64-bit integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
+///
+/// \param __C
+///    An unsigned integer operand to add to the CRC-32C checksum of operand
+///    \a __D.
+/// \param __D
+///    An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+///    operand \a __D.
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
 {

@@ -43,14 +43,12 @@ typedef __builtin_va_list va_list;
 #define va_copy(dest, src)  __builtin_va_copy(dest, src)
 #endif
 
-/* Hack required to make standard headers work, at least on Ubuntu */
 #ifndef __GNUC_VA_LIST
 #define __GNUC_VA_LIST 1
-#endif
 typedef __builtin_va_list __gnuc_va_list;
-
 /* zig: added because glibc stdio.h was duplicately defining va_list
  */
 #define _VA_LIST_DEFINED
+#endif
 
 #endif /* __STDARG_H */

@@ -40,16 +40,16 @@ extern "C" {
 
 /* 7.17.1 Introduction */
 
-#define ATOMIC_BOOL_LOCK_FREE       __GCC_ATOMIC_BOOL_LOCK_FREE
-#define ATOMIC_CHAR_LOCK_FREE       __GCC_ATOMIC_CHAR_LOCK_FREE
-#define ATOMIC_CHAR16_T_LOCK_FREE   __GCC_ATOMIC_CHAR16_T_LOCK_FREE
-#define ATOMIC_CHAR32_T_LOCK_FREE   __GCC_ATOMIC_CHAR32_T_LOCK_FREE
-#define ATOMIC_WCHAR_T_LOCK_FREE    __GCC_ATOMIC_WCHAR_T_LOCK_FREE
-#define ATOMIC_SHORT_LOCK_FREE      __GCC_ATOMIC_SHORT_LOCK_FREE
-#define ATOMIC_INT_LOCK_FREE        __GCC_ATOMIC_INT_LOCK_FREE
-#define ATOMIC_LONG_LOCK_FREE       __GCC_ATOMIC_LONG_LOCK_FREE
-#define ATOMIC_LLONG_LOCK_FREE      __GCC_ATOMIC_LLONG_LOCK_FREE
-#define ATOMIC_POINTER_LOCK_FREE    __GCC_ATOMIC_POINTER_LOCK_FREE
+#define ATOMIC_BOOL_LOCK_FREE       __CLANG_ATOMIC_BOOL_LOCK_FREE
+#define ATOMIC_CHAR_LOCK_FREE       __CLANG_ATOMIC_CHAR_LOCK_FREE
+#define ATOMIC_CHAR16_T_LOCK_FREE   __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
+#define ATOMIC_CHAR32_T_LOCK_FREE   __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
+#define ATOMIC_WCHAR_T_LOCK_FREE    __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
+#define ATOMIC_SHORT_LOCK_FREE      __CLANG_ATOMIC_SHORT_LOCK_FREE
+#define ATOMIC_INT_LOCK_FREE        __CLANG_ATOMIC_INT_LOCK_FREE
+#define ATOMIC_LONG_LOCK_FREE       __CLANG_ATOMIC_LONG_LOCK_FREE
+#define ATOMIC_LLONG_LOCK_FREE      __CLANG_ATOMIC_LLONG_LOCK_FREE
+#define ATOMIC_POINTER_LOCK_FREE    __CLANG_ATOMIC_POINTER_LOCK_FREE
 
 /* 7.17.2 Initialization */

@@ -255,19 +255,16 @@ typedef __uint_least8_t uint_fast8_t;
  */
 #define __stdint_join3(a,b,c) a ## b ## c
 
-#define  __intn_t(n) __stdint_join3( int, n, _t)
-#define __uintn_t(n) __stdint_join3(uint, n, _t)
-
 #ifndef _INTPTR_T
 #ifndef __intptr_t_defined
-typedef  __intn_t(__INTPTR_WIDTH__)  intptr_t;
+typedef __INTPTR_TYPE__ intptr_t;
 #define __intptr_t_defined
 #define _INTPTR_T
 #endif
 #endif
 
 #ifndef _UINTPTR_T
-typedef __uintn_t(__INTPTR_WIDTH__) uintptr_t;
+typedef __UINTPTR_TYPE__ uintptr_t;
 #define _UINTPTR_T
 #endif
 
@@ -659,12 +656,12 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 /* C99 7.18.2.4 Limits of integer types capable of holding object pointers. */
 /* C99 7.18.3 Limits of other integer types. */
 
-#define  INTPTR_MIN  __INTN_MIN(__INTPTR_WIDTH__)
-#define  INTPTR_MAX  __INTN_MAX(__INTPTR_WIDTH__)
-#define UINTPTR_MAX __UINTN_MAX(__INTPTR_WIDTH__)
-#define PTRDIFF_MIN  __INTN_MIN(__PTRDIFF_WIDTH__)
-#define PTRDIFF_MAX  __INTN_MAX(__PTRDIFF_WIDTH__)
-#define    SIZE_MAX __UINTN_MAX(__SIZE_WIDTH__)
+#define  INTPTR_MIN  (-__INTPTR_MAX__-1)
+#define  INTPTR_MAX    __INTPTR_MAX__
+#define UINTPTR_MAX   __UINTPTR_MAX__
+#define PTRDIFF_MIN (-__PTRDIFF_MAX__-1)
+#define PTRDIFF_MAX   __PTRDIFF_MAX__
+#define    SIZE_MAX      __SIZE_MAX__
 
 /* ISO9899:2011 7.20 (C11 Annex K): Define RSIZE_MAX if __STDC_WANT_LIB_EXT1__
  * is enabled. */
@@ -673,9 +670,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #endif
 
 /* C99 7.18.2.5 Limits of greatest-width integer types. */
-#define INTMAX_MIN   __INTN_MIN(__INTMAX_WIDTH__)
-#define INTMAX_MAX   __INTN_MAX(__INTMAX_WIDTH__)
-#define UINTMAX_MAX __UINTN_MAX(__INTMAX_WIDTH__)
+#define  INTMAX_MIN (-__INTMAX_MAX__-1)
+#define  INTMAX_MAX   __INTMAX_MAX__
+#define UINTMAX_MAX  __UINTMAX_MAX__
 
 /* C99 7.18.3 Limits of other integer types. */
 #define SIG_ATOMIC_MIN __INTN_MIN(__SIG_ATOMIC_WIDTH__)
@@ -700,8 +697,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #endif
 
 /* 7.18.4.2 Macros for greatest-width integer constants. */
-#define INTMAX_C(v)   __INTN_C(__INTMAX_WIDTH__, v)
-#define UINTMAX_C(v) __UINTN_C(__INTMAX_WIDTH__, v)
+#define  INTMAX_C(v) __int_c(v,  __INTMAX_C_SUFFIX__)
+#define UINTMAX_C(v) __int_c(v, __UINTMAX_C_SUFFIX__)
 
 #endif /* __STDC_HOSTED__ */
 #endif /* __CLANG_STDINT_H */

@@ -22,12 +22,21 @@
  *
 \*===----------------------------------------------------------------------===*/
 
-#ifndef __TGMATH_H
-#define __TGMATH_H
+#ifndef __CLANG_TGMATH_H
+#define __CLANG_TGMATH_H
 
 /* C99 7.22 Type-generic math <tgmath.h>. */
 #include <math.h>
 
+/*
+ * Allow additional definitions and implementation-defined values on Apple
+ * platforms. This is done after #include <math.h> to avoid depcycle conflicts
+ * between libcxx and darwin in C++ modules builds.
+ */
+#if defined(__APPLE__) && __STDC_HOSTED__ && __has_include_next(<tgmath.h>)
+#  include_next <tgmath.h>
+#else
+
 /* C++ handles type genericity with overloading in math.h. */
 #ifndef __cplusplus
 #include <complex.h>
@@ -1371,4 +1380,5 @@ static long double
 #undef _TG_ATTRS
 
 #endif /* __cplusplus */
-#endif /* __TGMATH_H */
+#endif /* __has_include_next */
+#endif /* __CLANG_TGMATH_H */

@@ -469,10 +469,11 @@ _mm_hsubs_pi16(__m64 __a, __m64 __b)
 ///    values contained in the first source operand and packed 8-bit signed
 ///    integer values contained in the second source operand, adds pairs of
 ///    contiguous products with signed saturation, and writes the 16-bit sums to
-///    the corresponding bits in the destination. For example, bits [7:0] of
-///    both operands are multiplied, bits [15:8] of both operands are
-///    multiplied, and the sum of both results is written to bits [15:0] of the
-///    destination.
+///    the corresponding bits in the destination.
+///
+///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
+///    both operands are multiplied, and the sum of both results is written to
+///    bits [15:0] of the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -502,10 +503,11 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b)
 ///    values contained in the first source operand and packed 8-bit signed
 ///    integer values contained in the second source operand, adds pairs of
 ///    contiguous products with signed saturation, and writes the 16-bit sums to
-///    the corresponding bits in the destination. For example, bits [7:0] of
-///    both operands are multiplied, bits [15:8] of both operands are
-///    multiplied, and the sum of both results is written to bits [15:0] of the
-///    destination.
+///    the corresponding bits in the destination.
+///
+///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
+///    both operands are multiplied, and the sum of both results is written to
+///    bits [15:0] of the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -619,13 +621,14 @@ _mm_shuffle_pi8(__m64 __a, __m64 __b)
 }
 
 /// \brief For each 8-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    byte in the second source is negative, calculate the two's complement of
-///    the corresponding byte in the first source, and write that value to the
-///    destination. If the byte in the second source is positive, copy the
-///    corresponding byte from the first source to the destination. If the byte
-///    in the second source is zero, clear the corresponding byte in the
-///    destination.
+///    the following actions as specified by the second source operand.
+///
+///    If the byte in the second source is negative, calculate the two's
+///    complement of the corresponding byte in the first source, and write that
+///    value to the destination. If the byte in the second source is positive,
+///    copy the corresponding byte from the first source to the destination. If
+///    the byte in the second source is zero, clear the corresponding byte in
+///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -644,13 +647,14 @@ _mm_sign_epi8(__m128i __a, __m128i __b)
 }
 
 /// \brief For each 16-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    word in the second source is negative, calculate the two's complement of
-///    the corresponding word in the first source, and write that value to the
-///    destination. If the word in the second source is positive, copy the
-///    corresponding word from the first source to the destination. If the word
-///    in the second source is zero, clear the corresponding word in the
-///    destination.
+///    the following actions as specified by the second source operand.
+///
+///    If the word in the second source is negative, calculate the two's
+///    complement of the corresponding word in the first source, and write that
+///    value to the destination. If the word in the second source is positive,
+///    copy the corresponding word from the first source to the destination. If
+///    the word in the second source is zero, clear the corresponding word in
+///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -669,8 +673,9 @@ _mm_sign_epi16(__m128i __a, __m128i __b)
 }
 
 /// \brief For each 32-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    doubleword in the second source is negative, calculate the two's
+///    the following actions as specified by the second source operand.
+///
+///    If the doubleword in the second source is negative, calculate the two's
 ///    complement of the corresponding word in the first source, and write that
 ///    value to the destination. If the doubleword in the second source is
 ///    positive, copy the corresponding word from the first source to the
@@ -694,13 +699,14 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
 }
 
 /// \brief For each 8-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    byte in the second source is negative, calculate the two's complement of
-///    the corresponding byte in the first source, and write that value to the
-///    destination. If the byte in the second source is positive, copy the
-///    corresponding byte from the first source to the destination. If the byte
-///    in the second source is zero, clear the corresponding byte in the
-///    destination.
+///    the following actions as specified by the second source operand.
+///
+///    If the byte in the second source is negative, calculate the two's
+///    complement of the corresponding byte in the first source, and write that
+///    value to the destination. If the byte in the second source is positive,
+///    copy the corresponding byte from the first source to the destination. If
+///    the byte in the second source is zero, clear the corresponding byte in
+///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -719,13 +725,14 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
 }
 
 /// \brief For each 16-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    word in the second source is negative, calculate the two's complement of
-///    the corresponding word in the first source, and write that value to the
-///    destination. If the word in the second source is positive, copy the
-///    corresponding word from the first source to the destination. If the word
-///    in the second source is zero, clear the corresponding word in the
-///    destination.
+///    the following actions as specified by the second source operand.
+///
+///    If the word in the second source is negative, calculate the two's
+///    complement of the corresponding word in the first source, and write that
+///    value to the destination. If the word in the second source is positive,
+///    copy the corresponding word from the first source to the destination. If
+///    the word in the second source is zero, clear the corresponding word in
+///    the destination.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -744,8 +751,9 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
 }
 
 /// \brief For each 32-bit integer in the first source operand, perform one of
-///    the following actions as specified by the second source operand: If the
-///    doubleword in the second source is negative, calculate the two's
+///    the following actions as specified by the second source operand.
+///
+///    If the doubleword in the second source is negative, calculate the two's
 ///    complement of the corresponding doubleword in the first source, and
 ///    write that value to the destination. If the doubleword in the second
 ///    source is positive, copy the corresponding doubleword from the first

@@ -116,6 +116,13 @@ vec_extract(vector unsigned long long __vec, int __index) {
   return __vec[__index & 1];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai float
+vec_extract(vector float __vec, int __index) {
+  return __vec[__index & 3];
+}
+#endif
+
 static inline __ATTRS_o_ai double
 vec_extract(vector double __vec, int __index) {
   return __vec[__index & 1];
@@ -129,6 +136,7 @@ vec_insert(signed char __scalar, vector signed char __vec, int __index) {
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_insert(unsigned char __scalar, vector bool char __vec, int __index) {
   vector unsigned char __newvec = (vector unsigned char)__vec;
@@ -148,6 +156,7 @@ vec_insert(signed short __scalar, vector signed short __vec, int __index) {
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_insert(unsigned short __scalar, vector bool short __vec, int __index) {
   vector unsigned short __newvec = (vector unsigned short)__vec;
@@ -167,6 +176,7 @@ vec_insert(signed int __scalar, vector signed int __vec, int __index) {
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_insert(unsigned int __scalar, vector bool int __vec, int __index) {
   vector unsigned int __newvec = (vector unsigned int)__vec;
@@ -187,6 +197,7 @@ vec_insert(signed long long __scalar, vector signed long long __vec,
   return __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_insert(unsigned long long __scalar, vector bool long long __vec,
            int __index) {
@@ -202,6 +213,14 @@ vec_insert(unsigned long long __scalar, vector unsigned long long __vec,
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_insert(float __scalar, vector float __vec, int __index) {
+  __vec[__index & 1] = __scalar;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_insert(double __scalar, vector double __vec, int __index) {
   __vec[__index & 1] = __scalar;
@@ -282,6 +301,16 @@ vec_promote(unsigned long long __scalar, int __index) {
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_promote(float __scalar, int __index) {
+  const vector float __zero = (vector float)0;
+  vector float __vec = __builtin_shufflevector(__zero, __zero, -1, -1, -1, -1);
+  __vec[__index & 3] = __scalar;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_promote(double __scalar, int __index) {
   const vector double __zero = (vector double)0;
@@ -348,6 +377,15 @@ vec_insert_and_zero(const unsigned long long *__ptr) {
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_insert_and_zero(const float *__ptr) {
+  vector float __vec = (vector float)0;
+  __vec[0] = *__ptr;
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_insert_and_zero(const double *__ptr) {
   vector double __vec = (vector double)0;
@@ -441,6 +479,15 @@ vec_perm(vector bool long long __a, vector bool long long __b,
            (vector unsigned char)__a, (vector unsigned char)__b, __c);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_perm(vector float __a, vector float __b,
+         vector unsigned char __c) {
+  return (vector float)__builtin_s390_vperm(
+           (vector unsigned char)__a, (vector unsigned char)__b, __c);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_perm(vector double __a, vector double __b,
          vector unsigned char __c) {
@@ -450,18 +497,22 @@ vec_perm(vector double __a, vector double __b,
 
 /*-- vec_permi --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 extern __ATTRS_o vector signed long long
 vec_permi(vector signed long long __a, vector signed long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector unsigned long long
 vec_permi(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector bool long long
 vec_permi(vector bool long long __a, vector bool long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector double
 vec_permi(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 3);
@@ -471,6 +522,15 @@ vec_permi(vector double __a, vector double __b, int __c)
                       (vector unsigned long long)(Y), \
                       (((Z) & 2) << 1) | ((Z) & 1)))
 
+/*-- vec_bperm_u128 ---------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector unsigned long long
+vec_bperm_u128(vector unsigned char __a, vector unsigned char __b) {
+  return __builtin_s390_vbperm(__a, __b);
+}
+#endif
+
 /*-- vec_sel ----------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed char
@@ -614,6 +674,22 @@ vec_sel(vector unsigned long long __a, vector unsigned long long __b,
           (~(vector unsigned long long)__c & __a));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_sel(vector float __a, vector float __b, vector unsigned int __c) {
+  return (vector float)((__c & (vector unsigned int)__b) |
+                        (~__c & (vector unsigned int)__a));
+}
+
+static inline __ATTRS_o_ai vector float
+vec_sel(vector float __a, vector float __b, vector bool int __c) {
+  vector unsigned int __ac = (vector unsigned int)__a;
+  vector unsigned int __bc = (vector unsigned int)__b;
+  vector unsigned int __cc = (vector unsigned int)__c;
+  return (vector float)((__cc & __bc) | (~__cc & __ac));
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_sel(vector double __a, vector double __b, vector unsigned long long __c) {
   return (vector double)((__c & (vector unsigned long long)__b) |
@@ -687,6 +763,17 @@ vec_gather_element(vector unsigned long long __vec,
   return __vec;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_gather_element(vector float __vec, vector unsigned int __offset,
+                   const float *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  __vec[__index] = *(const float *)(
+    (__INTPTR_TYPE__)__ptr + (__INTPTR_TYPE__)__offset[__index]);
+  return __vec;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_gather_element(vector double __vec, vector unsigned long long __offset,
                    const double *__ptr, int __index)
@@ -749,6 +836,16 @@ vec_scatter_element(vector unsigned long long __vec,
     __vec[__index];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_scatter_element(vector float __vec, vector unsigned int __offset,
+                    float *__ptr, int __index)
+  __constant_range(__index, 0, 3) {
+  *(float *)((__INTPTR_TYPE__)__ptr + __offset[__index]) =
+    __vec[__index];
+}
+#endif
+
 static inline __ATTRS_o_ai void
 vec_scatter_element(vector double __vec, vector unsigned long long __offset,
                     double *__ptr, int __index)
@@ -757,48 +854,111 @@ vec_scatter_element(vector double __vec, vector unsigned long long __offset,
     __vec[__index];
 }
 
+/*-- vec_xl -----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed char
+vec_xl(long __offset, const signed char *__ptr) {
+  return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_xl(long __offset, const unsigned char *__ptr) {
+  return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_xl(long __offset, const signed short *__ptr) {
+  return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_xl(long __offset, const unsigned short *__ptr) {
+  return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_xl(long __offset, const signed int *__ptr) {
+  return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_xl(long __offset, const unsigned int *__ptr) {
+  return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_xl(long __offset, const signed long long *__ptr) {
+  return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_xl(long __offset, const unsigned long long *__ptr) {
+  return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_xl(long __offset, const float *__ptr) {
+  return *(const vector float *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
+vec_xl(long __offset, const double *__ptr) {
+  return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
+}
+
 /*-- vec_xld2 ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_xld2(long __offset, const signed char *__ptr) {
   return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_xld2(long __offset, const unsigned char *__ptr) {
   return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_xld2(long __offset, const signed short *__ptr) {
   return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_xld2(long __offset, const unsigned short *__ptr) {
   return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_xld2(long __offset, const signed int *__ptr) {
   return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_xld2(long __offset, const unsigned int *__ptr) {
   return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_xld2(long __offset, const signed long long *__ptr) {
   return *(const vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_xld2(long __offset, const unsigned long long *__ptr) {
   return *(const vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_xld2(long __offset, const double *__ptr) {
   return *(const vector double *)((__INTPTR_TYPE__)__ptr + __offset);
@@ -806,74 +966,145 @@ vec_xld2(long __offset, const double *__ptr) {
 
 /*-- vec_xlw4 ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_xlw4(long __offset, const signed char *__ptr) {
   return *(const vector signed char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_xlw4(long __offset, const unsigned char *__ptr) {
   return *(const vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_xlw4(long __offset, const signed short *__ptr) {
   return *(const vector signed short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_xlw4(long __offset, const unsigned short *__ptr) {
   return *(const vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_xlw4(long __offset, const signed int *__ptr) {
   return *(const vector signed int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_xlw4(long __offset, const unsigned int *__ptr) {
   return *(const vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset);
 }
 
+/*-- vec_xst ----------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed char __vec, long __offset, signed char *__ptr) {
+  *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
+  *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed short __vec, long __offset, signed short *__ptr) {
+  *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
+  *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed int __vec, long __offset, signed int *__ptr) {
+  *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
+  *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector signed long long __vec, long __offset,
+          signed long long *__ptr) {
+  *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
+static inline __ATTRS_o_ai void
+vec_xst(vector unsigned long long __vec, long __offset,
+          unsigned long long *__ptr) {
+  *(vector unsigned long long *)((__INTPTR_TYPE__)__ptr + __offset) =
+    __vec;
+}
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_xst(vector float __vec, long __offset, float *__ptr) {
+  *(vector float *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+#endif
+
+static inline __ATTRS_o_ai void
+vec_xst(vector double __vec, long __offset, double *__ptr) {
+  *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
+}
+
 /*-- vec_xstd2 --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed char __vec, long __offset, signed char *__ptr) {
   *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
   *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed short __vec, long __offset, signed short *__ptr) {
   *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
   *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed int __vec, long __offset, signed int *__ptr) {
   *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
   *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector signed long long __vec, long __offset,
           signed long long *__ptr) {
   *(vector signed long long *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector unsigned long long __vec, long __offset,
           unsigned long long *__ptr) {
@@ -881,6 +1112,7 @@ vec_xstd2(vector unsigned long long __vec, long __offset,
     __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstd2(vector double __vec, long __offset, double *__ptr) {
   *(vector double *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
@@ -888,31 +1120,37 @@ vec_xstd2(vector double __vec, long __offset, double *__ptr) {
 
 /*-- vec_xstw4 --------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed char __vec, long __offset, signed char *__ptr) {
   *(vector signed char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned char __vec, long __offset, unsigned char *__ptr) {
   *(vector unsigned char *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed short __vec, long __offset, signed short *__ptr) {
   *(vector signed short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned short __vec, long __offset, unsigned short *__ptr) {
   *(vector unsigned short *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector signed int __vec, long __offset, signed int *__ptr) {
   *(vector signed int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai void
 vec_xstw4(vector unsigned int __vec, long __offset, unsigned int *__ptr) {
   *(vector unsigned int *)((__INTPTR_TYPE__)__ptr + __offset) = __vec;
@@ -952,6 +1190,12 @@ extern __ATTRS_o vector unsigned long long
 vec_load_bndry(const unsigned long long *__ptr, unsigned short __len)
   __constant_pow2_range(__len, 64, 4096);
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector float
+vec_load_bndry(const float *__ptr, unsigned short __len)
+  __constant_pow2_range(__len, 64, 4096);
+#endif
+
 extern __ATTRS_o vector double
 vec_load_bndry(const double *__ptr, unsigned short __len)
   __constant_pow2_range(__len, 64, 4096);
@@ -1007,11 +1251,27 @@ vec_load_len(const unsigned long long *__ptr, unsigned int __len) {
   return (vector unsigned long long)__builtin_s390_vll(__len, __ptr);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_load_len(const float *__ptr, unsigned int __len) {
+  return (vector float)__builtin_s390_vll(__len, __ptr);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_load_len(const double *__ptr, unsigned int __len) {
   return (vector double)__builtin_s390_vll(__len, __ptr);
 }
 
+/*-- vec_load_len_r ---------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector unsigned char
+vec_load_len_r(const unsigned char *__ptr, unsigned int __len) {
+  return (vector unsigned char)__builtin_s390_vlrl(__len, __ptr);
+}
+#endif
+
 /*-- vec_store_len ----------------------------------------------------------*/
 
 static inline __ATTRS_o_ai void
@@ -1062,12 +1322,30 @@ vec_store_len(vector unsigned long long __vec, unsigned long long *__ptr,
   __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai void
+vec_store_len(vector float __vec, float *__ptr,
+              unsigned int __len) {
+  __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
+}
+#endif
+
 static inline __ATTRS_o_ai void
 vec_store_len(vector double __vec, double *__ptr,
               unsigned int __len) {
   __builtin_s390_vstl((vector signed char)__vec, __len, __ptr);
 }
 
+/*-- vec_store_len_r --------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai void
+vec_store_len_r(vector unsigned char __vec, unsigned char *__ptr,
+                unsigned int __len) {
+  __builtin_s390_vstrl((vector signed char)__vec, __len, __ptr);
+}
+#endif
+
 /*-- vec_load_pair ----------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector signed long long
@@ -1232,6 +1510,14 @@ vec_splat(vector unsigned long long __vec, int __index)
   return (vector unsigned long long)__vec[__index];
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_splat(vector float __vec, int __index)
+  __constant_range(__index, 0, 3) {
+  return (vector float)__vec[__index];
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_splat(vector double __vec, int __index)
   __constant_range(__index, 0, 1) {
@@ -1332,6 +1618,13 @@ vec_splats(unsigned long long __scalar) {
   return (vector unsigned long long)__scalar;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_splats(float __scalar) {
+  return (vector float)__scalar;
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_splats(double __scalar) {
   return (vector double)__scalar;
@@ -1425,6 +1718,13 @@ vec_mergeh(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)(__a[0], __b[0]);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_mergeh(vector float __a, vector float __b) {
+  return (vector float)(__a[0], __b[0], __a[1], __b[1]);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_mergeh(vector double __a, vector double __b) {
   return (vector double)(__a[0], __b[0]);
@@ -1501,6 +1801,13 @@ vec_mergel(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector unsigned long long)(__a[1], __b[1]);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_mergel(vector float __a, vector float __b) {
+  return (vector float)(__a[2], __b[2], __a[3], __b[3]);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_mergel(vector double __a, vector double __b) {
   return (vector double)(__a[1], __b[1]);
@@ -1866,6 +2173,13 @@ vec_cmpeq(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a == __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpeq(vector float __a, vector float __b) {
+  return (vector bool int)(__a == __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpeq(vector double __a, vector double __b) {
   return (vector bool long long)(__a == __b);
@@ -1913,6 +2227,13 @@ vec_cmpge(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a >= __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpge(vector float __a, vector float __b) {
+  return (vector bool int)(__a >= __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpge(vector double __a, vector double __b) {
   return (vector bool long long)(__a >= __b);
@@ -1960,6 +2281,13 @@ vec_cmpgt(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a > __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmpgt(vector float __a, vector float __b) {
+  return (vector bool int)(__a > __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmpgt(vector double __a, vector double __b) {
   return (vector bool long long)(__a > __b);
@@ -2007,6 +2335,13 @@ vec_cmple(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a <= __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmple(vector float __a, vector float __b) {
+  return (vector bool int)(__a <= __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmple(vector double __a, vector double __b) {
   return (vector bool long long)(__a <= __b);
@@ -2054,6 +2389,13 @@ vec_cmplt(vector unsigned long long __a, vector unsigned long long __b) {
   return (vector bool long long)(__a < __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool int
+vec_cmplt(vector float __a, vector float __b) {
+  return (vector bool int)(__a < __b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector bool long long
 vec_cmplt(vector double __a, vector double __b) {
   return (vector bool long long)(__a < __b);
@@ -2068,6 +2410,7 @@ vec_all_eq(vector signed char __a, vector signed char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2075,6 +2418,7 @@ vec_all_eq(vector signed char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2090,6 +2434,7 @@ vec_all_eq(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2098,6 +2443,7 @@ vec_all_eq(vector unsigned char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2121,6 +2467,7 @@ vec_all_eq(vector signed short __a, vector signed short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2128,6 +2475,7 @@ vec_all_eq(vector signed short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2143,6 +2491,7 @@ vec_all_eq(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2151,6 +2500,7 @@ vec_all_eq(vector unsigned short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2174,6 +2524,7 @@ vec_all_eq(vector signed int __a, vector signed int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2181,6 +2532,7 @@ vec_all_eq(vector signed int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2196,6 +2548,7 @@ vec_all_eq(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2204,6 +2557,7 @@ vec_all_eq(vector unsigned int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2227,6 +2581,7 @@ vec_all_eq(vector signed long long __a, vector signed long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2234,6 +2589,7 @@ vec_all_eq(vector signed long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2249,6 +2605,7 @@ vec_all_eq(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2257,6 +2614,7 @@ vec_all_eq(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_eq(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2273,6 +2631,15 @@ vec_all_eq(vector bool long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_eq(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_eq(vector double __a, vector double __b) {
   int __cc;
@@ -2289,6 +2656,7 @@ vec_all_ne(vector signed char __a, vector signed char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2296,6 +2664,7 @@ vec_all_ne(vector signed char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2311,6 +2680,7 @@ vec_all_ne(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2319,6 +2689,7 @@ vec_all_ne(vector unsigned char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2342,6 +2713,7 @@ vec_all_ne(vector signed short __a, vector signed short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2349,6 +2721,7 @@ vec_all_ne(vector signed short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2364,6 +2737,7 @@ vec_all_ne(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2372,6 +2746,7 @@ vec_all_ne(vector unsigned short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2395,6 +2770,7 @@ vec_all_ne(vector signed int __a, vector signed int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2402,6 +2778,7 @@ vec_all_ne(vector signed int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2417,6 +2794,7 @@ vec_all_ne(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2425,6 +2803,7 @@ vec_all_ne(vector unsigned int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2448,6 +2827,7 @@ vec_all_ne(vector signed long long __a, vector signed long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2455,6 +2835,7 @@ vec_all_ne(vector signed long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2470,6 +2851,7 @@ vec_all_ne(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2478,6 +2860,7 @@ vec_all_ne(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ne(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2494,6 +2877,15 @@ vec_all_ne(vector bool long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ne(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_ne(vector double __a, vector double __b) {
   int __cc;
@@ -2510,6 +2902,7 @@ vec_all_ge(vector signed char __a, vector signed char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2517,6 +2910,7 @@ vec_all_ge(vector signed char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2531,6 +2925,7 @@ vec_all_ge(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2538,6 +2933,7 @@ vec_all_ge(vector unsigned char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2545,6 +2941,7 @@ vec_all_ge(vector bool char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2560,6 +2957,7 @@ vec_all_ge(vector signed short __a, vector signed short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2567,6 +2965,7 @@ vec_all_ge(vector signed short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2581,6 +2980,7 @@ vec_all_ge(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2588,6 +2988,7 @@ vec_all_ge(vector unsigned short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2595,6 +2996,7 @@ vec_all_ge(vector bool short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -2610,6 +3012,7 @@ vec_all_ge(vector signed int __a, vector signed int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2617,6 +3020,7 @@ vec_all_ge(vector signed int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2631,6 +3035,7 @@ vec_all_ge(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2638,6 +3043,7 @@ vec_all_ge(vector unsigned int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2645,6 +3051,7 @@ vec_all_ge(vector bool int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -2660,6 +3067,7 @@ vec_all_ge(vector signed long long __a, vector signed long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2667,6 +3075,7 @@ vec_all_ge(vector signed long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2681,6 +3090,7 @@ vec_all_ge(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2688,6 +3098,7 @@ vec_all_ge(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2695,6 +3106,7 @@ vec_all_ge(vector bool long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_ge(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -2703,6 +3115,15 @@ vec_all_ge(vector bool long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_ge(vector double __a, vector double __b) {
   int __cc;
@@ -2719,6 +3140,7 @@ vec_all_gt(vector signed char __a, vector signed char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2726,6 +3148,7 @@ vec_all_gt(vector signed char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2740,6 +3163,7 @@ vec_all_gt(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2747,6 +3171,7 @@ vec_all_gt(vector unsigned char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2754,6 +3179,7 @@ vec_all_gt(vector bool char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2769,6 +3195,7 @@ vec_all_gt(vector signed short __a, vector signed short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2776,6 +3203,7 @@ vec_all_gt(vector signed short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2790,6 +3218,7 @@ vec_all_gt(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -2797,6 +3226,7 @@ vec_all_gt(vector unsigned short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -2804,6 +3234,7 @@ vec_all_gt(vector bool short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -2819,6 +3250,7 @@ vec_all_gt(vector signed int __a, vector signed int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -2826,6 +3258,7 @@ vec_all_gt(vector signed int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -2840,6 +3273,7 @@ vec_all_gt(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -2847,6 +3281,7 @@ vec_all_gt(vector unsigned int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -2854,6 +3289,7 @@ vec_all_gt(vector bool int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -2869,6 +3305,7 @@ vec_all_gt(vector signed long long __a, vector signed long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -2876,6 +3313,7 @@ vec_all_gt(vector signed long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -2890,6 +3328,7 @@ vec_all_gt(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -2897,6 +3336,7 @@ vec_all_gt(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -2904,6 +3344,7 @@ vec_all_gt(vector bool long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_gt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -2912,6 +3353,15 @@ vec_all_gt(vector bool long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_gt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_gt(vector double __a, vector double __b) {
   int __cc;
@@ -2928,6 +3378,7 @@ vec_all_le(vector signed char __a, vector signed char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -2935,6 +3386,7 @@ vec_all_le(vector signed char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -2949,6 +3401,7 @@ vec_all_le(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -2956,6 +3409,7 @@ vec_all_le(vector unsigned char __a, vector bool char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -2963,6 +3417,7 @@ vec_all_le(vector bool char __a, vector unsigned char __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -2978,6 +3433,7 @@ vec_all_le(vector signed short __a, vector signed short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -2985,6 +3441,7 @@ vec_all_le(vector signed short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -2999,6 +3456,7 @@ vec_all_le(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3006,6 +3464,7 @@ vec_all_le(vector unsigned short __a, vector bool short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3013,6 +3472,7 @@ vec_all_le(vector bool short __a, vector unsigned short __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3028,6 +3488,7 @@ vec_all_le(vector signed int __a, vector signed int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3035,6 +3496,7 @@ vec_all_le(vector signed int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3049,6 +3511,7 @@ vec_all_le(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3056,6 +3519,7 @@ vec_all_le(vector unsigned int __a, vector bool int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3063,6 +3527,7 @@ vec_all_le(vector bool int __a, vector unsigned int __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3078,6 +3543,7 @@ vec_all_le(vector signed long long __a, vector signed long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3085,6 +3551,7 @@ vec_all_le(vector signed long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3099,6 +3566,7 @@ vec_all_le(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3106,6 +3574,7 @@ vec_all_le(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3113,6 +3582,7 @@ vec_all_le(vector bool long long __a, vector unsigned long long __b) {
   return __cc == 3;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_le(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -3121,6 +3591,15 @@ vec_all_le(vector bool long long __a, vector bool long long __b) {
   return __cc == 3;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_le(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_le(vector double __a, vector double __b) {
   int __cc;
@@ -3137,6 +3616,7 @@ vec_all_lt(vector signed char __a, vector signed char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3144,6 +3624,7 @@ vec_all_lt(vector signed char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3158,6 +3639,7 @@ vec_all_lt(vector unsigned char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3165,6 +3647,7 @@ vec_all_lt(vector unsigned char __a, vector bool char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3172,6 +3655,7 @@ vec_all_lt(vector bool char __a, vector unsigned char __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -3187,6 +3671,7 @@ vec_all_lt(vector signed short __a, vector signed short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3194,6 +3679,7 @@ vec_all_lt(vector signed short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3208,6 +3694,7 @@ vec_all_lt(vector unsigned short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3215,6 +3702,7 @@ vec_all_lt(vector unsigned short __a, vector bool short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3222,6 +3710,7 @@ vec_all_lt(vector bool short __a, vector unsigned short __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3237,6 +3726,7 @@ vec_all_lt(vector signed int __a, vector signed int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3244,6 +3734,7 @@ vec_all_lt(vector signed int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3258,6 +3749,7 @@ vec_all_lt(vector unsigned int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3265,6 +3757,7 @@ vec_all_lt(vector unsigned int __a, vector bool int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3272,6 +3765,7 @@ vec_all_lt(vector bool int __a, vector unsigned int __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3287,6 +3781,7 @@ vec_all_lt(vector signed long long __a, vector signed long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3294,6 +3789,7 @@ vec_all_lt(vector signed long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3308,6 +3804,7 @@ vec_all_lt(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3315,6 +3812,7 @@ vec_all_lt(vector unsigned long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3322,6 +3820,7 @@ vec_all_lt(vector bool long long __a, vector unsigned long long __b) {
   return __cc == 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_all_lt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -3330,6 +3829,15 @@ vec_all_lt(vector bool long long __a, vector bool long long __b) {
   return __cc == 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_lt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc == 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_all_lt(vector double __a, vector double __b) {
   int __cc;
@@ -3339,7 +3847,16 @@ vec_all_lt(vector double __a, vector double __b) {
 
 /*-- vec_all_nge ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nge(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__a, __b, &__cc);
@@ -3348,7 +3865,16 @@ vec_all_nge(vector double __a, vector double __b) {
 
 /*-- vec_all_ngt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_ngt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_ngt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__a, __b, &__cc);
@@ -3357,7 +3883,16 @@ vec_all_ngt(vector double __a, vector double __b) {
 
 /*-- vec_all_nle ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nle(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nle(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__b, __a, &__cc);
@@ -3366,7 +3901,16 @@ vec_all_nle(vector double __a, vector double __b) {
 
 /*-- vec_all_nlt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nlt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nlt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__b, __a, &__cc);
@@ -3375,7 +3919,16 @@ vec_all_nlt(vector double __a, vector double __b) {
 
 /*-- vec_all_nan ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_nan(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc == 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_nan(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -3384,7 +3937,16 @@ vec_all_nan(vector double __a) {
 
 /*-- vec_all_numeric --------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_all_numeric(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc == 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_all_numeric(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -3400,6 +3962,7 @@ vec_any_eq(vector signed char __a, vector signed char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3407,6 +3970,7 @@ vec_any_eq(vector signed char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3422,6 +3986,7 @@ vec_any_eq(vector unsigned char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3430,6 +3995,7 @@ vec_any_eq(vector unsigned char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3453,6 +4019,7 @@ vec_any_eq(vector signed short __a, vector signed short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3460,6 +4027,7 @@ vec_any_eq(vector signed short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3475,6 +4043,7 @@ vec_any_eq(vector unsigned short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3483,6 +4052,7 @@ vec_any_eq(vector unsigned short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3506,6 +4076,7 @@ vec_any_eq(vector signed int __a, vector signed int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3513,6 +4084,7 @@ vec_any_eq(vector signed int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3528,6 +4100,7 @@ vec_any_eq(vector unsigned int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3536,6 +4109,7 @@ vec_any_eq(vector unsigned int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3559,6 +4133,7 @@ vec_any_eq(vector signed long long __a, vector signed long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3566,6 +4141,7 @@ vec_any_eq(vector signed long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3581,6 +4157,7 @@ vec_any_eq(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3589,6 +4166,7 @@ vec_any_eq(vector unsigned long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_eq(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3605,6 +4183,15 @@ vec_any_eq(vector bool long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_eq(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_eq(vector double __a, vector double __b) {
   int __cc;
@@ -3621,6 +4208,7 @@ vec_any_ne(vector signed char __a, vector signed char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3628,6 +4216,7 @@ vec_any_ne(vector signed char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3643,6 +4232,7 @@ vec_any_ne(vector unsigned char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3651,6 +4241,7 @@ vec_any_ne(vector unsigned char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3674,6 +4265,7 @@ vec_any_ne(vector signed short __a, vector signed short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3681,6 +4273,7 @@ vec_any_ne(vector signed short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3696,6 +4289,7 @@ vec_any_ne(vector unsigned short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3704,6 +4298,7 @@ vec_any_ne(vector unsigned short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3727,6 +4322,7 @@ vec_any_ne(vector signed int __a, vector signed int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3734,6 +4330,7 @@ vec_any_ne(vector signed int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3749,6 +4346,7 @@ vec_any_ne(vector unsigned int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3757,6 +4355,7 @@ vec_any_ne(vector unsigned int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3780,6 +4379,7 @@ vec_any_ne(vector signed long long __a, vector signed long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3787,6 +4387,7 @@ vec_any_ne(vector signed long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -3802,6 +4403,7 @@ vec_any_ne(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -3810,6 +4412,7 @@ vec_any_ne(vector unsigned long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ne(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -3826,6 +4429,15 @@ vec_any_ne(vector bool long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ne(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfcesbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_ne(vector double __a, vector double __b) {
   int __cc;
@@ -3842,6 +4454,7 @@ vec_any_ge(vector signed char __a, vector signed char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -3849,6 +4462,7 @@ vec_any_ge(vector signed char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -3863,6 +4477,7 @@ vec_any_ge(vector unsigned char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -3870,6 +4485,7 @@ vec_any_ge(vector unsigned char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -3877,6 +4493,7 @@ vec_any_ge(vector bool char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -3892,6 +4509,7 @@ vec_any_ge(vector signed short __a, vector signed short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -3899,6 +4517,7 @@ vec_any_ge(vector signed short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -3913,6 +4532,7 @@ vec_any_ge(vector unsigned short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -3920,6 +4540,7 @@ vec_any_ge(vector unsigned short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -3927,6 +4548,7 @@ vec_any_ge(vector bool short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -3942,6 +4564,7 @@ vec_any_ge(vector signed int __a, vector signed int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -3949,6 +4572,7 @@ vec_any_ge(vector signed int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -3963,6 +4587,7 @@ vec_any_ge(vector unsigned int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -3970,6 +4595,7 @@ vec_any_ge(vector unsigned int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -3977,6 +4603,7 @@ vec_any_ge(vector bool int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -3992,6 +4619,7 @@ vec_any_ge(vector signed long long __a, vector signed long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -3999,6 +4627,7 @@ vec_any_ge(vector signed long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4013,6 +4642,7 @@ vec_any_ge(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4020,6 +4650,7 @@ vec_any_ge(vector unsigned long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4027,6 +4658,7 @@ vec_any_ge(vector bool long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_ge(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4035,6 +4667,15 @@ vec_any_ge(vector bool long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_ge(vector double __a, vector double __b) {
   int __cc;
@@ -4051,6 +4692,7 @@ vec_any_gt(vector signed char __a, vector signed char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4058,6 +4700,7 @@ vec_any_gt(vector signed char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4072,6 +4715,7 @@ vec_any_gt(vector unsigned char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4079,6 +4723,7 @@ vec_any_gt(vector unsigned char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4086,6 +4731,7 @@ vec_any_gt(vector bool char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4101,6 +4747,7 @@ vec_any_gt(vector signed short __a, vector signed short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4108,6 +4755,7 @@ vec_any_gt(vector signed short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4122,6 +4770,7 @@ vec_any_gt(vector unsigned short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4129,6 +4778,7 @@ vec_any_gt(vector unsigned short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4136,6 +4786,7 @@ vec_any_gt(vector bool short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4151,6 +4802,7 @@ vec_any_gt(vector signed int __a, vector signed int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4158,6 +4810,7 @@ vec_any_gt(vector signed int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4172,6 +4825,7 @@ vec_any_gt(vector unsigned int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4179,6 +4833,7 @@ vec_any_gt(vector unsigned int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4186,6 +4841,7 @@ vec_any_gt(vector bool int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4201,6 +4857,7 @@ vec_any_gt(vector signed long long __a, vector signed long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4208,6 +4865,7 @@ vec_any_gt(vector signed long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4222,6 +4880,7 @@ vec_any_gt(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4229,6 +4888,7 @@ vec_any_gt(vector unsigned long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4236,6 +4896,7 @@ vec_any_gt(vector bool long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_gt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4244,6 +4905,15 @@ vec_any_gt(vector bool long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_gt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_gt(vector double __a, vector double __b) {
   int __cc;
@@ -4260,6 +4930,7 @@ vec_any_le(vector signed char __a, vector signed char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4267,6 +4938,7 @@ vec_any_le(vector signed char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4281,6 +4953,7 @@ vec_any_le(vector unsigned char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4288,6 +4961,7 @@ vec_any_le(vector unsigned char __a, vector bool char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4295,6 +4969,7 @@ vec_any_le(vector bool char __a, vector unsigned char __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4310,6 +4985,7 @@ vec_any_le(vector signed short __a, vector signed short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4317,6 +4993,7 @@ vec_any_le(vector signed short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4331,6 +5008,7 @@ vec_any_le(vector unsigned short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4338,6 +5016,7 @@ vec_any_le(vector unsigned short __a, vector bool short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4345,6 +5024,7 @@ vec_any_le(vector bool short __a, vector unsigned short __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4360,6 +5040,7 @@ vec_any_le(vector signed int __a, vector signed int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4367,6 +5048,7 @@ vec_any_le(vector signed int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4381,6 +5063,7 @@ vec_any_le(vector unsigned int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4388,6 +5071,7 @@ vec_any_le(vector unsigned int __a, vector bool int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4395,6 +5079,7 @@ vec_any_le(vector bool int __a, vector unsigned int __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4410,6 +5095,7 @@ vec_any_le(vector signed long long __a, vector signed long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4417,6 +5103,7 @@ vec_any_le(vector signed long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4431,6 +5118,7 @@ vec_any_le(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4438,6 +5126,7 @@ vec_any_le(vector unsigned long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4445,6 +5134,7 @@ vec_any_le(vector bool long long __a, vector unsigned long long __b) {
   return __cc != 0;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_le(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4453,6 +5143,15 @@ vec_any_le(vector bool long long __a, vector bool long long __b) {
   return __cc != 0;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_le(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_le(vector double __a, vector double __b) {
   int __cc;
@@ -4469,6 +5168,7 @@ vec_any_lt(vector signed char __a, vector signed char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed char __a, vector bool char __b) {
   int __cc;
@@ -4476,6 +5176,7 @@ vec_any_lt(vector signed char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector signed char __b) {
   int __cc;
@@ -4490,6 +5191,7 @@ vec_any_lt(vector unsigned char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned char __a, vector bool char __b) {
   int __cc;
@@ -4497,6 +5199,7 @@ vec_any_lt(vector unsigned char __a, vector bool char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector unsigned char __b) {
   int __cc;
@@ -4504,6 +5207,7 @@ vec_any_lt(vector bool char __a, vector unsigned char __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool char __a, vector bool char __b) {
   int __cc;
@@ -4519,6 +5223,7 @@ vec_any_lt(vector signed short __a, vector signed short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed short __a, vector bool short __b) {
   int __cc;
@@ -4526,6 +5231,7 @@ vec_any_lt(vector signed short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector signed short __b) {
   int __cc;
@@ -4540,6 +5246,7 @@ vec_any_lt(vector unsigned short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned short __a, vector bool short __b) {
   int __cc;
@@ -4547,6 +5254,7 @@ vec_any_lt(vector unsigned short __a, vector bool short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector unsigned short __b) {
   int __cc;
@@ -4554,6 +5262,7 @@ vec_any_lt(vector bool short __a, vector unsigned short __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool short __a, vector bool short __b) {
   int __cc;
@@ -4569,6 +5278,7 @@ vec_any_lt(vector signed int __a, vector signed int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed int __a, vector bool int __b) {
   int __cc;
@@ -4576,6 +5286,7 @@ vec_any_lt(vector signed int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector signed int __b) {
   int __cc;
@@ -4590,6 +5301,7 @@ vec_any_lt(vector unsigned int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned int __a, vector bool int __b) {
   int __cc;
@@ -4597,6 +5309,7 @@ vec_any_lt(vector unsigned int __a, vector bool int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector unsigned int __b) {
   int __cc;
@@ -4604,6 +5317,7 @@ vec_any_lt(vector bool int __a, vector unsigned int __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool int __a, vector bool int __b) {
   int __cc;
@@ -4619,6 +5333,7 @@ vec_any_lt(vector signed long long __a, vector signed long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector signed long long __a, vector bool long long __b) {
   int __cc;
@@ -4626,6 +5341,7 @@ vec_any_lt(vector signed long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector signed long long __b) {
   int __cc;
@@ -4640,6 +5356,7 @@ vec_any_lt(vector unsigned long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
   int __cc;
@@ -4647,6 +5364,7 @@ vec_any_lt(vector unsigned long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
   int __cc;
@@ -4654,6 +5372,7 @@ vec_any_lt(vector bool long long __a, vector unsigned long long __b) {
   return __cc <= 1;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai int
 vec_any_lt(vector bool long long __a, vector bool long long __b) {
   int __cc;
@@ -4662,6 +5381,15 @@ vec_any_lt(vector bool long long __a, vector bool long long __b) {
   return __cc <= 1;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_lt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc <= 1;
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_any_lt(vector double __a, vector double __b) {
   int __cc;
@@ -4671,7 +5399,16 @@ vec_any_lt(vector double __a, vector double __b) {
 
 /*-- vec_any_nge ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nge(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nge(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__a, __b, &__cc);
@@ -4680,7 +5417,16 @@ vec_any_nge(vector double __a, vector double __b) {
 
 /*-- vec_any_ngt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_ngt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__a, __b, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_ngt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__a, __b, &__cc);
@@ -4689,7 +5435,16 @@ vec_any_ngt(vector double __a, vector double __b) {
 
 /*-- vec_any_nle ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nle(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchesbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nle(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchedbs(__b, __a, &__cc);
@@ -4698,7 +5453,16 @@ vec_any_nle(vector double __a, vector double __b) {
 
 /*-- vec_any_nlt ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nlt(vector float __a, vector float __b) {
+  int __cc;
+  __builtin_s390_vfchsbs(__b, __a, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nlt(vector double __a, vector double __b) {
   int __cc;
   __builtin_s390_vfchdbs(__b, __a, &__cc);
@@ -4707,7 +5471,16 @@ vec_any_nlt(vector double __a, vector double __b) {
 
 /*-- vec_any_nan ------------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_nan(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc != 3;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_nan(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -4716,7 +5489,16 @@ vec_any_nan(vector double __a) {
 
 /*-- vec_any_numeric --------------------------------------------------------*/
 
-static inline __ATTRS_ai int
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_any_numeric(vector float __a) {
+  int __cc;
+  __builtin_s390_vftcisb(__a, 15, &__cc);
+  return __cc != 0;
+}
+#endif
+
+static inline __ATTRS_o_ai int
 vec_any_numeric(vector double __a) {
   int __cc;
   __builtin_s390_vftcidb(__a, 15, &__cc);
@@ -4735,11 +5517,13 @@ vec_andc(vector signed char __a, vector signed char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_andc(vector bool char __a, vector signed char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_andc(vector signed char __a, vector bool char __b) {
   return __a & ~__b;
@@ -4750,11 +5534,13 @@ vec_andc(vector unsigned char __a, vector unsigned char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_andc(vector bool char __a, vector unsigned char __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_andc(vector unsigned char __a, vector bool char __b) {
   return __a & ~__b;
@@ -4770,11 +5556,13 @@ vec_andc(vector signed short __a, vector signed short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_andc(vector bool short __a, vector signed short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_andc(vector signed short __a, vector bool short __b) {
   return __a & ~__b;
@@ -4785,11 +5573,13 @@ vec_andc(vector unsigned short __a, vector unsigned short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_andc(vector bool short __a, vector unsigned short __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_andc(vector unsigned short __a, vector bool short __b) {
   return __a & ~__b;
@@ -4805,11 +5595,13 @@ vec_andc(vector signed int __a, vector signed int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_andc(vector bool int __a, vector signed int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_andc(vector signed int __a, vector bool int __b) {
   return __a & ~__b;
@@ -4820,11 +5612,13 @@ vec_andc(vector unsigned int __a, vector unsigned int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_andc(vector bool int __a, vector unsigned int __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_andc(vector unsigned int __a, vector bool int __b) {
   return __a & ~__b;
@@ -4840,11 +5634,13 @@ vec_andc(vector signed long long __a, vector signed long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_andc(vector bool long long __a, vector signed long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_andc(vector signed long long __a, vector bool long long __b) {
   return __a & ~__b;
@@ -4855,28 +5651,40 @@ vec_andc(vector unsigned long long __a, vector unsigned long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_andc(vector bool long long __a, vector unsigned long long __b) {
   return __a & ~__b;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_andc(vector unsigned long long __a, vector bool long long __b) {
   return __a & ~__b;
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_andc(vector float __a, vector float __b) {
+  return (vector float)((vector unsigned int)__a &
+                         ~(vector unsigned int)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_andc(vector double __a, vector double __b) {
   return (vector double)((vector unsigned long long)__a &
                          ~(vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_andc(vector bool long long __a, vector double __b) {
   return (vector double)((vector unsigned long long)__a &
                          ~(vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_andc(vector double __a, vector bool long long __b) {
   return (vector double)((vector unsigned long long)__a &
@@ -4895,11 +5703,13 @@ vec_nor(vector signed char __a, vector signed char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_nor(vector bool char __a, vector signed char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_nor(vector signed char __a, vector bool char __b) {
   return ~(__a | __b);
@@ -4910,11 +5720,13 @@ vec_nor(vector unsigned char __a, vector unsigned char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_nor(vector bool char __a, vector unsigned char __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_nor(vector unsigned char __a, vector bool char __b) {
   return ~(__a | __b);
@@ -4930,11 +5742,13 @@ vec_nor(vector signed short __a, vector signed short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_nor(vector bool short __a, vector signed short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_nor(vector signed short __a, vector bool short __b) {
   return ~(__a | __b);
@@ -4945,11 +5759,13 @@ vec_nor(vector unsigned short __a, vector unsigned short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_nor(vector bool short __a, vector unsigned short __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_nor(vector unsigned short __a, vector bool short __b) {
   return ~(__a | __b);
@@ -4965,11 +5781,13 @@ vec_nor(vector signed int __a, vector signed int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_nor(vector bool int __a, vector signed int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_nor(vector signed int __a, vector bool int __b) {
   return ~(__a | __b);
@@ -4980,11 +5798,13 @@ vec_nor(vector unsigned int __a, vector unsigned int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_nor(vector bool int __a, vector unsigned int __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_nor(vector unsigned int __a, vector bool int __b) {
   return ~(__a | __b);
@@ -5000,11 +5820,13 @@ vec_nor(vector signed long long __a, vector signed long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_nor(vector bool long long __a, vector signed long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_nor(vector signed long long __a, vector bool long long __b) {
   return ~(__a | __b);
@@ -5015,34 +5837,274 @@ vec_nor(vector unsigned long long __a, vector unsigned long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_nor(vector bool long long __a, vector unsigned long long __b) {
   return ~(__a | __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_nor(vector unsigned long long __a, vector bool long long __b) {
   return ~(__a | __b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nor(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a |
+                         (vector unsigned int)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_nor(vector double __a, vector double __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_nor(vector bool long long __a, vector double __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_nor(vector double __a, vector bool long long __b) {
   return (vector double)~((vector unsigned long long)__a |
                           (vector unsigned long long)__b);
 }
 
+/*-- vec_orc ----------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_orc(vector bool char __a, vector bool char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_orc(vector signed char __a, vector signed char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_orc(vector unsigned char __a, vector unsigned char __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_orc(vector bool short __a, vector bool short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_orc(vector signed short __a, vector signed short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_orc(vector unsigned short __a, vector unsigned short __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_orc(vector bool int __a, vector bool int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_orc(vector signed int __a, vector signed int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_orc(vector unsigned int __a, vector unsigned int __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_orc(vector bool long long __a, vector bool long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_orc(vector signed long long __a, vector signed long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_orc(vector unsigned long long __a, vector unsigned long long __b) {
+  return __a | ~__b;
+}
+
+static inline __ATTRS_o_ai vector float
+vec_orc(vector float __a, vector float __b) {
+  return (vector float)((vector unsigned int)__a &
+                        ~(vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_orc(vector double __a, vector double __b) {
+  return (vector double)((vector unsigned long long)__a &
+                         ~(vector unsigned long long)__b);
+}
+#endif
+
+/*-- vec_nand ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_nand(vector bool char __a, vector bool char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_nand(vector signed char __a, vector signed char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_nand(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_nand(vector bool short __a, vector bool short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_nand(vector signed short __a, vector signed short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_nand(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_nand(vector bool int __a, vector bool int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_nand(vector signed int __a, vector signed int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_nand(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_nand(vector bool long long __a, vector bool long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_nand(vector signed long long __a, vector signed long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_nand(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a & __b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_nand(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a &
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nand(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a &
+                          (vector unsigned long long)__b);
+}
+#endif
+
+/*-- vec_eqv ----------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector bool char
+vec_eqv(vector bool char __a, vector bool char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed char
+vec_eqv(vector signed char __a, vector signed char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned char
+vec_eqv(vector unsigned char __a, vector unsigned char __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool short
+vec_eqv(vector bool short __a, vector bool short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed short
+vec_eqv(vector signed short __a, vector signed short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned short
+vec_eqv(vector unsigned short __a, vector unsigned short __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool int
+vec_eqv(vector bool int __a, vector bool int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed int
+vec_eqv(vector signed int __a, vector signed int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned int
+vec_eqv(vector unsigned int __a, vector unsigned int __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector bool long long
+vec_eqv(vector bool long long __a, vector bool long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector signed long long
+vec_eqv(vector signed long long __a, vector signed long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_eqv(vector unsigned long long __a, vector unsigned long long __b) {
+  return ~(__a ^ __b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_eqv(vector float __a, vector float __b) {
+  return (vector float)~((vector unsigned int)__a ^
+                         (vector unsigned int)__b);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_eqv(vector double __a, vector double __b) {
+  return (vector double)~((vector unsigned long long)__a ^
+                          (vector unsigned long long)__b);
+}
+#endif
+
 /*-- vec_cntlz --------------------------------------------------------------*/
 
 static inline __ATTRS_o_ai vector unsigned char
@@ -5323,30 +6385,35 @@ vec_sll(vector signed char __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sll(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sll(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sll(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsl(
@@ -5358,11 +6425,13 @@ vec_sll(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_s390_vsl(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sll(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsl(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sll(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsl(__a, (vector unsigned char)__b);
@@ -5374,30 +6443,35 @@ vec_sll(vector signed short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sll(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sll(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sll(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsl(
@@ -5410,12 +6484,14 @@ vec_sll(vector unsigned short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sll(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sll(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsl(
@@ -5428,30 +6504,35 @@ vec_sll(vector signed int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sll(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sll(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sll(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsl(
@@ -5464,12 +6545,14 @@ vec_sll(vector unsigned int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sll(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sll(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsl(
@@ -5482,30 +6565,35 @@ vec_sll(vector signed long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sll(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sll(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sll(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsl(
@@ -5518,12 +6606,14 @@ vec_sll(vector unsigned long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sll(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sll(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsl(
@@ -5626,6 +6716,20 @@ vec_slb(vector unsigned long long __a, vector unsigned long long __b) {
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_slb(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_slb(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vslb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_slb(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vslb(
@@ -5644,6 +6748,10 @@ extern __ATTRS_o vector signed char
 vec_sld(vector signed char __a, vector signed char __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool char
+vec_sld(vector bool char __a, vector bool char __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned char
 vec_sld(vector unsigned char __a, vector unsigned char __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5652,6 +6760,10 @@ extern __ATTRS_o vector signed short
 vec_sld(vector signed short __a, vector signed short __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool short
+vec_sld(vector bool short __a, vector bool short __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned short
 vec_sld(vector unsigned short __a, vector unsigned short __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5660,6 +6772,10 @@ extern __ATTRS_o vector signed int
 vec_sld(vector signed int __a, vector signed int __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool int
+vec_sld(vector bool int __a, vector bool int __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned int
 vec_sld(vector unsigned int __a, vector unsigned int __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5668,10 +6784,20 @@ extern __ATTRS_o vector signed long long
 vec_sld(vector signed long long __a, vector signed long long __b, int __c)
   __constant_range(__c, 0, 15);
 
+extern __ATTRS_o vector bool long long
+vec_sld(vector bool long long __a, vector bool long long __b, int __c)
+  __constant_range(__c, 0, 15);
+
 extern __ATTRS_o vector unsigned long long
 vec_sld(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 15);
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector float
+vec_sld(vector float __a, vector float __b, int __c)
+  __constant_range(__c, 0, 15);
+#endif
+
 extern __ATTRS_o vector double
 vec_sld(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 15);
@@ -5714,6 +6840,7 @@ extern __ATTRS_o vector unsigned long long
 vec_sldw(vector unsigned long long __a, vector unsigned long long __b, int __c)
   __constant_range(__c, 0, 3);
 
+// This prototype is deprecated.
 extern __ATTRS_o vector double
 vec_sldw(vector double __a, vector double __b, int __c)
   __constant_range(__c, 0, 3);
@@ -5730,30 +6857,35 @@ vec_sral(vector signed char __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sral(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_sral(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_sral(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsra(
@@ -5765,11 +6897,13 @@ vec_sral(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_s390_vsra(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sral(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsra(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_sral(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsra(__a, (vector unsigned char)__b);
@@ -5781,30 +6915,35 @@ vec_sral(vector signed short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sral(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_sral(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_sral(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsra(
@@ -5817,12 +6956,14 @@ vec_sral(vector unsigned short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sral(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_sral(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsra(
@@ -5835,30 +6976,35 @@ vec_sral(vector signed int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sral(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_sral(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_sral(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsra(
@@ -5871,12 +7017,14 @@ vec_sral(vector unsigned int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sral(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_sral(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsra(
@@ -5889,30 +7037,35 @@ vec_sral(vector signed long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sral(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_sral(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsra(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_sral(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsra(
@@ -5925,12 +7078,14 @@ vec_sral(vector unsigned long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sral(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsra(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_sral(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsra(
@@ -6033,6 +7188,20 @@ vec_srab(vector unsigned long long __a, vector unsigned long long __b) {
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_srab(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_srab(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vsrab(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_srab(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vsrab(
@@ -6053,30 +7222,35 @@ vec_srl(vector signed char __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_srl(vector signed char __a, vector unsigned short __b) {
   return (vector signed char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_srl(vector signed char __a, vector unsigned int __b) {
   return (vector signed char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned char __b) {
   return (vector bool char)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned short __b) {
   return (vector bool char)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool char
 vec_srl(vector bool char __a, vector unsigned int __b) {
   return (vector bool char)__builtin_s390_vsrl(
@@ -6088,11 +7262,13 @@ vec_srl(vector unsigned char __a, vector unsigned char __b) {
   return __builtin_s390_vsrl(__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_srl(vector unsigned char __a, vector unsigned short __b) {
   return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_srl(vector unsigned char __a, vector unsigned int __b) {
   return __builtin_s390_vsrl(__a, (vector unsigned char)__b);
@@ -6104,30 +7280,35 @@ vec_srl(vector signed short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_srl(vector signed short __a, vector unsigned short __b) {
   return (vector signed short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_srl(vector signed short __a, vector unsigned int __b) {
   return (vector signed short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned char __b) {
   return (vector bool short)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned short __b) {
   return (vector bool short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool short
 vec_srl(vector bool short __a, vector unsigned int __b) {
   return (vector bool short)__builtin_s390_vsrl(
@@ -6140,12 +7321,14 @@ vec_srl(vector unsigned short __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_srl(vector unsigned short __a, vector unsigned short __b) {
   return (vector unsigned short)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_srl(vector unsigned short __a, vector unsigned int __b) {
   return (vector unsigned short)__builtin_s390_vsrl(
@@ -6158,30 +7341,35 @@ vec_srl(vector signed int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_srl(vector signed int __a, vector unsigned short __b) {
   return (vector signed int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_srl(vector signed int __a, vector unsigned int __b) {
   return (vector signed int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned char __b) {
   return (vector bool int)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned short __b) {
   return (vector bool int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool int
 vec_srl(vector bool int __a, vector unsigned int __b) {
   return (vector bool int)__builtin_s390_vsrl(
@@ -6194,12 +7382,14 @@ vec_srl(vector unsigned int __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_srl(vector unsigned int __a, vector unsigned short __b) {
   return (vector unsigned int)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_srl(vector unsigned int __a, vector unsigned int __b) {
   return (vector unsigned int)__builtin_s390_vsrl(
@@ -6212,30 +7402,35 @@ vec_srl(vector signed long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_srl(vector signed long long __a, vector unsigned short __b) {
   return (vector signed long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_srl(vector signed long long __a, vector unsigned int __b) {
   return (vector signed long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned char __b) {
   return (vector bool long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned short __b) {
   return (vector bool long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector bool long long
 vec_srl(vector bool long long __a, vector unsigned int __b) {
   return (vector bool long long)__builtin_s390_vsrl(
@@ -6248,12 +7443,14 @@ vec_srl(vector unsigned long long __a, vector unsigned char __b) {
     (vector unsigned char)__a, __b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_srl(vector unsigned long long __a, vector unsigned short __b) {
   return (vector unsigned long long)__builtin_s390_vsrl(
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_srl(vector unsigned long long __a, vector unsigned int __b) {
   return (vector unsigned long long)__builtin_s390_vsrl(
@@ -6356,6 +7553,20 @@ vec_srb(vector unsigned long long __a, vector unsigned long long __b) {
     (vector unsigned char)__a, (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_srb(vector float __a, vector signed int __b) {
+  return (vector float)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+
+static inline __ATTRS_o_ai vector float
+vec_srb(vector float __a, vector unsigned int __b) {
+  return (vector float)__builtin_s390_vsrlb(
+    (vector unsigned char)__a, (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_srb(vector double __a, vector signed long long __b) {
   return (vector double)__builtin_s390_vsrlb(
@@ -6390,6 +7601,13 @@ vec_abs(vector signed long long __a) {
   return vec_sel(__a, -__a, vec_cmplt(__a, (vector signed long long)0));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_abs(vector float __a) {
+  return __builtin_s390_vflpsb(__a);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_abs(vector double __a) {
   return __builtin_s390_vflpdb(__a);
@@ -6397,7 +7615,14 @@ vec_abs(vector double __a) {
 
 /*-- vec_nabs ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nabs(vector float __a) {
+  return __builtin_s390_vflnsb(__a);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_nabs(vector double __a) {
   return __builtin_s390_vflndb(__a);
 }
@@ -6409,12 +7634,14 @@ vec_max(vector signed char __a, vector signed char __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_max(vector signed char __a, vector bool char __b) {
   vector signed char __bc = (vector signed char)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_max(vector bool char __a, vector signed char __b) {
   vector signed char __ac = (vector signed char)__a;
@@ -6426,12 +7653,14 @@ vec_max(vector unsigned char __a, vector unsigned char __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_max(vector unsigned char __a, vector bool char __b) {
   vector unsigned char __bc = (vector unsigned char)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_max(vector bool char __a, vector unsigned char __b) {
   vector unsigned char __ac = (vector unsigned char)__a;
@@ -6443,12 +7672,14 @@ vec_max(vector signed short __a, vector signed short __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_max(vector signed short __a, vector bool short __b) {
   vector signed short __bc = (vector signed short)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_max(vector bool short __a, vector signed short __b) {
   vector signed short __ac = (vector signed short)__a;
@@ -6460,12 +7691,14 @@ vec_max(vector unsigned short __a, vector unsigned short __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_max(vector unsigned short __a, vector bool short __b) {
   vector unsigned short __bc = (vector unsigned short)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_max(vector bool short __a, vector unsigned short __b) {
   vector unsigned short __ac = (vector unsigned short)__a;
@@ -6477,12 +7710,14 @@ vec_max(vector signed int __a, vector signed int __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_max(vector signed int __a, vector bool int __b) {
   vector signed int __bc = (vector signed int)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_max(vector bool int __a, vector signed int __b) {
   vector signed int __ac = (vector signed int)__a;
@@ -6494,12 +7729,14 @@ vec_max(vector unsigned int __a, vector unsigned int __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_max(vector unsigned int __a, vector bool int __b) {
   vector unsigned int __bc = (vector unsigned int)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_max(vector bool int __a, vector unsigned int __b) {
   vector unsigned int __ac = (vector unsigned int)__a;
@@ -6511,12 +7748,14 @@ vec_max(vector signed long long __a, vector signed long long __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_max(vector signed long long __a, vector bool long long __b) {
   vector signed long long __bc = (vector signed long long)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_max(vector bool long long __a, vector signed long long __b) {
   vector signed long long __ac = (vector signed long long)__a;
@@ -6528,21 +7767,34 @@ vec_max(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_max(vector unsigned long long __a, vector bool long long __b) {
   vector unsigned long long __bc = (vector unsigned long long)__b;
   return vec_sel(__bc, __a, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_max(vector bool long long __a, vector unsigned long long __b) {
   vector unsigned long long __ac = (vector unsigned long long)__a;
   return vec_sel(__b, __ac, vec_cmpgt(__ac, __b));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_max(vector float __a, vector float __b) {
+  return __builtin_s390_vfmaxsb(__a, __b, 0);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_max(vector double __a, vector double __b) {
+#if __ARCH__ >= 12
+  return __builtin_s390_vfmaxdb(__a, __b, 0);
+#else
   return vec_sel(__b, __a, vec_cmpgt(__a, __b));
+#endif
 }
 
 /*-- vec_min ----------------------------------------------------------------*/
@@ -6552,12 +7804,14 @@ vec_min(vector signed char __a, vector signed char __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_min(vector signed char __a, vector bool char __b) {
   vector signed char __bc = (vector signed char)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed char
 vec_min(vector bool char __a, vector signed char __b) {
   vector signed char __ac = (vector signed char)__a;
@@ -6569,12 +7823,14 @@ vec_min(vector unsigned char __a, vector unsigned char __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_min(vector unsigned char __a, vector bool char __b) {
   vector unsigned char __bc = (vector unsigned char)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned char
 vec_min(vector bool char __a, vector unsigned char __b) {
   vector unsigned char __ac = (vector unsigned char)__a;
@@ -6586,12 +7842,14 @@ vec_min(vector signed short __a, vector signed short __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_min(vector signed short __a, vector bool short __b) {
   vector signed short __bc = (vector signed short)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed short
 vec_min(vector bool short __a, vector signed short __b) {
   vector signed short __ac = (vector signed short)__a;
@@ -6603,12 +7861,14 @@ vec_min(vector unsigned short __a, vector unsigned short __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_min(vector unsigned short __a, vector bool short __b) {
   vector unsigned short __bc = (vector unsigned short)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned short
 vec_min(vector bool short __a, vector unsigned short __b) {
   vector unsigned short __ac = (vector unsigned short)__a;
@@ -6620,12 +7880,14 @@ vec_min(vector signed int __a, vector signed int __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_min(vector signed int __a, vector bool int __b) {
   vector signed int __bc = (vector signed int)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed int
 vec_min(vector bool int __a, vector signed int __b) {
   vector signed int __ac = (vector signed int)__a;
@@ -6637,12 +7899,14 @@ vec_min(vector unsigned int __a, vector unsigned int __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_min(vector unsigned int __a, vector bool int __b) {
   vector unsigned int __bc = (vector unsigned int)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned int
 vec_min(vector bool int __a, vector unsigned int __b) {
   vector unsigned int __ac = (vector unsigned int)__a;
@@ -6654,12 +7918,14 @@ vec_min(vector signed long long __a, vector signed long long __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_min(vector signed long long __a, vector bool long long __b) {
   vector signed long long __bc = (vector signed long long)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_min(vector bool long long __a, vector signed long long __b) {
   vector signed long long __ac = (vector signed long long)__a;
@@ -6671,21 +7937,34 @@ vec_min(vector unsigned long long __a, vector unsigned long long __b) {
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_min(vector unsigned long long __a, vector bool long long __b) {
   vector unsigned long long __bc = (vector unsigned long long)__b;
   return vec_sel(__a, __bc, vec_cmpgt(__a, __bc));
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_min(vector bool long long __a, vector unsigned long long __b) {
   vector unsigned long long __ac = (vector unsigned long long)__a;
   return vec_sel(__ac, __b, vec_cmpgt(__ac, __b));
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_min(vector float __a, vector float __b) {
+  return __builtin_s390_vfminsb(__a, __b, 0);
+}
+#endif
+
 static inline __ATTRS_o_ai vector double
 vec_min(vector double __a, vector double __b) {
+#if __ARCH__ >= 12
+  return __builtin_s390_vfmindb(__a, __b, 0);
+#else
   return vec_sel(__a, __b, vec_cmpgt(__a, __b));
+#endif
 }
 
 /*-- vec_add_u128 -----------------------------------------------------------*/
@@ -7126,6 +8405,13 @@ vec_mulo(vector unsigned int __a, vector unsigned int __b) {
   return __builtin_s390_vmlof(__a, __b);
 }
 
+/*-- vec_msum_u128 ----------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+#define vec_msum_u128(X, Y, Z, W) \
+  ((vector unsigned char)__builtin_s390_vmslg((X), (Y), (Z), (W)));
+#endif
+
 /*-- vec_sub_u128 -----------------------------------------------------------*/
 
 static inline __ATTRS_ai vector unsigned char
@@ -7263,6 +8549,14 @@ vec_test_mask(vector unsigned long long __a, vector unsigned long long __b) {
                             (vector unsigned char)__b);
 }
 
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai int
+vec_test_mask(vector float __a, vector unsigned int __b) {
+  return __builtin_s390_vtm((vector unsigned char)__a,
+                            (vector unsigned char)__b);
+}
+#endif
+
 static inline __ATTRS_o_ai int
 vec_test_mask(vector double __a, vector unsigned long long __b) {
   return __builtin_s390_vtm((vector unsigned char)__a,
@@ -7271,27 +8565,77 @@ vec_test_mask(vector double __a, vector unsigned long long __b) {
 
 /*-- vec_madd ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_madd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfmasb(__a, __b, __c);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_madd(vector double __a, vector double __b, vector double __c) {
   return __builtin_s390_vfmadb(__a, __b, __c);
 }
 
 /*-- vec_msub ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_msub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfmssb(__a, __b, __c);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_msub(vector double __a, vector double __b, vector double __c) {
   return __builtin_s390_vfmsdb(__a, __b, __c);
 }
 
+/*-- vec_nmadd ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nmadd(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfnmasb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nmadd(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfnmadb(__a, __b, __c);
+}
+#endif
+
+/*-- vec_nmsub ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_nmsub(vector float __a, vector float __b, vector float __c) {
+  return __builtin_s390_vfnmssb(__a, __b, __c);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_nmsub(vector double __a, vector double __b, vector double __c) {
+  return __builtin_s390_vfnmsdb(__a, __b, __c);
+}
+#endif
+
 /*-- vec_sqrt ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_sqrt(vector float __a) {
+  return __builtin_s390_vfsqsb(__a);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_sqrt(vector double __a) {
   return __builtin_s390_vfsqdb(__a);
 }
 
 /*-- vec_ld2f ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai vector double
 vec_ld2f(const float *__ptr) {
   typedef float __v2f32 __attribute__((__vector_size__(8)));
@@ -7300,6 +8644,7 @@ vec_ld2f(const float *__ptr) {
 
 /*-- vec_st2f ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_ai void
 vec_st2f(vector double __a, float *__ptr) {
   typedef float __v2f32 __attribute__((__vector_size__(8)));
@@ -7308,6 +8653,7 @@ vec_st2f(vector double __a, float *__ptr) {
 
 /*-- vec_ctd ----------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_ctd(vector signed long long __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7316,6 +8662,7 @@ vec_ctd(vector signed long long __a, int __b)
   return __conv;
 }
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector double
 vec_ctd(vector unsigned long long __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7326,6 +8673,7 @@ vec_ctd(vector unsigned long long __a, int __b)
 
 /*-- vec_ctsl ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector signed long long
 vec_ctsl(vector double __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7335,6 +8683,7 @@ vec_ctsl(vector double __a, int __b)
 
 /*-- vec_ctul ---------------------------------------------------------------*/
 
+// This prototype is deprecated.
 static inline __ATTRS_o_ai vector unsigned long long
 vec_ctul(vector double __a, int __b)
   __constant_range(__b, 0, 31) {
@@ -7342,16 +8691,79 @@ vec_ctul(vector double __a, int __b)
   return __builtin_convertvector(__a, vector unsigned long long);
 }
 
-/*-- vec_roundp -------------------------------------------------------------*/
+/*-- vec_doublee ------------------------------------------------------------*/
 
+#if __ARCH__ >= 12
 static inline __ATTRS_ai vector double
+vec_doublee(vector float __a) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  __v2f32 __pack = __builtin_shufflevector(__a, __a, 0, 2);
+  return __builtin_convertvector(__pack, vector double);
+}
+#endif
+
+/*-- vec_floate -------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_ai vector float
+vec_floate(vector double __a) {
+  typedef float __v2f32 __attribute__((__vector_size__(8)));
+  __v2f32 __pack = __builtin_convertvector(__a, __v2f32);
+  return __builtin_shufflevector(__pack, __pack, 0, -1, 1, -1);
+}
+#endif
+
+/*-- vec_double -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector double
+vec_double(vector signed long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+static inline __ATTRS_o_ai vector double
+vec_double(vector unsigned long long __a) {
+  return __builtin_convertvector(__a, vector double);
+}
+
+/*-- vec_signed -------------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector signed long long
+vec_signed(vector double __a) {
+  return __builtin_convertvector(__a, vector signed long long);
+}
+
+/*-- vec_unsigned -----------------------------------------------------------*/
+
+static inline __ATTRS_o_ai vector unsigned long long
+vec_unsigned(vector double __a) {
+  return __builtin_convertvector(__a, vector unsigned long long);
+}
+
+/*-- vec_roundp -------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundp(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 6);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundp(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 6);
 }
 
 /*-- vec_ceil ---------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_ceil(vector float __a) {
+  // On this platform, vec_ceil never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 6);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_ceil(vector double __a) {
   // On this platform, vec_ceil never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 6);
@@ -7359,14 +8771,29 @@ vec_ceil(vector double __a) {
 
 /*-- vec_roundm -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundm(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 7);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundm(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 7);
 }
 
 /*-- vec_floor --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_floor(vector float __a) {
+  // On this platform, vec_floor never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 7);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_floor(vector double __a) {
   // On this platform, vec_floor never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 7);
@@ -7374,14 +8801,29 @@ vec_floor(vector double __a) {
 
 /*-- vec_roundz -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundz(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 5);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundz(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 5);
 }
 
 /*-- vec_trunc --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_trunc(vector float __a) {
+  // On this platform, vec_trunc never triggers the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 4, 5);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_trunc(vector double __a) {
   // On this platform, vec_trunc never triggers the IEEE-inexact exception.
   return __builtin_s390_vfidb(__a, 4, 5);
@@ -7389,22 +8831,104 @@ vec_trunc(vector double __a) {
 
 /*-- vec_roundc -------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_roundc(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 0);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_roundc(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 0);
 }
 
+/*-- vec_rint ---------------------------------------------------------------*/
+
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_rint(vector float __a) {
+  // vec_rint may trigger the IEEE-inexact exception.
+  return __builtin_s390_vfisb(__a, 0, 0);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
+vec_rint(vector double __a) {
+  // vec_rint may trigger the IEEE-inexact exception.
+  return __builtin_s390_vfidb(__a, 0, 0);
+}
+
 /*-- vec_round --------------------------------------------------------------*/
 
-static inline __ATTRS_ai vector double
+#if __ARCH__ >= 12
+static inline __ATTRS_o_ai vector float
+vec_round(vector float __a) {
+  return __builtin_s390_vfisb(__a, 4, 4);
+}
+#endif
+
+static inline __ATTRS_o_ai vector double
 vec_round(vector double __a) {
   return __builtin_s390_vfidb(__a, 4, 4);
 }
 
 /*-- vec_fp_test_data_class -------------------------------------------------*/
 
+#if __ARCH__ >= 12
+extern __ATTRS_o vector bool int
+vec_fp_test_data_class(vector float __a, int __b, int *__c)
+  __constant_range(__b, 0, 4095);
+
+extern __ATTRS_o vector bool long long
+vec_fp_test_data_class(vector double __a, int __b, int *__c)
+  __constant_range(__b, 0, 4095);
+
+#define vec_fp_test_data_class(X, Y, Z) \
+  ((__typeof__((vec_fp_test_data_class)((X), (Y), (Z)))) \
+   __extension__ ({ \
+     vector unsigned char __res; \
+     vector unsigned char __x = (vector unsigned char)(X); \
+     int *__z = (Z); \
+     switch (sizeof ((X)[0])) { \
+     case 4:  __res = (vector unsigned char) \
+                      __builtin_s390_vftcisb((vector float)__x, (Y), __z); \
+              break; \
+     default: __res = (vector unsigned char) \
+                      __builtin_s390_vftcidb((vector double)__x, (Y), __z); \
+              break; \
+     } __res; }))
+#else
 #define vec_fp_test_data_class(X, Y, Z) \
   ((vector bool long long)__builtin_s390_vftcidb((X), (Y), (Z)))
+#endif
+
+#define __VEC_CLASS_FP_ZERO_P (1 << 11)
+#define __VEC_CLASS_FP_ZERO_N (1 << 10)
+#define __VEC_CLASS_FP_ZERO (__VEC_CLASS_FP_ZERO_P | __VEC_CLASS_FP_ZERO_N)
+#define __VEC_CLASS_FP_NORMAL_P (1 << 9)
+#define __VEC_CLASS_FP_NORMAL_N (1 << 8)
+#define __VEC_CLASS_FP_NORMAL (__VEC_CLASS_FP_NORMAL_P | \
+                               __VEC_CLASS_FP_NORMAL_N)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 7)
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 6)
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | \
+                                  __VEC_CLASS_FP_SUBNORMAL_N)
+#define __VEC_CLASS_FP_INFINITY_P (1 << 5)
+#define __VEC_CLASS_FP_INFINITY_N (1 << 4)
+#define __VEC_CLASS_FP_INFINITY (__VEC_CLASS_FP_INFINITY_P | \
+                                 __VEC_CLASS_FP_INFINITY_N)
+#define __VEC_CLASS_FP_QNAN_P (1 << 3)
+#define __VEC_CLASS_FP_QNAN_N (1 << 2)
+#define __VEC_CLASS_FP_QNAN (__VEC_CLASS_FP_QNAN_P | __VEC_CLASS_FP_QNAN_N)
+#define __VEC_CLASS_FP_SNAN_P (1 << 1)
+#define __VEC_CLASS_FP_SNAN_N (1 << 0)
+#define __VEC_CLASS_FP_SNAN (__VEC_CLASS_FP_SNAN_P | __VEC_CLASS_FP_SNAN_N)
+#define __VEC_CLASS_FP_NAN (__VEC_CLASS_FP_QNAN | __VEC_CLASS_FP_SNAN)
+#define __VEC_CLASS_FP_NOT_NORMAL (__VEC_CLASS_FP_NAN | \
+                                   __VEC_CLASS_FP_SUBNORMAL | \
+                                   __VEC_CLASS_FP_ZERO | \
+                                   __VEC_CLASS_FP_INFINITY)
 
 /*-- vec_cp_until_zero ------------------------------------------------------*/

@@ -72,6 +72,10 @@
 #include <tbmintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LWP__)
+#include <lwpintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__)
 #include <f16cintrin.h>
 #endif
@@ -80,6 +84,8 @@
 #include <mwaitxintrin.h>
 #endif
 
-/* FIXME: LWP */
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLZERO__)
+#include <clzerointrin.h>
+#endif
 
 #endif /* __X86INTRIN_H */

@@ -2067,7 +2067,7 @@ _mm_storer_ps(float *__p, __m128 __a)
 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
 ///    be generated. \n
 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
-///    be generated.                                   
+///    be generated.
 #define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel)))
 #endif
 
@@ -2099,7 +2099,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
 ///
 /// \param __p
 ///    A pointer to a 128-bit aligned memory location that will receive the
-///    integer values.
+///    single-precision floating-point values.
 /// \param __a
 ///    A 128-bit vector of [4 x float] containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
@@ -2133,7 +2133,7 @@ void _mm_sfence(void);
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// void _mm_extract_pi(__m64 a, int n);
+/// int _mm_extract_pi16(__m64 a, int n);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
@@ -2157,7 +2157,7 @@ void _mm_sfence(void);
 /// \headerfile <x86intrin.h>
 ///
 /// \code
-/// void _mm_insert_pi(__m64 a, int d, int n);
+/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
 /// \endcode
 ///
 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
@@ -2331,8 +2331,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b)
 /// \brief Conditionally copies the values from each 8-bit element in the first
 ///    64-bit integer vector operand to the specified memory location, as
 ///    specified by the most significant bit in the corresponding element in the
-///    second 64-bit integer vector operand. To minimize caching, the data is
-///    flagged as non-temporal (unlikely to be used again soon).
+///    second 64-bit integer vector operand.
+///
+///    To minimize caching, the data is flagged as non-temporal
+///    (unlikely to be used again soon).
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2435,17 +2437,17 @@ extern "C" {
 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
-///    </li>            
+///    </li>
 ///    <li>
 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
 ///      _MM_GET_ROUNDING_MODE(x) where x is one of these macros.
 ///    </li>
-///    <li> 
+///    <li>
 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
 ///    </li>
-///    <li> 
+///    <li>
 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
 ///      _MM_GET_DENORMALS_ZERO_MODE().
@@ -2468,11 +2470,11 @@ extern "C" {
 unsigned int _mm_getcsr(void);
 
 /// \brief Sets the MXCSR register with the 32-bit unsigned integer value.
-///   
+///
 ///    There are several groups of macros associated with this intrinsic,
 ///    including:
 ///    <ul>
-///    <li> 
+///    <li>
 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
@@ -2517,7 +2519,7 @@ unsigned int _mm_getcsr(void);
 ///
 /// \param __i
 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
-void _mm_setcsr(unsigned int);
+void _mm_setcsr(unsigned int __i);
 
 #if defined(__cplusplus)
 } // extern "C"
@@ -2540,7 +2542,7 @@ void _mm_setcsr(unsigned int);
 ///    A 128-bit vector of [4 x float].
 /// \param mask
 ///    An immediate value containing an 8-bit value specifying which elements to
-///    copy from \ a and \a b. \n
+///    copy from \a a and \a b. \n
 ///    Bits [3:0] specify the values copied from operand \a a. \n
 ///    Bits [7:4] specify the values copied from operand \a b. \n
 ///    The destinations within the 128-bit destination are assigned values as
@@ -2678,8 +2680,7 @@ _mm_movelh_ps(__m128 __a, __m128 __b)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
-///   instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
@@ -2709,8 +2710,7 @@ _mm_cvtpi16_ps(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
-///   instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
@@ -2739,8 +2739,7 @@ _mm_cvtpu16_ps(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
-///   instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
@@ -2764,8 +2763,7 @@ _mm_cvtpi8_ps(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
-///   instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
@@ -2789,8 +2787,7 @@ _mm_cvtpu8_ps(__m64 __a)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> CVTPI2PS + \c COMPOSITE </c>
-///   instruction.
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
@@ -2815,16 +2812,16 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
 
 /// \brief Converts each single-precision floating-point element of a 128-bit
 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
-///    packs the results into a 64-bit integer vector of [4 x i16]. If the
-///    floating-point element is NaN or infinity, or if the floating-point
-///    element is greater than 0x7FFFFFFF or less than -0x8000, it is converted
-///    to 0x8000. Otherwise if the floating-point element is greater than
-///    0x7FFF, it is converted to 0x7FFF.
+///    packs the results into a 64-bit integer vector of [4 x i16].
+///
+///    If the floating-point element is NaN or infinity, or if the
+///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
+///    it is converted to 0x8000. Otherwise if the floating-point element is
+///    greater than 0x7FFF, it is converted to 0x7FFF.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
-///   instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    A 128-bit floating-point vector of [4 x float].
@@ -2845,16 +2842,16 @@ _mm_cvtps_pi16(__m128 __a)
 /// \brief Converts each single-precision floating-point element of a 128-bit
 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
-///    [8 x i8]. The upper 32 bits of the vector are set to 0. If the
-///    floating-point element is NaN or infinity, or if the floating-point
-///    element is greater than 0x7FFFFFFF or less than -0x80, it is converted
-///    to 0x80. Otherwise if the floating-point element is greater than 0x7F,
-///    it is converted to 0x7F.
+///    [8 x i8]. The upper 32 bits of the vector are set to 0.
+///
+///    If the floating-point element is NaN or infinity, or if the
+///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
+///    is converted to 0x80. Otherwise if the floating-point element is greater
+///    than 0x7F, it is converted to 0x7F.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> CVTPS2PI + \c COMPOSITE </c>
-///   instruction.
+/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
 ///
 /// \param __a
 ///    128-bit floating-point vector of [4 x float].

@@ -198,13 +198,13 @@ _mm_hsubq_epi32(__m128i __A)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
 {
-  return (__m128i)__builtin_ia32_vpcmov((__v2di)__A, (__v2di)__B, (__v2di)__C);
+  return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
 {
-  return (__m256i)__builtin_ia32_vpcmov_256((__v4di)__A, (__v4di)__B, (__v4di)__C);
+  return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS

@@ -2,7 +2,7 @@
 
 Create bootstrap code in std/bootstrap.zig and add conditional compilation
 logic. This code is responsible for the real executable entry point, calling
-main(argc, argv, env) and making the exit syscall when main returns.
+main() and making the exit syscall when main returns.
 
 How to pass a byvalue struct parameter in the C calling convention is
 target-specific. Add logic for how to do function prototypes and function calls

@@ -410,10 +410,12 @@ install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlbwintrin.h" DESTINATION "${
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlcdintrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vldqintrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vlintrin.h" DESTINATION "${C_HEADERS_DEST}")
+install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avx512vpopcntdqintrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/avxintrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmi2intrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/bmiintrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/clflushoptintrin.h" DESTINATION "${C_HEADERS_DEST}")
+install(FILES "${CMAKE_SOURCE_DIR}/c_headers/clzerointrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cpuid.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/algorithm" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/cuda_wrappers/complex" DESTINATION "${C_HEADERS_DEST}/cuda_wrappers")
@@ -432,6 +434,7 @@ install(FILES "${CMAKE_SOURCE_DIR}/c_headers/intrin.h" DESTINATION "${C_HEADERS_
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/inttypes.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/iso646.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/limits.h" DESTINATION "${C_HEADERS_DEST}")
+install(FILES "${CMAKE_SOURCE_DIR}/c_headers/lwpintrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/lzcntintrin.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm3dnow.h" DESTINATION "${C_HEADERS_DEST}")
 install(FILES "${CMAKE_SOURCE_DIR}/c_headers/mm_malloc.h" DESTINATION "${C_HEADERS_DEST}")

Commit cd58b40011

Commit `cd58b40011`