master
   1/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9#ifndef __IMMINTRIN_H
  10#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
  11#endif
  12
  13#ifndef __AVX512FINTRIN_H
  14#define __AVX512FINTRIN_H
  15
  16typedef char __v64qi __attribute__((__vector_size__(64)));
  17typedef short __v32hi __attribute__((__vector_size__(64)));
  18typedef double __v8df __attribute__((__vector_size__(64)));
  19typedef float __v16sf __attribute__((__vector_size__(64)));
  20typedef long long __v8di __attribute__((__vector_size__(64)));
  21typedef int __v16si __attribute__((__vector_size__(64)));
  22
  23/* Unsigned types */
  24typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
  25typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
  26typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
  27typedef unsigned int __v16su __attribute__((__vector_size__(64)));
  28
  29/* We need an explicitly signed variant for char. Note that this shouldn't
  30 * appear in the interface though. */
  31typedef signed char __v64qs __attribute__((__vector_size__(64)));
  32
  33typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
  34typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
  35typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
  36
  37typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
  38typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
  39typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
  40
  41typedef unsigned char __mmask8;
  42typedef unsigned short __mmask16;
  43
  44/* Rounding mode macros.  */
  45#define _MM_FROUND_TO_NEAREST_INT   0x00
  46#define _MM_FROUND_TO_NEG_INF       0x01
  47#define _MM_FROUND_TO_POS_INF       0x02
  48#define _MM_FROUND_TO_ZERO          0x03
  49#define _MM_FROUND_CUR_DIRECTION    0x04
  50
  51/* Constants for integer comparison predicates */
  52typedef enum {
  53    _MM_CMPINT_EQ,      /* Equal */
  54    _MM_CMPINT_LT,      /* Less than */
  55    _MM_CMPINT_LE,      /* Less than or Equal */
  56    _MM_CMPINT_UNUSED,
  57    _MM_CMPINT_NE,      /* Not Equal */
  58    _MM_CMPINT_NLT,     /* Not Less than */
  59#define _MM_CMPINT_GE   _MM_CMPINT_NLT  /* Greater than or Equal */
  60    _MM_CMPINT_NLE      /* Not Less than or Equal */
  61#define _MM_CMPINT_GT   _MM_CMPINT_NLE  /* Greater than */
  62} _MM_CMPINT_ENUM;
  63
  64typedef enum
  65{
  66  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
  67  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
  68  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
  69  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
  70  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
  71  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
  72  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
  73  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
  74  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
  75  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
  76  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
  77  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
  78  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
  79  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
  80  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
  81  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
  82  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
  83  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
  84  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
  85  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
  86  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
  87  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
  88  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
  89  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
  90  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
  91  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
  92  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
  93  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
  94  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
  95  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
  96  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
  97  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
  98  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
  99  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
 100  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
 101  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
 102  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
 103  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
 104  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
 105  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
 106  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
 107  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
 108  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
 109  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
 110  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
 111  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
 112  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
 113  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
 114  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
 115  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
 116  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
 117  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
 118  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
 119  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
 120  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
 121  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
 122  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
 123  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
 124  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
 125  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
 126  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
 127  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
 128  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
 129  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
 130  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
 131  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
 132  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
 133  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
 134  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
 135  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
 136  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
 137  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
 138  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
 139  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
 140  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
 141  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
 142  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
 143  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
 144  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
 145  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
 146  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
 147  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
 148  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
 149  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
 150  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
 151  _MM_PERM_DDDD = 0xFF
 152} _MM_PERM_ENUM;
 153
 154typedef enum
 155{
 156  _MM_MANT_NORM_1_2,    /* interval [1, 2)      */
 157  _MM_MANT_NORM_p5_2,   /* interval [0.5, 2)    */
 158  _MM_MANT_NORM_p5_1,   /* interval [0.5, 1)    */
 159  _MM_MANT_NORM_p75_1p5   /* interval [0.75, 1.5) */
 160} _MM_MANTISSA_NORM_ENUM;
 161
 162typedef enum
 163{
 164  _MM_MANT_SIGN_src,    /* sign = sign(SRC)     */
 165  _MM_MANT_SIGN_zero,   /* sign = 0             */
 166  _MM_MANT_SIGN_nan   /* DEST = NaN if sign(SRC) = 1 */
 167} _MM_MANTISSA_SIGN_ENUM;
 168
 169/* Define the default attributes for the functions in this file. */
 170#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f,evex512"), __min_vector_width__(512)))
 171#define __DEFAULT_FN_ATTRS128                                                  \
 172  __attribute__((__always_inline__, __nodebug__,                               \
 173                 __target__("avx512f,no-evex512"), __min_vector_width__(128)))
 174#define __DEFAULT_FN_ATTRS                                                     \
 175  __attribute__((__always_inline__, __nodebug__,                               \
 176                 __target__("avx512f,no-evex512")))
 177
 178#if defined(__cplusplus) && (__cplusplus >= 201103L)
 179#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
 180#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
 181#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
 182#else
 183#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
 184#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
 185#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
 186#endif
 187
 188/* Create vectors with repeated elements */
 189
 190static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 191_mm512_setzero_si512(void) {
 192  return __extension__(__m512i)(__v8di){0, 0, 0, 0, 0, 0, 0, 0};
 193}
 194
 195#define _mm512_setzero_epi32 _mm512_setzero_si512
 196
 197static __inline__ __m512d __DEFAULT_FN_ATTRS512
 198_mm512_undefined_pd(void)
 199{
 200  return (__m512d)__builtin_ia32_undef512();
 201}
 202
 203static __inline__ __m512 __DEFAULT_FN_ATTRS512
 204_mm512_undefined(void)
 205{
 206  return (__m512)__builtin_ia32_undef512();
 207}
 208
 209static __inline__ __m512 __DEFAULT_FN_ATTRS512
 210_mm512_undefined_ps(void)
 211{
 212  return (__m512)__builtin_ia32_undef512();
 213}
 214
 215static __inline__ __m512i __DEFAULT_FN_ATTRS512
 216_mm512_undefined_epi32(void)
 217{
 218  return (__m512i)__builtin_ia32_undef512();
 219}
 220
 221static __inline__ __m512i __DEFAULT_FN_ATTRS512
 222_mm512_broadcastd_epi32 (__m128i __A)
 223{
 224  return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
 225                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 226}
 227
 228static __inline__ __m512i __DEFAULT_FN_ATTRS512
 229_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
 230{
 231  return (__m512i)__builtin_ia32_selectd_512(__M,
 232                                             (__v16si) _mm512_broadcastd_epi32(__A),
 233                                             (__v16si) __O);
 234}
 235
 236static __inline__ __m512i __DEFAULT_FN_ATTRS512
 237_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
 238{
 239  return (__m512i)__builtin_ia32_selectd_512(__M,
 240                                             (__v16si) _mm512_broadcastd_epi32(__A),
 241                                             (__v16si) _mm512_setzero_si512());
 242}
 243
 244static __inline__ __m512i __DEFAULT_FN_ATTRS512
 245_mm512_broadcastq_epi64 (__m128i __A)
 246{
 247  return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
 248                                          0, 0, 0, 0, 0, 0, 0, 0);
 249}
 250
 251static __inline__ __m512i __DEFAULT_FN_ATTRS512
 252_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
 253{
 254  return (__m512i)__builtin_ia32_selectq_512(__M,
 255                                             (__v8di) _mm512_broadcastq_epi64(__A),
 256                                             (__v8di) __O);
 257
 258}
 259
 260static __inline__ __m512i __DEFAULT_FN_ATTRS512
 261_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
 262{
 263  return (__m512i)__builtin_ia32_selectq_512(__M,
 264                                             (__v8di) _mm512_broadcastq_epi64(__A),
 265                                             (__v8di) _mm512_setzero_si512());
 266}
 267
 268static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_setzero_ps(void) {
 269  return __extension__(__m512){0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
 270                               0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
 271}
 272
 273#define _mm512_setzero _mm512_setzero_ps
 274
 275static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 276_mm512_setzero_pd(void) {
 277  return __extension__(__m512d){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
 278}
 279
 280static __inline __m512 __DEFAULT_FN_ATTRS512
 281_mm512_set1_ps(float __w)
 282{
 283  return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
 284                                 __w, __w, __w, __w, __w, __w, __w, __w  };
 285}
 286
 287static __inline __m512d __DEFAULT_FN_ATTRS512
 288_mm512_set1_pd(double __w)
 289{
 290  return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
 291}
 292
 293static __inline __m512i __DEFAULT_FN_ATTRS512
 294_mm512_set1_epi8(char __w)
 295{
 296  return __extension__ (__m512i)(__v64qi){
 297    __w, __w, __w, __w, __w, __w, __w, __w,
 298    __w, __w, __w, __w, __w, __w, __w, __w,
 299    __w, __w, __w, __w, __w, __w, __w, __w,
 300    __w, __w, __w, __w, __w, __w, __w, __w,
 301    __w, __w, __w, __w, __w, __w, __w, __w,
 302    __w, __w, __w, __w, __w, __w, __w, __w,
 303    __w, __w, __w, __w, __w, __w, __w, __w,
 304    __w, __w, __w, __w, __w, __w, __w, __w  };
 305}
 306
 307static __inline __m512i __DEFAULT_FN_ATTRS512
 308_mm512_set1_epi16(short __w)
 309{
 310  return __extension__ (__m512i)(__v32hi){
 311    __w, __w, __w, __w, __w, __w, __w, __w,
 312    __w, __w, __w, __w, __w, __w, __w, __w,
 313    __w, __w, __w, __w, __w, __w, __w, __w,
 314    __w, __w, __w, __w, __w, __w, __w, __w };
 315}
 316
 317static __inline __m512i __DEFAULT_FN_ATTRS512
 318_mm512_set1_epi32(int __s)
 319{
 320  return __extension__ (__m512i)(__v16si){
 321    __s, __s, __s, __s, __s, __s, __s, __s,
 322    __s, __s, __s, __s, __s, __s, __s, __s };
 323}
 324
 325static __inline __m512i __DEFAULT_FN_ATTRS512
 326_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
 327{
 328  return (__m512i)__builtin_ia32_selectd_512(__M,
 329                                             (__v16si)_mm512_set1_epi32(__A),
 330                                             (__v16si)_mm512_setzero_si512());
 331}
 332
 333static __inline __m512i __DEFAULT_FN_ATTRS512
 334_mm512_set1_epi64(long long __d)
 335{
 336  return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
 337}
 338
 339static __inline __m512i __DEFAULT_FN_ATTRS512
 340_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
 341{
 342  return (__m512i)__builtin_ia32_selectq_512(__M,
 343                                             (__v8di)_mm512_set1_epi64(__A),
 344                                             (__v8di)_mm512_setzero_si512());
 345}
 346
 347static __inline__ __m512 __DEFAULT_FN_ATTRS512
 348_mm512_broadcastss_ps(__m128 __A)
 349{
 350  return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
 351                                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 352}
 353
 354static __inline __m512i __DEFAULT_FN_ATTRS512
 355_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
 356{
 357  return __extension__ (__m512i)(__v16si)
 358   { __D, __C, __B, __A, __D, __C, __B, __A,
 359     __D, __C, __B, __A, __D, __C, __B, __A };
 360}
 361
 362static __inline __m512i __DEFAULT_FN_ATTRS512
 363_mm512_set4_epi64 (long long __A, long long __B, long long __C,
 364       long long __D)
 365{
 366  return __extension__ (__m512i) (__v8di)
 367   { __D, __C, __B, __A, __D, __C, __B, __A };
 368}
 369
 370static __inline __m512d __DEFAULT_FN_ATTRS512
 371_mm512_set4_pd (double __A, double __B, double __C, double __D)
 372{
 373  return __extension__ (__m512d)
 374   { __D, __C, __B, __A, __D, __C, __B, __A };
 375}
 376
 377static __inline __m512 __DEFAULT_FN_ATTRS512
 378_mm512_set4_ps (float __A, float __B, float __C, float __D)
 379{
 380  return __extension__ (__m512)
 381   { __D, __C, __B, __A, __D, __C, __B, __A,
 382     __D, __C, __B, __A, __D, __C, __B, __A };
 383}
 384
 385#define _mm512_setr4_epi32(e0,e1,e2,e3)               \
 386  _mm512_set4_epi32((e3),(e2),(e1),(e0))
 387
 388#define _mm512_setr4_epi64(e0,e1,e2,e3)               \
 389  _mm512_set4_epi64((e3),(e2),(e1),(e0))
 390
 391#define _mm512_setr4_pd(e0,e1,e2,e3)                \
 392  _mm512_set4_pd((e3),(e2),(e1),(e0))
 393
 394#define _mm512_setr4_ps(e0,e1,e2,e3)                \
 395  _mm512_set4_ps((e3),(e2),(e1),(e0))
 396
 397static __inline__ __m512d __DEFAULT_FN_ATTRS512
 398_mm512_broadcastsd_pd(__m128d __A)
 399{
 400  return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
 401                                          0, 0, 0, 0, 0, 0, 0, 0);
 402}
 403
 404/* Cast between vector types */
 405
 406static __inline __m512d __DEFAULT_FN_ATTRS512
 407_mm512_castpd256_pd512(__m256d __a)
 408{
 409  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
 410                                 1, 2, 3, 4, 5, 6, 7);
 411}
 412
 413static __inline __m512 __DEFAULT_FN_ATTRS512
 414_mm512_castps256_ps512(__m256 __a)
 415{
 416  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
 417                                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 418}
 419
 420static __inline __m128d __DEFAULT_FN_ATTRS512
 421_mm512_castpd512_pd128(__m512d __a)
 422{
 423  return __builtin_shufflevector(__a, __a, 0, 1);
 424}
 425
 426static __inline __m256d __DEFAULT_FN_ATTRS512
 427_mm512_castpd512_pd256 (__m512d __A)
 428{
 429  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
 430}
 431
 432static __inline __m128 __DEFAULT_FN_ATTRS512
 433_mm512_castps512_ps128(__m512 __a)
 434{
 435  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
 436}
 437
 438static __inline __m256 __DEFAULT_FN_ATTRS512
 439_mm512_castps512_ps256 (__m512 __A)
 440{
 441  return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
 442}
 443
 444static __inline __m512 __DEFAULT_FN_ATTRS512
 445_mm512_castpd_ps (__m512d __A)
 446{
 447  return (__m512) (__A);
 448}
 449
 450static __inline __m512i __DEFAULT_FN_ATTRS512
 451_mm512_castpd_si512 (__m512d __A)
 452{
 453  return (__m512i) (__A);
 454}
 455
 456static __inline__ __m512d __DEFAULT_FN_ATTRS512
 457_mm512_castpd128_pd512 (__m128d __A)
 458{
 459  __m256d __B = __builtin_nondeterministic_value(__B);
 460  return __builtin_shufflevector(
 461      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
 462      __B, 0, 1, 2, 3, 4, 5, 6, 7);
 463}
 464
 465static __inline __m512d __DEFAULT_FN_ATTRS512
 466_mm512_castps_pd (__m512 __A)
 467{
 468  return (__m512d) (__A);
 469}
 470
 471static __inline __m512i __DEFAULT_FN_ATTRS512
 472_mm512_castps_si512 (__m512 __A)
 473{
 474  return (__m512i) (__A);
 475}
 476
 477static __inline__ __m512 __DEFAULT_FN_ATTRS512
 478_mm512_castps128_ps512 (__m128 __A)
 479{
 480  __m256 __B = __builtin_nondeterministic_value(__B);
 481  return __builtin_shufflevector(
 482      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7),
 483      __B, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 484}
 485
 486static __inline__ __m512i __DEFAULT_FN_ATTRS512
 487_mm512_castsi128_si512 (__m128i __A)
 488{
 489  __m256i __B = __builtin_nondeterministic_value(__B);
 490  return __builtin_shufflevector(
 491      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
 492      __B, 0, 1, 2, 3, 4, 5, 6, 7);
 493}
 494
 495static __inline__ __m512i __DEFAULT_FN_ATTRS512
 496_mm512_castsi256_si512 (__m256i __A)
 497{
 498   return  __builtin_shufflevector( __A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7);
 499}
 500
 501static __inline __m512 __DEFAULT_FN_ATTRS512
 502_mm512_castsi512_ps (__m512i __A)
 503{
 504  return (__m512) (__A);
 505}
 506
 507static __inline __m512d __DEFAULT_FN_ATTRS512
 508_mm512_castsi512_pd (__m512i __A)
 509{
 510  return (__m512d) (__A);
 511}
 512
 513static __inline __m128i __DEFAULT_FN_ATTRS512
 514_mm512_castsi512_si128 (__m512i __A)
 515{
 516  return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
 517}
 518
 519static __inline __m256i __DEFAULT_FN_ATTRS512
 520_mm512_castsi512_si256 (__m512i __A)
 521{
 522  return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
 523}
 524
 525static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 526_mm512_int2mask(int __a)
 527{
 528  return (__mmask16)__a;
 529}
 530
 531static __inline__ int __DEFAULT_FN_ATTRS
 532_mm512_mask2int(__mmask16 __a)
 533{
 534  return (int)__a;
 535}
 536
 537/// Constructs a 512-bit floating-point vector of [8 x double] from a
 538///    128-bit floating-point vector of [2 x double]. The lower 128 bits
 539///    contain the value of the source vector. The upper 384 bits are set
 540///    to zero.
 541///
 542/// \headerfile <x86intrin.h>
 543///
 544/// This intrinsic has no corresponding instruction.
 545///
 546/// \param __a
 547///    A 128-bit vector of [2 x double].
 548/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
 549///    contain the value of the parameter. The upper 384 bits are set to zero.
 550static __inline __m512d __DEFAULT_FN_ATTRS512
 551_mm512_zextpd128_pd512(__m128d __a)
 552{
 553  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
 554}
 555
 556/// Constructs a 512-bit floating-point vector of [8 x double] from a
 557///    256-bit floating-point vector of [4 x double]. The lower 256 bits
 558///    contain the value of the source vector. The upper 256 bits are set
 559///    to zero.
 560///
 561/// \headerfile <x86intrin.h>
 562///
 563/// This intrinsic has no corresponding instruction.
 564///
 565/// \param __a
 566///    A 256-bit vector of [4 x double].
 567/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
 568///    contain the value of the parameter. The upper 256 bits are set to zero.
 569static __inline __m512d __DEFAULT_FN_ATTRS512
 570_mm512_zextpd256_pd512(__m256d __a)
 571{
 572  return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
 573}
 574
 575/// Constructs a 512-bit floating-point vector of [16 x float] from a
 576///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
 577///    the value of the source vector. The upper 384 bits are set to zero.
 578///
 579/// \headerfile <x86intrin.h>
 580///
 581/// This intrinsic has no corresponding instruction.
 582///
 583/// \param __a
 584///    A 128-bit vector of [4 x float].
 585/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
 586///    contain the value of the parameter. The upper 384 bits are set to zero.
 587static __inline __m512 __DEFAULT_FN_ATTRS512
 588_mm512_zextps128_ps512(__m128 __a)
 589{
 590  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
 591}
 592
 593/// Constructs a 512-bit floating-point vector of [16 x float] from a
 594///    256-bit floating-point vector of [8 x float]. The lower 256 bits contain
 595///    the value of the source vector. The upper 256 bits are set to zero.
 596///
 597/// \headerfile <x86intrin.h>
 598///
 599/// This intrinsic has no corresponding instruction.
 600///
 601/// \param __a
 602///    A 256-bit vector of [8 x float].
 603/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
 604///    contain the value of the parameter. The upper 256 bits are set to zero.
 605static __inline __m512 __DEFAULT_FN_ATTRS512
 606_mm512_zextps256_ps512(__m256 __a)
 607{
 608  return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 609}
 610
 611/// Constructs a 512-bit integer vector from a 128-bit integer vector.
 612///    The lower 128 bits contain the value of the source vector. The upper
 613///    384 bits are set to zero.
 614///
 615/// \headerfile <x86intrin.h>
 616///
 617/// This intrinsic has no corresponding instruction.
 618///
 619/// \param __a
 620///    A 128-bit integer vector.
 621/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
 622///    the parameter. The upper 384 bits are set to zero.
 623static __inline __m512i __DEFAULT_FN_ATTRS512
 624_mm512_zextsi128_si512(__m128i __a)
 625{
 626  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
 627}
 628
 629/// Constructs a 512-bit integer vector from a 256-bit integer vector.
 630///    The lower 256 bits contain the value of the source vector. The upper
 631///    256 bits are set to zero.
 632///
 633/// \headerfile <x86intrin.h>
 634///
 635/// This intrinsic has no corresponding instruction.
 636///
 637/// \param __a
 638///    A 256-bit integer vector.
 639/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
 640///    the parameter. The upper 256 bits are set to zero.
 641static __inline __m512i __DEFAULT_FN_ATTRS512
 642_mm512_zextsi256_si512(__m256i __a)
 643{
 644  return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
 645}
 646
 647/* Bitwise operators */
 648static __inline__ __m512i __DEFAULT_FN_ATTRS512
 649_mm512_and_epi32(__m512i __a, __m512i __b)
 650{
 651  return (__m512i)((__v16su)__a & (__v16su)__b);
 652}
 653
 654static __inline__ __m512i __DEFAULT_FN_ATTRS512
 655_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 656{
 657  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
 658                (__v16si) _mm512_and_epi32(__a, __b),
 659                (__v16si) __src);
 660}
 661
 662static __inline__ __m512i __DEFAULT_FN_ATTRS512
 663_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 664{
 665  return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
 666                                         __k, __a, __b);
 667}
 668
 669static __inline__ __m512i __DEFAULT_FN_ATTRS512
 670_mm512_and_epi64(__m512i __a, __m512i __b)
 671{
 672  return (__m512i)((__v8du)__a & (__v8du)__b);
 673}
 674
 675static __inline__ __m512i __DEFAULT_FN_ATTRS512
 676_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 677{
 678    return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
 679                (__v8di) _mm512_and_epi64(__a, __b),
 680                (__v8di) __src);
 681}
 682
 683static __inline__ __m512i __DEFAULT_FN_ATTRS512
 684_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 685{
 686  return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
 687                                         __k, __a, __b);
 688}
 689
 690static __inline__ __m512i __DEFAULT_FN_ATTRS512
 691_mm512_andnot_si512 (__m512i __A, __m512i __B)
 692{
 693  return (__m512i)(~(__v8du)__A & (__v8du)__B);
 694}
 695
 696static __inline__ __m512i __DEFAULT_FN_ATTRS512
 697_mm512_andnot_epi32 (__m512i __A, __m512i __B)
 698{
 699  return (__m512i)(~(__v16su)__A & (__v16su)__B);
 700}
 701
 702static __inline__ __m512i __DEFAULT_FN_ATTRS512
 703_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 704{
 705  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
 706                                         (__v16si)_mm512_andnot_epi32(__A, __B),
 707                                         (__v16si)__W);
 708}
 709
 710static __inline__ __m512i __DEFAULT_FN_ATTRS512
 711_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 712{
 713  return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
 714                                           __U, __A, __B);
 715}
 716
 717static __inline__ __m512i __DEFAULT_FN_ATTRS512
 718_mm512_andnot_epi64(__m512i __A, __m512i __B)
 719{
 720  return (__m512i)(~(__v8du)__A & (__v8du)__B);
 721}
 722
 723static __inline__ __m512i __DEFAULT_FN_ATTRS512
 724_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 725{
 726  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
 727                                          (__v8di)_mm512_andnot_epi64(__A, __B),
 728                                          (__v8di)__W);
 729}
 730
 731static __inline__ __m512i __DEFAULT_FN_ATTRS512
 732_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 733{
 734  return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
 735                                           __U, __A, __B);
 736}
 737
 738static __inline__ __m512i __DEFAULT_FN_ATTRS512
 739_mm512_or_epi32(__m512i __a, __m512i __b)
 740{
 741  return (__m512i)((__v16su)__a | (__v16su)__b);
 742}
 743
 744static __inline__ __m512i __DEFAULT_FN_ATTRS512
 745_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 746{
 747  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
 748                                             (__v16si)_mm512_or_epi32(__a, __b),
 749                                             (__v16si)__src);
 750}
 751
 752static __inline__ __m512i __DEFAULT_FN_ATTRS512
 753_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 754{
 755  return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
 756}
 757
 758static __inline__ __m512i __DEFAULT_FN_ATTRS512
 759_mm512_or_epi64(__m512i __a, __m512i __b)
 760{
 761  return (__m512i)((__v8du)__a | (__v8du)__b);
 762}
 763
 764static __inline__ __m512i __DEFAULT_FN_ATTRS512
 765_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 766{
 767  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
 768                                             (__v8di)_mm512_or_epi64(__a, __b),
 769                                             (__v8di)__src);
 770}
 771
 772static __inline__ __m512i __DEFAULT_FN_ATTRS512
 773_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 774{
 775  return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
 776}
 777
 778static __inline__ __m512i __DEFAULT_FN_ATTRS512
 779_mm512_xor_epi32(__m512i __a, __m512i __b)
 780{
 781  return (__m512i)((__v16su)__a ^ (__v16su)__b);
 782}
 783
 784static __inline__ __m512i __DEFAULT_FN_ATTRS512
 785_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
 786{
 787  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
 788                                            (__v16si)_mm512_xor_epi32(__a, __b),
 789                                            (__v16si)__src);
 790}
 791
 792static __inline__ __m512i __DEFAULT_FN_ATTRS512
 793_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
 794{
 795  return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
 796}
 797
 798static __inline__ __m512i __DEFAULT_FN_ATTRS512
 799_mm512_xor_epi64(__m512i __a, __m512i __b)
 800{
 801  return (__m512i)((__v8du)__a ^ (__v8du)__b);
 802}
 803
 804static __inline__ __m512i __DEFAULT_FN_ATTRS512
 805_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
 806{
 807  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
 808                                             (__v8di)_mm512_xor_epi64(__a, __b),
 809                                             (__v8di)__src);
 810}
 811
 812static __inline__ __m512i __DEFAULT_FN_ATTRS512
 813_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
 814{
 815  return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
 816}
 817
 818static __inline__ __m512i __DEFAULT_FN_ATTRS512
 819_mm512_and_si512(__m512i __a, __m512i __b)
 820{
 821  return (__m512i)((__v8du)__a & (__v8du)__b);
 822}
 823
 824static __inline__ __m512i __DEFAULT_FN_ATTRS512
 825_mm512_or_si512(__m512i __a, __m512i __b)
 826{
 827  return (__m512i)((__v8du)__a | (__v8du)__b);
 828}
 829
 830static __inline__ __m512i __DEFAULT_FN_ATTRS512
 831_mm512_xor_si512(__m512i __a, __m512i __b)
 832{
 833  return (__m512i)((__v8du)__a ^ (__v8du)__b);
 834}
 835
 836/* Arithmetic */
 837
 838static __inline __m512d __DEFAULT_FN_ATTRS512
 839_mm512_add_pd(__m512d __a, __m512d __b)
 840{
 841  return (__m512d)((__v8df)__a + (__v8df)__b);
 842}
 843
 844static __inline __m512 __DEFAULT_FN_ATTRS512
 845_mm512_add_ps(__m512 __a, __m512 __b)
 846{
 847  return (__m512)((__v16sf)__a + (__v16sf)__b);
 848}
 849
 850static __inline __m512d __DEFAULT_FN_ATTRS512
 851_mm512_mul_pd(__m512d __a, __m512d __b)
 852{
 853  return (__m512d)((__v8df)__a * (__v8df)__b);
 854}
 855
 856static __inline __m512 __DEFAULT_FN_ATTRS512
 857_mm512_mul_ps(__m512 __a, __m512 __b)
 858{
 859  return (__m512)((__v16sf)__a * (__v16sf)__b);
 860}
 861
 862static __inline __m512d __DEFAULT_FN_ATTRS512
 863_mm512_sub_pd(__m512d __a, __m512d __b)
 864{
 865  return (__m512d)((__v8df)__a - (__v8df)__b);
 866}
 867
 868static __inline __m512 __DEFAULT_FN_ATTRS512
 869_mm512_sub_ps(__m512 __a, __m512 __b)
 870{
 871  return (__m512)((__v16sf)__a - (__v16sf)__b);
 872}
 873
 874static __inline__ __m512i __DEFAULT_FN_ATTRS512
 875_mm512_add_epi64 (__m512i __A, __m512i __B)
 876{
 877  return (__m512i) ((__v8du) __A + (__v8du) __B);
 878}
 879
 880static __inline__ __m512i __DEFAULT_FN_ATTRS512
 881_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 882{
 883  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
 884                                             (__v8di)_mm512_add_epi64(__A, __B),
 885                                             (__v8di)__W);
 886}
 887
 888static __inline__ __m512i __DEFAULT_FN_ATTRS512
 889_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 890{
 891  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
 892                                             (__v8di)_mm512_add_epi64(__A, __B),
 893                                             (__v8di)_mm512_setzero_si512());
 894}
 895
 896static __inline__ __m512i __DEFAULT_FN_ATTRS512
 897_mm512_sub_epi64 (__m512i __A, __m512i __B)
 898{
 899  return (__m512i) ((__v8du) __A - (__v8du) __B);
 900}
 901
 902static __inline__ __m512i __DEFAULT_FN_ATTRS512
 903_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
 904{
 905  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
 906                                             (__v8di)_mm512_sub_epi64(__A, __B),
 907                                             (__v8di)__W);
 908}
 909
 910static __inline__ __m512i __DEFAULT_FN_ATTRS512
 911_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
 912{
 913  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
 914                                             (__v8di)_mm512_sub_epi64(__A, __B),
 915                                             (__v8di)_mm512_setzero_si512());
 916}
 917
 918static __inline__ __m512i __DEFAULT_FN_ATTRS512
 919_mm512_add_epi32 (__m512i __A, __m512i __B)
 920{
 921  return (__m512i) ((__v16su) __A + (__v16su) __B);
 922}
 923
 924static __inline__ __m512i __DEFAULT_FN_ATTRS512
 925_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 926{
 927  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
 928                                             (__v16si)_mm512_add_epi32(__A, __B),
 929                                             (__v16si)__W);
 930}
 931
 932static __inline__ __m512i __DEFAULT_FN_ATTRS512
 933_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
 934{
 935  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
 936                                             (__v16si)_mm512_add_epi32(__A, __B),
 937                                             (__v16si)_mm512_setzero_si512());
 938}
 939
 940static __inline__ __m512i __DEFAULT_FN_ATTRS512
 941_mm512_sub_epi32 (__m512i __A, __m512i __B)
 942{
 943  return (__m512i) ((__v16su) __A - (__v16su) __B);
 944}
 945
 946static __inline__ __m512i __DEFAULT_FN_ATTRS512
 947_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
 948{
 949  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
 950                                             (__v16si)_mm512_sub_epi32(__A, __B),
 951                                             (__v16si)__W);
 952}
 953
 954static __inline__ __m512i __DEFAULT_FN_ATTRS512
 955_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 956{
 957  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
 958                                             (__v16si)_mm512_sub_epi32(__A, __B),
 959                                             (__v16si)_mm512_setzero_si512());
 960}
 961
 962#define _mm512_max_round_pd(A, B, R) \
 963  ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
 964                                    (__v8df)(__m512d)(B), (int)(R)))
 965
 966#define _mm512_mask_max_round_pd(W, U, A, B, R) \
 967  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
 968                                   (__v8df)_mm512_max_round_pd((A), (B), (R)), \
 969                                   (__v8df)(W)))
 970
 971#define _mm512_maskz_max_round_pd(U, A, B, R) \
 972  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
 973                                   (__v8df)_mm512_max_round_pd((A), (B), (R)), \
 974                                   (__v8df)_mm512_setzero_pd()))
 975
 976static  __inline__ __m512d __DEFAULT_FN_ATTRS512
 977_mm512_max_pd(__m512d __A, __m512d __B)
 978{
 979  return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
 980                                           _MM_FROUND_CUR_DIRECTION);
 981}
 982
 983static __inline__ __m512d __DEFAULT_FN_ATTRS512
 984_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 985{
 986  return (__m512d)__builtin_ia32_selectpd_512(__U,
 987                                              (__v8df)_mm512_max_pd(__A, __B),
 988                                              (__v8df)__W);
 989}
 990
 991static __inline__ __m512d __DEFAULT_FN_ATTRS512
 992_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
 993{
 994  return (__m512d)__builtin_ia32_selectpd_512(__U,
 995                                              (__v8df)_mm512_max_pd(__A, __B),
 996                                              (__v8df)_mm512_setzero_pd());
 997}
 998
 999#define _mm512_max_round_ps(A, B, R) \
1000  ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
1001                                   (__v16sf)(__m512)(B), (int)(R)))
1002
1003#define _mm512_mask_max_round_ps(W, U, A, B, R) \
1004  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1005                                  (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
1006                                  (__v16sf)(W)))
1007
1008#define _mm512_maskz_max_round_ps(U, A, B, R) \
1009  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1010                                  (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
1011                                  (__v16sf)_mm512_setzero_ps()))
1012
1013static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1014_mm512_max_ps(__m512 __A, __m512 __B)
1015{
1016  return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
1017                                          _MM_FROUND_CUR_DIRECTION);
1018}
1019
1020static __inline__ __m512 __DEFAULT_FN_ATTRS512
1021_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
1022{
1023  return (__m512)__builtin_ia32_selectps_512(__U,
1024                                             (__v16sf)_mm512_max_ps(__A, __B),
1025                                             (__v16sf)__W);
1026}
1027
1028static __inline__ __m512 __DEFAULT_FN_ATTRS512
1029_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
1030{
1031  return (__m512)__builtin_ia32_selectps_512(__U,
1032                                             (__v16sf)_mm512_max_ps(__A, __B),
1033                                             (__v16sf)_mm512_setzero_ps());
1034}
1035
1036static __inline__ __m128 __DEFAULT_FN_ATTRS128
1037_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1038  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1039                (__v4sf) __B,
1040                (__v4sf) __W,
1041                (__mmask8) __U,
1042                _MM_FROUND_CUR_DIRECTION);
1043}
1044
1045static __inline__ __m128 __DEFAULT_FN_ATTRS128
1046_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1047  return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
1048                (__v4sf) __B,
1049                (__v4sf)  _mm_setzero_ps (),
1050                (__mmask8) __U,
1051                _MM_FROUND_CUR_DIRECTION);
1052}
1053
1054#define _mm_max_round_ss(A, B, R) \
1055  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1056                                           (__v4sf)(__m128)(B), \
1057                                           (__v4sf)_mm_setzero_ps(), \
1058                                           (__mmask8)-1, (int)(R)))
1059
1060#define _mm_mask_max_round_ss(W, U, A, B, R) \
1061  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1062                                           (__v4sf)(__m128)(B), \
1063                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
1064                                           (int)(R)))
1065
1066#define _mm_maskz_max_round_ss(U, A, B, R) \
1067  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
1068                                           (__v4sf)(__m128)(B), \
1069                                           (__v4sf)_mm_setzero_ps(), \
1070                                           (__mmask8)(U), (int)(R)))
1071
1072static __inline__ __m128d __DEFAULT_FN_ATTRS128
1073_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1074  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1075                (__v2df) __B,
1076                (__v2df) __W,
1077                (__mmask8) __U,
1078                _MM_FROUND_CUR_DIRECTION);
1079}
1080
1081static __inline__ __m128d __DEFAULT_FN_ATTRS128
1082_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1083  return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
1084                (__v2df) __B,
1085                (__v2df)  _mm_setzero_pd (),
1086                (__mmask8) __U,
1087                _MM_FROUND_CUR_DIRECTION);
1088}
1089
1090#define _mm_max_round_sd(A, B, R) \
1091  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1092                                            (__v2df)(__m128d)(B), \
1093                                            (__v2df)_mm_setzero_pd(), \
1094                                            (__mmask8)-1, (int)(R)))
1095
1096#define _mm_mask_max_round_sd(W, U, A, B, R) \
1097  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1098                                            (__v2df)(__m128d)(B), \
1099                                            (__v2df)(__m128d)(W), \
1100                                            (__mmask8)(U), (int)(R)))
1101
1102#define _mm_maskz_max_round_sd(U, A, B, R) \
1103  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
1104                                            (__v2df)(__m128d)(B), \
1105                                            (__v2df)_mm_setzero_pd(), \
1106                                            (__mmask8)(U), (int)(R)))
1107
1108static __inline __m512i
1109__DEFAULT_FN_ATTRS512
1110_mm512_max_epi32(__m512i __A, __m512i __B)
1111{
1112  return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
1113}
1114
1115static __inline__ __m512i __DEFAULT_FN_ATTRS512
1116_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1117{
1118  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1119                                            (__v16si)_mm512_max_epi32(__A, __B),
1120                                            (__v16si)__W);
1121}
1122
1123static __inline__ __m512i __DEFAULT_FN_ATTRS512
1124_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
1125{
1126  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1127                                            (__v16si)_mm512_max_epi32(__A, __B),
1128                                            (__v16si)_mm512_setzero_si512());
1129}
1130
1131static __inline __m512i __DEFAULT_FN_ATTRS512
1132_mm512_max_epu32(__m512i __A, __m512i __B)
1133{
1134  return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
1135}
1136
1137static __inline__ __m512i __DEFAULT_FN_ATTRS512
1138_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1139{
1140  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1141                                            (__v16si)_mm512_max_epu32(__A, __B),
1142                                            (__v16si)__W);
1143}
1144
1145static __inline__ __m512i __DEFAULT_FN_ATTRS512
1146_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
1147{
1148  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1149                                            (__v16si)_mm512_max_epu32(__A, __B),
1150                                            (__v16si)_mm512_setzero_si512());
1151}
1152
1153static __inline __m512i __DEFAULT_FN_ATTRS512
1154_mm512_max_epi64(__m512i __A, __m512i __B)
1155{
1156  return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
1157}
1158
1159static __inline__ __m512i __DEFAULT_FN_ATTRS512
1160_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1161{
1162  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1163                                             (__v8di)_mm512_max_epi64(__A, __B),
1164                                             (__v8di)__W);
1165}
1166
1167static __inline__ __m512i __DEFAULT_FN_ATTRS512
1168_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
1169{
1170  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1171                                             (__v8di)_mm512_max_epi64(__A, __B),
1172                                             (__v8di)_mm512_setzero_si512());
1173}
1174
1175static __inline __m512i __DEFAULT_FN_ATTRS512
1176_mm512_max_epu64(__m512i __A, __m512i __B)
1177{
1178  return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
1179}
1180
1181static __inline__ __m512i __DEFAULT_FN_ATTRS512
1182_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1183{
1184  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1185                                             (__v8di)_mm512_max_epu64(__A, __B),
1186                                             (__v8di)__W);
1187}
1188
1189static __inline__ __m512i __DEFAULT_FN_ATTRS512
1190_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
1191{
1192  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1193                                             (__v8di)_mm512_max_epu64(__A, __B),
1194                                             (__v8di)_mm512_setzero_si512());
1195}
1196
1197#define _mm512_min_round_pd(A, B, R) \
1198  ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
1199                                    (__v8df)(__m512d)(B), (int)(R)))
1200
1201#define _mm512_mask_min_round_pd(W, U, A, B, R) \
1202  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1203                                   (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1204                                   (__v8df)(W)))
1205
1206#define _mm512_maskz_min_round_pd(U, A, B, R) \
1207  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1208                                   (__v8df)_mm512_min_round_pd((A), (B), (R)), \
1209                                   (__v8df)_mm512_setzero_pd()))
1210
1211static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1212_mm512_min_pd(__m512d __A, __m512d __B)
1213{
1214  return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
1215                                           _MM_FROUND_CUR_DIRECTION);
1216}
1217
1218static __inline__ __m512d __DEFAULT_FN_ATTRS512
1219_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
1220{
1221  return (__m512d)__builtin_ia32_selectpd_512(__U,
1222                                              (__v8df)_mm512_min_pd(__A, __B),
1223                                              (__v8df)__W);
1224}
1225
1226static __inline__ __m512d __DEFAULT_FN_ATTRS512
1227_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
1228{
1229  return (__m512d)__builtin_ia32_selectpd_512(__U,
1230                                              (__v8df)_mm512_min_pd(__A, __B),
1231                                              (__v8df)_mm512_setzero_pd());
1232}
1233
1234#define _mm512_min_round_ps(A, B, R) \
1235  ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
1236                                   (__v16sf)(__m512)(B), (int)(R)))
1237
1238#define _mm512_mask_min_round_ps(W, U, A, B, R) \
1239  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1240                                  (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1241                                  (__v16sf)(W)))
1242
1243#define _mm512_maskz_min_round_ps(U, A, B, R) \
1244  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1245                                  (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
1246                                  (__v16sf)_mm512_setzero_ps()))
1247
1248static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1249_mm512_min_ps(__m512 __A, __m512 __B)
1250{
1251  return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
1252                                          _MM_FROUND_CUR_DIRECTION);
1253}
1254
1255static __inline__ __m512 __DEFAULT_FN_ATTRS512
1256_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
1257{
1258  return (__m512)__builtin_ia32_selectps_512(__U,
1259                                             (__v16sf)_mm512_min_ps(__A, __B),
1260                                             (__v16sf)__W);
1261}
1262
1263static __inline__ __m512 __DEFAULT_FN_ATTRS512
1264_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
1265{
1266  return (__m512)__builtin_ia32_selectps_512(__U,
1267                                             (__v16sf)_mm512_min_ps(__A, __B),
1268                                             (__v16sf)_mm512_setzero_ps());
1269}
1270
1271static __inline__ __m128 __DEFAULT_FN_ATTRS128
1272_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1273  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1274                (__v4sf) __B,
1275                (__v4sf) __W,
1276                (__mmask8) __U,
1277                _MM_FROUND_CUR_DIRECTION);
1278}
1279
1280static __inline__ __m128 __DEFAULT_FN_ATTRS128
1281_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1282  return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
1283                (__v4sf) __B,
1284                (__v4sf)  _mm_setzero_ps (),
1285                (__mmask8) __U,
1286                _MM_FROUND_CUR_DIRECTION);
1287}
1288
1289#define _mm_min_round_ss(A, B, R) \
1290  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1291                                           (__v4sf)(__m128)(B), \
1292                                           (__v4sf)_mm_setzero_ps(), \
1293                                           (__mmask8)-1, (int)(R)))
1294
1295#define _mm_mask_min_round_ss(W, U, A, B, R) \
1296  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1297                                           (__v4sf)(__m128)(B), \
1298                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
1299                                           (int)(R)))
1300
1301#define _mm_maskz_min_round_ss(U, A, B, R) \
1302  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
1303                                           (__v4sf)(__m128)(B), \
1304                                           (__v4sf)_mm_setzero_ps(), \
1305                                           (__mmask8)(U), (int)(R)))
1306
1307static __inline__ __m128d __DEFAULT_FN_ATTRS128
1308_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1309  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1310                (__v2df) __B,
1311                (__v2df) __W,
1312                (__mmask8) __U,
1313                _MM_FROUND_CUR_DIRECTION);
1314}
1315
1316static __inline__ __m128d __DEFAULT_FN_ATTRS128
1317_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1318  return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
1319                (__v2df) __B,
1320                (__v2df)  _mm_setzero_pd (),
1321                (__mmask8) __U,
1322                _MM_FROUND_CUR_DIRECTION);
1323}
1324
1325#define _mm_min_round_sd(A, B, R) \
1326  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1327                                            (__v2df)(__m128d)(B), \
1328                                            (__v2df)_mm_setzero_pd(), \
1329                                            (__mmask8)-1, (int)(R)))
1330
1331#define _mm_mask_min_round_sd(W, U, A, B, R) \
1332  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1333                                            (__v2df)(__m128d)(B), \
1334                                            (__v2df)(__m128d)(W), \
1335                                            (__mmask8)(U), (int)(R)))
1336
1337#define _mm_maskz_min_round_sd(U, A, B, R) \
1338  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
1339                                            (__v2df)(__m128d)(B), \
1340                                            (__v2df)_mm_setzero_pd(), \
1341                                            (__mmask8)(U), (int)(R)))
1342
1343static __inline __m512i
1344__DEFAULT_FN_ATTRS512
1345_mm512_min_epi32(__m512i __A, __m512i __B)
1346{
1347  return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
1348}
1349
1350static __inline__ __m512i __DEFAULT_FN_ATTRS512
1351_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1352{
1353  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1354                                            (__v16si)_mm512_min_epi32(__A, __B),
1355                                            (__v16si)__W);
1356}
1357
1358static __inline__ __m512i __DEFAULT_FN_ATTRS512
1359_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
1360{
1361  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1362                                            (__v16si)_mm512_min_epi32(__A, __B),
1363                                            (__v16si)_mm512_setzero_si512());
1364}
1365
1366static __inline __m512i __DEFAULT_FN_ATTRS512
1367_mm512_min_epu32(__m512i __A, __m512i __B)
1368{
1369  return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
1370}
1371
1372static __inline__ __m512i __DEFAULT_FN_ATTRS512
1373_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1374{
1375  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1376                                            (__v16si)_mm512_min_epu32(__A, __B),
1377                                            (__v16si)__W);
1378}
1379
1380static __inline__ __m512i __DEFAULT_FN_ATTRS512
1381_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
1382{
1383  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1384                                            (__v16si)_mm512_min_epu32(__A, __B),
1385                                            (__v16si)_mm512_setzero_si512());
1386}
1387
1388static __inline __m512i __DEFAULT_FN_ATTRS512
1389_mm512_min_epi64(__m512i __A, __m512i __B)
1390{
1391  return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
1392}
1393
1394static __inline__ __m512i __DEFAULT_FN_ATTRS512
1395_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1396{
1397  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1398                                             (__v8di)_mm512_min_epi64(__A, __B),
1399                                             (__v8di)__W);
1400}
1401
1402static __inline__ __m512i __DEFAULT_FN_ATTRS512
1403_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
1404{
1405  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1406                                             (__v8di)_mm512_min_epi64(__A, __B),
1407                                             (__v8di)_mm512_setzero_si512());
1408}
1409
1410static __inline __m512i __DEFAULT_FN_ATTRS512
1411_mm512_min_epu64(__m512i __A, __m512i __B)
1412{
1413  return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
1414}
1415
1416static __inline__ __m512i __DEFAULT_FN_ATTRS512
1417_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
1418{
1419  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1420                                             (__v8di)_mm512_min_epu64(__A, __B),
1421                                             (__v8di)__W);
1422}
1423
1424static __inline__ __m512i __DEFAULT_FN_ATTRS512
1425_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
1426{
1427  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1428                                             (__v8di)_mm512_min_epu64(__A, __B),
1429                                             (__v8di)_mm512_setzero_si512());
1430}
1431
1432static __inline __m512i __DEFAULT_FN_ATTRS512
1433_mm512_mul_epi32(__m512i __X, __m512i __Y)
1434{
1435  return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
1436}
1437
1438static __inline __m512i __DEFAULT_FN_ATTRS512
1439_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
1440{
1441  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1442                                             (__v8di)_mm512_mul_epi32(__X, __Y),
1443                                             (__v8di)__W);
1444}
1445
1446static __inline __m512i __DEFAULT_FN_ATTRS512
1447_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
1448{
1449  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1450                                             (__v8di)_mm512_mul_epi32(__X, __Y),
1451                                             (__v8di)_mm512_setzero_si512 ());
1452}
1453
1454static __inline __m512i __DEFAULT_FN_ATTRS512
1455_mm512_mul_epu32(__m512i __X, __m512i __Y)
1456{
1457  return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
1458}
1459
1460static __inline __m512i __DEFAULT_FN_ATTRS512
1461_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
1462{
1463  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1464                                             (__v8di)_mm512_mul_epu32(__X, __Y),
1465                                             (__v8di)__W);
1466}
1467
1468static __inline __m512i __DEFAULT_FN_ATTRS512
1469_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
1470{
1471  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
1472                                             (__v8di)_mm512_mul_epu32(__X, __Y),
1473                                             (__v8di)_mm512_setzero_si512 ());
1474}
1475
1476static __inline __m512i __DEFAULT_FN_ATTRS512
1477_mm512_mullo_epi32 (__m512i __A, __m512i __B)
1478{
1479  return (__m512i) ((__v16su) __A * (__v16su) __B);
1480}
1481
1482static __inline __m512i __DEFAULT_FN_ATTRS512
1483_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
1484{
1485  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1486                                             (__v16si)_mm512_mullo_epi32(__A, __B),
1487                                             (__v16si)_mm512_setzero_si512());
1488}
1489
1490static __inline __m512i __DEFAULT_FN_ATTRS512
1491_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
1492{
1493  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
1494                                             (__v16si)_mm512_mullo_epi32(__A, __B),
1495                                             (__v16si)__W);
1496}
1497
1498static __inline__ __m512i __DEFAULT_FN_ATTRS512
1499_mm512_mullox_epi64 (__m512i __A, __m512i __B) {
1500  return (__m512i) ((__v8du) __A * (__v8du) __B);
1501}
1502
1503static __inline__ __m512i __DEFAULT_FN_ATTRS512
1504_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
1505  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1506                                             (__v8di)_mm512_mullox_epi64(__A, __B),
1507                                             (__v8di)__W);
1508}
1509
1510#define _mm512_sqrt_round_pd(A, R) \
1511  ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
1512
1513#define _mm512_mask_sqrt_round_pd(W, U, A, R) \
1514  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1515                                       (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1516                                       (__v8df)(__m512d)(W)))
1517
1518#define _mm512_maskz_sqrt_round_pd(U, A, R) \
1519  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
1520                                       (__v8df)_mm512_sqrt_round_pd((A), (R)), \
1521                                       (__v8df)_mm512_setzero_pd()))
1522
1523static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1524_mm512_sqrt_pd(__m512d __A)
1525{
1526  return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
1527                                           _MM_FROUND_CUR_DIRECTION);
1528}
1529
1530static __inline__ __m512d __DEFAULT_FN_ATTRS512
1531_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
1532{
1533  return (__m512d)__builtin_ia32_selectpd_512(__U,
1534                                              (__v8df)_mm512_sqrt_pd(__A),
1535                                              (__v8df)__W);
1536}
1537
1538static __inline__ __m512d __DEFAULT_FN_ATTRS512
1539_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
1540{
1541  return (__m512d)__builtin_ia32_selectpd_512(__U,
1542                                              (__v8df)_mm512_sqrt_pd(__A),
1543                                              (__v8df)_mm512_setzero_pd());
1544}
1545
1546#define _mm512_sqrt_round_ps(A, R) \
1547  ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
1548
1549#define _mm512_mask_sqrt_round_ps(W, U, A, R) \
1550  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1551                                      (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1552                                      (__v16sf)(__m512)(W)))
1553
1554#define _mm512_maskz_sqrt_round_ps(U, A, R) \
1555  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
1556                                      (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
1557                                      (__v16sf)_mm512_setzero_ps()))
1558
1559static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1560_mm512_sqrt_ps(__m512 __A)
1561{
1562  return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
1563                                          _MM_FROUND_CUR_DIRECTION);
1564}
1565
1566static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1567_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
1568{
1569  return (__m512)__builtin_ia32_selectps_512(__U,
1570                                             (__v16sf)_mm512_sqrt_ps(__A),
1571                                             (__v16sf)__W);
1572}
1573
1574static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1575_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
1576{
1577  return (__m512)__builtin_ia32_selectps_512(__U,
1578                                             (__v16sf)_mm512_sqrt_ps(__A),
1579                                             (__v16sf)_mm512_setzero_ps());
1580}
1581
1582static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1583_mm512_rsqrt14_pd(__m512d __A)
1584{
1585  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1586                 (__v8df)
1587                 _mm512_setzero_pd (),
1588                 (__mmask8) -1);}
1589
1590static __inline__ __m512d __DEFAULT_FN_ATTRS512
1591_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
1592{
1593  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1594                  (__v8df) __W,
1595                  (__mmask8) __U);
1596}
1597
1598static __inline__ __m512d __DEFAULT_FN_ATTRS512
1599_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
1600{
1601  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
1602                  (__v8df)
1603                  _mm512_setzero_pd (),
1604                  (__mmask8) __U);
1605}
1606
1607static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1608_mm512_rsqrt14_ps(__m512 __A)
1609{
1610  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1611                (__v16sf)
1612                _mm512_setzero_ps (),
1613                (__mmask16) -1);
1614}
1615
1616static __inline__ __m512 __DEFAULT_FN_ATTRS512
1617_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
1618{
1619  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1620                 (__v16sf) __W,
1621                 (__mmask16) __U);
1622}
1623
1624static __inline__ __m512 __DEFAULT_FN_ATTRS512
1625_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
1626{
1627  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
1628                 (__v16sf)
1629                 _mm512_setzero_ps (),
1630                 (__mmask16) __U);
1631}
1632
1633static  __inline__ __m128 __DEFAULT_FN_ATTRS128
1634_mm_rsqrt14_ss(__m128 __A, __m128 __B)
1635{
1636  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1637             (__v4sf) __B,
1638             (__v4sf)
1639             _mm_setzero_ps (),
1640             (__mmask8) -1);
1641}
1642
1643static __inline__ __m128 __DEFAULT_FN_ATTRS128
1644_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
1645{
1646 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1647          (__v4sf) __B,
1648          (__v4sf) __W,
1649          (__mmask8) __U);
1650}
1651
1652static __inline__ __m128 __DEFAULT_FN_ATTRS128
1653_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
1654{
1655 return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
1656          (__v4sf) __B,
1657          (__v4sf) _mm_setzero_ps (),
1658          (__mmask8) __U);
1659}
1660
1661static  __inline__ __m128d __DEFAULT_FN_ATTRS128
1662_mm_rsqrt14_sd(__m128d __A, __m128d __B)
1663{
1664  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
1665              (__v2df) __B,
1666              (__v2df)
1667              _mm_setzero_pd (),
1668              (__mmask8) -1);
1669}
1670
1671static __inline__ __m128d __DEFAULT_FN_ATTRS128
1672_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
1673{
1674 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1675          (__v2df) __B,
1676          (__v2df) __W,
1677          (__mmask8) __U);
1678}
1679
1680static __inline__ __m128d __DEFAULT_FN_ATTRS128
1681_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
1682{
1683 return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
1684          (__v2df) __B,
1685          (__v2df) _mm_setzero_pd (),
1686          (__mmask8) __U);
1687}
1688
1689static  __inline__ __m512d __DEFAULT_FN_ATTRS512
1690_mm512_rcp14_pd(__m512d __A)
1691{
1692  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1693               (__v8df)
1694               _mm512_setzero_pd (),
1695               (__mmask8) -1);
1696}
1697
1698static __inline__ __m512d __DEFAULT_FN_ATTRS512
1699_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
1700{
1701  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1702                (__v8df) __W,
1703                (__mmask8) __U);
1704}
1705
1706static __inline__ __m512d __DEFAULT_FN_ATTRS512
1707_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
1708{
1709  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
1710                (__v8df)
1711                _mm512_setzero_pd (),
1712                (__mmask8) __U);
1713}
1714
1715static  __inline__ __m512 __DEFAULT_FN_ATTRS512
1716_mm512_rcp14_ps(__m512 __A)
1717{
1718  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1719              (__v16sf)
1720              _mm512_setzero_ps (),
1721              (__mmask16) -1);
1722}
1723
1724static __inline__ __m512 __DEFAULT_FN_ATTRS512
1725_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
1726{
1727  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1728                   (__v16sf) __W,
1729                   (__mmask16) __U);
1730}
1731
1732static __inline__ __m512 __DEFAULT_FN_ATTRS512
1733_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
1734{
1735  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
1736                   (__v16sf)
1737                   _mm512_setzero_ps (),
1738                   (__mmask16) __U);
1739}
1740
1741static  __inline__ __m128 __DEFAULT_FN_ATTRS128
1742_mm_rcp14_ss(__m128 __A, __m128 __B)
1743{
1744  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1745                 (__v4sf) __B,
1746                 (__v4sf)
1747                 _mm_setzero_ps (),
1748                 (__mmask8) -1);
1749}
1750
1751static __inline__ __m128 __DEFAULT_FN_ATTRS128
1752_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
1753{
1754 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1755          (__v4sf) __B,
1756          (__v4sf) __W,
1757          (__mmask8) __U);
1758}
1759
1760static __inline__ __m128 __DEFAULT_FN_ATTRS128
1761_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
1762{
1763 return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
1764          (__v4sf) __B,
1765          (__v4sf) _mm_setzero_ps (),
1766          (__mmask8) __U);
1767}
1768
1769static  __inline__ __m128d __DEFAULT_FN_ATTRS128
1770_mm_rcp14_sd(__m128d __A, __m128d __B)
1771{
1772  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
1773            (__v2df) __B,
1774            (__v2df)
1775            _mm_setzero_pd (),
1776            (__mmask8) -1);
1777}
1778
1779static __inline__ __m128d __DEFAULT_FN_ATTRS128
1780_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
1781{
1782 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1783          (__v2df) __B,
1784          (__v2df) __W,
1785          (__mmask8) __U);
1786}
1787
1788static __inline__ __m128d __DEFAULT_FN_ATTRS128
1789_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
1790{
1791 return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
1792          (__v2df) __B,
1793          (__v2df) _mm_setzero_pd (),
1794          (__mmask8) __U);
1795}
1796
1797static __inline __m512 __DEFAULT_FN_ATTRS512
1798_mm512_floor_ps(__m512 __A)
1799{
1800  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1801                                                  _MM_FROUND_FLOOR,
1802                                                  (__v16sf) __A, (unsigned short)-1,
1803                                                  _MM_FROUND_CUR_DIRECTION);
1804}
1805
1806static __inline__ __m512 __DEFAULT_FN_ATTRS512
1807_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
1808{
1809  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1810                   _MM_FROUND_FLOOR,
1811                   (__v16sf) __W, __U,
1812                   _MM_FROUND_CUR_DIRECTION);
1813}
1814
1815static __inline __m512d __DEFAULT_FN_ATTRS512
1816_mm512_floor_pd(__m512d __A)
1817{
1818  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1819                                                   _MM_FROUND_FLOOR,
1820                                                   (__v8df) __A, (unsigned char)-1,
1821                                                   _MM_FROUND_CUR_DIRECTION);
1822}
1823
1824static __inline__ __m512d __DEFAULT_FN_ATTRS512
1825_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
1826{
1827  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1828                _MM_FROUND_FLOOR,
1829                (__v8df) __W, __U,
1830                _MM_FROUND_CUR_DIRECTION);
1831}
1832
1833static __inline__ __m512 __DEFAULT_FN_ATTRS512
1834_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
1835{
1836  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1837                   _MM_FROUND_CEIL,
1838                   (__v16sf) __W, __U,
1839                   _MM_FROUND_CUR_DIRECTION);
1840}
1841
1842static __inline __m512 __DEFAULT_FN_ATTRS512
1843_mm512_ceil_ps(__m512 __A)
1844{
1845  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
1846                                                  _MM_FROUND_CEIL,
1847                                                  (__v16sf) __A, (unsigned short)-1,
1848                                                  _MM_FROUND_CUR_DIRECTION);
1849}
1850
1851static __inline __m512d __DEFAULT_FN_ATTRS512
1852_mm512_ceil_pd(__m512d __A)
1853{
1854  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1855                                                   _MM_FROUND_CEIL,
1856                                                   (__v8df) __A, (unsigned char)-1,
1857                                                   _MM_FROUND_CUR_DIRECTION);
1858}
1859
1860static __inline__ __m512d __DEFAULT_FN_ATTRS512
1861_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
1862{
1863  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
1864                _MM_FROUND_CEIL,
1865                (__v8df) __W, __U,
1866                _MM_FROUND_CUR_DIRECTION);
1867}
1868
1869static __inline __m512i __DEFAULT_FN_ATTRS512
1870_mm512_abs_epi64(__m512i __A)
1871{
1872  return (__m512i)__builtin_elementwise_abs((__v8di)__A);
1873}
1874
1875static __inline__ __m512i __DEFAULT_FN_ATTRS512
1876_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
1877{
1878  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1879                                             (__v8di)_mm512_abs_epi64(__A),
1880                                             (__v8di)__W);
1881}
1882
1883static __inline__ __m512i __DEFAULT_FN_ATTRS512
1884_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
1885{
1886  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
1887                                             (__v8di)_mm512_abs_epi64(__A),
1888                                             (__v8di)_mm512_setzero_si512());
1889}
1890
1891static __inline __m512i __DEFAULT_FN_ATTRS512
1892_mm512_abs_epi32(__m512i __A)
1893{
1894  return (__m512i)__builtin_elementwise_abs((__v16si) __A);
1895}
1896
1897static __inline__ __m512i __DEFAULT_FN_ATTRS512
1898_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
1899{
1900  return (__m512i)__builtin_ia32_selectd_512(__U,
1901                                             (__v16si)_mm512_abs_epi32(__A),
1902                                             (__v16si)__W);
1903}
1904
1905static __inline__ __m512i __DEFAULT_FN_ATTRS512
1906_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
1907{
1908  return (__m512i)__builtin_ia32_selectd_512(__U,
1909                                             (__v16si)_mm512_abs_epi32(__A),
1910                                             (__v16si)_mm512_setzero_si512());
1911}
1912
1913static __inline__ __m128 __DEFAULT_FN_ATTRS128
1914_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
1915  __A = _mm_add_ss(__A, __B);
1916  return __builtin_ia32_selectss_128(__U, __A, __W);
1917}
1918
1919static __inline__ __m128 __DEFAULT_FN_ATTRS128
1920_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
1921  __A = _mm_add_ss(__A, __B);
1922  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
1923}
1924
1925#define _mm_add_round_ss(A, B, R) \
1926  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1927                                           (__v4sf)(__m128)(B), \
1928                                           (__v4sf)_mm_setzero_ps(), \
1929                                           (__mmask8)-1, (int)(R)))
1930
1931#define _mm_mask_add_round_ss(W, U, A, B, R) \
1932  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1933                                           (__v4sf)(__m128)(B), \
1934                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
1935                                           (int)(R)))
1936
1937#define _mm_maskz_add_round_ss(U, A, B, R) \
1938  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
1939                                           (__v4sf)(__m128)(B), \
1940                                           (__v4sf)_mm_setzero_ps(), \
1941                                           (__mmask8)(U), (int)(R)))
1942
1943static __inline__ __m128d __DEFAULT_FN_ATTRS128
1944_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
1945  __A = _mm_add_sd(__A, __B);
1946  return __builtin_ia32_selectsd_128(__U, __A, __W);
1947}
1948
1949static __inline__ __m128d __DEFAULT_FN_ATTRS128
1950_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
1951  __A = _mm_add_sd(__A, __B);
1952  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
1953}
1954#define _mm_add_round_sd(A, B, R) \
1955  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1956                                            (__v2df)(__m128d)(B), \
1957                                            (__v2df)_mm_setzero_pd(), \
1958                                            (__mmask8)-1, (int)(R)))
1959
1960#define _mm_mask_add_round_sd(W, U, A, B, R) \
1961  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1962                                            (__v2df)(__m128d)(B), \
1963                                            (__v2df)(__m128d)(W), \
1964                                            (__mmask8)(U), (int)(R)))
1965
1966#define _mm_maskz_add_round_sd(U, A, B, R) \
1967  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
1968                                            (__v2df)(__m128d)(B), \
1969                                            (__v2df)_mm_setzero_pd(), \
1970                                            (__mmask8)(U), (int)(R)))
1971
1972static __inline__ __m512d __DEFAULT_FN_ATTRS512
1973_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
1974  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
1975                                              (__v8df)_mm512_add_pd(__A, __B),
1976                                              (__v8df)__W);
1977}
1978
1979static __inline__ __m512d __DEFAULT_FN_ATTRS512
1980_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
1981  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
1982                                              (__v8df)_mm512_add_pd(__A, __B),
1983                                              (__v8df)_mm512_setzero_pd());
1984}
1985
1986static __inline__ __m512 __DEFAULT_FN_ATTRS512
1987_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
1988  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
1989                                             (__v16sf)_mm512_add_ps(__A, __B),
1990                                             (__v16sf)__W);
1991}
1992
1993static __inline__ __m512 __DEFAULT_FN_ATTRS512
1994_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
1995  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
1996                                             (__v16sf)_mm512_add_ps(__A, __B),
1997                                             (__v16sf)_mm512_setzero_ps());
1998}
1999
2000#define _mm512_add_round_pd(A, B, R) \
2001  ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
2002                                    (__v8df)(__m512d)(B), (int)(R)))
2003
2004#define _mm512_mask_add_round_pd(W, U, A, B, R) \
2005  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2006                                   (__v8df)_mm512_add_round_pd((A), (B), (R)), \
2007                                   (__v8df)(__m512d)(W)))
2008
2009#define _mm512_maskz_add_round_pd(U, A, B, R) \
2010  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2011                                   (__v8df)_mm512_add_round_pd((A), (B), (R)), \
2012                                   (__v8df)_mm512_setzero_pd()))
2013
2014#define _mm512_add_round_ps(A, B, R) \
2015  ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
2016                                   (__v16sf)(__m512)(B), (int)(R)))
2017
2018#define _mm512_mask_add_round_ps(W, U, A, B, R) \
2019  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2020                                  (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2021                                  (__v16sf)(__m512)(W)))
2022
2023#define _mm512_maskz_add_round_ps(U, A, B, R) \
2024  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2025                                  (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
2026                                  (__v16sf)_mm512_setzero_ps()))
2027
2028static __inline__ __m128 __DEFAULT_FN_ATTRS128
2029_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2030  __A = _mm_sub_ss(__A, __B);
2031  return __builtin_ia32_selectss_128(__U, __A, __W);
2032}
2033
2034static __inline__ __m128 __DEFAULT_FN_ATTRS128
2035_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2036  __A = _mm_sub_ss(__A, __B);
2037  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2038}
2039#define _mm_sub_round_ss(A, B, R) \
2040  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2041                                           (__v4sf)(__m128)(B), \
2042                                           (__v4sf)_mm_setzero_ps(), \
2043                                           (__mmask8)-1, (int)(R)))
2044
2045#define _mm_mask_sub_round_ss(W, U, A, B, R) \
2046  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2047                                           (__v4sf)(__m128)(B), \
2048                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
2049                                           (int)(R)))
2050
2051#define _mm_maskz_sub_round_ss(U, A, B, R) \
2052  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
2053                                           (__v4sf)(__m128)(B), \
2054                                           (__v4sf)_mm_setzero_ps(), \
2055                                           (__mmask8)(U), (int)(R)))
2056
2057static __inline__ __m128d __DEFAULT_FN_ATTRS128
2058_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2059  __A = _mm_sub_sd(__A, __B);
2060  return __builtin_ia32_selectsd_128(__U, __A, __W);
2061}
2062
2063static __inline__ __m128d __DEFAULT_FN_ATTRS128
2064_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2065  __A = _mm_sub_sd(__A, __B);
2066  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2067}
2068
2069#define _mm_sub_round_sd(A, B, R) \
2070  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2071                                            (__v2df)(__m128d)(B), \
2072                                            (__v2df)_mm_setzero_pd(), \
2073                                            (__mmask8)-1, (int)(R)))
2074
2075#define _mm_mask_sub_round_sd(W, U, A, B, R) \
2076  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2077                                            (__v2df)(__m128d)(B), \
2078                                            (__v2df)(__m128d)(W), \
2079                                            (__mmask8)(U), (int)(R)))
2080
2081#define _mm_maskz_sub_round_sd(U, A, B, R) \
2082  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
2083                                            (__v2df)(__m128d)(B), \
2084                                            (__v2df)_mm_setzero_pd(), \
2085                                            (__mmask8)(U), (int)(R)))
2086
2087static __inline__ __m512d __DEFAULT_FN_ATTRS512
2088_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2089  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2090                                              (__v8df)_mm512_sub_pd(__A, __B),
2091                                              (__v8df)__W);
2092}
2093
2094static __inline__ __m512d __DEFAULT_FN_ATTRS512
2095_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2096  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2097                                              (__v8df)_mm512_sub_pd(__A, __B),
2098                                              (__v8df)_mm512_setzero_pd());
2099}
2100
2101static __inline__ __m512 __DEFAULT_FN_ATTRS512
2102_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2103  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2104                                             (__v16sf)_mm512_sub_ps(__A, __B),
2105                                             (__v16sf)__W);
2106}
2107
2108static __inline__ __m512 __DEFAULT_FN_ATTRS512
2109_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2110  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2111                                             (__v16sf)_mm512_sub_ps(__A, __B),
2112                                             (__v16sf)_mm512_setzero_ps());
2113}
2114
2115#define _mm512_sub_round_pd(A, B, R) \
2116  ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
2117                                    (__v8df)(__m512d)(B), (int)(R)))
2118
2119#define _mm512_mask_sub_round_pd(W, U, A, B, R) \
2120  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2121                                   (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2122                                   (__v8df)(__m512d)(W)))
2123
2124#define _mm512_maskz_sub_round_pd(U, A, B, R) \
2125  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2126                                   (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
2127                                   (__v8df)_mm512_setzero_pd()))
2128
2129#define _mm512_sub_round_ps(A, B, R) \
2130  ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
2131                                   (__v16sf)(__m512)(B), (int)(R)))
2132
2133#define _mm512_mask_sub_round_ps(W, U, A, B, R) \
2134  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2135                                  (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2136                                  (__v16sf)(__m512)(W)))
2137
2138#define _mm512_maskz_sub_round_ps(U, A, B, R) \
2139  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2140                                  (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
2141                                  (__v16sf)_mm512_setzero_ps()))
2142
2143static __inline__ __m128 __DEFAULT_FN_ATTRS128
2144_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2145  __A = _mm_mul_ss(__A, __B);
2146  return __builtin_ia32_selectss_128(__U, __A, __W);
2147}
2148
2149static __inline__ __m128 __DEFAULT_FN_ATTRS128
2150_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2151  __A = _mm_mul_ss(__A, __B);
2152  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2153}
2154#define _mm_mul_round_ss(A, B, R) \
2155  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2156                                           (__v4sf)(__m128)(B), \
2157                                           (__v4sf)_mm_setzero_ps(), \
2158                                           (__mmask8)-1, (int)(R)))
2159
2160#define _mm_mask_mul_round_ss(W, U, A, B, R) \
2161  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2162                                           (__v4sf)(__m128)(B), \
2163                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
2164                                           (int)(R)))
2165
2166#define _mm_maskz_mul_round_ss(U, A, B, R) \
2167  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
2168                                           (__v4sf)(__m128)(B), \
2169                                           (__v4sf)_mm_setzero_ps(), \
2170                                           (__mmask8)(U), (int)(R)))
2171
2172static __inline__ __m128d __DEFAULT_FN_ATTRS128
2173_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2174  __A = _mm_mul_sd(__A, __B);
2175  return __builtin_ia32_selectsd_128(__U, __A, __W);
2176}
2177
2178static __inline__ __m128d __DEFAULT_FN_ATTRS128
2179_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2180  __A = _mm_mul_sd(__A, __B);
2181  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2182}
2183
2184#define _mm_mul_round_sd(A, B, R) \
2185  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2186                                            (__v2df)(__m128d)(B), \
2187                                            (__v2df)_mm_setzero_pd(), \
2188                                            (__mmask8)-1, (int)(R)))
2189
2190#define _mm_mask_mul_round_sd(W, U, A, B, R) \
2191  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2192                                            (__v2df)(__m128d)(B), \
2193                                            (__v2df)(__m128d)(W), \
2194                                            (__mmask8)(U), (int)(R)))
2195
2196#define _mm_maskz_mul_round_sd(U, A, B, R) \
2197  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
2198                                            (__v2df)(__m128d)(B), \
2199                                            (__v2df)_mm_setzero_pd(), \
2200                                            (__mmask8)(U), (int)(R)))
2201
2202static __inline__ __m512d __DEFAULT_FN_ATTRS512
2203_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2204  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2205                                              (__v8df)_mm512_mul_pd(__A, __B),
2206                                              (__v8df)__W);
2207}
2208
2209static __inline__ __m512d __DEFAULT_FN_ATTRS512
2210_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2211  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2212                                              (__v8df)_mm512_mul_pd(__A, __B),
2213                                              (__v8df)_mm512_setzero_pd());
2214}
2215
2216static __inline__ __m512 __DEFAULT_FN_ATTRS512
2217_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2218  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2219                                             (__v16sf)_mm512_mul_ps(__A, __B),
2220                                             (__v16sf)__W);
2221}
2222
2223static __inline__ __m512 __DEFAULT_FN_ATTRS512
2224_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2225  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2226                                             (__v16sf)_mm512_mul_ps(__A, __B),
2227                                             (__v16sf)_mm512_setzero_ps());
2228}
2229
2230#define _mm512_mul_round_pd(A, B, R) \
2231  ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
2232                                    (__v8df)(__m512d)(B), (int)(R)))
2233
2234#define _mm512_mask_mul_round_pd(W, U, A, B, R) \
2235  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2236                                   (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2237                                   (__v8df)(__m512d)(W)))
2238
2239#define _mm512_maskz_mul_round_pd(U, A, B, R) \
2240  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2241                                   (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
2242                                   (__v8df)_mm512_setzero_pd()))
2243
2244#define _mm512_mul_round_ps(A, B, R) \
2245  ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
2246                                  (__v16sf)(__m512)(B), (int)(R)))
2247
2248#define _mm512_mask_mul_round_ps(W, U, A, B, R) \
2249  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2250                                  (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2251                                  (__v16sf)(__m512)(W)))
2252
2253#define _mm512_maskz_mul_round_ps(U, A, B, R) \
2254  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2255                                  (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
2256                                  (__v16sf)_mm512_setzero_ps()))
2257
2258static __inline__ __m128 __DEFAULT_FN_ATTRS128
2259_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
2260  __A = _mm_div_ss(__A, __B);
2261  return __builtin_ia32_selectss_128(__U, __A, __W);
2262}
2263
2264static __inline__ __m128 __DEFAULT_FN_ATTRS128
2265_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
2266  __A = _mm_div_ss(__A, __B);
2267  return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
2268}
2269
2270#define _mm_div_round_ss(A, B, R) \
2271  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2272                                           (__v4sf)(__m128)(B), \
2273                                           (__v4sf)_mm_setzero_ps(), \
2274                                           (__mmask8)-1, (int)(R)))
2275
2276#define _mm_mask_div_round_ss(W, U, A, B, R) \
2277  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2278                                           (__v4sf)(__m128)(B), \
2279                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
2280                                           (int)(R)))
2281
2282#define _mm_maskz_div_round_ss(U, A, B, R) \
2283  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
2284                                           (__v4sf)(__m128)(B), \
2285                                           (__v4sf)_mm_setzero_ps(), \
2286                                           (__mmask8)(U), (int)(R)))
2287
2288static __inline__ __m128d __DEFAULT_FN_ATTRS128
2289_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
2290  __A = _mm_div_sd(__A, __B);
2291  return __builtin_ia32_selectsd_128(__U, __A, __W);
2292}
2293
2294static __inline__ __m128d __DEFAULT_FN_ATTRS128
2295_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
2296  __A = _mm_div_sd(__A, __B);
2297  return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
2298}
2299
2300#define _mm_div_round_sd(A, B, R) \
2301  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2302                                            (__v2df)(__m128d)(B), \
2303                                            (__v2df)_mm_setzero_pd(), \
2304                                            (__mmask8)-1, (int)(R)))
2305
2306#define _mm_mask_div_round_sd(W, U, A, B, R) \
2307  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2308                                            (__v2df)(__m128d)(B), \
2309                                            (__v2df)(__m128d)(W), \
2310                                            (__mmask8)(U), (int)(R)))
2311
2312#define _mm_maskz_div_round_sd(U, A, B, R) \
2313  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
2314                                            (__v2df)(__m128d)(B), \
2315                                            (__v2df)_mm_setzero_pd(), \
2316                                            (__mmask8)(U), (int)(R)))
2317
2318static __inline __m512d __DEFAULT_FN_ATTRS512
2319_mm512_div_pd(__m512d __a, __m512d __b)
2320{
2321  return (__m512d)((__v8df)__a/(__v8df)__b);
2322}
2323
2324static __inline__ __m512d __DEFAULT_FN_ATTRS512
2325_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
2326  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2327                                              (__v8df)_mm512_div_pd(__A, __B),
2328                                              (__v8df)__W);
2329}
2330
2331static __inline__ __m512d __DEFAULT_FN_ATTRS512
2332_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
2333  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
2334                                              (__v8df)_mm512_div_pd(__A, __B),
2335                                              (__v8df)_mm512_setzero_pd());
2336}
2337
2338static __inline __m512 __DEFAULT_FN_ATTRS512
2339_mm512_div_ps(__m512 __a, __m512 __b)
2340{
2341  return (__m512)((__v16sf)__a/(__v16sf)__b);
2342}
2343
2344static __inline__ __m512 __DEFAULT_FN_ATTRS512
2345_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
2346  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2347                                             (__v16sf)_mm512_div_ps(__A, __B),
2348                                             (__v16sf)__W);
2349}
2350
2351static __inline__ __m512 __DEFAULT_FN_ATTRS512
2352_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
2353  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
2354                                             (__v16sf)_mm512_div_ps(__A, __B),
2355                                             (__v16sf)_mm512_setzero_ps());
2356}
2357
2358#define _mm512_div_round_pd(A, B, R) \
2359  ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
2360                                    (__v8df)(__m512d)(B), (int)(R)))
2361
2362#define _mm512_mask_div_round_pd(W, U, A, B, R) \
2363  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2364                                   (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2365                                   (__v8df)(__m512d)(W)))
2366
2367#define _mm512_maskz_div_round_pd(U, A, B, R) \
2368  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
2369                                   (__v8df)_mm512_div_round_pd((A), (B), (R)), \
2370                                   (__v8df)_mm512_setzero_pd()))
2371
2372#define _mm512_div_round_ps(A, B, R) \
2373  ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
2374                                   (__v16sf)(__m512)(B), (int)(R)))
2375
2376#define _mm512_mask_div_round_ps(W, U, A, B, R) \
2377  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2378                                  (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2379                                  (__v16sf)(__m512)(W)))
2380
2381#define _mm512_maskz_div_round_ps(U, A, B, R) \
2382  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
2383                                  (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
2384                                  (__v16sf)_mm512_setzero_ps()))
2385
2386#define _mm512_roundscale_ps(A, B) \
2387  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
2388                                          (__v16sf)_mm512_undefined_ps(), \
2389                                          (__mmask16)-1, \
2390                                          _MM_FROUND_CUR_DIRECTION))
2391
2392#define _mm512_mask_roundscale_ps(A, B, C, imm) \
2393  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2394                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
2395                                         _MM_FROUND_CUR_DIRECTION))
2396
2397#define _mm512_maskz_roundscale_ps(A, B, imm) \
2398  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2399                                          (__v16sf)_mm512_setzero_ps(), \
2400                                          (__mmask16)(A), \
2401                                          _MM_FROUND_CUR_DIRECTION))
2402
2403#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
2404  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
2405                                         (__v16sf)(__m512)(A), (__mmask16)(B), \
2406                                         (int)(R)))
2407
2408#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
2409  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
2410                                          (__v16sf)_mm512_setzero_ps(), \
2411                                          (__mmask16)(A), (int)(R)))
2412
2413#define _mm512_roundscale_round_ps(A, imm, R) \
2414  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
2415                                          (__v16sf)_mm512_undefined_ps(), \
2416                                          (__mmask16)-1, (int)(R)))
2417
2418#define _mm512_roundscale_pd(A, B) \
2419  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
2420                                           (__v8df)_mm512_undefined_pd(), \
2421                                           (__mmask8)-1, \
2422                                           _MM_FROUND_CUR_DIRECTION))
2423
2424#define _mm512_mask_roundscale_pd(A, B, C, imm) \
2425  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2426                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
2427                                          _MM_FROUND_CUR_DIRECTION))
2428
2429#define _mm512_maskz_roundscale_pd(A, B, imm) \
2430  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2431                                           (__v8df)_mm512_setzero_pd(), \
2432                                           (__mmask8)(A), \
2433                                           _MM_FROUND_CUR_DIRECTION))
2434
2435#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
2436  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
2437                                          (__v8df)(__m512d)(A), (__mmask8)(B), \
2438                                          (int)(R)))
2439
2440#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
2441  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
2442                                           (__v8df)_mm512_setzero_pd(), \
2443                                           (__mmask8)(A), (int)(R)))
2444
2445#define _mm512_roundscale_round_pd(A, imm, R) \
2446  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
2447                                           (__v8df)_mm512_undefined_pd(), \
2448                                           (__mmask8)-1, (int)(R)))
2449
2450#define _mm512_fmadd_round_pd(A, B, C, R) \
2451  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2452                                            (__v8df)(__m512d)(B), \
2453                                            (__v8df)(__m512d)(C), \
2454                                            (__mmask8)-1, (int)(R)))
2455
2456
2457#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
2458  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2459                                            (__v8df)(__m512d)(B), \
2460                                            (__v8df)(__m512d)(C), \
2461                                            (__mmask8)(U), (int)(R)))
2462
2463
2464#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
2465  ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
2466                                             (__v8df)(__m512d)(B), \
2467                                             (__v8df)(__m512d)(C), \
2468                                             (__mmask8)(U), (int)(R)))
2469
2470
2471#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
2472  ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2473                                             (__v8df)(__m512d)(B), \
2474                                             (__v8df)(__m512d)(C), \
2475                                             (__mmask8)(U), (int)(R)))
2476
2477
2478#define _mm512_fmsub_round_pd(A, B, C, R) \
2479  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2480                                            (__v8df)(__m512d)(B), \
2481                                            -(__v8df)(__m512d)(C), \
2482                                            (__mmask8)-1, (int)(R)))
2483
2484
2485#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
2486  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
2487                                            (__v8df)(__m512d)(B), \
2488                                            -(__v8df)(__m512d)(C), \
2489                                            (__mmask8)(U), (int)(R)))
2490
2491
2492#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
2493  ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
2494                                             (__v8df)(__m512d)(B), \
2495                                             -(__v8df)(__m512d)(C), \
2496                                             (__mmask8)(U), (int)(R)))
2497
2498
2499#define _mm512_fnmadd_round_pd(A, B, C, R) \
2500  ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2501                                            (__v8df)(__m512d)(B), \
2502                                            (__v8df)(__m512d)(C), \
2503                                            (__mmask8)-1, (int)(R)))
2504
2505
2506#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
2507  ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
2508                                             (__v8df)(__m512d)(B), \
2509                                             (__v8df)(__m512d)(C), \
2510                                             (__mmask8)(U), (int)(R)))
2511
2512
2513#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
2514  ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2515                                             (__v8df)(__m512d)(B), \
2516                                             (__v8df)(__m512d)(C), \
2517                                             (__mmask8)(U), (int)(R)))
2518
2519
2520#define _mm512_fnmsub_round_pd(A, B, C, R) \
2521  ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
2522                                            (__v8df)(__m512d)(B), \
2523                                            -(__v8df)(__m512d)(C), \
2524                                            (__mmask8)-1, (int)(R)))
2525
2526
2527#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
2528  ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
2529                                             (__v8df)(__m512d)(B), \
2530                                             -(__v8df)(__m512d)(C), \
2531                                             (__mmask8)(U), (int)(R)))
2532
2533
2534static __inline__ __m512d __DEFAULT_FN_ATTRS512
2535_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
2536{
2537  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2538                                                    (__v8df) __B,
2539                                                    (__v8df) __C,
2540                                                    (__mmask8) -1,
2541                                                    _MM_FROUND_CUR_DIRECTION);
2542}
2543
2544static __inline__ __m512d __DEFAULT_FN_ATTRS512
2545_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2546{
2547  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2548                                                    (__v8df) __B,
2549                                                    (__v8df) __C,
2550                                                    (__mmask8) __U,
2551                                                    _MM_FROUND_CUR_DIRECTION);
2552}
2553
2554static __inline__ __m512d __DEFAULT_FN_ATTRS512
2555_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2556{
2557  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
2558                                                     (__v8df) __B,
2559                                                     (__v8df) __C,
2560                                                     (__mmask8) __U,
2561                                                     _MM_FROUND_CUR_DIRECTION);
2562}
2563
2564static __inline__ __m512d __DEFAULT_FN_ATTRS512
2565_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2566{
2567  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2568                                                     (__v8df) __B,
2569                                                     (__v8df) __C,
2570                                                     (__mmask8) __U,
2571                                                     _MM_FROUND_CUR_DIRECTION);
2572}
2573
2574static __inline__ __m512d __DEFAULT_FN_ATTRS512
2575_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
2576{
2577  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2578                                                    (__v8df) __B,
2579                                                    -(__v8df) __C,
2580                                                    (__mmask8) -1,
2581                                                    _MM_FROUND_CUR_DIRECTION);
2582}
2583
2584static __inline__ __m512d __DEFAULT_FN_ATTRS512
2585_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2586{
2587  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2588                                                    (__v8df) __B,
2589                                                    -(__v8df) __C,
2590                                                    (__mmask8) __U,
2591                                                    _MM_FROUND_CUR_DIRECTION);
2592}
2593
2594static __inline__ __m512d __DEFAULT_FN_ATTRS512
2595_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2596{
2597  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
2598                                                     (__v8df) __B,
2599                                                     -(__v8df) __C,
2600                                                     (__mmask8) __U,
2601                                                     _MM_FROUND_CUR_DIRECTION);
2602}
2603
2604static __inline__ __m512d __DEFAULT_FN_ATTRS512
2605_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
2606{
2607  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2608                                                    -(__v8df) __B,
2609                                                    (__v8df) __C,
2610                                                    (__mmask8) -1,
2611                                                    _MM_FROUND_CUR_DIRECTION);
2612}
2613
2614static __inline__ __m512d __DEFAULT_FN_ATTRS512
2615_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2616{
2617  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
2618                                                     (__v8df) __B,
2619                                                     (__v8df) __C,
2620                                                     (__mmask8) __U,
2621                                                     _MM_FROUND_CUR_DIRECTION);
2622}
2623
2624static __inline__ __m512d __DEFAULT_FN_ATTRS512
2625_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2626{
2627  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2628                                                     (__v8df) __B,
2629                                                     (__v8df) __C,
2630                                                     (__mmask8) __U,
2631                                                     _MM_FROUND_CUR_DIRECTION);
2632}
2633
2634static __inline__ __m512d __DEFAULT_FN_ATTRS512
2635_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
2636{
2637  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
2638                                                    -(__v8df) __B,
2639                                                    -(__v8df) __C,
2640                                                    (__mmask8) -1,
2641                                                    _MM_FROUND_CUR_DIRECTION);
2642}
2643
2644static __inline__ __m512d __DEFAULT_FN_ATTRS512
2645_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2646{
2647  return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
2648                                                     (__v8df) __B,
2649                                                     -(__v8df) __C,
2650                                                     (__mmask8) __U,
2651                                                     _MM_FROUND_CUR_DIRECTION);
2652}
2653
2654#define _mm512_fmadd_round_ps(A, B, C, R) \
2655  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2656                                           (__v16sf)(__m512)(B), \
2657                                           (__v16sf)(__m512)(C), \
2658                                           (__mmask16)-1, (int)(R)))
2659
2660
2661#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
2662  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2663                                           (__v16sf)(__m512)(B), \
2664                                           (__v16sf)(__m512)(C), \
2665                                           (__mmask16)(U), (int)(R)))
2666
2667
2668#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
2669  ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
2670                                            (__v16sf)(__m512)(B), \
2671                                            (__v16sf)(__m512)(C), \
2672                                            (__mmask16)(U), (int)(R)))
2673
2674
2675#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
2676  ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2677                                            (__v16sf)(__m512)(B), \
2678                                            (__v16sf)(__m512)(C), \
2679                                            (__mmask16)(U), (int)(R)))
2680
2681
2682#define _mm512_fmsub_round_ps(A, B, C, R) \
2683  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2684                                           (__v16sf)(__m512)(B), \
2685                                           -(__v16sf)(__m512)(C), \
2686                                           (__mmask16)-1, (int)(R)))
2687
2688
2689#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
2690  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2691                                           (__v16sf)(__m512)(B), \
2692                                           -(__v16sf)(__m512)(C), \
2693                                           (__mmask16)(U), (int)(R)))
2694
2695
2696#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
2697  ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
2698                                            (__v16sf)(__m512)(B), \
2699                                            -(__v16sf)(__m512)(C), \
2700                                            (__mmask16)(U), (int)(R)))
2701
2702
2703#define _mm512_fnmadd_round_ps(A, B, C, R) \
2704  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2705                                           -(__v16sf)(__m512)(B), \
2706                                           (__v16sf)(__m512)(C), \
2707                                           (__mmask16)-1, (int)(R)))
2708
2709
2710#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
2711  ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
2712                                            (__v16sf)(__m512)(B), \
2713                                            (__v16sf)(__m512)(C), \
2714                                            (__mmask16)(U), (int)(R)))
2715
2716
2717#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
2718  ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2719                                            (__v16sf)(__m512)(B), \
2720                                            (__v16sf)(__m512)(C), \
2721                                            (__mmask16)(U), (int)(R)))
2722
2723
2724#define _mm512_fnmsub_round_ps(A, B, C, R) \
2725  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
2726                                           -(__v16sf)(__m512)(B), \
2727                                           -(__v16sf)(__m512)(C), \
2728                                           (__mmask16)-1, (int)(R)))
2729
2730
2731#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
2732  ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
2733                                            (__v16sf)(__m512)(B), \
2734                                            -(__v16sf)(__m512)(C), \
2735                                            (__mmask16)(U), (int)(R)))
2736
2737
2738static __inline__ __m512 __DEFAULT_FN_ATTRS512
2739_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
2740{
2741  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2742                                                   (__v16sf) __B,
2743                                                   (__v16sf) __C,
2744                                                   (__mmask16) -1,
2745                                                   _MM_FROUND_CUR_DIRECTION);
2746}
2747
2748static __inline__ __m512 __DEFAULT_FN_ATTRS512
2749_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
2750{
2751  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2752                                                   (__v16sf) __B,
2753                                                   (__v16sf) __C,
2754                                                   (__mmask16) __U,
2755                                                   _MM_FROUND_CUR_DIRECTION);
2756}
2757
2758static __inline__ __m512 __DEFAULT_FN_ATTRS512
2759_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
2760{
2761  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
2762                                                    (__v16sf) __B,
2763                                                    (__v16sf) __C,
2764                                                    (__mmask16) __U,
2765                                                    _MM_FROUND_CUR_DIRECTION);
2766}
2767
2768static __inline__ __m512 __DEFAULT_FN_ATTRS512
2769_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2770{
2771  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2772                                                    (__v16sf) __B,
2773                                                    (__v16sf) __C,
2774                                                    (__mmask16) __U,
2775                                                    _MM_FROUND_CUR_DIRECTION);
2776}
2777
2778static __inline__ __m512 __DEFAULT_FN_ATTRS512
2779_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
2780{
2781  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2782                                                   (__v16sf) __B,
2783                                                   -(__v16sf) __C,
2784                                                   (__mmask16) -1,
2785                                                   _MM_FROUND_CUR_DIRECTION);
2786}
2787
2788static __inline__ __m512 __DEFAULT_FN_ATTRS512
2789_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
2790{
2791  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2792                                                   (__v16sf) __B,
2793                                                   -(__v16sf) __C,
2794                                                   (__mmask16) __U,
2795                                                   _MM_FROUND_CUR_DIRECTION);
2796}
2797
2798static __inline__ __m512 __DEFAULT_FN_ATTRS512
2799_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2800{
2801  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
2802                                                    (__v16sf) __B,
2803                                                    -(__v16sf) __C,
2804                                                    (__mmask16) __U,
2805                                                    _MM_FROUND_CUR_DIRECTION);
2806}
2807
2808static __inline__ __m512 __DEFAULT_FN_ATTRS512
2809_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
2810{
2811  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2812                                                   -(__v16sf) __B,
2813                                                   (__v16sf) __C,
2814                                                   (__mmask16) -1,
2815                                                   _MM_FROUND_CUR_DIRECTION);
2816}
2817
2818static __inline__ __m512 __DEFAULT_FN_ATTRS512
2819_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
2820{
2821  return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
2822                                                    (__v16sf) __B,
2823                                                    (__v16sf) __C,
2824                                                    (__mmask16) __U,
2825                                                    _MM_FROUND_CUR_DIRECTION);
2826}
2827
2828static __inline__ __m512 __DEFAULT_FN_ATTRS512
2829_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2830{
2831  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2832                                                    (__v16sf) __B,
2833                                                    (__v16sf) __C,
2834                                                    (__mmask16) __U,
2835                                                    _MM_FROUND_CUR_DIRECTION);
2836}
2837
2838static __inline__ __m512 __DEFAULT_FN_ATTRS512
2839_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
2840{
2841  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
2842                                                   -(__v16sf) __B,
2843                                                   -(__v16sf) __C,
2844                                                   (__mmask16) -1,
2845                                                   _MM_FROUND_CUR_DIRECTION);
2846}
2847
2848static __inline__ __m512 __DEFAULT_FN_ATTRS512
2849_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
2850{
2851  return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
2852                                                    (__v16sf) __B,
2853                                                    -(__v16sf) __C,
2854                                                    (__mmask16) __U,
2855                                                    _MM_FROUND_CUR_DIRECTION);
2856}
2857
2858#define _mm512_fmaddsub_round_pd(A, B, C, R) \
2859  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2860                                               (__v8df)(__m512d)(B), \
2861                                               (__v8df)(__m512d)(C), \
2862                                               (__mmask8)-1, (int)(R)))
2863
2864
2865#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
2866  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2867                                               (__v8df)(__m512d)(B), \
2868                                               (__v8df)(__m512d)(C), \
2869                                               (__mmask8)(U), (int)(R)))
2870
2871
2872#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
2873  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
2874                                                (__v8df)(__m512d)(B), \
2875                                                (__v8df)(__m512d)(C), \
2876                                                (__mmask8)(U), (int)(R)))
2877
2878
2879#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
2880  ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2881                                                (__v8df)(__m512d)(B), \
2882                                                (__v8df)(__m512d)(C), \
2883                                                (__mmask8)(U), (int)(R)))
2884
2885
2886#define _mm512_fmsubadd_round_pd(A, B, C, R) \
2887  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2888                                               (__v8df)(__m512d)(B), \
2889                                               -(__v8df)(__m512d)(C), \
2890                                               (__mmask8)-1, (int)(R)))
2891
2892
2893#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
2894  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
2895                                               (__v8df)(__m512d)(B), \
2896                                               -(__v8df)(__m512d)(C), \
2897                                               (__mmask8)(U), (int)(R)))
2898
2899
2900#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
2901  ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
2902                                                (__v8df)(__m512d)(B), \
2903                                                -(__v8df)(__m512d)(C), \
2904                                                (__mmask8)(U), (int)(R)))
2905
2906
2907static __inline__ __m512d __DEFAULT_FN_ATTRS512
2908_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
2909{
2910  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2911                                                      (__v8df) __B,
2912                                                      (__v8df) __C,
2913                                                      (__mmask8) -1,
2914                                                      _MM_FROUND_CUR_DIRECTION);
2915}
2916
2917static __inline__ __m512d __DEFAULT_FN_ATTRS512
2918_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2919{
2920  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2921                                                      (__v8df) __B,
2922                                                      (__v8df) __C,
2923                                                      (__mmask8) __U,
2924                                                      _MM_FROUND_CUR_DIRECTION);
2925}
2926
2927static __inline__ __m512d __DEFAULT_FN_ATTRS512
2928_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
2929{
2930  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
2931                                                       (__v8df) __B,
2932                                                       (__v8df) __C,
2933                                                       (__mmask8) __U,
2934                                                       _MM_FROUND_CUR_DIRECTION);
2935}
2936
2937static __inline__ __m512d __DEFAULT_FN_ATTRS512
2938_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2939{
2940  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2941                                                       (__v8df) __B,
2942                                                       (__v8df) __C,
2943                                                       (__mmask8) __U,
2944                                                       _MM_FROUND_CUR_DIRECTION);
2945}
2946
2947static __inline__ __m512d __DEFAULT_FN_ATTRS512
2948_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
2949{
2950  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2951                                                       (__v8df) __B,
2952                                                       -(__v8df) __C,
2953                                                       (__mmask8) -1,
2954                                                       _MM_FROUND_CUR_DIRECTION);
2955}
2956
2957static __inline__ __m512d __DEFAULT_FN_ATTRS512
2958_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
2959{
2960  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
2961                                                       (__v8df) __B,
2962                                                       -(__v8df) __C,
2963                                                       (__mmask8) __U,
2964                                                       _MM_FROUND_CUR_DIRECTION);
2965}
2966
2967static __inline__ __m512d __DEFAULT_FN_ATTRS512
2968_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
2969{
2970  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
2971                                                        (__v8df) __B,
2972                                                        -(__v8df) __C,
2973                                                        (__mmask8) __U,
2974                                                        _MM_FROUND_CUR_DIRECTION);
2975}
2976
2977#define _mm512_fmaddsub_round_ps(A, B, C, R) \
2978  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2979                                              (__v16sf)(__m512)(B), \
2980                                              (__v16sf)(__m512)(C), \
2981                                              (__mmask16)-1, (int)(R)))
2982
2983
2984#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
2985  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
2986                                              (__v16sf)(__m512)(B), \
2987                                              (__v16sf)(__m512)(C), \
2988                                              (__mmask16)(U), (int)(R)))
2989
2990
2991#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
2992  ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
2993                                               (__v16sf)(__m512)(B), \
2994                                               (__v16sf)(__m512)(C), \
2995                                               (__mmask16)(U), (int)(R)))
2996
2997
2998#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
2999  ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
3000                                               (__v16sf)(__m512)(B), \
3001                                               (__v16sf)(__m512)(C), \
3002                                               (__mmask16)(U), (int)(R)))
3003
3004
3005#define _mm512_fmsubadd_round_ps(A, B, C, R) \
3006  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
3007                                              (__v16sf)(__m512)(B), \
3008                                              -(__v16sf)(__m512)(C), \
3009                                              (__mmask16)-1, (int)(R)))
3010
3011
3012#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
3013  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
3014                                              (__v16sf)(__m512)(B), \
3015                                              -(__v16sf)(__m512)(C), \
3016                                              (__mmask16)(U), (int)(R)))
3017
3018
3019#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
3020  ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
3021                                               (__v16sf)(__m512)(B), \
3022                                               -(__v16sf)(__m512)(C), \
3023                                               (__mmask16)(U), (int)(R)))
3024
3025
3026static __inline__ __m512 __DEFAULT_FN_ATTRS512
3027_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
3028{
3029  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3030                                                      (__v16sf) __B,
3031                                                      (__v16sf) __C,
3032                                                      (__mmask16) -1,
3033                                                      _MM_FROUND_CUR_DIRECTION);
3034}
3035
3036static __inline__ __m512 __DEFAULT_FN_ATTRS512
3037_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3038{
3039  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3040                                                      (__v16sf) __B,
3041                                                      (__v16sf) __C,
3042                                                      (__mmask16) __U,
3043                                                      _MM_FROUND_CUR_DIRECTION);
3044}
3045
3046static __inline__ __m512 __DEFAULT_FN_ATTRS512
3047_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3048{
3049  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
3050                                                       (__v16sf) __B,
3051                                                       (__v16sf) __C,
3052                                                       (__mmask16) __U,
3053                                                       _MM_FROUND_CUR_DIRECTION);
3054}
3055
3056static __inline__ __m512 __DEFAULT_FN_ATTRS512
3057_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
3058{
3059  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3060                                                       (__v16sf) __B,
3061                                                       (__v16sf) __C,
3062                                                       (__mmask16) __U,
3063                                                       _MM_FROUND_CUR_DIRECTION);
3064}
3065
3066static __inline__ __m512 __DEFAULT_FN_ATTRS512
3067_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
3068{
3069  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3070                                                      (__v16sf) __B,
3071                                                      -(__v16sf) __C,
3072                                                      (__mmask16) -1,
3073                                                      _MM_FROUND_CUR_DIRECTION);
3074}
3075
3076static __inline__ __m512 __DEFAULT_FN_ATTRS512
3077_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3078{
3079  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
3080                                                      (__v16sf) __B,
3081                                                      -(__v16sf) __C,
3082                                                      (__mmask16) __U,
3083                                                      _MM_FROUND_CUR_DIRECTION);
3084}
3085
3086static __inline__ __m512 __DEFAULT_FN_ATTRS512
3087_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
3088{
3089  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
3090                                                       (__v16sf) __B,
3091                                                       -(__v16sf) __C,
3092                                                       (__mmask16) __U,
3093                                                       _MM_FROUND_CUR_DIRECTION);
3094}
3095
3096#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
3097  ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
3098                                             (__v8df)(__m512d)(B), \
3099                                             (__v8df)(__m512d)(C), \
3100                                             (__mmask8)(U), (int)(R)))
3101
3102
3103static __inline__ __m512d __DEFAULT_FN_ATTRS512
3104_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3105{
3106  return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
3107                                                    (__v8df) __B,
3108                                                    (__v8df) __C,
3109                                                    (__mmask8) __U,
3110                                                    _MM_FROUND_CUR_DIRECTION);
3111}
3112
3113#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
3114  ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
3115                                            (__v16sf)(__m512)(B), \
3116                                            (__v16sf)(__m512)(C), \
3117                                            (__mmask16)(U), (int)(R)))
3118
3119static __inline__ __m512 __DEFAULT_FN_ATTRS512
3120_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3121{
3122  return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
3123                                                   (__v16sf) __B,
3124                                                   (__v16sf) __C,
3125                                                   (__mmask16) __U,
3126                                                   _MM_FROUND_CUR_DIRECTION);
3127}
3128
3129#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
3130  ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
3131                                                (__v8df)(__m512d)(B), \
3132                                                (__v8df)(__m512d)(C), \
3133                                                (__mmask8)(U), (int)(R)))
3134
3135
3136static __inline__ __m512d __DEFAULT_FN_ATTRS512
3137_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3138{
3139  return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
3140                                                       (__v8df) __B,
3141                                                       (__v8df) __C,
3142                                                       (__mmask8) __U,
3143                                                       _MM_FROUND_CUR_DIRECTION);
3144}
3145
3146#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
3147  ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
3148                                               (__v16sf)(__m512)(B), \
3149                                               (__v16sf)(__m512)(C), \
3150                                               (__mmask16)(U), (int)(R)))
3151
3152
3153static __inline__ __m512 __DEFAULT_FN_ATTRS512
3154_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3155{
3156  return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
3157                                                      (__v16sf) __B,
3158                                                      (__v16sf) __C,
3159                                                      (__mmask16) __U,
3160                                                      _MM_FROUND_CUR_DIRECTION);
3161}
3162
3163#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
3164  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3165                                            -(__v8df)(__m512d)(B), \
3166                                            (__v8df)(__m512d)(C), \
3167                                            (__mmask8)(U), (int)(R)))
3168
3169
3170static __inline__ __m512d __DEFAULT_FN_ATTRS512
3171_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
3172{
3173  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3174                                                    -(__v8df) __B,
3175                                                    (__v8df) __C,
3176                                                    (__mmask8) __U,
3177                                                    _MM_FROUND_CUR_DIRECTION);
3178}
3179
3180#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
3181  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3182                                           -(__v16sf)(__m512)(B), \
3183                                           (__v16sf)(__m512)(C), \
3184                                           (__mmask16)(U), (int)(R)))
3185
3186
3187static __inline__ __m512 __DEFAULT_FN_ATTRS512
3188_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3189{
3190  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3191                                                   -(__v16sf) __B,
3192                                                   (__v16sf) __C,
3193                                                   (__mmask16) __U,
3194                                                   _MM_FROUND_CUR_DIRECTION);
3195}
3196
3197#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
3198  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
3199                                            -(__v8df)(__m512d)(B), \
3200                                            -(__v8df)(__m512d)(C), \
3201                                            (__mmask8)(U), (int)(R)))
3202
3203
3204#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
3205  ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
3206                                             (__v8df)(__m512d)(B), \
3207                                             (__v8df)(__m512d)(C), \
3208                                             (__mmask8)(U), (int)(R)))
3209
3210
3211static __inline__ __m512d __DEFAULT_FN_ATTRS512
3212_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
3213{
3214  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
3215                                                    -(__v8df) __B,
3216                                                    -(__v8df) __C,
3217                                                    (__mmask8) __U,
3218                                                    _MM_FROUND_CUR_DIRECTION);
3219}
3220
3221static __inline__ __m512d __DEFAULT_FN_ATTRS512
3222_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
3223{
3224  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
3225                                                     (__v8df) __B,
3226                                                     (__v8df) __C,
3227                                                     (__mmask8) __U,
3228                                                     _MM_FROUND_CUR_DIRECTION);
3229}
3230
3231#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
3232  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
3233                                           -(__v16sf)(__m512)(B), \
3234                                           -(__v16sf)(__m512)(C), \
3235                                           (__mmask16)(U), (int)(R)))
3236
3237
3238#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
3239  ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
3240                                            (__v16sf)(__m512)(B), \
3241                                            (__v16sf)(__m512)(C), \
3242                                            (__mmask16)(U), (int)(R)))
3243
3244
3245static __inline__ __m512 __DEFAULT_FN_ATTRS512
3246_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
3247{
3248  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
3249                                                   -(__v16sf) __B,
3250                                                   -(__v16sf) __C,
3251                                                   (__mmask16) __U,
3252                                                   _MM_FROUND_CUR_DIRECTION);
3253}
3254
3255static __inline__ __m512 __DEFAULT_FN_ATTRS512
3256_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
3257{
3258  return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
3259                                                    (__v16sf) __B,
3260                                                    (__v16sf) __C,
3261                                                    (__mmask16) __U,
3262                                                    _MM_FROUND_CUR_DIRECTION);
3263}
3264
3265
3266
3267/* Vector permutations */
3268
3269static __inline __m512i __DEFAULT_FN_ATTRS512
3270_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
3271{
3272  return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
3273                                                (__v16si) __B);
3274}
3275
3276static __inline__ __m512i __DEFAULT_FN_ATTRS512
3277_mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
3278                               __m512i __B)
3279{
3280  return (__m512i)__builtin_ia32_selectd_512(__U,
3281                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3282                              (__v16si)__A);
3283}
3284
3285static __inline__ __m512i __DEFAULT_FN_ATTRS512
3286_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
3287                                __m512i __B)
3288{
3289  return (__m512i)__builtin_ia32_selectd_512(__U,
3290                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3291                              (__v16si)__I);
3292}
3293
3294static __inline__ __m512i __DEFAULT_FN_ATTRS512
3295_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
3296                                __m512i __B)
3297{
3298  return (__m512i)__builtin_ia32_selectd_512(__U,
3299                              (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
3300                              (__v16si)_mm512_setzero_si512());
3301}
3302
3303static __inline __m512i __DEFAULT_FN_ATTRS512
3304_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
3305{
3306  return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
3307                                                (__v8di) __B);
3308}
3309
3310static __inline__ __m512i __DEFAULT_FN_ATTRS512
3311_mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
3312                               __m512i __B)
3313{
3314  return (__m512i)__builtin_ia32_selectq_512(__U,
3315                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3316                               (__v8di)__A);
3317}
3318
3319static __inline__ __m512i __DEFAULT_FN_ATTRS512
3320_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
3321                                __m512i __B)
3322{
3323  return (__m512i)__builtin_ia32_selectq_512(__U,
3324                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3325                               (__v8di)__I);
3326}
3327
3328static __inline__ __m512i __DEFAULT_FN_ATTRS512
3329_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
3330                                __m512i __B)
3331{
3332  return (__m512i)__builtin_ia32_selectq_512(__U,
3333                               (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
3334                               (__v8di)_mm512_setzero_si512());
3335}
3336
3337#define _mm512_alignr_epi64(A, B, I) \
3338  ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
3339                                     (__v8di)(__m512i)(B), (int)(I)))
3340
3341#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
3342  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3343                                  (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3344                                  (__v8di)(__m512i)(W)))
3345
3346#define _mm512_maskz_alignr_epi64(U, A, B, imm) \
3347  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
3348                                  (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
3349                                  (__v8di)_mm512_setzero_si512()))
3350
3351#define _mm512_alignr_epi32(A, B, I) \
3352  ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
3353                                     (__v16si)(__m512i)(B), (int)(I)))
3354
3355#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
3356  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3357                                 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3358                                 (__v16si)(__m512i)(W)))
3359
3360#define _mm512_maskz_alignr_epi32(U, A, B, imm) \
3361  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
3362                                 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
3363                                 (__v16si)_mm512_setzero_si512()))
3364/* Vector Extract */
3365
3366#define _mm512_extractf64x4_pd(A, I) \
3367  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
3368                                             (__v4df)_mm256_undefined_pd(), \
3369                                             (__mmask8)-1))
3370
3371#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
3372  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3373                                             (__v4df)(__m256d)(W), \
3374                                             (__mmask8)(U)))
3375
3376#define _mm512_maskz_extractf64x4_pd(U, A, imm) \
3377  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
3378                                             (__v4df)_mm256_setzero_pd(), \
3379                                             (__mmask8)(U)))
3380
3381#define _mm512_extractf32x4_ps(A, I) \
3382  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
3383                                            (__v4sf)_mm_undefined_ps(), \
3384                                            (__mmask8)-1))
3385
3386#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
3387  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3388                                            (__v4sf)(__m128)(W), \
3389                                            (__mmask8)(U)))
3390
3391#define _mm512_maskz_extractf32x4_ps(U, A, imm) \
3392  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
3393                                            (__v4sf)_mm_setzero_ps(), \
3394                                            (__mmask8)(U)))
3395
3396/* Vector Blend */
3397
3398static __inline __m512d __DEFAULT_FN_ATTRS512
3399_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
3400{
3401  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
3402                 (__v8df) __W,
3403                 (__v8df) __A);
3404}
3405
3406static __inline __m512 __DEFAULT_FN_ATTRS512
3407_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
3408{
3409  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
3410                (__v16sf) __W,
3411                (__v16sf) __A);
3412}
3413
3414static __inline __m512i __DEFAULT_FN_ATTRS512
3415_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
3416{
3417  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
3418                (__v8di) __W,
3419                (__v8di) __A);
3420}
3421
3422static __inline __m512i __DEFAULT_FN_ATTRS512
3423_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
3424{
3425  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
3426                (__v16si) __W,
3427                (__v16si) __A);
3428}
3429
3430/* Compare */
3431
3432#define _mm512_cmp_round_ps_mask(A, B, P, R) \
3433  ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3434                                           (__v16sf)(__m512)(B), (int)(P), \
3435                                           (__mmask16)-1, (int)(R)))
3436
3437#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
3438  ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
3439                                           (__v16sf)(__m512)(B), (int)(P), \
3440                                           (__mmask16)(U), (int)(R)))
3441
3442#define _mm512_cmp_ps_mask(A, B, P) \
3443  _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3444#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
3445  _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3446
3447#define _mm512_cmpeq_ps_mask(A, B) \
3448    _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
3449#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
3450    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
3451
3452#define _mm512_cmplt_ps_mask(A, B) \
3453    _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
3454#define _mm512_mask_cmplt_ps_mask(k, A, B) \
3455    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
3456
3457#define _mm512_cmple_ps_mask(A, B) \
3458    _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
3459#define _mm512_mask_cmple_ps_mask(k, A, B) \
3460    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
3461
3462#define _mm512_cmpunord_ps_mask(A, B) \
3463    _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
3464#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
3465    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
3466
3467#define _mm512_cmpneq_ps_mask(A, B) \
3468    _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
3469#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
3470    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
3471
3472#define _mm512_cmpnlt_ps_mask(A, B) \
3473    _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
3474#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
3475    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
3476
3477#define _mm512_cmpnle_ps_mask(A, B) \
3478    _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
3479#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
3480    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
3481
3482#define _mm512_cmpord_ps_mask(A, B) \
3483    _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
3484#define _mm512_mask_cmpord_ps_mask(k, A, B) \
3485    _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
3486
3487#define _mm512_cmp_round_pd_mask(A, B, P, R) \
3488  ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3489                                          (__v8df)(__m512d)(B), (int)(P), \
3490                                          (__mmask8)-1, (int)(R)))
3491
3492#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
3493  ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
3494                                          (__v8df)(__m512d)(B), (int)(P), \
3495                                          (__mmask8)(U), (int)(R)))
3496
3497#define _mm512_cmp_pd_mask(A, B, P) \
3498  _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3499#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
3500  _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
3501
3502#define _mm512_cmpeq_pd_mask(A, B) \
3503    _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
3504#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
3505    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
3506
3507#define _mm512_cmplt_pd_mask(A, B) \
3508    _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
3509#define _mm512_mask_cmplt_pd_mask(k, A, B) \
3510    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
3511
3512#define _mm512_cmple_pd_mask(A, B) \
3513    _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
3514#define _mm512_mask_cmple_pd_mask(k, A, B) \
3515    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
3516
3517#define _mm512_cmpunord_pd_mask(A, B) \
3518    _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
3519#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
3520    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
3521
3522#define _mm512_cmpneq_pd_mask(A, B) \
3523    _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
3524#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
3525    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
3526
3527#define _mm512_cmpnlt_pd_mask(A, B) \
3528    _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
3529#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
3530    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
3531
3532#define _mm512_cmpnle_pd_mask(A, B) \
3533    _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
3534#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
3535    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
3536
3537#define _mm512_cmpord_pd_mask(A, B) \
3538    _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
3539#define _mm512_mask_cmpord_pd_mask(k, A, B) \
3540    _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
3541
3542/* Conversion */
3543
3544#define _mm512_cvtt_roundps_epu32(A, R) \
3545  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3546                                              (__v16si)_mm512_undefined_epi32(), \
3547                                              (__mmask16)-1, (int)(R)))
3548
3549#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
3550  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3551                                              (__v16si)(__m512i)(W), \
3552                                              (__mmask16)(U), (int)(R)))
3553
3554#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
3555  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
3556                                              (__v16si)_mm512_setzero_si512(), \
3557                                              (__mmask16)(U), (int)(R)))
3558
3559
3560static __inline __m512i __DEFAULT_FN_ATTRS512
3561_mm512_cvttps_epu32(__m512 __A)
3562{
3563  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3564                  (__v16si)
3565                  _mm512_setzero_si512 (),
3566                  (__mmask16) -1,
3567                  _MM_FROUND_CUR_DIRECTION);
3568}
3569
3570static __inline__ __m512i __DEFAULT_FN_ATTRS512
3571_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
3572{
3573  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3574                   (__v16si) __W,
3575                   (__mmask16) __U,
3576                   _MM_FROUND_CUR_DIRECTION);
3577}
3578
3579static __inline__ __m512i __DEFAULT_FN_ATTRS512
3580_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
3581{
3582  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
3583                   (__v16si) _mm512_setzero_si512 (),
3584                   (__mmask16) __U,
3585                   _MM_FROUND_CUR_DIRECTION);
3586}
3587
3588#define _mm512_cvt_roundepi32_ps(A, R) \
3589  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3590                                           (__v16sf)_mm512_setzero_ps(), \
3591                                           (__mmask16)-1, (int)(R)))
3592
3593#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
3594  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3595                                           (__v16sf)(__m512)(W), \
3596                                           (__mmask16)(U), (int)(R)))
3597
3598#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
3599  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
3600                                           (__v16sf)_mm512_setzero_ps(), \
3601                                           (__mmask16)(U), (int)(R)))
3602
3603#define _mm512_cvt_roundepu32_ps(A, R) \
3604  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3605                                            (__v16sf)_mm512_setzero_ps(), \
3606                                            (__mmask16)-1, (int)(R)))
3607
3608#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
3609  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3610                                            (__v16sf)(__m512)(W), \
3611                                            (__mmask16)(U), (int)(R)))
3612
3613#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
3614  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
3615                                            (__v16sf)_mm512_setzero_ps(), \
3616                                            (__mmask16)(U), (int)(R)))
3617
3618static __inline__ __m512 __DEFAULT_FN_ATTRS512
3619_mm512_cvtepu32_ps (__m512i __A)
3620{
3621  return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
3622}
3623
3624static __inline__ __m512 __DEFAULT_FN_ATTRS512
3625_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
3626{
3627  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3628                                             (__v16sf)_mm512_cvtepu32_ps(__A),
3629                                             (__v16sf)__W);
3630}
3631
3632static __inline__ __m512 __DEFAULT_FN_ATTRS512
3633_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
3634{
3635  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3636                                             (__v16sf)_mm512_cvtepu32_ps(__A),
3637                                             (__v16sf)_mm512_setzero_ps());
3638}
3639
3640static __inline __m512d __DEFAULT_FN_ATTRS512
3641_mm512_cvtepi32_pd(__m256i __A)
3642{
3643  return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
3644}
3645
3646static __inline__ __m512d __DEFAULT_FN_ATTRS512
3647_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
3648{
3649  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3650                                              (__v8df)_mm512_cvtepi32_pd(__A),
3651                                              (__v8df)__W);
3652}
3653
3654static __inline__ __m512d __DEFAULT_FN_ATTRS512
3655_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
3656{
3657  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3658                                              (__v8df)_mm512_cvtepi32_pd(__A),
3659                                              (__v8df)_mm512_setzero_pd());
3660}
3661
3662static __inline__ __m512d __DEFAULT_FN_ATTRS512
3663_mm512_cvtepi32lo_pd(__m512i __A)
3664{
3665  return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
3666}
3667
3668static __inline__ __m512d __DEFAULT_FN_ATTRS512
3669_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
3670{
3671  return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
3672}
3673
3674static __inline__ __m512 __DEFAULT_FN_ATTRS512
3675_mm512_cvtepi32_ps (__m512i __A)
3676{
3677  return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
3678}
3679
3680static __inline__ __m512 __DEFAULT_FN_ATTRS512
3681_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
3682{
3683  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3684                                             (__v16sf)_mm512_cvtepi32_ps(__A),
3685                                             (__v16sf)__W);
3686}
3687
3688static __inline__ __m512 __DEFAULT_FN_ATTRS512
3689_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
3690{
3691  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
3692                                             (__v16sf)_mm512_cvtepi32_ps(__A),
3693                                             (__v16sf)_mm512_setzero_ps());
3694}
3695
3696static __inline __m512d __DEFAULT_FN_ATTRS512
3697_mm512_cvtepu32_pd(__m256i __A)
3698{
3699  return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
3700}
3701
3702static __inline__ __m512d __DEFAULT_FN_ATTRS512
3703_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
3704{
3705  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3706                                              (__v8df)_mm512_cvtepu32_pd(__A),
3707                                              (__v8df)__W);
3708}
3709
3710static __inline__ __m512d __DEFAULT_FN_ATTRS512
3711_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
3712{
3713  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
3714                                              (__v8df)_mm512_cvtepu32_pd(__A),
3715                                              (__v8df)_mm512_setzero_pd());
3716}
3717
3718static __inline__ __m512d __DEFAULT_FN_ATTRS512
3719_mm512_cvtepu32lo_pd(__m512i __A)
3720{
3721  return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
3722}
3723
3724static __inline__ __m512d __DEFAULT_FN_ATTRS512
3725_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
3726{
3727  return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
3728}
3729
3730#define _mm512_cvt_roundpd_ps(A, R) \
3731  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3732                                           (__v8sf)_mm256_setzero_ps(), \
3733                                           (__mmask8)-1, (int)(R)))
3734
3735#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
3736  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3737                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
3738                                           (int)(R)))
3739
3740#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
3741  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
3742                                           (__v8sf)_mm256_setzero_ps(), \
3743                                           (__mmask8)(U), (int)(R)))
3744
3745static __inline__ __m256 __DEFAULT_FN_ATTRS512
3746_mm512_cvtpd_ps (__m512d __A)
3747{
3748  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3749                (__v8sf) _mm256_undefined_ps (),
3750                (__mmask8) -1,
3751                _MM_FROUND_CUR_DIRECTION);
3752}
3753
3754static __inline__ __m256 __DEFAULT_FN_ATTRS512
3755_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
3756{
3757  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3758                (__v8sf) __W,
3759                (__mmask8) __U,
3760                _MM_FROUND_CUR_DIRECTION);
3761}
3762
3763static __inline__ __m256 __DEFAULT_FN_ATTRS512
3764_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
3765{
3766  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
3767                (__v8sf) _mm256_setzero_ps (),
3768                (__mmask8) __U,
3769                _MM_FROUND_CUR_DIRECTION);
3770}
3771
3772static __inline__ __m512 __DEFAULT_FN_ATTRS512
3773_mm512_cvtpd_pslo (__m512d __A)
3774{
3775  return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
3776                (__v8sf) _mm256_setzero_ps (),
3777                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3778}
3779
3780static __inline__ __m512 __DEFAULT_FN_ATTRS512
3781_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
3782{
3783  return (__m512) __builtin_shufflevector (
3784                (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
3785                                               __U, __A),
3786                (__v8sf) _mm256_setzero_ps (),
3787                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
3788}
3789
3790#define _mm512_cvt_roundps_ph(A, I) \
3791  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3792                                             (__v16hi)_mm256_undefined_si256(), \
3793                                             (__mmask16)-1))
3794
3795#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
3796  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3797                                             (__v16hi)(__m256i)(U), \
3798                                             (__mmask16)(W)))
3799
3800#define _mm512_maskz_cvt_roundps_ph(W, A, I) \
3801  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
3802                                             (__v16hi)_mm256_setzero_si256(), \
3803                                             (__mmask16)(W)))
3804
3805#define _mm512_cvtps_ph       _mm512_cvt_roundps_ph
3806#define _mm512_mask_cvtps_ph  _mm512_mask_cvt_roundps_ph
3807#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
3808
3809#define _mm512_cvt_roundph_ps(A, R) \
3810  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3811                                            (__v16sf)_mm512_undefined_ps(), \
3812                                            (__mmask16)-1, (int)(R)))
3813
3814#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
3815  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3816                                            (__v16sf)(__m512)(W), \
3817                                            (__mmask16)(U), (int)(R)))
3818
3819#define _mm512_maskz_cvt_roundph_ps(U, A, R) \
3820  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
3821                                            (__v16sf)_mm512_setzero_ps(), \
3822                                            (__mmask16)(U), (int)(R)))
3823
3824
3825static  __inline __m512 __DEFAULT_FN_ATTRS512
3826_mm512_cvtph_ps(__m256i __A)
3827{
3828  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3829                (__v16sf)
3830                _mm512_setzero_ps (),
3831                (__mmask16) -1,
3832                _MM_FROUND_CUR_DIRECTION);
3833}
3834
3835static __inline__ __m512 __DEFAULT_FN_ATTRS512
3836_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
3837{
3838  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3839                 (__v16sf) __W,
3840                 (__mmask16) __U,
3841                 _MM_FROUND_CUR_DIRECTION);
3842}
3843
3844static __inline__ __m512 __DEFAULT_FN_ATTRS512
3845_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
3846{
3847  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
3848                 (__v16sf) _mm512_setzero_ps (),
3849                 (__mmask16) __U,
3850                 _MM_FROUND_CUR_DIRECTION);
3851}
3852
3853#define _mm512_cvtt_roundpd_epi32(A, R) \
3854  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3855                                             (__v8si)_mm256_setzero_si256(), \
3856                                             (__mmask8)-1, (int)(R)))
3857
3858#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
3859  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3860                                             (__v8si)(__m256i)(W), \
3861                                             (__mmask8)(U), (int)(R)))
3862
3863#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
3864  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
3865                                             (__v8si)_mm256_setzero_si256(), \
3866                                             (__mmask8)(U), (int)(R)))
3867
3868static __inline __m256i __DEFAULT_FN_ATTRS512
3869_mm512_cvttpd_epi32(__m512d __a)
3870{
3871  return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
3872                                                   (__v8si)_mm256_setzero_si256(),
3873                                                   (__mmask8) -1,
3874                                                    _MM_FROUND_CUR_DIRECTION);
3875}
3876
3877static __inline__ __m256i __DEFAULT_FN_ATTRS512
3878_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
3879{
3880  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3881                  (__v8si) __W,
3882                  (__mmask8) __U,
3883                  _MM_FROUND_CUR_DIRECTION);
3884}
3885
3886static __inline__ __m256i __DEFAULT_FN_ATTRS512
3887_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
3888{
3889  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
3890                  (__v8si) _mm256_setzero_si256 (),
3891                  (__mmask8) __U,
3892                  _MM_FROUND_CUR_DIRECTION);
3893}
3894
3895#define _mm512_cvtt_roundps_epi32(A, R) \
3896  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3897                                             (__v16si)_mm512_setzero_si512(), \
3898                                             (__mmask16)-1, (int)(R)))
3899
3900#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
3901  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3902                                             (__v16si)(__m512i)(W), \
3903                                             (__mmask16)(U), (int)(R)))
3904
3905#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
3906  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
3907                                             (__v16si)_mm512_setzero_si512(), \
3908                                             (__mmask16)(U), (int)(R)))
3909
3910static __inline __m512i __DEFAULT_FN_ATTRS512
3911_mm512_cvttps_epi32(__m512 __a)
3912{
3913  return (__m512i)
3914    __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
3915                                     (__v16si) _mm512_setzero_si512 (),
3916                                     (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
3917}
3918
3919static __inline__ __m512i __DEFAULT_FN_ATTRS512
3920_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
3921{
3922  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3923                  (__v16si) __W,
3924                  (__mmask16) __U,
3925                  _MM_FROUND_CUR_DIRECTION);
3926}
3927
3928static __inline__ __m512i __DEFAULT_FN_ATTRS512
3929_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
3930{
3931  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
3932                  (__v16si) _mm512_setzero_si512 (),
3933                  (__mmask16) __U,
3934                  _MM_FROUND_CUR_DIRECTION);
3935}
3936
3937#define _mm512_cvt_roundps_epi32(A, R) \
3938  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3939                                            (__v16si)_mm512_setzero_si512(), \
3940                                            (__mmask16)-1, (int)(R)))
3941
3942#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
3943  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3944                                            (__v16si)(__m512i)(W), \
3945                                            (__mmask16)(U), (int)(R)))
3946
3947#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
3948  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
3949                                            (__v16si)_mm512_setzero_si512(), \
3950                                            (__mmask16)(U), (int)(R)))
3951
3952static __inline__ __m512i __DEFAULT_FN_ATTRS512
3953_mm512_cvtps_epi32 (__m512 __A)
3954{
3955  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3956                 (__v16si) _mm512_undefined_epi32 (),
3957                 (__mmask16) -1,
3958                 _MM_FROUND_CUR_DIRECTION);
3959}
3960
3961static __inline__ __m512i __DEFAULT_FN_ATTRS512
3962_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
3963{
3964  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3965                 (__v16si) __W,
3966                 (__mmask16) __U,
3967                 _MM_FROUND_CUR_DIRECTION);
3968}
3969
3970static __inline__ __m512i __DEFAULT_FN_ATTRS512
3971_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
3972{
3973  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
3974                 (__v16si)
3975                 _mm512_setzero_si512 (),
3976                 (__mmask16) __U,
3977                 _MM_FROUND_CUR_DIRECTION);
3978}
3979
3980#define _mm512_cvt_roundpd_epi32(A, R) \
3981  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3982                                            (__v8si)_mm256_setzero_si256(), \
3983                                            (__mmask8)-1, (int)(R)))
3984
3985#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
3986  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3987                                            (__v8si)(__m256i)(W), \
3988                                            (__mmask8)(U), (int)(R)))
3989
3990#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
3991  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
3992                                            (__v8si)_mm256_setzero_si256(), \
3993                                            (__mmask8)(U), (int)(R)))
3994
3995static __inline__ __m256i __DEFAULT_FN_ATTRS512
3996_mm512_cvtpd_epi32 (__m512d __A)
3997{
3998  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
3999                 (__v8si)
4000                 _mm256_undefined_si256 (),
4001                 (__mmask8) -1,
4002                 _MM_FROUND_CUR_DIRECTION);
4003}
4004
4005static __inline__ __m256i __DEFAULT_FN_ATTRS512
4006_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
4007{
4008  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
4009                 (__v8si) __W,
4010                 (__mmask8) __U,
4011                 _MM_FROUND_CUR_DIRECTION);
4012}
4013
4014static __inline__ __m256i __DEFAULT_FN_ATTRS512
4015_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
4016{
4017  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
4018                 (__v8si)
4019                 _mm256_setzero_si256 (),
4020                 (__mmask8) __U,
4021                 _MM_FROUND_CUR_DIRECTION);
4022}
4023
4024#define _mm512_cvt_roundps_epu32(A, R) \
4025  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4026                                             (__v16si)_mm512_setzero_si512(), \
4027                                             (__mmask16)-1, (int)(R)))
4028
4029#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
4030  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4031                                             (__v16si)(__m512i)(W), \
4032                                             (__mmask16)(U), (int)(R)))
4033
4034#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
4035  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
4036                                             (__v16si)_mm512_setzero_si512(), \
4037                                             (__mmask16)(U), (int)(R)))
4038
4039static __inline__ __m512i __DEFAULT_FN_ATTRS512
4040_mm512_cvtps_epu32 ( __m512 __A)
4041{
4042  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
4043                  (__v16si)\
4044                  _mm512_undefined_epi32 (),
4045                  (__mmask16) -1,\
4046                  _MM_FROUND_CUR_DIRECTION);
4047}
4048
4049static __inline__ __m512i __DEFAULT_FN_ATTRS512
4050_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
4051{
4052  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4053                  (__v16si) __W,
4054                  (__mmask16) __U,
4055                  _MM_FROUND_CUR_DIRECTION);
4056}
4057
4058static __inline__ __m512i __DEFAULT_FN_ATTRS512
4059_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
4060{
4061  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
4062                  (__v16si)
4063                  _mm512_setzero_si512 (),
4064                  (__mmask16) __U ,
4065                  _MM_FROUND_CUR_DIRECTION);
4066}
4067
4068#define _mm512_cvt_roundpd_epu32(A, R) \
4069  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4070                                             (__v8si)_mm256_setzero_si256(), \
4071                                             (__mmask8)-1, (int)(R)))
4072
4073#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
4074  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4075                                             (__v8si)(__m256i)(W), \
4076                                             (__mmask8)(U), (int)(R)))
4077
4078#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
4079  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
4080                                             (__v8si)_mm256_setzero_si256(), \
4081                                             (__mmask8)(U), (int)(R)))
4082
4083static __inline__ __m256i __DEFAULT_FN_ATTRS512
4084_mm512_cvtpd_epu32 (__m512d __A)
4085{
4086  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4087                  (__v8si)
4088                  _mm256_undefined_si256 (),
4089                  (__mmask8) -1,
4090                  _MM_FROUND_CUR_DIRECTION);
4091}
4092
4093static __inline__ __m256i __DEFAULT_FN_ATTRS512
4094_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
4095{
4096  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4097                  (__v8si) __W,
4098                  (__mmask8) __U,
4099                  _MM_FROUND_CUR_DIRECTION);
4100}
4101
4102static __inline__ __m256i __DEFAULT_FN_ATTRS512
4103_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
4104{
4105  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
4106                  (__v8si)
4107                  _mm256_setzero_si256 (),
4108                  (__mmask8) __U,
4109                  _MM_FROUND_CUR_DIRECTION);
4110}
4111
4112static __inline__ double __DEFAULT_FN_ATTRS512
4113_mm512_cvtsd_f64(__m512d __a)
4114{
4115  return __a[0];
4116}
4117
4118static __inline__ float __DEFAULT_FN_ATTRS512
4119_mm512_cvtss_f32(__m512 __a)
4120{
4121  return __a[0];
4122}
4123
4124/* Unpack and Interleave */
4125
4126static __inline __m512d __DEFAULT_FN_ATTRS512
4127_mm512_unpackhi_pd(__m512d __a, __m512d __b)
4128{
4129  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
4130                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4131}
4132
4133static __inline__ __m512d __DEFAULT_FN_ATTRS512
4134_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
4135{
4136  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4137                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
4138                                           (__v8df)__W);
4139}
4140
4141static __inline__ __m512d __DEFAULT_FN_ATTRS512
4142_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
4143{
4144  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4145                                           (__v8df)_mm512_unpackhi_pd(__A, __B),
4146                                           (__v8df)_mm512_setzero_pd());
4147}
4148
4149static __inline __m512d __DEFAULT_FN_ATTRS512
4150_mm512_unpacklo_pd(__m512d __a, __m512d __b)
4151{
4152  return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
4153                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4154}
4155
4156static __inline__ __m512d __DEFAULT_FN_ATTRS512
4157_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
4158{
4159  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4160                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
4161                                           (__v8df)__W);
4162}
4163
4164static __inline__ __m512d __DEFAULT_FN_ATTRS512
4165_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
4166{
4167  return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
4168                                           (__v8df)_mm512_unpacklo_pd(__A, __B),
4169                                           (__v8df)_mm512_setzero_pd());
4170}
4171
4172static __inline __m512 __DEFAULT_FN_ATTRS512
4173_mm512_unpackhi_ps(__m512 __a, __m512 __b)
4174{
4175  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
4176                                         2,    18,    3,    19,
4177                                         2+4,  18+4,  3+4,  19+4,
4178                                         2+8,  18+8,  3+8,  19+8,
4179                                         2+12, 18+12, 3+12, 19+12);
4180}
4181
4182static __inline__ __m512 __DEFAULT_FN_ATTRS512
4183_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
4184{
4185  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4186                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
4187                                          (__v16sf)__W);
4188}
4189
4190static __inline__ __m512 __DEFAULT_FN_ATTRS512
4191_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
4192{
4193  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4194                                          (__v16sf)_mm512_unpackhi_ps(__A, __B),
4195                                          (__v16sf)_mm512_setzero_ps());
4196}
4197
4198static __inline __m512 __DEFAULT_FN_ATTRS512
4199_mm512_unpacklo_ps(__m512 __a, __m512 __b)
4200{
4201  return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
4202                                         0,    16,    1,    17,
4203                                         0+4,  16+4,  1+4,  17+4,
4204                                         0+8,  16+8,  1+8,  17+8,
4205                                         0+12, 16+12, 1+12, 17+12);
4206}
4207
4208static __inline__ __m512 __DEFAULT_FN_ATTRS512
4209_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
4210{
4211  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4212                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
4213                                          (__v16sf)__W);
4214}
4215
4216static __inline__ __m512 __DEFAULT_FN_ATTRS512
4217_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
4218{
4219  return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
4220                                          (__v16sf)_mm512_unpacklo_ps(__A, __B),
4221                                          (__v16sf)_mm512_setzero_ps());
4222}
4223
4224static __inline__ __m512i __DEFAULT_FN_ATTRS512
4225_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
4226{
4227  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4228                                          2,    18,    3,    19,
4229                                          2+4,  18+4,  3+4,  19+4,
4230                                          2+8,  18+8,  3+8,  19+8,
4231                                          2+12, 18+12, 3+12, 19+12);
4232}
4233
4234static __inline__ __m512i __DEFAULT_FN_ATTRS512
4235_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4236{
4237  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4238                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
4239                                       (__v16si)__W);
4240}
4241
4242static __inline__ __m512i __DEFAULT_FN_ATTRS512
4243_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
4244{
4245  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4246                                       (__v16si)_mm512_unpackhi_epi32(__A, __B),
4247                                       (__v16si)_mm512_setzero_si512());
4248}
4249
4250static __inline__ __m512i __DEFAULT_FN_ATTRS512
4251_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
4252{
4253  return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
4254                                          0,    16,    1,    17,
4255                                          0+4,  16+4,  1+4,  17+4,
4256                                          0+8,  16+8,  1+8,  17+8,
4257                                          0+12, 16+12, 1+12, 17+12);
4258}
4259
4260static __inline__ __m512i __DEFAULT_FN_ATTRS512
4261_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4262{
4263  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4264                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
4265                                       (__v16si)__W);
4266}
4267
4268static __inline__ __m512i __DEFAULT_FN_ATTRS512
4269_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
4270{
4271  return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
4272                                       (__v16si)_mm512_unpacklo_epi32(__A, __B),
4273                                       (__v16si)_mm512_setzero_si512());
4274}
4275
4276static __inline__ __m512i __DEFAULT_FN_ATTRS512
4277_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
4278{
4279  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4280                                          1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
4281}
4282
4283static __inline__ __m512i __DEFAULT_FN_ATTRS512
4284_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4285{
4286  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4287                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
4288                                        (__v8di)__W);
4289}
4290
4291static __inline__ __m512i __DEFAULT_FN_ATTRS512
4292_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
4293{
4294  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4295                                        (__v8di)_mm512_unpackhi_epi64(__A, __B),
4296                                        (__v8di)_mm512_setzero_si512());
4297}
4298
4299static __inline__ __m512i __DEFAULT_FN_ATTRS512
4300_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
4301{
4302  return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
4303                                          0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
4304}
4305
4306static __inline__ __m512i __DEFAULT_FN_ATTRS512
4307_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4308{
4309  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4310                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
4311                                        (__v8di)__W);
4312}
4313
4314static __inline__ __m512i __DEFAULT_FN_ATTRS512
4315_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
4316{
4317  return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
4318                                        (__v8di)_mm512_unpacklo_epi64(__A, __B),
4319                                        (__v8di)_mm512_setzero_si512());
4320}
4321
4322
4323/* SIMD load ops */
4324
4325static __inline __m512i __DEFAULT_FN_ATTRS512
4326_mm512_loadu_si512 (void const *__P)
4327{
4328  struct __loadu_si512 {
4329    __m512i_u __v;
4330  } __attribute__((__packed__, __may_alias__));
4331  return ((const struct __loadu_si512*)__P)->__v;
4332}
4333
4334static __inline __m512i __DEFAULT_FN_ATTRS512
4335_mm512_loadu_epi32 (void const *__P)
4336{
4337  struct __loadu_epi32 {
4338    __m512i_u __v;
4339  } __attribute__((__packed__, __may_alias__));
4340  return ((const struct __loadu_epi32*)__P)->__v;
4341}
4342
4343static __inline __m512i __DEFAULT_FN_ATTRS512
4344_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
4345{
4346  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
4347                  (__v16si) __W,
4348                  (__mmask16) __U);
4349}
4350
4351
4352static __inline __m512i __DEFAULT_FN_ATTRS512
4353_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
4354{
4355  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
4356                                                     (__v16si)
4357                                                     _mm512_setzero_si512 (),
4358                                                     (__mmask16) __U);
4359}
4360
4361static __inline __m512i __DEFAULT_FN_ATTRS512
4362_mm512_loadu_epi64 (void const *__P)
4363{
4364  struct __loadu_epi64 {
4365    __m512i_u __v;
4366  } __attribute__((__packed__, __may_alias__));
4367  return ((const struct __loadu_epi64*)__P)->__v;
4368}
4369
4370static __inline __m512i __DEFAULT_FN_ATTRS512
4371_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
4372{
4373  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
4374                  (__v8di) __W,
4375                  (__mmask8) __U);
4376}
4377
4378static __inline __m512i __DEFAULT_FN_ATTRS512
4379_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
4380{
4381  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
4382                                                     (__v8di)
4383                                                     _mm512_setzero_si512 (),
4384                                                     (__mmask8) __U);
4385}
4386
4387static __inline __m512 __DEFAULT_FN_ATTRS512
4388_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
4389{
4390  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
4391                   (__v16sf) __W,
4392                   (__mmask16) __U);
4393}
4394
4395static __inline __m512 __DEFAULT_FN_ATTRS512
4396_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
4397{
4398  return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
4399                                                  (__v16sf)
4400                                                  _mm512_setzero_ps (),
4401                                                  (__mmask16) __U);
4402}
4403
4404static __inline __m512d __DEFAULT_FN_ATTRS512
4405_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
4406{
4407  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
4408                (__v8df) __W,
4409                (__mmask8) __U);
4410}
4411
4412static __inline __m512d __DEFAULT_FN_ATTRS512
4413_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
4414{
4415  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
4416                                                   (__v8df)
4417                                                   _mm512_setzero_pd (),
4418                                                   (__mmask8) __U);
4419}
4420
4421static __inline __m512d __DEFAULT_FN_ATTRS512
4422_mm512_loadu_pd(void const *__p)
4423{
4424  struct __loadu_pd {
4425    __m512d_u __v;
4426  } __attribute__((__packed__, __may_alias__));
4427  return ((const struct __loadu_pd*)__p)->__v;
4428}
4429
4430static __inline __m512 __DEFAULT_FN_ATTRS512
4431_mm512_loadu_ps(void const *__p)
4432{
4433  struct __loadu_ps {
4434    __m512_u __v;
4435  } __attribute__((__packed__, __may_alias__));
4436  return ((const struct __loadu_ps*)__p)->__v;
4437}
4438
4439static __inline __m512 __DEFAULT_FN_ATTRS512
4440_mm512_load_ps(void const *__p)
4441{
4442  return *(const __m512*)__p;
4443}
4444
4445static __inline __m512 __DEFAULT_FN_ATTRS512
4446_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
4447{
4448  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
4449                   (__v16sf) __W,
4450                   (__mmask16) __U);
4451}
4452
4453static __inline __m512 __DEFAULT_FN_ATTRS512
4454_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
4455{
4456  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
4457                                                  (__v16sf)
4458                                                  _mm512_setzero_ps (),
4459                                                  (__mmask16) __U);
4460}
4461
4462static __inline __m512d __DEFAULT_FN_ATTRS512
4463_mm512_load_pd(void const *__p)
4464{
4465  return *(const __m512d*)__p;
4466}
4467
4468static __inline __m512d __DEFAULT_FN_ATTRS512
4469_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
4470{
4471  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
4472                          (__v8df) __W,
4473                          (__mmask8) __U);
4474}
4475
4476static __inline __m512d __DEFAULT_FN_ATTRS512
4477_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
4478{
4479  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
4480                                                   (__v8df)
4481                                                   _mm512_setzero_pd (),
4482                                                   (__mmask8) __U);
4483}
4484
4485static __inline __m512i __DEFAULT_FN_ATTRS512
4486_mm512_load_si512 (void const *__P)
4487{
4488  return *(const __m512i *) __P;
4489}
4490
4491static __inline __m512i __DEFAULT_FN_ATTRS512
4492_mm512_load_epi32 (void const *__P)
4493{
4494  return *(const __m512i *) __P;
4495}
4496
4497static __inline __m512i __DEFAULT_FN_ATTRS512
4498_mm512_load_epi64 (void const *__P)
4499{
4500  return *(const __m512i *) __P;
4501}
4502
4503/* SIMD store ops */
4504
4505static __inline void __DEFAULT_FN_ATTRS512
4506_mm512_storeu_epi64 (void *__P, __m512i __A)
4507{
4508  struct __storeu_epi64 {
4509    __m512i_u __v;
4510  } __attribute__((__packed__, __may_alias__));
4511  ((struct __storeu_epi64*)__P)->__v = __A;
4512}
4513
4514static __inline void __DEFAULT_FN_ATTRS512
4515_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
4516{
4517  __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
4518                                     (__mmask8) __U);
4519}
4520
4521static __inline void __DEFAULT_FN_ATTRS512
4522_mm512_storeu_si512 (void *__P, __m512i __A)
4523{
4524  struct __storeu_si512 {
4525    __m512i_u __v;
4526  } __attribute__((__packed__, __may_alias__));
4527  ((struct __storeu_si512*)__P)->__v = __A;
4528}
4529
4530static __inline void __DEFAULT_FN_ATTRS512
4531_mm512_storeu_epi32 (void *__P, __m512i __A)
4532{
4533  struct __storeu_epi32 {
4534    __m512i_u __v;
4535  } __attribute__((__packed__, __may_alias__));
4536  ((struct __storeu_epi32*)__P)->__v = __A;
4537}
4538
4539static __inline void __DEFAULT_FN_ATTRS512
4540_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
4541{
4542  __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
4543                                     (__mmask16) __U);
4544}
4545
4546static __inline void __DEFAULT_FN_ATTRS512
4547_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
4548{
4549  __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
4550}
4551
4552static __inline void __DEFAULT_FN_ATTRS512
4553_mm512_storeu_pd(void *__P, __m512d __A)
4554{
4555  struct __storeu_pd {
4556    __m512d_u __v;
4557  } __attribute__((__packed__, __may_alias__));
4558  ((struct __storeu_pd*)__P)->__v = __A;
4559}
4560
4561static __inline void __DEFAULT_FN_ATTRS512
4562_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
4563{
4564  __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
4565                                   (__mmask16) __U);
4566}
4567
4568static __inline void __DEFAULT_FN_ATTRS512
4569_mm512_storeu_ps(void *__P, __m512 __A)
4570{
4571  struct __storeu_ps {
4572    __m512_u __v;
4573  } __attribute__((__packed__, __may_alias__));
4574  ((struct __storeu_ps*)__P)->__v = __A;
4575}
4576
4577static __inline void __DEFAULT_FN_ATTRS512
4578_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
4579{
4580  __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
4581}
4582
4583static __inline void __DEFAULT_FN_ATTRS512
4584_mm512_store_pd(void *__P, __m512d __A)
4585{
4586  *(__m512d*)__P = __A;
4587}
4588
4589static __inline void __DEFAULT_FN_ATTRS512
4590_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
4591{
4592  __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
4593                                   (__mmask16) __U);
4594}
4595
4596static __inline void __DEFAULT_FN_ATTRS512
4597_mm512_store_ps(void *__P, __m512 __A)
4598{
4599  *(__m512*)__P = __A;
4600}
4601
4602static __inline void __DEFAULT_FN_ATTRS512
4603_mm512_store_si512 (void *__P, __m512i __A)
4604{
4605  *(__m512i *) __P = __A;
4606}
4607
4608static __inline void __DEFAULT_FN_ATTRS512
4609_mm512_store_epi32 (void *__P, __m512i __A)
4610{
4611  *(__m512i *) __P = __A;
4612}
4613
4614static __inline void __DEFAULT_FN_ATTRS512
4615_mm512_store_epi64 (void *__P, __m512i __A)
4616{
4617  *(__m512i *) __P = __A;
4618}
4619
4620/* Mask ops */
4621
4622static __inline __mmask16 __DEFAULT_FN_ATTRS
4623_mm512_knot(__mmask16 __M)
4624{
4625  return __builtin_ia32_knothi(__M);
4626}
4627
4628/* Integer compare */
4629
4630#define _mm512_cmpeq_epi32_mask(A, B) \
4631    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
4632#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
4633    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
4634#define _mm512_cmpge_epi32_mask(A, B) \
4635    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
4636#define _mm512_mask_cmpge_epi32_mask(k, A, B) \
4637    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
4638#define _mm512_cmpgt_epi32_mask(A, B) \
4639    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
4640#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
4641    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
4642#define _mm512_cmple_epi32_mask(A, B) \
4643    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
4644#define _mm512_mask_cmple_epi32_mask(k, A, B) \
4645    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
4646#define _mm512_cmplt_epi32_mask(A, B) \
4647    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
4648#define _mm512_mask_cmplt_epi32_mask(k, A, B) \
4649    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
4650#define _mm512_cmpneq_epi32_mask(A, B) \
4651    _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
4652#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
4653    _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
4654
4655#define _mm512_cmpeq_epu32_mask(A, B) \
4656    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
4657#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
4658    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
4659#define _mm512_cmpge_epu32_mask(A, B) \
4660    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
4661#define _mm512_mask_cmpge_epu32_mask(k, A, B) \
4662    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
4663#define _mm512_cmpgt_epu32_mask(A, B) \
4664    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
4665#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
4666    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
4667#define _mm512_cmple_epu32_mask(A, B) \
4668    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
4669#define _mm512_mask_cmple_epu32_mask(k, A, B) \
4670    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
4671#define _mm512_cmplt_epu32_mask(A, B) \
4672    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
4673#define _mm512_mask_cmplt_epu32_mask(k, A, B) \
4674    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
4675#define _mm512_cmpneq_epu32_mask(A, B) \
4676    _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
4677#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
4678    _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
4679
4680#define _mm512_cmpeq_epi64_mask(A, B) \
4681    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
4682#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
4683    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
4684#define _mm512_cmpge_epi64_mask(A, B) \
4685    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
4686#define _mm512_mask_cmpge_epi64_mask(k, A, B) \
4687    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
4688#define _mm512_cmpgt_epi64_mask(A, B) \
4689    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
4690#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
4691    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
4692#define _mm512_cmple_epi64_mask(A, B) \
4693    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
4694#define _mm512_mask_cmple_epi64_mask(k, A, B) \
4695    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
4696#define _mm512_cmplt_epi64_mask(A, B) \
4697    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
4698#define _mm512_mask_cmplt_epi64_mask(k, A, B) \
4699    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
4700#define _mm512_cmpneq_epi64_mask(A, B) \
4701    _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
4702#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
4703    _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
4704
4705#define _mm512_cmpeq_epu64_mask(A, B) \
4706    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
4707#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
4708    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
4709#define _mm512_cmpge_epu64_mask(A, B) \
4710    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
4711#define _mm512_mask_cmpge_epu64_mask(k, A, B) \
4712    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
4713#define _mm512_cmpgt_epu64_mask(A, B) \
4714    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
4715#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
4716    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
4717#define _mm512_cmple_epu64_mask(A, B) \
4718    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
4719#define _mm512_mask_cmple_epu64_mask(k, A, B) \
4720    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
4721#define _mm512_cmplt_epu64_mask(A, B) \
4722    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
4723#define _mm512_mask_cmplt_epu64_mask(k, A, B) \
4724    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
4725#define _mm512_cmpneq_epu64_mask(A, B) \
4726    _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
4727#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
4728    _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
4729
4730static __inline__ __m512i __DEFAULT_FN_ATTRS512
4731_mm512_cvtepi8_epi32(__m128i __A)
4732{
4733  /* This function always performs a signed extension, but __v16qi is a char
4734     which may be signed or unsigned, so use __v16qs. */
4735  return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
4736}
4737
4738static __inline__ __m512i __DEFAULT_FN_ATTRS512
4739_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
4740{
4741  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4742                                             (__v16si)_mm512_cvtepi8_epi32(__A),
4743                                             (__v16si)__W);
4744}
4745
4746static __inline__ __m512i __DEFAULT_FN_ATTRS512
4747_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
4748{
4749  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4750                                             (__v16si)_mm512_cvtepi8_epi32(__A),
4751                                             (__v16si)_mm512_setzero_si512());
4752}
4753
4754static __inline__ __m512i __DEFAULT_FN_ATTRS512
4755_mm512_cvtepi8_epi64(__m128i __A)
4756{
4757  /* This function always performs a signed extension, but __v16qi is a char
4758     which may be signed or unsigned, so use __v16qs. */
4759  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4760}
4761
4762static __inline__ __m512i __DEFAULT_FN_ATTRS512
4763_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4764{
4765  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4766                                             (__v8di)_mm512_cvtepi8_epi64(__A),
4767                                             (__v8di)__W);
4768}
4769
4770static __inline__ __m512i __DEFAULT_FN_ATTRS512
4771_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
4772{
4773  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4774                                             (__v8di)_mm512_cvtepi8_epi64(__A),
4775                                             (__v8di)_mm512_setzero_si512 ());
4776}
4777
4778static __inline__ __m512i __DEFAULT_FN_ATTRS512
4779_mm512_cvtepi32_epi64(__m256i __X)
4780{
4781  return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
4782}
4783
4784static __inline__ __m512i __DEFAULT_FN_ATTRS512
4785_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
4786{
4787  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4788                                             (__v8di)_mm512_cvtepi32_epi64(__X),
4789                                             (__v8di)__W);
4790}
4791
4792static __inline__ __m512i __DEFAULT_FN_ATTRS512
4793_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
4794{
4795  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4796                                             (__v8di)_mm512_cvtepi32_epi64(__X),
4797                                             (__v8di)_mm512_setzero_si512());
4798}
4799
4800static __inline__ __m512i __DEFAULT_FN_ATTRS512
4801_mm512_cvtepi16_epi32(__m256i __A)
4802{
4803  return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
4804}
4805
4806static __inline__ __m512i __DEFAULT_FN_ATTRS512
4807_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
4808{
4809  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4810                                            (__v16si)_mm512_cvtepi16_epi32(__A),
4811                                            (__v16si)__W);
4812}
4813
4814static __inline__ __m512i __DEFAULT_FN_ATTRS512
4815_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
4816{
4817  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4818                                            (__v16si)_mm512_cvtepi16_epi32(__A),
4819                                            (__v16si)_mm512_setzero_si512 ());
4820}
4821
4822static __inline__ __m512i __DEFAULT_FN_ATTRS512
4823_mm512_cvtepi16_epi64(__m128i __A)
4824{
4825  return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
4826}
4827
4828static __inline__ __m512i __DEFAULT_FN_ATTRS512
4829_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4830{
4831  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4832                                             (__v8di)_mm512_cvtepi16_epi64(__A),
4833                                             (__v8di)__W);
4834}
4835
4836static __inline__ __m512i __DEFAULT_FN_ATTRS512
4837_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
4838{
4839  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4840                                             (__v8di)_mm512_cvtepi16_epi64(__A),
4841                                             (__v8di)_mm512_setzero_si512());
4842}
4843
4844static __inline__ __m512i __DEFAULT_FN_ATTRS512
4845_mm512_cvtepu8_epi32(__m128i __A)
4846{
4847  return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
4848}
4849
4850static __inline__ __m512i __DEFAULT_FN_ATTRS512
4851_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
4852{
4853  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4854                                             (__v16si)_mm512_cvtepu8_epi32(__A),
4855                                             (__v16si)__W);
4856}
4857
4858static __inline__ __m512i __DEFAULT_FN_ATTRS512
4859_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
4860{
4861  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4862                                             (__v16si)_mm512_cvtepu8_epi32(__A),
4863                                             (__v16si)_mm512_setzero_si512());
4864}
4865
4866static __inline__ __m512i __DEFAULT_FN_ATTRS512
4867_mm512_cvtepu8_epi64(__m128i __A)
4868{
4869  return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
4870}
4871
4872static __inline__ __m512i __DEFAULT_FN_ATTRS512
4873_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4874{
4875  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4876                                             (__v8di)_mm512_cvtepu8_epi64(__A),
4877                                             (__v8di)__W);
4878}
4879
4880static __inline__ __m512i __DEFAULT_FN_ATTRS512
4881_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
4882{
4883  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4884                                             (__v8di)_mm512_cvtepu8_epi64(__A),
4885                                             (__v8di)_mm512_setzero_si512());
4886}
4887
4888static __inline__ __m512i __DEFAULT_FN_ATTRS512
4889_mm512_cvtepu32_epi64(__m256i __X)
4890{
4891  return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
4892}
4893
4894static __inline__ __m512i __DEFAULT_FN_ATTRS512
4895_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
4896{
4897  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4898                                             (__v8di)_mm512_cvtepu32_epi64(__X),
4899                                             (__v8di)__W);
4900}
4901
4902static __inline__ __m512i __DEFAULT_FN_ATTRS512
4903_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
4904{
4905  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4906                                             (__v8di)_mm512_cvtepu32_epi64(__X),
4907                                             (__v8di)_mm512_setzero_si512());
4908}
4909
4910static __inline__ __m512i __DEFAULT_FN_ATTRS512
4911_mm512_cvtepu16_epi32(__m256i __A)
4912{
4913  return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
4914}
4915
4916static __inline__ __m512i __DEFAULT_FN_ATTRS512
4917_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
4918{
4919  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4920                                            (__v16si)_mm512_cvtepu16_epi32(__A),
4921                                            (__v16si)__W);
4922}
4923
4924static __inline__ __m512i __DEFAULT_FN_ATTRS512
4925_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
4926{
4927  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
4928                                            (__v16si)_mm512_cvtepu16_epi32(__A),
4929                                            (__v16si)_mm512_setzero_si512());
4930}
4931
4932static __inline__ __m512i __DEFAULT_FN_ATTRS512
4933_mm512_cvtepu16_epi64(__m128i __A)
4934{
4935  return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
4936}
4937
4938static __inline__ __m512i __DEFAULT_FN_ATTRS512
4939_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
4940{
4941  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4942                                             (__v8di)_mm512_cvtepu16_epi64(__A),
4943                                             (__v8di)__W);
4944}
4945
4946static __inline__ __m512i __DEFAULT_FN_ATTRS512
4947_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
4948{
4949  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
4950                                             (__v8di)_mm512_cvtepu16_epi64(__A),
4951                                             (__v8di)_mm512_setzero_si512());
4952}
4953
4954static __inline__ __m512i __DEFAULT_FN_ATTRS512
4955_mm512_rorv_epi32 (__m512i __A, __m512i __B)
4956{
4957  return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
4958}
4959
4960static __inline__ __m512i __DEFAULT_FN_ATTRS512
4961_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
4962{
4963  return (__m512i)__builtin_ia32_selectd_512(__U,
4964                                           (__v16si)_mm512_rorv_epi32(__A, __B),
4965                                           (__v16si)__W);
4966}
4967
4968static __inline__ __m512i __DEFAULT_FN_ATTRS512
4969_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
4970{
4971  return (__m512i)__builtin_ia32_selectd_512(__U,
4972                                           (__v16si)_mm512_rorv_epi32(__A, __B),
4973                                           (__v16si)_mm512_setzero_si512());
4974}
4975
4976static __inline__ __m512i __DEFAULT_FN_ATTRS512
4977_mm512_rorv_epi64 (__m512i __A, __m512i __B)
4978{
4979  return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
4980}
4981
4982static __inline__ __m512i __DEFAULT_FN_ATTRS512
4983_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
4984{
4985  return (__m512i)__builtin_ia32_selectq_512(__U,
4986                                            (__v8di)_mm512_rorv_epi64(__A, __B),
4987                                            (__v8di)__W);
4988}
4989
4990static __inline__ __m512i __DEFAULT_FN_ATTRS512
4991_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
4992{
4993  return (__m512i)__builtin_ia32_selectq_512(__U,
4994                                            (__v8di)_mm512_rorv_epi64(__A, __B),
4995                                            (__v8di)_mm512_setzero_si512());
4996}
4997
4998
4999
5000#define _mm512_cmp_epi32_mask(a, b, p) \
5001  ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
5002                                          (__v16si)(__m512i)(b), (int)(p), \
5003                                          (__mmask16)-1))
5004
5005#define _mm512_cmp_epu32_mask(a, b, p) \
5006  ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
5007                                           (__v16si)(__m512i)(b), (int)(p), \
5008                                           (__mmask16)-1))
5009
5010#define _mm512_cmp_epi64_mask(a, b, p) \
5011  ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5012                                         (__v8di)(__m512i)(b), (int)(p), \
5013                                         (__mmask8)-1))
5014
5015#define _mm512_cmp_epu64_mask(a, b, p) \
5016  ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5017                                          (__v8di)(__m512i)(b), (int)(p), \
5018                                          (__mmask8)-1))
5019
5020#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
5021  ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
5022                                          (__v16si)(__m512i)(b), (int)(p), \
5023                                          (__mmask16)(m)))
5024
5025#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
5026  ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
5027                                           (__v16si)(__m512i)(b), (int)(p), \
5028                                           (__mmask16)(m)))
5029
5030#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
5031  ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
5032                                         (__v8di)(__m512i)(b), (int)(p), \
5033                                         (__mmask8)(m)))
5034
5035#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
5036  ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
5037                                          (__v8di)(__m512i)(b), (int)(p), \
5038                                          (__mmask8)(m)))
5039
5040#define _mm512_rol_epi32(a, b) \
5041  ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
5042
5043#define _mm512_mask_rol_epi32(W, U, a, b) \
5044  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5045                                       (__v16si)_mm512_rol_epi32((a), (b)), \
5046                                       (__v16si)(__m512i)(W)))
5047
5048#define _mm512_maskz_rol_epi32(U, a, b) \
5049  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5050                                       (__v16si)_mm512_rol_epi32((a), (b)), \
5051                                       (__v16si)_mm512_setzero_si512()))
5052
5053#define _mm512_rol_epi64(a, b) \
5054  ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
5055
5056#define _mm512_mask_rol_epi64(W, U, a, b) \
5057  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5058                                       (__v8di)_mm512_rol_epi64((a), (b)), \
5059                                       (__v8di)(__m512i)(W)))
5060
5061#define _mm512_maskz_rol_epi64(U, a, b) \
5062  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5063                                       (__v8di)_mm512_rol_epi64((a), (b)), \
5064                                       (__v8di)_mm512_setzero_si512()))
5065
5066static __inline__ __m512i __DEFAULT_FN_ATTRS512
5067_mm512_rolv_epi32 (__m512i __A, __m512i __B)
5068{
5069  return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
5070}
5071
5072static __inline__ __m512i __DEFAULT_FN_ATTRS512
5073_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
5074{
5075  return (__m512i)__builtin_ia32_selectd_512(__U,
5076                                           (__v16si)_mm512_rolv_epi32(__A, __B),
5077                                           (__v16si)__W);
5078}
5079
5080static __inline__ __m512i __DEFAULT_FN_ATTRS512
5081_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
5082{
5083  return (__m512i)__builtin_ia32_selectd_512(__U,
5084                                           (__v16si)_mm512_rolv_epi32(__A, __B),
5085                                           (__v16si)_mm512_setzero_si512());
5086}
5087
5088static __inline__ __m512i __DEFAULT_FN_ATTRS512
5089_mm512_rolv_epi64 (__m512i __A, __m512i __B)
5090{
5091  return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
5092}
5093
5094static __inline__ __m512i __DEFAULT_FN_ATTRS512
5095_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
5096{
5097  return (__m512i)__builtin_ia32_selectq_512(__U,
5098                                            (__v8di)_mm512_rolv_epi64(__A, __B),
5099                                            (__v8di)__W);
5100}
5101
5102static __inline__ __m512i __DEFAULT_FN_ATTRS512
5103_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
5104{
5105  return (__m512i)__builtin_ia32_selectq_512(__U,
5106                                            (__v8di)_mm512_rolv_epi64(__A, __B),
5107                                            (__v8di)_mm512_setzero_si512());
5108}
5109
5110#define _mm512_ror_epi32(A, B) \
5111  ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
5112
5113#define _mm512_mask_ror_epi32(W, U, A, B) \
5114  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5115                                       (__v16si)_mm512_ror_epi32((A), (B)), \
5116                                       (__v16si)(__m512i)(W)))
5117
5118#define _mm512_maskz_ror_epi32(U, A, B) \
5119  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
5120                                       (__v16si)_mm512_ror_epi32((A), (B)), \
5121                                       (__v16si)_mm512_setzero_si512()))
5122
5123#define _mm512_ror_epi64(A, B) \
5124  ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
5125
5126#define _mm512_mask_ror_epi64(W, U, A, B) \
5127  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5128                                       (__v8di)_mm512_ror_epi64((A), (B)), \
5129                                       (__v8di)(__m512i)(W)))
5130
5131#define _mm512_maskz_ror_epi64(U, A, B) \
5132  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
5133                                       (__v8di)_mm512_ror_epi64((A), (B)), \
5134                                       (__v8di)_mm512_setzero_si512()))
5135
5136static __inline__ __m512i __DEFAULT_FN_ATTRS512
5137_mm512_slli_epi32(__m512i __A, unsigned int __B)
5138{
5139  return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, (int)__B);
5140}
5141
5142static __inline__ __m512i __DEFAULT_FN_ATTRS512
5143_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
5144                       unsigned int __B)
5145{
5146  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5147                                         (__v16si)_mm512_slli_epi32(__A, __B),
5148                                         (__v16si)__W);
5149}
5150
5151static __inline__ __m512i __DEFAULT_FN_ATTRS512
5152_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
5153  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5154                                         (__v16si)_mm512_slli_epi32(__A, __B),
5155                                         (__v16si)_mm512_setzero_si512());
5156}
5157
5158static __inline__ __m512i __DEFAULT_FN_ATTRS512
5159_mm512_slli_epi64(__m512i __A, unsigned int __B)
5160{
5161  return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, (int)__B);
5162}
5163
5164static __inline__ __m512i __DEFAULT_FN_ATTRS512
5165_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
5166{
5167  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5168                                          (__v8di)_mm512_slli_epi64(__A, __B),
5169                                          (__v8di)__W);
5170}
5171
5172static __inline__ __m512i __DEFAULT_FN_ATTRS512
5173_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
5174{
5175  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5176                                          (__v8di)_mm512_slli_epi64(__A, __B),
5177                                          (__v8di)_mm512_setzero_si512());
5178}
5179
5180static __inline__ __m512i __DEFAULT_FN_ATTRS512
5181_mm512_srli_epi32(__m512i __A, unsigned int __B)
5182{
5183  return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, (int)__B);
5184}
5185
5186static __inline__ __m512i __DEFAULT_FN_ATTRS512
5187_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
5188                       unsigned int __B)
5189{
5190  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5191                                         (__v16si)_mm512_srli_epi32(__A, __B),
5192                                         (__v16si)__W);
5193}
5194
5195static __inline__ __m512i __DEFAULT_FN_ATTRS512
5196_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
5197  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5198                                         (__v16si)_mm512_srli_epi32(__A, __B),
5199                                         (__v16si)_mm512_setzero_si512());
5200}
5201
5202static __inline__ __m512i __DEFAULT_FN_ATTRS512
5203_mm512_srli_epi64(__m512i __A, unsigned int __B)
5204{
5205  return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, (int)__B);
5206}
5207
5208static __inline__ __m512i __DEFAULT_FN_ATTRS512
5209_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
5210                       unsigned int __B)
5211{
5212  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5213                                          (__v8di)_mm512_srli_epi64(__A, __B),
5214                                          (__v8di)__W);
5215}
5216
5217static __inline__ __m512i __DEFAULT_FN_ATTRS512
5218_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
5219                        unsigned int __B)
5220{
5221  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5222                                          (__v8di)_mm512_srli_epi64(__A, __B),
5223                                          (__v8di)_mm512_setzero_si512());
5224}
5225
5226static __inline__ __m512i __DEFAULT_FN_ATTRS512
5227_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
5228{
5229  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
5230              (__v16si) __W,
5231              (__mmask16) __U);
5232}
5233
5234static __inline__ __m512i __DEFAULT_FN_ATTRS512
5235_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
5236{
5237  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
5238              (__v16si)
5239              _mm512_setzero_si512 (),
5240              (__mmask16) __U);
5241}
5242
5243static __inline__ void __DEFAULT_FN_ATTRS512
5244_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
5245{
5246  __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
5247          (__mmask16) __U);
5248}
5249
5250static __inline__ __m512i __DEFAULT_FN_ATTRS512
5251_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
5252{
5253  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
5254                 (__v16si) __A,
5255                 (__v16si) __W);
5256}
5257
5258static __inline__ __m512i __DEFAULT_FN_ATTRS512
5259_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
5260{
5261  return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
5262                 (__v16si) __A,
5263                 (__v16si) _mm512_setzero_si512 ());
5264}
5265
5266static __inline__ __m512i __DEFAULT_FN_ATTRS512
5267_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
5268{
5269  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
5270                 (__v8di) __A,
5271                 (__v8di) __W);
5272}
5273
5274static __inline__ __m512i __DEFAULT_FN_ATTRS512
5275_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
5276{
5277  return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
5278                 (__v8di) __A,
5279                 (__v8di) _mm512_setzero_si512 ());
5280}
5281
5282static __inline__ __m512i __DEFAULT_FN_ATTRS512
5283_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
5284{
5285  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
5286              (__v8di) __W,
5287              (__mmask8) __U);
5288}
5289
5290static __inline__ __m512i __DEFAULT_FN_ATTRS512
5291_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
5292{
5293  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
5294              (__v8di)
5295              _mm512_setzero_si512 (),
5296              (__mmask8) __U);
5297}
5298
5299static __inline__ void __DEFAULT_FN_ATTRS512
5300_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
5301{
5302  __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
5303          (__mmask8) __U);
5304}
5305
5306static __inline__ __m512d __DEFAULT_FN_ATTRS512
5307_mm512_movedup_pd (__m512d __A)
5308{
5309  return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
5310                                          0, 0, 2, 2, 4, 4, 6, 6);
5311}
5312
5313static __inline__ __m512d __DEFAULT_FN_ATTRS512
5314_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
5315{
5316  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
5317                                              (__v8df)_mm512_movedup_pd(__A),
5318                                              (__v8df)__W);
5319}
5320
5321static __inline__ __m512d __DEFAULT_FN_ATTRS512
5322_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
5323{
5324  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
5325                                              (__v8df)_mm512_movedup_pd(__A),
5326                                              (__v8df)_mm512_setzero_pd());
5327}
5328
5329#define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
5330  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5331                                              (__v8df)(__m512d)(B), \
5332                                              (__v8di)(__m512i)(C), (int)(imm), \
5333                                              (__mmask8)-1, (int)(R)))
5334
5335#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
5336  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5337                                              (__v8df)(__m512d)(B), \
5338                                              (__v8di)(__m512i)(C), (int)(imm), \
5339                                              (__mmask8)(U), (int)(R)))
5340
5341#define _mm512_fixupimm_pd(A, B, C, imm) \
5342  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5343                                              (__v8df)(__m512d)(B), \
5344                                              (__v8di)(__m512i)(C), (int)(imm), \
5345                                              (__mmask8)-1, \
5346                                              _MM_FROUND_CUR_DIRECTION))
5347
5348#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
5349  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
5350                                              (__v8df)(__m512d)(B), \
5351                                              (__v8di)(__m512i)(C), (int)(imm), \
5352                                              (__mmask8)(U), \
5353                                              _MM_FROUND_CUR_DIRECTION))
5354
5355#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
5356  ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5357                                               (__v8df)(__m512d)(B), \
5358                                               (__v8di)(__m512i)(C), \
5359                                               (int)(imm), (__mmask8)(U), \
5360                                               (int)(R)))
5361
5362#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
5363  ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
5364                                               (__v8df)(__m512d)(B), \
5365                                               (__v8di)(__m512i)(C), \
5366                                               (int)(imm), (__mmask8)(U), \
5367                                               _MM_FROUND_CUR_DIRECTION))
5368
5369#define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
5370  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5371                                             (__v16sf)(__m512)(B), \
5372                                             (__v16si)(__m512i)(C), (int)(imm), \
5373                                             (__mmask16)-1, (int)(R)))
5374
5375#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
5376  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5377                                             (__v16sf)(__m512)(B), \
5378                                             (__v16si)(__m512i)(C), (int)(imm), \
5379                                             (__mmask16)(U), (int)(R)))
5380
5381#define _mm512_fixupimm_ps(A, B, C, imm) \
5382  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5383                                             (__v16sf)(__m512)(B), \
5384                                             (__v16si)(__m512i)(C), (int)(imm), \
5385                                             (__mmask16)-1, \
5386                                             _MM_FROUND_CUR_DIRECTION))
5387
5388#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
5389  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
5390                                             (__v16sf)(__m512)(B), \
5391                                             (__v16si)(__m512i)(C), (int)(imm), \
5392                                             (__mmask16)(U), \
5393                                             _MM_FROUND_CUR_DIRECTION))
5394
5395#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
5396  ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5397                                              (__v16sf)(__m512)(B), \
5398                                              (__v16si)(__m512i)(C), \
5399                                              (int)(imm), (__mmask16)(U), \
5400                                              (int)(R)))
5401
5402#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
5403  ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
5404                                              (__v16sf)(__m512)(B), \
5405                                              (__v16si)(__m512i)(C), \
5406                                              (int)(imm), (__mmask16)(U), \
5407                                              _MM_FROUND_CUR_DIRECTION))
5408
5409#define _mm_fixupimm_round_sd(A, B, C, imm, R) \
5410  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5411                                           (__v2df)(__m128d)(B), \
5412                                           (__v2di)(__m128i)(C), (int)(imm), \
5413                                           (__mmask8)-1, (int)(R)))
5414
5415#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
5416  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5417                                           (__v2df)(__m128d)(B), \
5418                                           (__v2di)(__m128i)(C), (int)(imm), \
5419                                           (__mmask8)(U), (int)(R)))
5420
5421#define _mm_fixupimm_sd(A, B, C, imm) \
5422  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5423                                           (__v2df)(__m128d)(B), \
5424                                           (__v2di)(__m128i)(C), (int)(imm), \
5425                                           (__mmask8)-1, \
5426                                           _MM_FROUND_CUR_DIRECTION))
5427
5428#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
5429  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
5430                                           (__v2df)(__m128d)(B), \
5431                                           (__v2di)(__m128i)(C), (int)(imm), \
5432                                           (__mmask8)(U), \
5433                                           _MM_FROUND_CUR_DIRECTION))
5434
5435#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
5436  ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
5437                                            (__v2df)(__m128d)(B), \
5438                                            (__v2di)(__m128i)(C), (int)(imm), \
5439                                            (__mmask8)(U), (int)(R)))
5440
5441#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
5442  ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
5443                                            (__v2df)(__m128d)(B), \
5444                                            (__v2di)(__m128i)(C), (int)(imm), \
5445                                            (__mmask8)(U), \
5446                                            _MM_FROUND_CUR_DIRECTION))
5447
5448#define _mm_fixupimm_round_ss(A, B, C, imm, R) \
5449  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5450                                          (__v4sf)(__m128)(B), \
5451                                          (__v4si)(__m128i)(C), (int)(imm), \
5452                                          (__mmask8)-1, (int)(R)))
5453
5454#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
5455  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5456                                          (__v4sf)(__m128)(B), \
5457                                          (__v4si)(__m128i)(C), (int)(imm), \
5458                                          (__mmask8)(U), (int)(R)))
5459
5460#define _mm_fixupimm_ss(A, B, C, imm) \
5461  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5462                                          (__v4sf)(__m128)(B), \
5463                                          (__v4si)(__m128i)(C), (int)(imm), \
5464                                          (__mmask8)-1, \
5465                                          _MM_FROUND_CUR_DIRECTION))
5466
5467#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
5468  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
5469                                          (__v4sf)(__m128)(B), \
5470                                          (__v4si)(__m128i)(C), (int)(imm), \
5471                                          (__mmask8)(U), \
5472                                          _MM_FROUND_CUR_DIRECTION))
5473
5474#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
5475  ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
5476                                           (__v4sf)(__m128)(B), \
5477                                           (__v4si)(__m128i)(C), (int)(imm), \
5478                                           (__mmask8)(U), (int)(R)))
5479
5480#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
5481  ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
5482                                           (__v4sf)(__m128)(B), \
5483                                           (__v4si)(__m128i)(C), (int)(imm), \
5484                                           (__mmask8)(U), \
5485                                           _MM_FROUND_CUR_DIRECTION))
5486
5487#define _mm_getexp_round_sd(A, B, R) \
5488  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5489                                                  (__v2df)(__m128d)(B), \
5490                                                  (__v2df)_mm_setzero_pd(), \
5491                                                  (__mmask8)-1, (int)(R)))
5492
5493
5494static __inline__ __m128d __DEFAULT_FN_ATTRS128
5495_mm_getexp_sd (__m128d __A, __m128d __B)
5496{
5497  return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
5498                 (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
5499}
5500
5501static __inline__ __m128d __DEFAULT_FN_ATTRS128
5502_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
5503{
5504 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
5505          (__v2df) __B,
5506          (__v2df) __W,
5507          (__mmask8) __U,
5508          _MM_FROUND_CUR_DIRECTION);
5509}
5510
5511#define _mm_mask_getexp_round_sd(W, U, A, B, R) \
5512  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5513                                                  (__v2df)(__m128d)(B), \
5514                                                  (__v2df)(__m128d)(W), \
5515                                                  (__mmask8)(U), (int)(R)))
5516
5517static __inline__ __m128d __DEFAULT_FN_ATTRS128
5518_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
5519{
5520 return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
5521          (__v2df) __B,
5522          (__v2df) _mm_setzero_pd (),
5523          (__mmask8) __U,
5524          _MM_FROUND_CUR_DIRECTION);
5525}
5526
5527#define _mm_maskz_getexp_round_sd(U, A, B, R) \
5528  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
5529                                                  (__v2df)(__m128d)(B), \
5530                                                  (__v2df)_mm_setzero_pd(), \
5531                                                  (__mmask8)(U), (int)(R)))
5532
5533#define _mm_getexp_round_ss(A, B, R) \
5534  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5535                                                 (__v4sf)(__m128)(B), \
5536                                                 (__v4sf)_mm_setzero_ps(), \
5537                                                 (__mmask8)-1, (int)(R)))
5538
5539static __inline__ __m128 __DEFAULT_FN_ATTRS128
5540_mm_getexp_ss (__m128 __A, __m128 __B)
5541{
5542  return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5543                (__v4sf) __B, (__v4sf)  _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
5544}
5545
5546static __inline__ __m128 __DEFAULT_FN_ATTRS128
5547_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
5548{
5549 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5550          (__v4sf) __B,
5551          (__v4sf) __W,
5552          (__mmask8) __U,
5553          _MM_FROUND_CUR_DIRECTION);
5554}
5555
5556#define _mm_mask_getexp_round_ss(W, U, A, B, R) \
5557  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5558                                                 (__v4sf)(__m128)(B), \
5559                                                 (__v4sf)(__m128)(W), \
5560                                                 (__mmask8)(U), (int)(R)))
5561
5562static __inline__ __m128 __DEFAULT_FN_ATTRS128
5563_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
5564{
5565 return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
5566          (__v4sf) __B,
5567          (__v4sf) _mm_setzero_ps (),
5568          (__mmask8) __U,
5569          _MM_FROUND_CUR_DIRECTION);
5570}
5571
5572#define _mm_maskz_getexp_round_ss(U, A, B, R) \
5573  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
5574                                                 (__v4sf)(__m128)(B), \
5575                                                 (__v4sf)_mm_setzero_ps(), \
5576                                                 (__mmask8)(U), (int)(R)))
5577
5578#define _mm_getmant_round_sd(A, B, C, D, R) \
5579  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5580                                                (__v2df)(__m128d)(B), \
5581                                                (int)(((D)<<2) | (C)), \
5582                                                (__v2df)_mm_setzero_pd(), \
5583                                                (__mmask8)-1, (int)(R)))
5584
5585#define _mm_getmant_sd(A, B, C, D)  \
5586  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5587                                                (__v2df)(__m128d)(B), \
5588                                                (int)(((D)<<2) | (C)), \
5589                                                (__v2df)_mm_setzero_pd(), \
5590                                                (__mmask8)-1, \
5591                                                _MM_FROUND_CUR_DIRECTION))
5592
5593#define _mm_mask_getmant_sd(W, U, A, B, C, D) \
5594  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5595                                                (__v2df)(__m128d)(B), \
5596                                                (int)(((D)<<2) | (C)), \
5597                                                (__v2df)(__m128d)(W), \
5598                                                (__mmask8)(U), \
5599                                                _MM_FROUND_CUR_DIRECTION))
5600
5601#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
5602  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5603                                                (__v2df)(__m128d)(B), \
5604                                                (int)(((D)<<2) | (C)), \
5605                                                (__v2df)(__m128d)(W), \
5606                                                (__mmask8)(U), (int)(R)))
5607
5608#define _mm_maskz_getmant_sd(U, A, B, C, D) \
5609  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5610                                                (__v2df)(__m128d)(B), \
5611                                                (int)(((D)<<2) | (C)), \
5612                                                (__v2df)_mm_setzero_pd(), \
5613                                                (__mmask8)(U), \
5614                                                _MM_FROUND_CUR_DIRECTION))
5615
5616#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
5617  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
5618                                                (__v2df)(__m128d)(B), \
5619                                                (int)(((D)<<2) | (C)), \
5620                                                (__v2df)_mm_setzero_pd(), \
5621                                                (__mmask8)(U), (int)(R)))
5622
5623#define _mm_getmant_round_ss(A, B, C, D, R) \
5624  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5625                                               (__v4sf)(__m128)(B), \
5626                                               (int)(((D)<<2) | (C)), \
5627                                               (__v4sf)_mm_setzero_ps(), \
5628                                               (__mmask8)-1, (int)(R)))
5629
5630#define _mm_getmant_ss(A, B, C, D) \
5631  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5632                                               (__v4sf)(__m128)(B), \
5633                                               (int)(((D)<<2) | (C)), \
5634                                               (__v4sf)_mm_setzero_ps(), \
5635                                               (__mmask8)-1, \
5636                                               _MM_FROUND_CUR_DIRECTION))
5637
5638#define _mm_mask_getmant_ss(W, U, A, B, C, D) \
5639  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5640                                               (__v4sf)(__m128)(B), \
5641                                               (int)(((D)<<2) | (C)), \
5642                                               (__v4sf)(__m128)(W), \
5643                                               (__mmask8)(U), \
5644                                               _MM_FROUND_CUR_DIRECTION))
5645
5646#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
5647  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5648                                               (__v4sf)(__m128)(B), \
5649                                               (int)(((D)<<2) | (C)), \
5650                                               (__v4sf)(__m128)(W), \
5651                                               (__mmask8)(U), (int)(R)))
5652
5653#define _mm_maskz_getmant_ss(U, A, B, C, D) \
5654  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5655                                               (__v4sf)(__m128)(B), \
5656                                               (int)(((D)<<2) | (C)), \
5657                                               (__v4sf)_mm_setzero_ps(), \
5658                                               (__mmask8)(U), \
5659                                               _MM_FROUND_CUR_DIRECTION))
5660
5661#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
5662  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
5663                                               (__v4sf)(__m128)(B), \
5664                                               (int)(((D)<<2) | (C)), \
5665                                               (__v4sf)_mm_setzero_ps(), \
5666                                               (__mmask8)(U), (int)(R)))
5667
5668static __inline__ __mmask16 __DEFAULT_FN_ATTRS
5669_mm512_kmov (__mmask16 __A)
5670{
5671  return  __A;
5672}
5673
5674#define _mm_comi_round_sd(A, B, P, R) \
5675  ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
5676                               (int)(P), (int)(R)))
5677
5678#define _mm_comi_round_ss(A, B, P, R) \
5679  ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
5680                               (int)(P), (int)(R)))
5681
5682#ifdef __x86_64__
5683#define _mm_cvt_roundsd_si64(A, R) \
5684  ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
5685#endif
5686
5687static __inline__ __m512i __DEFAULT_FN_ATTRS512
5688_mm512_sll_epi32(__m512i __A, __m128i __B)
5689{
5690  return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
5691}
5692
5693static __inline__ __m512i __DEFAULT_FN_ATTRS512
5694_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5695{
5696  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5697                                          (__v16si)_mm512_sll_epi32(__A, __B),
5698                                          (__v16si)__W);
5699}
5700
5701static __inline__ __m512i __DEFAULT_FN_ATTRS512
5702_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5703{
5704  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5705                                          (__v16si)_mm512_sll_epi32(__A, __B),
5706                                          (__v16si)_mm512_setzero_si512());
5707}
5708
5709static __inline__ __m512i __DEFAULT_FN_ATTRS512
5710_mm512_sll_epi64(__m512i __A, __m128i __B)
5711{
5712  return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
5713}
5714
5715static __inline__ __m512i __DEFAULT_FN_ATTRS512
5716_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5717{
5718  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5719                                             (__v8di)_mm512_sll_epi64(__A, __B),
5720                                             (__v8di)__W);
5721}
5722
5723static __inline__ __m512i __DEFAULT_FN_ATTRS512
5724_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5725{
5726  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5727                                           (__v8di)_mm512_sll_epi64(__A, __B),
5728                                           (__v8di)_mm512_setzero_si512());
5729}
5730
5731static __inline__ __m512i __DEFAULT_FN_ATTRS512
5732_mm512_sllv_epi32(__m512i __X, __m512i __Y)
5733{
5734  return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
5735}
5736
5737static __inline__ __m512i __DEFAULT_FN_ATTRS512
5738_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5739{
5740  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5741                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
5742                                           (__v16si)__W);
5743}
5744
5745static __inline__ __m512i __DEFAULT_FN_ATTRS512
5746_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5747{
5748  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5749                                           (__v16si)_mm512_sllv_epi32(__X, __Y),
5750                                           (__v16si)_mm512_setzero_si512());
5751}
5752
5753static __inline__ __m512i __DEFAULT_FN_ATTRS512
5754_mm512_sllv_epi64(__m512i __X, __m512i __Y)
5755{
5756  return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
5757}
5758
5759static __inline__ __m512i __DEFAULT_FN_ATTRS512
5760_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5761{
5762  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5763                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
5764                                            (__v8di)__W);
5765}
5766
5767static __inline__ __m512i __DEFAULT_FN_ATTRS512
5768_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5769{
5770  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5771                                            (__v8di)_mm512_sllv_epi64(__X, __Y),
5772                                            (__v8di)_mm512_setzero_si512());
5773}
5774
5775static __inline__ __m512i __DEFAULT_FN_ATTRS512
5776_mm512_sra_epi32(__m512i __A, __m128i __B)
5777{
5778  return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
5779}
5780
5781static __inline__ __m512i __DEFAULT_FN_ATTRS512
5782_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5783{
5784  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5785                                          (__v16si)_mm512_sra_epi32(__A, __B),
5786                                          (__v16si)__W);
5787}
5788
5789static __inline__ __m512i __DEFAULT_FN_ATTRS512
5790_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5791{
5792  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5793                                          (__v16si)_mm512_sra_epi32(__A, __B),
5794                                          (__v16si)_mm512_setzero_si512());
5795}
5796
5797static __inline__ __m512i __DEFAULT_FN_ATTRS512
5798_mm512_sra_epi64(__m512i __A, __m128i __B)
5799{
5800  return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
5801}
5802
5803static __inline__ __m512i __DEFAULT_FN_ATTRS512
5804_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5805{
5806  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5807                                           (__v8di)_mm512_sra_epi64(__A, __B),
5808                                           (__v8di)__W);
5809}
5810
5811static __inline__ __m512i __DEFAULT_FN_ATTRS512
5812_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5813{
5814  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5815                                           (__v8di)_mm512_sra_epi64(__A, __B),
5816                                           (__v8di)_mm512_setzero_si512());
5817}
5818
5819static __inline__ __m512i __DEFAULT_FN_ATTRS512
5820_mm512_srav_epi32(__m512i __X, __m512i __Y)
5821{
5822  return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
5823}
5824
5825static __inline__ __m512i __DEFAULT_FN_ATTRS512
5826_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5827{
5828  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5829                                           (__v16si)_mm512_srav_epi32(__X, __Y),
5830                                           (__v16si)__W);
5831}
5832
5833static __inline__ __m512i __DEFAULT_FN_ATTRS512
5834_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5835{
5836  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5837                                           (__v16si)_mm512_srav_epi32(__X, __Y),
5838                                           (__v16si)_mm512_setzero_si512());
5839}
5840
5841static __inline__ __m512i __DEFAULT_FN_ATTRS512
5842_mm512_srav_epi64(__m512i __X, __m512i __Y)
5843{
5844  return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
5845}
5846
5847static __inline__ __m512i __DEFAULT_FN_ATTRS512
5848_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5849{
5850  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5851                                            (__v8di)_mm512_srav_epi64(__X, __Y),
5852                                            (__v8di)__W);
5853}
5854
5855static __inline__ __m512i __DEFAULT_FN_ATTRS512
5856_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5857{
5858  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5859                                            (__v8di)_mm512_srav_epi64(__X, __Y),
5860                                            (__v8di)_mm512_setzero_si512());
5861}
5862
5863static __inline__ __m512i __DEFAULT_FN_ATTRS512
5864_mm512_srl_epi32(__m512i __A, __m128i __B)
5865{
5866  return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
5867}
5868
5869static __inline__ __m512i __DEFAULT_FN_ATTRS512
5870_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
5871{
5872  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5873                                          (__v16si)_mm512_srl_epi32(__A, __B),
5874                                          (__v16si)__W);
5875}
5876
5877static __inline__ __m512i __DEFAULT_FN_ATTRS512
5878_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
5879{
5880  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5881                                          (__v16si)_mm512_srl_epi32(__A, __B),
5882                                          (__v16si)_mm512_setzero_si512());
5883}
5884
5885static __inline__ __m512i __DEFAULT_FN_ATTRS512
5886_mm512_srl_epi64(__m512i __A, __m128i __B)
5887{
5888  return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
5889}
5890
5891static __inline__ __m512i __DEFAULT_FN_ATTRS512
5892_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
5893{
5894  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5895                                           (__v8di)_mm512_srl_epi64(__A, __B),
5896                                           (__v8di)__W);
5897}
5898
5899static __inline__ __m512i __DEFAULT_FN_ATTRS512
5900_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
5901{
5902  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5903                                           (__v8di)_mm512_srl_epi64(__A, __B),
5904                                           (__v8di)_mm512_setzero_si512());
5905}
5906
5907static __inline__ __m512i __DEFAULT_FN_ATTRS512
5908_mm512_srlv_epi32(__m512i __X, __m512i __Y)
5909{
5910  return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
5911}
5912
5913static __inline__ __m512i __DEFAULT_FN_ATTRS512
5914_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
5915{
5916  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5917                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
5918                                           (__v16si)__W);
5919}
5920
5921static __inline__ __m512i __DEFAULT_FN_ATTRS512
5922_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
5923{
5924  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
5925                                           (__v16si)_mm512_srlv_epi32(__X, __Y),
5926                                           (__v16si)_mm512_setzero_si512());
5927}
5928
5929static __inline__ __m512i __DEFAULT_FN_ATTRS512
5930_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
5931{
5932  return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
5933}
5934
5935static __inline__ __m512i __DEFAULT_FN_ATTRS512
5936_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
5937{
5938  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5939                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
5940                                            (__v8di)__W);
5941}
5942
5943static __inline__ __m512i __DEFAULT_FN_ATTRS512
5944_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
5945{
5946  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
5947                                            (__v8di)_mm512_srlv_epi64(__X, __Y),
5948                                            (__v8di)_mm512_setzero_si512());
5949}
5950
5951/// \enum _MM_TERNLOG_ENUM
5952///    A helper to represent the ternary logic operations among vector \a A,
5953///    \a B and \a C. The representation is passed to \a imm.
5954typedef enum {
5955  _MM_TERNLOG_A = 0xF0,
5956  _MM_TERNLOG_B = 0xCC,
5957  _MM_TERNLOG_C = 0xAA
5958} _MM_TERNLOG_ENUM;
5959
5960#define _mm512_ternarylogic_epi32(A, B, C, imm)                                \
5961  ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
5962      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5963      (unsigned char)(imm), (__mmask16)-1))
5964
5965#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm)                        \
5966  ((__m512i)__builtin_ia32_pternlogd512_mask(                                  \
5967      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5968      (unsigned char)(imm), (__mmask16)(U)))
5969
5970#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm)                       \
5971  ((__m512i)__builtin_ia32_pternlogd512_maskz(                                 \
5972      (__v16si)(__m512i)(A), (__v16si)(__m512i)(B), (__v16si)(__m512i)(C),     \
5973      (unsigned char)(imm), (__mmask16)(U)))
5974
5975#define _mm512_ternarylogic_epi64(A, B, C, imm)                                \
5976  ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
5977      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5978      (unsigned char)(imm), (__mmask8)-1))
5979
5980#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm)                        \
5981  ((__m512i)__builtin_ia32_pternlogq512_mask(                                  \
5982      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5983      (unsigned char)(imm), (__mmask8)(U)))
5984
5985#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm)                       \
5986  ((__m512i)__builtin_ia32_pternlogq512_maskz(                                 \
5987      (__v8di)(__m512i)(A), (__v8di)(__m512i)(B), (__v8di)(__m512i)(C),        \
5988      (unsigned char)(imm), (__mmask8)(U)))
5989
5990#ifdef __x86_64__
5991#define _mm_cvt_roundsd_i64(A, R) \
5992  ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
5993#endif
5994
5995#define _mm_cvt_roundsd_si32(A, R) \
5996  ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
5997
5998#define _mm_cvt_roundsd_i32(A, R) \
5999  ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
6000
6001#define _mm_cvt_roundsd_u32(A, R) \
6002  ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
6003
6004static __inline__ unsigned __DEFAULT_FN_ATTRS128
6005_mm_cvtsd_u32 (__m128d __A)
6006{
6007  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
6008             _MM_FROUND_CUR_DIRECTION);
6009}
6010
6011#ifdef __x86_64__
6012#define _mm_cvt_roundsd_u64(A, R) \
6013  ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
6014                                                   (int)(R)))
6015
6016static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6017_mm_cvtsd_u64 (__m128d __A)
6018{
6019  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
6020                 __A,
6021                 _MM_FROUND_CUR_DIRECTION);
6022}
6023#endif
6024
6025#define _mm_cvt_roundss_si32(A, R) \
6026  ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
6027
6028#define _mm_cvt_roundss_i32(A, R) \
6029  ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
6030
6031#ifdef __x86_64__
6032#define _mm_cvt_roundss_si64(A, R) \
6033  ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
6034
6035#define _mm_cvt_roundss_i64(A, R) \
6036  ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
6037#endif
6038
6039#define _mm_cvt_roundss_u32(A, R) \
6040  ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
6041
6042static __inline__ unsigned __DEFAULT_FN_ATTRS128
6043_mm_cvtss_u32 (__m128 __A)
6044{
6045  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
6046             _MM_FROUND_CUR_DIRECTION);
6047}
6048
6049#ifdef __x86_64__
6050#define _mm_cvt_roundss_u64(A, R) \
6051  ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
6052                                                   (int)(R)))
6053
6054static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6055_mm_cvtss_u64 (__m128 __A)
6056{
6057  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
6058                 __A,
6059                 _MM_FROUND_CUR_DIRECTION);
6060}
6061#endif
6062
6063#define _mm_cvtt_roundsd_i32(A, R) \
6064  ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
6065
6066#define _mm_cvtt_roundsd_si32(A, R) \
6067  ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
6068
6069static __inline__ int __DEFAULT_FN_ATTRS128
6070_mm_cvttsd_i32 (__m128d __A)
6071{
6072  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
6073              _MM_FROUND_CUR_DIRECTION);
6074}
6075
6076#ifdef __x86_64__
6077#define _mm_cvtt_roundsd_si64(A, R) \
6078  ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
6079
6080#define _mm_cvtt_roundsd_i64(A, R) \
6081  ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
6082
6083static __inline__ long long __DEFAULT_FN_ATTRS128
6084_mm_cvttsd_i64 (__m128d __A)
6085{
6086  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
6087              _MM_FROUND_CUR_DIRECTION);
6088}
6089#endif
6090
6091#define _mm_cvtt_roundsd_u32(A, R) \
6092  ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
6093
6094static __inline__ unsigned __DEFAULT_FN_ATTRS128
6095_mm_cvttsd_u32 (__m128d __A)
6096{
6097  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
6098              _MM_FROUND_CUR_DIRECTION);
6099}
6100
6101#ifdef __x86_64__
6102#define _mm_cvtt_roundsd_u64(A, R) \
6103  ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
6104                                                    (int)(R)))
6105
6106static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6107_mm_cvttsd_u64 (__m128d __A)
6108{
6109  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
6110                  __A,
6111                  _MM_FROUND_CUR_DIRECTION);
6112}
6113#endif
6114
6115#define _mm_cvtt_roundss_i32(A, R) \
6116  ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
6117
6118#define _mm_cvtt_roundss_si32(A, R) \
6119  ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
6120
6121static __inline__ int __DEFAULT_FN_ATTRS128
6122_mm_cvttss_i32 (__m128 __A)
6123{
6124  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
6125              _MM_FROUND_CUR_DIRECTION);
6126}
6127
6128#ifdef __x86_64__
6129#define _mm_cvtt_roundss_i64(A, R) \
6130  ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
6131
6132#define _mm_cvtt_roundss_si64(A, R) \
6133  ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
6134
6135static __inline__ long long __DEFAULT_FN_ATTRS128
6136_mm_cvttss_i64 (__m128 __A)
6137{
6138  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
6139              _MM_FROUND_CUR_DIRECTION);
6140}
6141#endif
6142
6143#define _mm_cvtt_roundss_u32(A, R) \
6144  ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
6145
6146static __inline__ unsigned __DEFAULT_FN_ATTRS128
6147_mm_cvttss_u32 (__m128 __A)
6148{
6149  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
6150              _MM_FROUND_CUR_DIRECTION);
6151}
6152
6153#ifdef __x86_64__
6154#define _mm_cvtt_roundss_u64(A, R) \
6155  ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
6156                                                    (int)(R)))
6157
6158static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
6159_mm_cvttss_u64 (__m128 __A)
6160{
6161  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
6162                  __A,
6163                  _MM_FROUND_CUR_DIRECTION);
6164}
6165#endif
6166
6167#define _mm512_permute_pd(X, C) \
6168  ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
6169
6170#define _mm512_mask_permute_pd(W, U, X, C) \
6171  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6172                                        (__v8df)_mm512_permute_pd((X), (C)), \
6173                                        (__v8df)(__m512d)(W)))
6174
6175#define _mm512_maskz_permute_pd(U, X, C) \
6176  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6177                                        (__v8df)_mm512_permute_pd((X), (C)), \
6178                                        (__v8df)_mm512_setzero_pd()))
6179
6180#define _mm512_permute_ps(X, C) \
6181  ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
6182
6183#define _mm512_mask_permute_ps(W, U, X, C) \
6184  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6185                                       (__v16sf)_mm512_permute_ps((X), (C)), \
6186                                       (__v16sf)(__m512)(W)))
6187
6188#define _mm512_maskz_permute_ps(U, X, C) \
6189  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6190                                       (__v16sf)_mm512_permute_ps((X), (C)), \
6191                                       (__v16sf)_mm512_setzero_ps()))
6192
6193static __inline__ __m512d __DEFAULT_FN_ATTRS512
6194_mm512_permutevar_pd(__m512d __A, __m512i __C)
6195{
6196  return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
6197}
6198
6199static __inline__ __m512d __DEFAULT_FN_ATTRS512
6200_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
6201{
6202  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
6203                                         (__v8df)_mm512_permutevar_pd(__A, __C),
6204                                         (__v8df)__W);
6205}
6206
6207static __inline__ __m512d __DEFAULT_FN_ATTRS512
6208_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
6209{
6210  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
6211                                         (__v8df)_mm512_permutevar_pd(__A, __C),
6212                                         (__v8df)_mm512_setzero_pd());
6213}
6214
6215static __inline__ __m512 __DEFAULT_FN_ATTRS512
6216_mm512_permutevar_ps(__m512 __A, __m512i __C)
6217{
6218  return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
6219}
6220
6221static __inline__ __m512 __DEFAULT_FN_ATTRS512
6222_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
6223{
6224  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
6225                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
6226                                        (__v16sf)__W);
6227}
6228
6229static __inline__ __m512 __DEFAULT_FN_ATTRS512
6230_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
6231{
6232  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
6233                                        (__v16sf)_mm512_permutevar_ps(__A, __C),
6234                                        (__v16sf)_mm512_setzero_ps());
6235}
6236
6237static __inline __m512d __DEFAULT_FN_ATTRS512
6238_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
6239{
6240  return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
6241                                                 (__v8df)__B);
6242}
6243
6244static __inline__ __m512d __DEFAULT_FN_ATTRS512
6245_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
6246{
6247  return (__m512d)__builtin_ia32_selectpd_512(__U,
6248                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6249                                  (__v8df)__A);
6250}
6251
6252static __inline__ __m512d __DEFAULT_FN_ATTRS512
6253_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
6254                             __m512d __B)
6255{
6256  return (__m512d)__builtin_ia32_selectpd_512(__U,
6257                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6258                                  (__v8df)(__m512d)__I);
6259}
6260
6261static __inline__ __m512d __DEFAULT_FN_ATTRS512
6262_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
6263                             __m512d __B)
6264{
6265  return (__m512d)__builtin_ia32_selectpd_512(__U,
6266                                  (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
6267                                  (__v8df)_mm512_setzero_pd());
6268}
6269
6270static __inline __m512 __DEFAULT_FN_ATTRS512
6271_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
6272{
6273  return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
6274                                                (__v16sf) __B);
6275}
6276
6277static __inline__ __m512 __DEFAULT_FN_ATTRS512
6278_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
6279{
6280  return (__m512)__builtin_ia32_selectps_512(__U,
6281                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6282                                 (__v16sf)__A);
6283}
6284
6285static __inline__ __m512 __DEFAULT_FN_ATTRS512
6286_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
6287{
6288  return (__m512)__builtin_ia32_selectps_512(__U,
6289                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6290                                 (__v16sf)(__m512)__I);
6291}
6292
6293static __inline__ __m512 __DEFAULT_FN_ATTRS512
6294_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
6295{
6296  return (__m512)__builtin_ia32_selectps_512(__U,
6297                                 (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
6298                                 (__v16sf)_mm512_setzero_ps());
6299}
6300
6301
6302#define _mm512_cvtt_roundpd_epu32(A, R) \
6303  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6304                                              (__v8si)_mm256_undefined_si256(), \
6305                                              (__mmask8)-1, (int)(R)))
6306
6307#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
6308  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6309                                              (__v8si)(__m256i)(W), \
6310                                              (__mmask8)(U), (int)(R)))
6311
6312#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
6313  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
6314                                              (__v8si)_mm256_setzero_si256(), \
6315                                              (__mmask8)(U), (int)(R)))
6316
6317static __inline__ __m256i __DEFAULT_FN_ATTRS512
6318_mm512_cvttpd_epu32 (__m512d __A)
6319{
6320  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6321                  (__v8si)
6322                  _mm256_undefined_si256 (),
6323                  (__mmask8) -1,
6324                  _MM_FROUND_CUR_DIRECTION);
6325}
6326
6327static __inline__ __m256i __DEFAULT_FN_ATTRS512
6328_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
6329{
6330  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6331                  (__v8si) __W,
6332                  (__mmask8) __U,
6333                  _MM_FROUND_CUR_DIRECTION);
6334}
6335
6336static __inline__ __m256i __DEFAULT_FN_ATTRS512
6337_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
6338{
6339  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
6340                  (__v8si)
6341                  _mm256_setzero_si256 (),
6342                  (__mmask8) __U,
6343                  _MM_FROUND_CUR_DIRECTION);
6344}
6345
6346#define _mm_roundscale_round_sd(A, B, imm, R) \
6347  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6348                                                 (__v2df)(__m128d)(B), \
6349                                                 (__v2df)_mm_setzero_pd(), \
6350                                                 (__mmask8)-1, (int)(imm), \
6351                                                 (int)(R)))
6352
6353#define _mm_roundscale_sd(A, B, imm) \
6354  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6355                                                 (__v2df)(__m128d)(B), \
6356                                                 (__v2df)_mm_setzero_pd(), \
6357                                                 (__mmask8)-1, (int)(imm), \
6358                                                 _MM_FROUND_CUR_DIRECTION))
6359
6360#define _mm_mask_roundscale_sd(W, U, A, B, imm) \
6361  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6362                                                 (__v2df)(__m128d)(B), \
6363                                                 (__v2df)(__m128d)(W), \
6364                                                 (__mmask8)(U), (int)(imm), \
6365                                                 _MM_FROUND_CUR_DIRECTION))
6366
6367#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
6368  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6369                                                 (__v2df)(__m128d)(B), \
6370                                                 (__v2df)(__m128d)(W), \
6371                                                 (__mmask8)(U), (int)(I), \
6372                                                 (int)(R)))
6373
6374#define _mm_maskz_roundscale_sd(U, A, B, I) \
6375  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6376                                                 (__v2df)(__m128d)(B), \
6377                                                 (__v2df)_mm_setzero_pd(), \
6378                                                 (__mmask8)(U), (int)(I), \
6379                                                 _MM_FROUND_CUR_DIRECTION))
6380
6381#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
6382  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
6383                                                 (__v2df)(__m128d)(B), \
6384                                                 (__v2df)_mm_setzero_pd(), \
6385                                                 (__mmask8)(U), (int)(I), \
6386                                                 (int)(R)))
6387
6388#define _mm_roundscale_round_ss(A, B, imm, R) \
6389  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6390                                                (__v4sf)(__m128)(B), \
6391                                                (__v4sf)_mm_setzero_ps(), \
6392                                                (__mmask8)-1, (int)(imm), \
6393                                                (int)(R)))
6394
6395#define _mm_roundscale_ss(A, B, imm) \
6396  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6397                                                (__v4sf)(__m128)(B), \
6398                                                (__v4sf)_mm_setzero_ps(), \
6399                                                (__mmask8)-1, (int)(imm), \
6400                                                _MM_FROUND_CUR_DIRECTION))
6401
6402#define _mm_mask_roundscale_ss(W, U, A, B, I) \
6403  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6404                                                (__v4sf)(__m128)(B), \
6405                                                (__v4sf)(__m128)(W), \
6406                                                (__mmask8)(U), (int)(I), \
6407                                                _MM_FROUND_CUR_DIRECTION))
6408
6409#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
6410  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6411                                                (__v4sf)(__m128)(B), \
6412                                                (__v4sf)(__m128)(W), \
6413                                                (__mmask8)(U), (int)(I), \
6414                                                (int)(R)))
6415
6416#define _mm_maskz_roundscale_ss(U, A, B, I) \
6417  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6418                                                (__v4sf)(__m128)(B), \
6419                                                (__v4sf)_mm_setzero_ps(), \
6420                                                (__mmask8)(U), (int)(I), \
6421                                                _MM_FROUND_CUR_DIRECTION))
6422
6423#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
6424  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
6425                                                (__v4sf)(__m128)(B), \
6426                                                (__v4sf)_mm_setzero_ps(), \
6427                                                (__mmask8)(U), (int)(I), \
6428                                                (int)(R)))
6429
6430#define _mm512_scalef_round_pd(A, B, R) \
6431  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6432                                            (__v8df)(__m512d)(B), \
6433                                            (__v8df)_mm512_undefined_pd(), \
6434                                            (__mmask8)-1, (int)(R)))
6435
6436#define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
6437  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6438                                            (__v8df)(__m512d)(B), \
6439                                            (__v8df)(__m512d)(W), \
6440                                            (__mmask8)(U), (int)(R)))
6441
6442#define _mm512_maskz_scalef_round_pd(U, A, B, R) \
6443  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
6444                                            (__v8df)(__m512d)(B), \
6445                                            (__v8df)_mm512_setzero_pd(), \
6446                                            (__mmask8)(U), (int)(R)))
6447
6448static __inline__ __m512d __DEFAULT_FN_ATTRS512
6449_mm512_scalef_pd (__m512d __A, __m512d __B)
6450{
6451  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6452                (__v8df) __B,
6453                (__v8df)
6454                _mm512_undefined_pd (),
6455                (__mmask8) -1,
6456                _MM_FROUND_CUR_DIRECTION);
6457}
6458
6459static __inline__ __m512d __DEFAULT_FN_ATTRS512
6460_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
6461{
6462  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6463                (__v8df) __B,
6464                (__v8df) __W,
6465                (__mmask8) __U,
6466                _MM_FROUND_CUR_DIRECTION);
6467}
6468
6469static __inline__ __m512d __DEFAULT_FN_ATTRS512
6470_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
6471{
6472  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
6473                (__v8df) __B,
6474                (__v8df)
6475                _mm512_setzero_pd (),
6476                (__mmask8) __U,
6477                _MM_FROUND_CUR_DIRECTION);
6478}
6479
6480#define _mm512_scalef_round_ps(A, B, R) \
6481  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6482                                           (__v16sf)(__m512)(B), \
6483                                           (__v16sf)_mm512_undefined_ps(), \
6484                                           (__mmask16)-1, (int)(R)))
6485
6486#define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
6487  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6488                                           (__v16sf)(__m512)(B), \
6489                                           (__v16sf)(__m512)(W), \
6490                                           (__mmask16)(U), (int)(R)))
6491
6492#define _mm512_maskz_scalef_round_ps(U, A, B, R) \
6493  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
6494                                           (__v16sf)(__m512)(B), \
6495                                           (__v16sf)_mm512_setzero_ps(), \
6496                                           (__mmask16)(U), (int)(R)))
6497
6498static __inline__ __m512 __DEFAULT_FN_ATTRS512
6499_mm512_scalef_ps (__m512 __A, __m512 __B)
6500{
6501  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6502               (__v16sf) __B,
6503               (__v16sf)
6504               _mm512_undefined_ps (),
6505               (__mmask16) -1,
6506               _MM_FROUND_CUR_DIRECTION);
6507}
6508
6509static __inline__ __m512 __DEFAULT_FN_ATTRS512
6510_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
6511{
6512  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6513               (__v16sf) __B,
6514               (__v16sf) __W,
6515               (__mmask16) __U,
6516               _MM_FROUND_CUR_DIRECTION);
6517}
6518
6519static __inline__ __m512 __DEFAULT_FN_ATTRS512
6520_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
6521{
6522  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
6523               (__v16sf) __B,
6524               (__v16sf)
6525               _mm512_setzero_ps (),
6526               (__mmask16) __U,
6527               _MM_FROUND_CUR_DIRECTION);
6528}
6529
6530#define _mm_scalef_round_sd(A, B, R) \
6531  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6532                                               (__v2df)(__m128d)(B), \
6533                                               (__v2df)_mm_setzero_pd(), \
6534                                               (__mmask8)-1, (int)(R)))
6535
6536static __inline__ __m128d __DEFAULT_FN_ATTRS128
6537_mm_scalef_sd (__m128d __A, __m128d __B)
6538{
6539  return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
6540              (__v2df)( __B), (__v2df) _mm_setzero_pd(),
6541              (__mmask8) -1,
6542              _MM_FROUND_CUR_DIRECTION);
6543}
6544
6545static __inline__ __m128d __DEFAULT_FN_ATTRS128
6546_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
6547{
6548 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
6549                 (__v2df) __B,
6550                (__v2df) __W,
6551                (__mmask8) __U,
6552                _MM_FROUND_CUR_DIRECTION);
6553}
6554
6555#define _mm_mask_scalef_round_sd(W, U, A, B, R) \
6556  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6557                                               (__v2df)(__m128d)(B), \
6558                                               (__v2df)(__m128d)(W), \
6559                                               (__mmask8)(U), (int)(R)))
6560
6561static __inline__ __m128d __DEFAULT_FN_ATTRS128
6562_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
6563{
6564 return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
6565                 (__v2df) __B,
6566                (__v2df) _mm_setzero_pd (),
6567                (__mmask8) __U,
6568                _MM_FROUND_CUR_DIRECTION);
6569}
6570
6571#define _mm_maskz_scalef_round_sd(U, A, B, R) \
6572  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
6573                                               (__v2df)(__m128d)(B), \
6574                                               (__v2df)_mm_setzero_pd(), \
6575                                               (__mmask8)(U), (int)(R)))
6576
6577#define _mm_scalef_round_ss(A, B, R) \
6578  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6579                                              (__v4sf)(__m128)(B), \
6580                                              (__v4sf)_mm_setzero_ps(), \
6581                                              (__mmask8)-1, (int)(R)))
6582
6583static __inline__ __m128 __DEFAULT_FN_ATTRS128
6584_mm_scalef_ss (__m128 __A, __m128 __B)
6585{
6586  return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
6587             (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
6588             (__mmask8) -1,
6589             _MM_FROUND_CUR_DIRECTION);
6590}
6591
6592static __inline__ __m128 __DEFAULT_FN_ATTRS128
6593_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
6594{
6595 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
6596                (__v4sf) __B,
6597                (__v4sf) __W,
6598                (__mmask8) __U,
6599                _MM_FROUND_CUR_DIRECTION);
6600}
6601
6602#define _mm_mask_scalef_round_ss(W, U, A, B, R) \
6603  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6604                                              (__v4sf)(__m128)(B), \
6605                                              (__v4sf)(__m128)(W), \
6606                                              (__mmask8)(U), (int)(R)))
6607
6608static __inline__ __m128 __DEFAULT_FN_ATTRS128
6609_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
6610{
6611 return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
6612                 (__v4sf) __B,
6613                (__v4sf) _mm_setzero_ps (),
6614                (__mmask8) __U,
6615                _MM_FROUND_CUR_DIRECTION);
6616}
6617
6618#define _mm_maskz_scalef_round_ss(U, A, B, R) \
6619  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
6620                                              (__v4sf)(__m128)(B), \
6621                                              (__v4sf)_mm_setzero_ps(), \
6622                                              (__mmask8)(U), \
6623                                              (int)(R)))
6624
6625static __inline__ __m512i __DEFAULT_FN_ATTRS512
6626_mm512_srai_epi32(__m512i __A, unsigned int __B)
6627{
6628  return (__m512i)__builtin_ia32_psradi512((__v16si)__A, (int)__B);
6629}
6630
6631static __inline__ __m512i __DEFAULT_FN_ATTRS512
6632_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
6633                       unsigned int __B)
6634{
6635  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
6636                                         (__v16si)_mm512_srai_epi32(__A, __B),
6637                                         (__v16si)__W);
6638}
6639
6640static __inline__ __m512i __DEFAULT_FN_ATTRS512
6641_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
6642                        unsigned int __B) {
6643  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
6644                                         (__v16si)_mm512_srai_epi32(__A, __B),
6645                                         (__v16si)_mm512_setzero_si512());
6646}
6647
6648static __inline__ __m512i __DEFAULT_FN_ATTRS512
6649_mm512_srai_epi64(__m512i __A, unsigned int __B)
6650{
6651  return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, (int)__B);
6652}
6653
6654static __inline__ __m512i __DEFAULT_FN_ATTRS512
6655_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
6656{
6657  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
6658                                          (__v8di)_mm512_srai_epi64(__A, __B),
6659                                          (__v8di)__W);
6660}
6661
6662static __inline__ __m512i __DEFAULT_FN_ATTRS512
6663_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
6664{
6665  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
6666                                          (__v8di)_mm512_srai_epi64(__A, __B),
6667                                          (__v8di)_mm512_setzero_si512());
6668}
6669
6670#define _mm512_shuffle_f32x4(A, B, imm) \
6671  ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
6672                                     (__v16sf)(__m512)(B), (int)(imm)))
6673
6674#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
6675  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6676                                       (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
6677                                       (__v16sf)(__m512)(W)))
6678
6679#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
6680  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6681                                       (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
6682                                       (__v16sf)_mm512_setzero_ps()))
6683
6684#define _mm512_shuffle_f64x2(A, B, imm) \
6685  ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
6686                                      (__v8df)(__m512d)(B), (int)(imm)))
6687
6688#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
6689  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6690                                        (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
6691                                        (__v8df)(__m512d)(W)))
6692
6693#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
6694  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6695                                        (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
6696                                        (__v8df)_mm512_setzero_pd()))
6697
6698#define _mm512_shuffle_i32x4(A, B, imm) \
6699  ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
6700                                      (__v16si)(__m512i)(B), (int)(imm)))
6701
6702#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
6703  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
6704                                       (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
6705                                       (__v16si)(__m512i)(W)))
6706
6707#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
6708  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
6709                                       (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
6710                                       (__v16si)_mm512_setzero_si512()))
6711
6712#define _mm512_shuffle_i64x2(A, B, imm) \
6713  ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
6714                                      (__v8di)(__m512i)(B), (int)(imm)))
6715
6716#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
6717  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
6718                                       (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
6719                                       (__v8di)(__m512i)(W)))
6720
6721#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
6722  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
6723                                       (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
6724                                       (__v8di)_mm512_setzero_si512()))
6725
6726#define _mm512_shuffle_pd(A, B, M) \
6727  ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
6728                                     (__v8df)(__m512d)(B), (int)(M)))
6729
6730#define _mm512_mask_shuffle_pd(W, U, A, B, M) \
6731  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6732                                        (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
6733                                        (__v8df)(__m512d)(W)))
6734
6735#define _mm512_maskz_shuffle_pd(U, A, B, M) \
6736  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
6737                                        (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
6738                                        (__v8df)_mm512_setzero_pd()))
6739
6740#define _mm512_shuffle_ps(A, B, M) \
6741  ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
6742                                    (__v16sf)(__m512)(B), (int)(M)))
6743
6744#define _mm512_mask_shuffle_ps(W, U, A, B, M) \
6745  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6746                                       (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
6747                                       (__v16sf)(__m512)(W)))
6748
6749#define _mm512_maskz_shuffle_ps(U, A, B, M) \
6750  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
6751                                       (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
6752                                       (__v16sf)_mm512_setzero_ps()))
6753
6754#define _mm_sqrt_round_sd(A, B, R) \
6755  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6756                                             (__v2df)(__m128d)(B), \
6757                                             (__v2df)_mm_setzero_pd(), \
6758                                             (__mmask8)-1, (int)(R)))
6759
6760static __inline__ __m128d __DEFAULT_FN_ATTRS128
6761_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
6762{
6763 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
6764                 (__v2df) __B,
6765                (__v2df) __W,
6766                (__mmask8) __U,
6767                _MM_FROUND_CUR_DIRECTION);
6768}
6769
6770#define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
6771  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6772                                             (__v2df)(__m128d)(B), \
6773                                             (__v2df)(__m128d)(W), \
6774                                             (__mmask8)(U), (int)(R)))
6775
6776static __inline__ __m128d __DEFAULT_FN_ATTRS128
6777_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
6778{
6779 return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
6780                 (__v2df) __B,
6781                (__v2df) _mm_setzero_pd (),
6782                (__mmask8) __U,
6783                _MM_FROUND_CUR_DIRECTION);
6784}
6785
6786#define _mm_maskz_sqrt_round_sd(U, A, B, R) \
6787  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
6788                                             (__v2df)(__m128d)(B), \
6789                                             (__v2df)_mm_setzero_pd(), \
6790                                             (__mmask8)(U), (int)(R)))
6791
6792#define _mm_sqrt_round_ss(A, B, R) \
6793  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6794                                            (__v4sf)(__m128)(B), \
6795                                            (__v4sf)_mm_setzero_ps(), \
6796                                            (__mmask8)-1, (int)(R)))
6797
6798static __inline__ __m128 __DEFAULT_FN_ATTRS128
6799_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
6800{
6801 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
6802                 (__v4sf) __B,
6803                (__v4sf) __W,
6804                (__mmask8) __U,
6805                _MM_FROUND_CUR_DIRECTION);
6806}
6807
6808#define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
6809  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6810                                            (__v4sf)(__m128)(B), \
6811                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
6812                                            (int)(R)))
6813
6814static __inline__ __m128 __DEFAULT_FN_ATTRS128
6815_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
6816{
6817 return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
6818                 (__v4sf) __B,
6819                (__v4sf) _mm_setzero_ps (),
6820                (__mmask8) __U,
6821                _MM_FROUND_CUR_DIRECTION);
6822}
6823
6824#define _mm_maskz_sqrt_round_ss(U, A, B, R) \
6825  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
6826                                            (__v4sf)(__m128)(B), \
6827                                            (__v4sf)_mm_setzero_ps(), \
6828                                            (__mmask8)(U), (int)(R)))
6829
6830static __inline__ __m512 __DEFAULT_FN_ATTRS512
6831_mm512_broadcast_f32x4(__m128 __A)
6832{
6833  return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
6834                                         0, 1, 2, 3, 0, 1, 2, 3,
6835                                         0, 1, 2, 3, 0, 1, 2, 3);
6836}
6837
6838static __inline__ __m512 __DEFAULT_FN_ATTRS512
6839_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
6840{
6841  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
6842                                           (__v16sf)_mm512_broadcast_f32x4(__A),
6843                                           (__v16sf)__O);
6844}
6845
6846static __inline__ __m512 __DEFAULT_FN_ATTRS512
6847_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
6848{
6849  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
6850                                           (__v16sf)_mm512_broadcast_f32x4(__A),
6851                                           (__v16sf)_mm512_setzero_ps());
6852}
6853
6854static __inline__ __m512d __DEFAULT_FN_ATTRS512
6855_mm512_broadcast_f64x4(__m256d __A)
6856{
6857  return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
6858                                          0, 1, 2, 3, 0, 1, 2, 3);
6859}
6860
6861static __inline__ __m512d __DEFAULT_FN_ATTRS512
6862_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
6863{
6864  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
6865                                            (__v8df)_mm512_broadcast_f64x4(__A),
6866                                            (__v8df)__O);
6867}
6868
6869static __inline__ __m512d __DEFAULT_FN_ATTRS512
6870_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
6871{
6872  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
6873                                            (__v8df)_mm512_broadcast_f64x4(__A),
6874                                            (__v8df)_mm512_setzero_pd());
6875}
6876
6877static __inline__ __m512i __DEFAULT_FN_ATTRS512
6878_mm512_broadcast_i32x4(__m128i __A)
6879{
6880  return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
6881                                          0, 1, 2, 3, 0, 1, 2, 3,
6882                                          0, 1, 2, 3, 0, 1, 2, 3);
6883}
6884
6885static __inline__ __m512i __DEFAULT_FN_ATTRS512
6886_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
6887{
6888  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
6889                                           (__v16si)_mm512_broadcast_i32x4(__A),
6890                                           (__v16si)__O);
6891}
6892
6893static __inline__ __m512i __DEFAULT_FN_ATTRS512
6894_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
6895{
6896  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
6897                                           (__v16si)_mm512_broadcast_i32x4(__A),
6898                                           (__v16si)_mm512_setzero_si512());
6899}
6900
6901static __inline__ __m512i __DEFAULT_FN_ATTRS512
6902_mm512_broadcast_i64x4(__m256i __A)
6903{
6904  return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
6905                                          0, 1, 2, 3, 0, 1, 2, 3);
6906}
6907
6908static __inline__ __m512i __DEFAULT_FN_ATTRS512
6909_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
6910{
6911  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
6912                                            (__v8di)_mm512_broadcast_i64x4(__A),
6913                                            (__v8di)__O);
6914}
6915
6916static __inline__ __m512i __DEFAULT_FN_ATTRS512
6917_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
6918{
6919  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
6920                                            (__v8di)_mm512_broadcast_i64x4(__A),
6921                                            (__v8di)_mm512_setzero_si512());
6922}
6923
6924static __inline__ __m512d __DEFAULT_FN_ATTRS512
6925_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
6926{
6927  return (__m512d)__builtin_ia32_selectpd_512(__M,
6928                                              (__v8df) _mm512_broadcastsd_pd(__A),
6929                                              (__v8df) __O);
6930}
6931
6932static __inline__ __m512d __DEFAULT_FN_ATTRS512
6933_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
6934{
6935  return (__m512d)__builtin_ia32_selectpd_512(__M,
6936                                              (__v8df) _mm512_broadcastsd_pd(__A),
6937                                              (__v8df) _mm512_setzero_pd());
6938}
6939
6940static __inline__ __m512 __DEFAULT_FN_ATTRS512
6941_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
6942{
6943  return (__m512)__builtin_ia32_selectps_512(__M,
6944                                             (__v16sf) _mm512_broadcastss_ps(__A),
6945                                             (__v16sf) __O);
6946}
6947
6948static __inline__ __m512 __DEFAULT_FN_ATTRS512
6949_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
6950{
6951  return (__m512)__builtin_ia32_selectps_512(__M,
6952                                             (__v16sf) _mm512_broadcastss_ps(__A),
6953                                             (__v16sf) _mm512_setzero_ps());
6954}
6955
6956static __inline__ __m128i __DEFAULT_FN_ATTRS512
6957_mm512_cvtsepi32_epi8 (__m512i __A)
6958{
6959  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6960               (__v16qi) _mm_undefined_si128 (),
6961               (__mmask16) -1);
6962}
6963
6964static __inline__ __m128i __DEFAULT_FN_ATTRS512
6965_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
6966{
6967  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6968               (__v16qi) __O, __M);
6969}
6970
6971static __inline__ __m128i __DEFAULT_FN_ATTRS512
6972_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
6973{
6974  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
6975               (__v16qi) _mm_setzero_si128 (),
6976               __M);
6977}
6978
6979static __inline__ void __DEFAULT_FN_ATTRS512
6980_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
6981{
6982  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
6983}
6984
6985static __inline__ __m256i __DEFAULT_FN_ATTRS512
6986_mm512_cvtsepi32_epi16 (__m512i __A)
6987{
6988  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6989               (__v16hi) _mm256_undefined_si256 (),
6990               (__mmask16) -1);
6991}
6992
6993static __inline__ __m256i __DEFAULT_FN_ATTRS512
6994_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
6995{
6996  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
6997               (__v16hi) __O, __M);
6998}
6999
7000static __inline__ __m256i __DEFAULT_FN_ATTRS512
7001_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
7002{
7003  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
7004               (__v16hi) _mm256_setzero_si256 (),
7005               __M);
7006}
7007
7008static __inline__ void __DEFAULT_FN_ATTRS512
7009_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
7010{
7011  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
7012}
7013
7014static __inline__ __m128i __DEFAULT_FN_ATTRS512
7015_mm512_cvtsepi64_epi8 (__m512i __A)
7016{
7017  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7018               (__v16qi) _mm_undefined_si128 (),
7019               (__mmask8) -1);
7020}
7021
7022static __inline__ __m128i __DEFAULT_FN_ATTRS512
7023_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7024{
7025  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7026               (__v16qi) __O, __M);
7027}
7028
7029static __inline__ __m128i __DEFAULT_FN_ATTRS512
7030_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
7031{
7032  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
7033               (__v16qi) _mm_setzero_si128 (),
7034               __M);
7035}
7036
7037static __inline__ void __DEFAULT_FN_ATTRS512
7038_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7039{
7040  __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7041}
7042
7043static __inline__ __m256i __DEFAULT_FN_ATTRS512
7044_mm512_cvtsepi64_epi32 (__m512i __A)
7045{
7046  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7047               (__v8si) _mm256_undefined_si256 (),
7048               (__mmask8) -1);
7049}
7050
7051static __inline__ __m256i __DEFAULT_FN_ATTRS512
7052_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7053{
7054  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7055               (__v8si) __O, __M);
7056}
7057
7058static __inline__ __m256i __DEFAULT_FN_ATTRS512
7059_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
7060{
7061  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
7062               (__v8si) _mm256_setzero_si256 (),
7063               __M);
7064}
7065
7066static __inline__ void __DEFAULT_FN_ATTRS512
7067_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
7068{
7069  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
7070}
7071
7072static __inline__ __m128i __DEFAULT_FN_ATTRS512
7073_mm512_cvtsepi64_epi16 (__m512i __A)
7074{
7075  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7076               (__v8hi) _mm_undefined_si128 (),
7077               (__mmask8) -1);
7078}
7079
7080static __inline__ __m128i __DEFAULT_FN_ATTRS512
7081_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7082{
7083  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7084               (__v8hi) __O, __M);
7085}
7086
7087static __inline__ __m128i __DEFAULT_FN_ATTRS512
7088_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
7089{
7090  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
7091               (__v8hi) _mm_setzero_si128 (),
7092               __M);
7093}
7094
7095static __inline__ void __DEFAULT_FN_ATTRS512
7096_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
7097{
7098  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
7099}
7100
7101static __inline__ __m128i __DEFAULT_FN_ATTRS512
7102_mm512_cvtusepi32_epi8 (__m512i __A)
7103{
7104  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7105                (__v16qi) _mm_undefined_si128 (),
7106                (__mmask16) -1);
7107}
7108
7109static __inline__ __m128i __DEFAULT_FN_ATTRS512
7110_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
7111{
7112  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7113                (__v16qi) __O,
7114                __M);
7115}
7116
7117static __inline__ __m128i __DEFAULT_FN_ATTRS512
7118_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
7119{
7120  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
7121                (__v16qi) _mm_setzero_si128 (),
7122                __M);
7123}
7124
7125static __inline__ void __DEFAULT_FN_ATTRS512
7126_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
7127{
7128  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
7129}
7130
7131static __inline__ __m256i __DEFAULT_FN_ATTRS512
7132_mm512_cvtusepi32_epi16 (__m512i __A)
7133{
7134  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7135                (__v16hi) _mm256_undefined_si256 (),
7136                (__mmask16) -1);
7137}
7138
7139static __inline__ __m256i __DEFAULT_FN_ATTRS512
7140_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
7141{
7142  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7143                (__v16hi) __O,
7144                __M);
7145}
7146
7147static __inline__ __m256i __DEFAULT_FN_ATTRS512
7148_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
7149{
7150  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
7151                (__v16hi) _mm256_setzero_si256 (),
7152                __M);
7153}
7154
7155static __inline__ void __DEFAULT_FN_ATTRS512
7156_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
7157{
7158  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
7159}
7160
7161static __inline__ __m128i __DEFAULT_FN_ATTRS512
7162_mm512_cvtusepi64_epi8 (__m512i __A)
7163{
7164  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7165                (__v16qi) _mm_undefined_si128 (),
7166                (__mmask8) -1);
7167}
7168
7169static __inline__ __m128i __DEFAULT_FN_ATTRS512
7170_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7171{
7172  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7173                (__v16qi) __O,
7174                __M);
7175}
7176
7177static __inline__ __m128i __DEFAULT_FN_ATTRS512
7178_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
7179{
7180  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
7181                (__v16qi) _mm_setzero_si128 (),
7182                __M);
7183}
7184
7185static __inline__ void __DEFAULT_FN_ATTRS512
7186_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7187{
7188  __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7189}
7190
7191static __inline__ __m256i __DEFAULT_FN_ATTRS512
7192_mm512_cvtusepi64_epi32 (__m512i __A)
7193{
7194  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7195                (__v8si) _mm256_undefined_si256 (),
7196                (__mmask8) -1);
7197}
7198
7199static __inline__ __m256i __DEFAULT_FN_ATTRS512
7200_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7201{
7202  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7203                (__v8si) __O, __M);
7204}
7205
7206static __inline__ __m256i __DEFAULT_FN_ATTRS512
7207_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
7208{
7209  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
7210                (__v8si) _mm256_setzero_si256 (),
7211                __M);
7212}
7213
7214static __inline__ void __DEFAULT_FN_ATTRS512
7215_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
7216{
7217  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
7218}
7219
7220static __inline__ __m128i __DEFAULT_FN_ATTRS512
7221_mm512_cvtusepi64_epi16 (__m512i __A)
7222{
7223  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7224                (__v8hi) _mm_undefined_si128 (),
7225                (__mmask8) -1);
7226}
7227
7228static __inline__ __m128i __DEFAULT_FN_ATTRS512
7229_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7230{
7231  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7232                (__v8hi) __O, __M);
7233}
7234
7235static __inline__ __m128i __DEFAULT_FN_ATTRS512
7236_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
7237{
7238  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
7239                (__v8hi) _mm_setzero_si128 (),
7240                __M);
7241}
7242
7243static __inline__ void __DEFAULT_FN_ATTRS512
7244_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
7245{
7246  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
7247}
7248
7249static __inline__ __m128i __DEFAULT_FN_ATTRS512
7250_mm512_cvtepi32_epi8 (__m512i __A)
7251{
7252  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7253              (__v16qi) _mm_undefined_si128 (),
7254              (__mmask16) -1);
7255}
7256
7257static __inline__ __m128i __DEFAULT_FN_ATTRS512
7258_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
7259{
7260  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7261              (__v16qi) __O, __M);
7262}
7263
7264static __inline__ __m128i __DEFAULT_FN_ATTRS512
7265_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
7266{
7267  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
7268              (__v16qi) _mm_setzero_si128 (),
7269              __M);
7270}
7271
7272static __inline__ void __DEFAULT_FN_ATTRS512
7273_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
7274{
7275  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
7276}
7277
7278static __inline__ __m256i __DEFAULT_FN_ATTRS512
7279_mm512_cvtepi32_epi16 (__m512i __A)
7280{
7281  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7282              (__v16hi) _mm256_undefined_si256 (),
7283              (__mmask16) -1);
7284}
7285
7286static __inline__ __m256i __DEFAULT_FN_ATTRS512
7287_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
7288{
7289  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7290              (__v16hi) __O, __M);
7291}
7292
7293static __inline__ __m256i __DEFAULT_FN_ATTRS512
7294_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
7295{
7296  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
7297              (__v16hi) _mm256_setzero_si256 (),
7298              __M);
7299}
7300
7301static __inline__ void __DEFAULT_FN_ATTRS512
7302_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
7303{
7304  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
7305}
7306
7307static __inline__ __m128i __DEFAULT_FN_ATTRS512
7308_mm512_cvtepi64_epi8 (__m512i __A)
7309{
7310  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7311              (__v16qi) _mm_undefined_si128 (),
7312              (__mmask8) -1);
7313}
7314
7315static __inline__ __m128i __DEFAULT_FN_ATTRS512
7316_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
7317{
7318  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7319              (__v16qi) __O, __M);
7320}
7321
7322static __inline__ __m128i __DEFAULT_FN_ATTRS512
7323_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
7324{
7325  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
7326              (__v16qi) _mm_setzero_si128 (),
7327              __M);
7328}
7329
7330static __inline__ void __DEFAULT_FN_ATTRS512
7331_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
7332{
7333  __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
7334}
7335
7336static __inline__ __m256i __DEFAULT_FN_ATTRS512
7337_mm512_cvtepi64_epi32 (__m512i __A)
7338{
7339  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7340              (__v8si) _mm256_undefined_si256 (),
7341              (__mmask8) -1);
7342}
7343
7344static __inline__ __m256i __DEFAULT_FN_ATTRS512
7345_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
7346{
7347  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7348              (__v8si) __O, __M);
7349}
7350
7351static __inline__ __m256i __DEFAULT_FN_ATTRS512
7352_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
7353{
7354  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
7355              (__v8si) _mm256_setzero_si256 (),
7356              __M);
7357}
7358
7359static __inline__ void __DEFAULT_FN_ATTRS512
7360_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
7361{
7362  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
7363}
7364
7365static __inline__ __m128i __DEFAULT_FN_ATTRS512
7366_mm512_cvtepi64_epi16 (__m512i __A)
7367{
7368  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7369              (__v8hi) _mm_undefined_si128 (),
7370              (__mmask8) -1);
7371}
7372
7373static __inline__ __m128i __DEFAULT_FN_ATTRS512
7374_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
7375{
7376  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7377              (__v8hi) __O, __M);
7378}
7379
7380static __inline__ __m128i __DEFAULT_FN_ATTRS512
7381_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
7382{
7383  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
7384              (__v8hi) _mm_setzero_si128 (),
7385              __M);
7386}
7387
7388static __inline__ void __DEFAULT_FN_ATTRS512
7389_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
7390{
7391  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
7392}
7393
7394#define _mm512_extracti32x4_epi32(A, imm) \
7395  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7396                                             (__v4si)_mm_undefined_si128(), \
7397                                             (__mmask8)-1))
7398
7399#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
7400  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7401                                             (__v4si)(__m128i)(W), \
7402                                             (__mmask8)(U)))
7403
7404#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
7405  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7406                                             (__v4si)_mm_setzero_si128(), \
7407                                             (__mmask8)(U)))
7408
7409#define _mm512_extracti64x4_epi64(A, imm) \
7410  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7411                                             (__v4di)_mm256_undefined_si256(), \
7412                                             (__mmask8)-1))
7413
7414#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
7415  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7416                                             (__v4di)(__m256i)(W), \
7417                                             (__mmask8)(U)))
7418
7419#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
7420  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7421                                             (__v4di)_mm256_setzero_si256(), \
7422                                             (__mmask8)(U)))
7423
7424#define _mm512_insertf64x4(A, B, imm) \
7425  ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
7426                                       (__v4df)(__m256d)(B), (int)(imm)))
7427
7428#define _mm512_mask_insertf64x4(W, U, A, B, imm) \
7429  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
7430                                   (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
7431                                   (__v8df)(__m512d)(W)))
7432
7433#define _mm512_maskz_insertf64x4(U, A, B, imm) \
7434  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
7435                                   (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
7436                                   (__v8df)_mm512_setzero_pd()))
7437
7438#define _mm512_inserti64x4(A, B, imm) \
7439  ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
7440                                       (__v4di)(__m256i)(B), (int)(imm)))
7441
7442#define _mm512_mask_inserti64x4(W, U, A, B, imm) \
7443  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
7444                                   (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
7445                                   (__v8di)(__m512i)(W)))
7446
7447#define _mm512_maskz_inserti64x4(U, A, B, imm) \
7448  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
7449                                   (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
7450                                   (__v8di)_mm512_setzero_si512()))
7451
7452#define _mm512_insertf32x4(A, B, imm) \
7453  ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
7454                                      (__v4sf)(__m128)(B), (int)(imm)))
7455
7456#define _mm512_mask_insertf32x4(W, U, A, B, imm) \
7457  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
7458                                  (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
7459                                  (__v16sf)(__m512)(W)))
7460
7461#define _mm512_maskz_insertf32x4(U, A, B, imm) \
7462  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
7463                                  (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
7464                                  (__v16sf)_mm512_setzero_ps()))
7465
7466#define _mm512_inserti32x4(A, B, imm) \
7467  ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
7468                                       (__v4si)(__m128i)(B), (int)(imm)))
7469
7470#define _mm512_mask_inserti32x4(W, U, A, B, imm) \
7471  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
7472                                  (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
7473                                  (__v16si)(__m512i)(W)))
7474
7475#define _mm512_maskz_inserti32x4(U, A, B, imm) \
7476  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
7477                                  (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
7478                                  (__v16si)_mm512_setzero_si512()))
7479
7480#define _mm512_getmant_round_pd(A, B, C, R) \
7481  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7482                                             (int)(((C)<<2) | (B)), \
7483                                             (__v8df)_mm512_undefined_pd(), \
7484                                             (__mmask8)-1, (int)(R)))
7485
7486#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
7487  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7488                                             (int)(((C)<<2) | (B)), \
7489                                             (__v8df)(__m512d)(W), \
7490                                             (__mmask8)(U), (int)(R)))
7491
7492#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
7493  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7494                                             (int)(((C)<<2) | (B)), \
7495                                             (__v8df)_mm512_setzero_pd(), \
7496                                             (__mmask8)(U), (int)(R)))
7497
7498#define _mm512_getmant_pd(A, B, C) \
7499  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7500                                             (int)(((C)<<2) | (B)), \
7501                                             (__v8df)_mm512_setzero_pd(), \
7502                                             (__mmask8)-1, \
7503                                             _MM_FROUND_CUR_DIRECTION))
7504
7505#define _mm512_mask_getmant_pd(W, U, A, B, C) \
7506  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7507                                             (int)(((C)<<2) | (B)), \
7508                                             (__v8df)(__m512d)(W), \
7509                                             (__mmask8)(U), \
7510                                             _MM_FROUND_CUR_DIRECTION))
7511
7512#define _mm512_maskz_getmant_pd(U, A, B, C) \
7513  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
7514                                             (int)(((C)<<2) | (B)), \
7515                                             (__v8df)_mm512_setzero_pd(), \
7516                                             (__mmask8)(U), \
7517                                             _MM_FROUND_CUR_DIRECTION))
7518
7519#define _mm512_getmant_round_ps(A, B, C, R) \
7520  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7521                                            (int)(((C)<<2) | (B)), \
7522                                            (__v16sf)_mm512_undefined_ps(), \
7523                                            (__mmask16)-1, (int)(R)))
7524
7525#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
7526  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7527                                            (int)(((C)<<2) | (B)), \
7528                                            (__v16sf)(__m512)(W), \
7529                                            (__mmask16)(U), (int)(R)))
7530
7531#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
7532  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7533                                            (int)(((C)<<2) | (B)), \
7534                                            (__v16sf)_mm512_setzero_ps(), \
7535                                            (__mmask16)(U), (int)(R)))
7536
7537#define _mm512_getmant_ps(A, B, C) \
7538  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7539                                            (int)(((C)<<2)|(B)), \
7540                                            (__v16sf)_mm512_undefined_ps(), \
7541                                            (__mmask16)-1, \
7542                                            _MM_FROUND_CUR_DIRECTION))
7543
7544#define _mm512_mask_getmant_ps(W, U, A, B, C) \
7545  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7546                                            (int)(((C)<<2)|(B)), \
7547                                            (__v16sf)(__m512)(W), \
7548                                            (__mmask16)(U), \
7549                                            _MM_FROUND_CUR_DIRECTION))
7550
7551#define _mm512_maskz_getmant_ps(U, A, B, C) \
7552  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
7553                                            (int)(((C)<<2)|(B)), \
7554                                            (__v16sf)_mm512_setzero_ps(), \
7555                                            (__mmask16)(U), \
7556                                            _MM_FROUND_CUR_DIRECTION))
7557
7558#define _mm512_getexp_round_pd(A, R) \
7559  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7560                                            (__v8df)_mm512_undefined_pd(), \
7561                                            (__mmask8)-1, (int)(R)))
7562
7563#define _mm512_mask_getexp_round_pd(W, U, A, R) \
7564  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7565                                            (__v8df)(__m512d)(W), \
7566                                            (__mmask8)(U), (int)(R)))
7567
7568#define _mm512_maskz_getexp_round_pd(U, A, R) \
7569  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
7570                                            (__v8df)_mm512_setzero_pd(), \
7571                                            (__mmask8)(U), (int)(R)))
7572
7573static __inline__ __m512d __DEFAULT_FN_ATTRS512
7574_mm512_getexp_pd (__m512d __A)
7575{
7576  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7577                (__v8df) _mm512_undefined_pd (),
7578                (__mmask8) -1,
7579                _MM_FROUND_CUR_DIRECTION);
7580}
7581
7582static __inline__ __m512d __DEFAULT_FN_ATTRS512
7583_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
7584{
7585  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7586                (__v8df) __W,
7587                (__mmask8) __U,
7588                _MM_FROUND_CUR_DIRECTION);
7589}
7590
7591static __inline__ __m512d __DEFAULT_FN_ATTRS512
7592_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
7593{
7594  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
7595                (__v8df) _mm512_setzero_pd (),
7596                (__mmask8) __U,
7597                _MM_FROUND_CUR_DIRECTION);
7598}
7599
7600#define _mm512_getexp_round_ps(A, R) \
7601  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7602                                           (__v16sf)_mm512_undefined_ps(), \
7603                                           (__mmask16)-1, (int)(R)))
7604
7605#define _mm512_mask_getexp_round_ps(W, U, A, R) \
7606  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7607                                           (__v16sf)(__m512)(W), \
7608                                           (__mmask16)(U), (int)(R)))
7609
7610#define _mm512_maskz_getexp_round_ps(U, A, R) \
7611  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
7612                                           (__v16sf)_mm512_setzero_ps(), \
7613                                           (__mmask16)(U), (int)(R)))
7614
7615static __inline__ __m512 __DEFAULT_FN_ATTRS512
7616_mm512_getexp_ps (__m512 __A)
7617{
7618  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7619               (__v16sf) _mm512_undefined_ps (),
7620               (__mmask16) -1,
7621               _MM_FROUND_CUR_DIRECTION);
7622}
7623
7624static __inline__ __m512 __DEFAULT_FN_ATTRS512
7625_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
7626{
7627  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7628               (__v16sf) __W,
7629               (__mmask16) __U,
7630               _MM_FROUND_CUR_DIRECTION);
7631}
7632
7633static __inline__ __m512 __DEFAULT_FN_ATTRS512
7634_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
7635{
7636  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
7637               (__v16sf) _mm512_setzero_ps (),
7638               (__mmask16) __U,
7639               _MM_FROUND_CUR_DIRECTION);
7640}
7641
7642#define _mm512_i64gather_ps(index, addr, scale) \
7643  ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
7644                                        (void const *)(addr), \
7645                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
7646                                        (int)(scale)))
7647
7648#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
7649  ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
7650                                        (void const *)(addr), \
7651                                        (__v8di)(__m512i)(index), \
7652                                        (__mmask8)(mask), (int)(scale)))
7653
7654#define _mm512_i64gather_epi32(index, addr, scale) \
7655  ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
7656                                         (void const *)(addr), \
7657                                         (__v8di)(__m512i)(index), \
7658                                         (__mmask8)-1, (int)(scale)))
7659
7660#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
7661  ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
7662                                         (void const *)(addr), \
7663                                         (__v8di)(__m512i)(index), \
7664                                         (__mmask8)(mask), (int)(scale)))
7665
7666#define _mm512_i64gather_pd(index, addr, scale) \
7667  ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
7668                                        (void const *)(addr), \
7669                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
7670                                        (int)(scale)))
7671
7672#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
7673  ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
7674                                        (void const *)(addr), \
7675                                        (__v8di)(__m512i)(index), \
7676                                        (__mmask8)(mask), (int)(scale)))
7677
7678#define _mm512_i64gather_epi64(index, addr, scale) \
7679  ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
7680                                        (void const *)(addr), \
7681                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
7682                                        (int)(scale)))
7683
7684#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
7685  ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
7686                                        (void const *)(addr), \
7687                                        (__v8di)(__m512i)(index), \
7688                                        (__mmask8)(mask), (int)(scale)))
7689
7690#define _mm512_i32gather_ps(index, addr, scale) \
7691  ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
7692                                        (void const *)(addr), \
7693                                        (__v16si)(__m512)(index), \
7694                                        (__mmask16)-1, (int)(scale)))
7695
7696#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
7697  ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
7698                                        (void const *)(addr), \
7699                                        (__v16si)(__m512)(index), \
7700                                        (__mmask16)(mask), (int)(scale)))
7701
7702#define _mm512_i32gather_epi32(index, addr, scale) \
7703  ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
7704                                         (void const *)(addr), \
7705                                         (__v16si)(__m512i)(index), \
7706                                         (__mmask16)-1, (int)(scale)))
7707
7708#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
7709  ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
7710                                         (void const *)(addr), \
7711                                         (__v16si)(__m512i)(index), \
7712                                         (__mmask16)(mask), (int)(scale)))
7713
7714#define _mm512_i32gather_pd(index, addr, scale) \
7715  ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
7716                                        (void const *)(addr), \
7717                                        (__v8si)(__m256i)(index), (__mmask8)-1, \
7718                                        (int)(scale)))
7719
7720#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
7721  ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
7722                                        (void const *)(addr), \
7723                                        (__v8si)(__m256i)(index), \
7724                                        (__mmask8)(mask), (int)(scale)))
7725
7726#define _mm512_i32gather_epi64(index, addr, scale) \
7727  ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
7728                                        (void const *)(addr), \
7729                                        (__v8si)(__m256i)(index), (__mmask8)-1, \
7730                                        (int)(scale)))
7731
7732#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
7733  ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
7734                                        (void const *)(addr), \
7735                                        (__v8si)(__m256i)(index), \
7736                                        (__mmask8)(mask), (int)(scale)))
7737
7738#define _mm512_i64scatter_ps(addr, index, v1, scale) \
7739  __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
7740                                (__v8di)(__m512i)(index), \
7741                                (__v8sf)(__m256)(v1), (int)(scale))
7742
7743#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
7744  __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \
7745                                (__v8di)(__m512i)(index), \
7746                                (__v8sf)(__m256)(v1), (int)(scale))
7747
7748#define _mm512_i64scatter_epi32(addr, index, v1, scale) \
7749  __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \
7750                                (__v8di)(__m512i)(index), \
7751                                (__v8si)(__m256i)(v1), (int)(scale))
7752
7753#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
7754  __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \
7755                                (__v8di)(__m512i)(index), \
7756                                (__v8si)(__m256i)(v1), (int)(scale))
7757
7758#define _mm512_i64scatter_pd(addr, index, v1, scale) \
7759  __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \
7760                               (__v8di)(__m512i)(index), \
7761                               (__v8df)(__m512d)(v1), (int)(scale))
7762
7763#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
7764  __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \
7765                               (__v8di)(__m512i)(index), \
7766                               (__v8df)(__m512d)(v1), (int)(scale))
7767
7768#define _mm512_i64scatter_epi64(addr, index, v1, scale) \
7769  __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \
7770                               (__v8di)(__m512i)(index), \
7771                               (__v8di)(__m512i)(v1), (int)(scale))
7772
7773#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
7774  __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \
7775                               (__v8di)(__m512i)(index), \
7776                               (__v8di)(__m512i)(v1), (int)(scale))
7777
7778#define _mm512_i32scatter_ps(addr, index, v1, scale) \
7779  __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \
7780                                (__v16si)(__m512i)(index), \
7781                                (__v16sf)(__m512)(v1), (int)(scale))
7782
7783#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
7784  __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \
7785                                (__v16si)(__m512i)(index), \
7786                                (__v16sf)(__m512)(v1), (int)(scale))
7787
7788#define _mm512_i32scatter_epi32(addr, index, v1, scale) \
7789  __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \
7790                                (__v16si)(__m512i)(index), \
7791                                (__v16si)(__m512i)(v1), (int)(scale))
7792
7793#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
7794  __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \
7795                                (__v16si)(__m512i)(index), \
7796                                (__v16si)(__m512i)(v1), (int)(scale))
7797
7798#define _mm512_i32scatter_pd(addr, index, v1, scale) \
7799  __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \
7800                               (__v8si)(__m256i)(index), \
7801                               (__v8df)(__m512d)(v1), (int)(scale))
7802
7803#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
7804  __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \
7805                               (__v8si)(__m256i)(index), \
7806                               (__v8df)(__m512d)(v1), (int)(scale))
7807
7808#define _mm512_i32scatter_epi64(addr, index, v1, scale) \
7809  __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \
7810                               (__v8si)(__m256i)(index), \
7811                               (__v8di)(__m512i)(v1), (int)(scale))
7812
7813#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
7814  __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \
7815                               (__v8si)(__m256i)(index), \
7816                               (__v8di)(__m512i)(v1), (int)(scale))
7817
7818static __inline__ __m128 __DEFAULT_FN_ATTRS128
7819_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7820{
7821  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7822                                       (__v4sf)__A,
7823                                       (__v4sf)__B,
7824                                       (__mmask8)__U,
7825                                       _MM_FROUND_CUR_DIRECTION);
7826}
7827
7828#define _mm_fmadd_round_ss(A, B, C, R) \
7829  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7830                                         (__v4sf)(__m128)(B), \
7831                                         (__v4sf)(__m128)(C), (__mmask8)-1, \
7832                                         (int)(R)))
7833
7834#define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
7835  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7836                                         (__v4sf)(__m128)(A), \
7837                                         (__v4sf)(__m128)(B), (__mmask8)(U), \
7838                                         (int)(R)))
7839
7840static __inline__ __m128 __DEFAULT_FN_ATTRS128
7841_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7842{
7843  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7844                                        (__v4sf)__B,
7845                                        (__v4sf)__C,
7846                                        (__mmask8)__U,
7847                                        _MM_FROUND_CUR_DIRECTION);
7848}
7849
7850#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
7851  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7852                                          (__v4sf)(__m128)(B), \
7853                                          (__v4sf)(__m128)(C), (__mmask8)(U), \
7854                                          (int)(R)))
7855
7856static __inline__ __m128 __DEFAULT_FN_ATTRS128
7857_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7858{
7859  return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
7860                                        (__v4sf)__X,
7861                                        (__v4sf)__Y,
7862                                        (__mmask8)__U,
7863                                        _MM_FROUND_CUR_DIRECTION);
7864}
7865
7866#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
7867  ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
7868                                          (__v4sf)(__m128)(X), \
7869                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
7870                                          (int)(R)))
7871
7872static __inline__ __m128 __DEFAULT_FN_ATTRS128
7873_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7874{
7875  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7876                                       (__v4sf)__A,
7877                                       -(__v4sf)__B,
7878                                       (__mmask8)__U,
7879                                       _MM_FROUND_CUR_DIRECTION);
7880}
7881
7882#define _mm_fmsub_round_ss(A, B, C, R) \
7883  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7884                                         (__v4sf)(__m128)(B), \
7885                                         -(__v4sf)(__m128)(C), (__mmask8)-1, \
7886                                         (int)(R)))
7887
7888#define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
7889  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7890                                         (__v4sf)(__m128)(A), \
7891                                         -(__v4sf)(__m128)(B), (__mmask8)(U), \
7892                                         (int)(R)))
7893
7894static __inline__ __m128 __DEFAULT_FN_ATTRS128
7895_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7896{
7897  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7898                                        (__v4sf)__B,
7899                                        -(__v4sf)__C,
7900                                        (__mmask8)__U,
7901                                        _MM_FROUND_CUR_DIRECTION);
7902}
7903
7904#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
7905  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7906                                          (__v4sf)(__m128)(B), \
7907                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
7908                                          (int)(R)))
7909
7910static __inline__ __m128 __DEFAULT_FN_ATTRS128
7911_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7912{
7913  return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
7914                                        (__v4sf)__X,
7915                                        (__v4sf)__Y,
7916                                        (__mmask8)__U,
7917                                        _MM_FROUND_CUR_DIRECTION);
7918}
7919
7920#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
7921  ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
7922                                          (__v4sf)(__m128)(X), \
7923                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
7924                                          (int)(R)))
7925
7926static __inline__ __m128 __DEFAULT_FN_ATTRS128
7927_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7928{
7929  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7930                                       -(__v4sf)__A,
7931                                       (__v4sf)__B,
7932                                       (__mmask8)__U,
7933                                       _MM_FROUND_CUR_DIRECTION);
7934}
7935
7936#define _mm_fnmadd_round_ss(A, B, C, R) \
7937  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7938                                         -(__v4sf)(__m128)(B), \
7939                                         (__v4sf)(__m128)(C), (__mmask8)-1, \
7940                                         (int)(R)))
7941
7942#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
7943  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7944                                         -(__v4sf)(__m128)(A), \
7945                                         (__v4sf)(__m128)(B), (__mmask8)(U), \
7946                                         (int)(R)))
7947
7948static __inline__ __m128 __DEFAULT_FN_ATTRS128
7949_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
7950{
7951  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
7952                                        -(__v4sf)__B,
7953                                        (__v4sf)__C,
7954                                        (__mmask8)__U,
7955                                        _MM_FROUND_CUR_DIRECTION);
7956}
7957
7958#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
7959  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
7960                                          -(__v4sf)(__m128)(B), \
7961                                          (__v4sf)(__m128)(C), (__mmask8)(U), \
7962                                          (int)(R)))
7963
7964static __inline__ __m128 __DEFAULT_FN_ATTRS128
7965_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
7966{
7967  return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
7968                                        -(__v4sf)__X,
7969                                        (__v4sf)__Y,
7970                                        (__mmask8)__U,
7971                                        _MM_FROUND_CUR_DIRECTION);
7972}
7973
7974#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
7975  ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
7976                                          -(__v4sf)(__m128)(X), \
7977                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
7978                                          (int)(R)))
7979
7980static __inline__ __m128 __DEFAULT_FN_ATTRS128
7981_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
7982{
7983  return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
7984                                       -(__v4sf)__A,
7985                                       -(__v4sf)__B,
7986                                       (__mmask8)__U,
7987                                       _MM_FROUND_CUR_DIRECTION);
7988}
7989
7990#define _mm_fnmsub_round_ss(A, B, C, R) \
7991  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
7992                                         -(__v4sf)(__m128)(B), \
7993                                         -(__v4sf)(__m128)(C), (__mmask8)-1, \
7994                                         (int)(R)))
7995
7996#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
7997  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
7998                                         -(__v4sf)(__m128)(A), \
7999                                         -(__v4sf)(__m128)(B), (__mmask8)(U), \
8000                                         (int)(R)))
8001
8002static __inline__ __m128 __DEFAULT_FN_ATTRS128
8003_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
8004{
8005  return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
8006                                        -(__v4sf)__B,
8007                                        -(__v4sf)__C,
8008                                        (__mmask8)__U,
8009                                        _MM_FROUND_CUR_DIRECTION);
8010}
8011
8012#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
8013  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
8014                                          -(__v4sf)(__m128)(B), \
8015                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
8016                                          (int)(R)))
8017
8018static __inline__ __m128 __DEFAULT_FN_ATTRS128
8019_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
8020{
8021  return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
8022                                        -(__v4sf)__X,
8023                                        (__v4sf)__Y,
8024                                        (__mmask8)__U,
8025                                        _MM_FROUND_CUR_DIRECTION);
8026}
8027
8028#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
8029  ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
8030                                          -(__v4sf)(__m128)(X), \
8031                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
8032                                          (int)(R)))
8033
8034static __inline__ __m128d __DEFAULT_FN_ATTRS128
8035_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8036{
8037  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8038                                       (__v2df)__A,
8039                                       (__v2df)__B,
8040                                       (__mmask8)__U,
8041                                       _MM_FROUND_CUR_DIRECTION);
8042}
8043
8044#define _mm_fmadd_round_sd(A, B, C, R) \
8045  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8046                                          (__v2df)(__m128d)(B), \
8047                                          (__v2df)(__m128d)(C), (__mmask8)-1, \
8048                                          (int)(R)))
8049
8050#define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
8051  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8052                                          (__v2df)(__m128d)(A), \
8053                                          (__v2df)(__m128d)(B), (__mmask8)(U), \
8054                                          (int)(R)))
8055
8056static __inline__ __m128d __DEFAULT_FN_ATTRS128
8057_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8058{
8059  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8060                                        (__v2df)__B,
8061                                        (__v2df)__C,
8062                                        (__mmask8)__U,
8063                                        _MM_FROUND_CUR_DIRECTION);
8064}
8065
8066#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
8067  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8068                                           (__v2df)(__m128d)(B), \
8069                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
8070                                           (int)(R)))
8071
8072static __inline__ __m128d __DEFAULT_FN_ATTRS128
8073_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8074{
8075  return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
8076                                        (__v2df)__X,
8077                                        (__v2df)__Y,
8078                                        (__mmask8)__U,
8079                                        _MM_FROUND_CUR_DIRECTION);
8080}
8081
8082#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
8083  ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
8084                                           (__v2df)(__m128d)(X), \
8085                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
8086                                           (int)(R)))
8087
8088static __inline__ __m128d __DEFAULT_FN_ATTRS128
8089_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8090{
8091  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8092                                       (__v2df)__A,
8093                                       -(__v2df)__B,
8094                                       (__mmask8)__U,
8095                                       _MM_FROUND_CUR_DIRECTION);
8096}
8097
8098#define _mm_fmsub_round_sd(A, B, C, R) \
8099  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8100                                          (__v2df)(__m128d)(B), \
8101                                          -(__v2df)(__m128d)(C), (__mmask8)-1, \
8102                                          (int)(R)))
8103
8104#define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
8105  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8106                                          (__v2df)(__m128d)(A), \
8107                                          -(__v2df)(__m128d)(B), (__mmask8)(U), \
8108                                          (int)(R)))
8109
8110static __inline__ __m128d __DEFAULT_FN_ATTRS128
8111_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8112{
8113  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8114                                        (__v2df)__B,
8115                                        -(__v2df)__C,
8116                                        (__mmask8)__U,
8117                                        _MM_FROUND_CUR_DIRECTION);
8118}
8119
8120#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
8121  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8122                                           (__v2df)(__m128d)(B), \
8123                                           -(__v2df)(__m128d)(C), \
8124                                           (__mmask8)(U), (int)(R)))
8125
8126static __inline__ __m128d __DEFAULT_FN_ATTRS128
8127_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8128{
8129  return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
8130                                        (__v2df)__X,
8131                                        (__v2df)__Y,
8132                                        (__mmask8)__U,
8133                                        _MM_FROUND_CUR_DIRECTION);
8134}
8135
8136#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
8137  ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
8138                                           (__v2df)(__m128d)(X), \
8139                                           (__v2df)(__m128d)(Y), \
8140                                           (__mmask8)(U), (int)(R)))
8141
8142static __inline__ __m128d __DEFAULT_FN_ATTRS128
8143_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8144{
8145  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8146                                       -(__v2df)__A,
8147                                       (__v2df)__B,
8148                                       (__mmask8)__U,
8149                                       _MM_FROUND_CUR_DIRECTION);
8150}
8151
8152#define _mm_fnmadd_round_sd(A, B, C, R) \
8153  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8154                                          -(__v2df)(__m128d)(B), \
8155                                          (__v2df)(__m128d)(C), (__mmask8)-1, \
8156                                          (int)(R)))
8157
8158#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
8159  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8160                                          -(__v2df)(__m128d)(A), \
8161                                          (__v2df)(__m128d)(B), (__mmask8)(U), \
8162                                          (int)(R)))
8163
8164static __inline__ __m128d __DEFAULT_FN_ATTRS128
8165_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8166{
8167  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8168                                        -(__v2df)__B,
8169                                        (__v2df)__C,
8170                                        (__mmask8)__U,
8171                                        _MM_FROUND_CUR_DIRECTION);
8172}
8173
8174#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
8175  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8176                                           -(__v2df)(__m128d)(B), \
8177                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
8178                                           (int)(R)))
8179
8180static __inline__ __m128d __DEFAULT_FN_ATTRS128
8181_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8182{
8183  return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
8184                                        -(__v2df)__X,
8185                                        (__v2df)__Y,
8186                                        (__mmask8)__U,
8187                                        _MM_FROUND_CUR_DIRECTION);
8188}
8189
8190#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
8191  ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
8192                                           -(__v2df)(__m128d)(X), \
8193                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
8194                                           (int)(R)))
8195
8196static __inline__ __m128d __DEFAULT_FN_ATTRS128
8197_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8198{
8199  return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
8200                                       -(__v2df)__A,
8201                                       -(__v2df)__B,
8202                                       (__mmask8)__U,
8203                                       _MM_FROUND_CUR_DIRECTION);
8204}
8205
8206#define _mm_fnmsub_round_sd(A, B, C, R) \
8207  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
8208                                          -(__v2df)(__m128d)(B), \
8209                                          -(__v2df)(__m128d)(C), (__mmask8)-1, \
8210                                          (int)(R)))
8211
8212#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
8213  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
8214                                          -(__v2df)(__m128d)(A), \
8215                                          -(__v2df)(__m128d)(B), (__mmask8)(U), \
8216                                          (int)(R)))
8217
8218static __inline__ __m128d __DEFAULT_FN_ATTRS128
8219_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
8220{
8221  return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
8222                                        -(__v2df)__B,
8223                                        -(__v2df)__C,
8224                                        (__mmask8)__U,
8225                                        _MM_FROUND_CUR_DIRECTION);
8226}
8227
8228#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
8229  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
8230                                           -(__v2df)(__m128d)(B), \
8231                                           -(__v2df)(__m128d)(C), \
8232                                           (__mmask8)(U), \
8233                                           (int)(R)))
8234
8235static __inline__ __m128d __DEFAULT_FN_ATTRS128
8236_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
8237{
8238  return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
8239                                        -(__v2df)__X,
8240                                        (__v2df)__Y,
8241                                        (__mmask8)__U,
8242                                        _MM_FROUND_CUR_DIRECTION);
8243}
8244
8245#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
8246  ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
8247                                           -(__v2df)(__m128d)(X), \
8248                                           (__v2df)(__m128d)(Y), \
8249                                           (__mmask8)(U), (int)(R)))
8250
8251#define _mm512_permutex_pd(X, C) \
8252  ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
8253
8254#define _mm512_mask_permutex_pd(W, U, X, C) \
8255  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
8256                                        (__v8df)_mm512_permutex_pd((X), (C)), \
8257                                        (__v8df)(__m512d)(W)))
8258
8259#define _mm512_maskz_permutex_pd(U, X, C) \
8260  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
8261                                        (__v8df)_mm512_permutex_pd((X), (C)), \
8262                                        (__v8df)_mm512_setzero_pd()))
8263
8264#define _mm512_permutex_epi64(X, C) \
8265  ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
8266
8267#define _mm512_mask_permutex_epi64(W, U, X, C) \
8268  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
8269                                       (__v8di)_mm512_permutex_epi64((X), (C)), \
8270                                       (__v8di)(__m512i)(W)))
8271
8272#define _mm512_maskz_permutex_epi64(U, X, C) \
8273  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
8274                                       (__v8di)_mm512_permutex_epi64((X), (C)), \
8275                                       (__v8di)_mm512_setzero_si512()))
8276
8277static __inline__ __m512d __DEFAULT_FN_ATTRS512
8278_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
8279{
8280  return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
8281}
8282
8283static __inline__ __m512d __DEFAULT_FN_ATTRS512
8284_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
8285{
8286  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8287                                        (__v8df)_mm512_permutexvar_pd(__X, __Y),
8288                                        (__v8df)__W);
8289}
8290
8291static __inline__ __m512d __DEFAULT_FN_ATTRS512
8292_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
8293{
8294  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8295                                        (__v8df)_mm512_permutexvar_pd(__X, __Y),
8296                                        (__v8df)_mm512_setzero_pd());
8297}
8298
8299static __inline__ __m512i __DEFAULT_FN_ATTRS512
8300_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
8301{
8302  return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
8303}
8304
8305static __inline__ __m512i __DEFAULT_FN_ATTRS512
8306_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
8307{
8308  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
8309                                     (__v8di)_mm512_permutexvar_epi64(__X, __Y),
8310                                     (__v8di)_mm512_setzero_si512());
8311}
8312
8313static __inline__ __m512i __DEFAULT_FN_ATTRS512
8314_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
8315             __m512i __Y)
8316{
8317  return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
8318                                     (__v8di)_mm512_permutexvar_epi64(__X, __Y),
8319                                     (__v8di)__W);
8320}
8321
8322static __inline__ __m512 __DEFAULT_FN_ATTRS512
8323_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
8324{
8325  return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
8326}
8327
8328static __inline__ __m512 __DEFAULT_FN_ATTRS512
8329_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
8330{
8331  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8332                                       (__v16sf)_mm512_permutexvar_ps(__X, __Y),
8333                                       (__v16sf)__W);
8334}
8335
8336static __inline__ __m512 __DEFAULT_FN_ATTRS512
8337_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
8338{
8339  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8340                                       (__v16sf)_mm512_permutexvar_ps(__X, __Y),
8341                                       (__v16sf)_mm512_setzero_ps());
8342}
8343
8344static __inline__ __m512i __DEFAULT_FN_ATTRS512
8345_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
8346{
8347  return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
8348}
8349
8350#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
8351
8352static __inline__ __m512i __DEFAULT_FN_ATTRS512
8353_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
8354{
8355  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
8356                                    (__v16si)_mm512_permutexvar_epi32(__X, __Y),
8357                                    (__v16si)_mm512_setzero_si512());
8358}
8359
8360static __inline__ __m512i __DEFAULT_FN_ATTRS512
8361_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
8362             __m512i __Y)
8363{
8364  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
8365                                    (__v16si)_mm512_permutexvar_epi32(__X, __Y),
8366                                    (__v16si)__W);
8367}
8368
8369#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
8370
8371static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8372_mm512_kand (__mmask16 __A, __mmask16 __B)
8373{
8374  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
8375}
8376
8377static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8378_mm512_kandn (__mmask16 __A, __mmask16 __B)
8379{
8380  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
8381}
8382
8383static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8384_mm512_kor (__mmask16 __A, __mmask16 __B)
8385{
8386  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
8387}
8388
8389static __inline__ int __DEFAULT_FN_ATTRS
8390_mm512_kortestc (__mmask16 __A, __mmask16 __B)
8391{
8392  return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
8393}
8394
8395static __inline__ int __DEFAULT_FN_ATTRS
8396_mm512_kortestz (__mmask16 __A, __mmask16 __B)
8397{
8398  return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
8399}
8400
8401static __inline__ unsigned char __DEFAULT_FN_ATTRS
8402_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
8403{
8404  return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
8405}
8406
8407static __inline__ unsigned char __DEFAULT_FN_ATTRS
8408_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
8409{
8410  return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
8411}
8412
8413static __inline__ unsigned char __DEFAULT_FN_ATTRS
8414_kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
8415  *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
8416  return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
8417}
8418
8419static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8420_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
8421{
8422  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
8423}
8424
8425static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8426_mm512_kxnor (__mmask16 __A, __mmask16 __B)
8427{
8428  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
8429}
8430
8431static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8432_mm512_kxor (__mmask16 __A, __mmask16 __B)
8433{
8434  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
8435}
8436
8437#define _kand_mask16 _mm512_kand
8438#define _kandn_mask16 _mm512_kandn
8439#define _knot_mask16 _mm512_knot
8440#define _kor_mask16 _mm512_kor
8441#define _kxnor_mask16 _mm512_kxnor
8442#define _kxor_mask16 _mm512_kxor
8443
8444#define _kshiftli_mask16(A, I) \
8445  ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
8446
8447#define _kshiftri_mask16(A, I) \
8448  ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
8449
8450static __inline__ unsigned int __DEFAULT_FN_ATTRS
8451_cvtmask16_u32(__mmask16 __A) {
8452  return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A);
8453}
8454
8455static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8456_cvtu32_mask16(unsigned int __A) {
8457  return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A);
8458}
8459
8460static __inline__ __mmask16 __DEFAULT_FN_ATTRS
8461_load_mask16(__mmask16 *__A) {
8462  return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A);
8463}
8464
8465static __inline__ void __DEFAULT_FN_ATTRS
8466_store_mask16(__mmask16 *__A, __mmask16 __B) {
8467  *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B);
8468}
8469
8470static __inline__ void __DEFAULT_FN_ATTRS512
8471_mm512_stream_si512 (void * __P, __m512i __A)
8472{
8473  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
8474  __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
8475}
8476
8477static __inline__ __m512i __DEFAULT_FN_ATTRS512
8478_mm512_stream_load_si512 (void const *__P)
8479{
8480  typedef __v8di __v8di_aligned __attribute__((aligned(64)));
8481  return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
8482}
8483
8484static __inline__ void __DEFAULT_FN_ATTRS512
8485_mm512_stream_pd (void *__P, __m512d __A)
8486{
8487  typedef __v8df __v8df_aligned __attribute__((aligned(64)));
8488  __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
8489}
8490
8491static __inline__ void __DEFAULT_FN_ATTRS512
8492_mm512_stream_ps (void *__P, __m512 __A)
8493{
8494  typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
8495  __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
8496}
8497
8498static __inline__ __m512d __DEFAULT_FN_ATTRS512
8499_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
8500{
8501  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
8502                  (__v8df) __W,
8503                  (__mmask8) __U);
8504}
8505
8506static __inline__ __m512d __DEFAULT_FN_ATTRS512
8507_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
8508{
8509  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
8510                  (__v8df)
8511                  _mm512_setzero_pd (),
8512                  (__mmask8) __U);
8513}
8514
8515static __inline__ __m512i __DEFAULT_FN_ATTRS512
8516_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
8517{
8518  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
8519                  (__v8di) __W,
8520                  (__mmask8) __U);
8521}
8522
8523static __inline__ __m512i __DEFAULT_FN_ATTRS512
8524_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
8525{
8526  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
8527                  (__v8di)
8528                  _mm512_setzero_si512 (),
8529                  (__mmask8) __U);
8530}
8531
8532static __inline__ __m512 __DEFAULT_FN_ATTRS512
8533_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
8534{
8535  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
8536                 (__v16sf) __W,
8537                 (__mmask16) __U);
8538}
8539
8540static __inline__ __m512 __DEFAULT_FN_ATTRS512
8541_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
8542{
8543  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
8544                 (__v16sf)
8545                 _mm512_setzero_ps (),
8546                 (__mmask16) __U);
8547}
8548
8549static __inline__ __m512i __DEFAULT_FN_ATTRS512
8550_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
8551{
8552  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
8553                  (__v16si) __W,
8554                  (__mmask16) __U);
8555}
8556
8557static __inline__ __m512i __DEFAULT_FN_ATTRS512
8558_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
8559{
8560  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
8561                  (__v16si)
8562                  _mm512_setzero_si512 (),
8563                  (__mmask16) __U);
8564}
8565
8566#define _mm_cmp_round_ss_mask(X, Y, P, R) \
8567  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8568                                       (__v4sf)(__m128)(Y), (int)(P), \
8569                                       (__mmask8)-1, (int)(R)))
8570
8571#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
8572  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8573                                       (__v4sf)(__m128)(Y), (int)(P), \
8574                                       (__mmask8)(M), (int)(R)))
8575
8576#define _mm_cmp_ss_mask(X, Y, P) \
8577  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8578                                       (__v4sf)(__m128)(Y), (int)(P), \
8579                                       (__mmask8)-1, \
8580                                       _MM_FROUND_CUR_DIRECTION))
8581
8582#define _mm_mask_cmp_ss_mask(M, X, Y, P) \
8583  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
8584                                       (__v4sf)(__m128)(Y), (int)(P), \
8585                                       (__mmask8)(M), \
8586                                       _MM_FROUND_CUR_DIRECTION))
8587
8588#define _mm_cmp_round_sd_mask(X, Y, P, R) \
8589  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8590                                       (__v2df)(__m128d)(Y), (int)(P), \
8591                                       (__mmask8)-1, (int)(R)))
8592
8593#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
8594  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8595                                       (__v2df)(__m128d)(Y), (int)(P), \
8596                                       (__mmask8)(M), (int)(R)))
8597
8598#define _mm_cmp_sd_mask(X, Y, P) \
8599  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8600                                       (__v2df)(__m128d)(Y), (int)(P), \
8601                                       (__mmask8)-1, \
8602                                       _MM_FROUND_CUR_DIRECTION))
8603
8604#define _mm_mask_cmp_sd_mask(M, X, Y, P) \
8605  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
8606                                       (__v2df)(__m128d)(Y), (int)(P), \
8607                                       (__mmask8)(M), \
8608                                       _MM_FROUND_CUR_DIRECTION))
8609
8610/* Bit Test */
8611
8612static __inline __mmask16 __DEFAULT_FN_ATTRS512
8613_mm512_test_epi32_mask (__m512i __A, __m512i __B)
8614{
8615  return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
8616                                   _mm512_setzero_si512());
8617}
8618
8619static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8620_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
8621{
8622  return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
8623                                        _mm512_setzero_si512());
8624}
8625
8626static __inline __mmask8 __DEFAULT_FN_ATTRS512
8627_mm512_test_epi64_mask (__m512i __A, __m512i __B)
8628{
8629  return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
8630                                   _mm512_setzero_si512());
8631}
8632
8633static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8634_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
8635{
8636  return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
8637                                        _mm512_setzero_si512());
8638}
8639
8640static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8641_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
8642{
8643  return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
8644                                  _mm512_setzero_si512());
8645}
8646
8647static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
8648_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
8649{
8650  return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
8651                                       _mm512_setzero_si512());
8652}
8653
8654static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8655_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
8656{
8657  return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
8658                                  _mm512_setzero_si512());
8659}
8660
8661static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
8662_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
8663{
8664  return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
8665                                       _mm512_setzero_si512());
8666}
8667
8668static __inline__ __m512 __DEFAULT_FN_ATTRS512
8669_mm512_movehdup_ps (__m512 __A)
8670{
8671  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
8672                         1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
8673}
8674
8675static __inline__ __m512 __DEFAULT_FN_ATTRS512
8676_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
8677{
8678  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8679                                             (__v16sf)_mm512_movehdup_ps(__A),
8680                                             (__v16sf)__W);
8681}
8682
8683static __inline__ __m512 __DEFAULT_FN_ATTRS512
8684_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
8685{
8686  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8687                                             (__v16sf)_mm512_movehdup_ps(__A),
8688                                             (__v16sf)_mm512_setzero_ps());
8689}
8690
8691static __inline__ __m512 __DEFAULT_FN_ATTRS512
8692_mm512_moveldup_ps (__m512 __A)
8693{
8694  return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
8695                         0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
8696}
8697
8698static __inline__ __m512 __DEFAULT_FN_ATTRS512
8699_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
8700{
8701  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8702                                             (__v16sf)_mm512_moveldup_ps(__A),
8703                                             (__v16sf)__W);
8704}
8705
8706static __inline__ __m512 __DEFAULT_FN_ATTRS512
8707_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
8708{
8709  return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
8710                                             (__v16sf)_mm512_moveldup_ps(__A),
8711                                             (__v16sf)_mm512_setzero_ps());
8712}
8713
8714static __inline__ __m128 __DEFAULT_FN_ATTRS128
8715_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
8716{
8717  return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
8718}
8719
8720static __inline__ __m128 __DEFAULT_FN_ATTRS128
8721_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
8722{
8723  return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
8724                                     _mm_setzero_ps());
8725}
8726
8727static __inline__ __m128d __DEFAULT_FN_ATTRS128
8728_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
8729{
8730  return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
8731}
8732
8733static __inline__ __m128d __DEFAULT_FN_ATTRS128
8734_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
8735{
8736  return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
8737                                     _mm_setzero_pd());
8738}
8739
8740static __inline__ void __DEFAULT_FN_ATTRS128
8741_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
8742{
8743  __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
8744}
8745
8746static __inline__ void __DEFAULT_FN_ATTRS128
8747_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
8748{
8749  __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
8750}
8751
8752static __inline__ __m128 __DEFAULT_FN_ATTRS128
8753_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
8754{
8755  __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
8756                                                (__v4sf)_mm_setzero_ps(),
8757                                                0, 4, 4, 4);
8758
8759  return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
8760}
8761
8762static __inline__ __m128 __DEFAULT_FN_ATTRS128
8763_mm_maskz_load_ss (__mmask8 __U, const float* __A)
8764{
8765  return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
8766                                                (__v4sf) _mm_setzero_ps(),
8767                                                __U & 1);
8768}
8769
8770static __inline__ __m128d __DEFAULT_FN_ATTRS128
8771_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
8772{
8773  __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
8774                                                 (__v2df)_mm_setzero_pd(),
8775                                                 0, 2);
8776
8777  return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
8778}
8779
8780static __inline__ __m128d __DEFAULT_FN_ATTRS128
8781_mm_maskz_load_sd (__mmask8 __U, const double* __A)
8782{
8783  return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
8784                                                  (__v2df) _mm_setzero_pd(),
8785                                                  __U & 1);
8786}
8787
8788#define _mm512_shuffle_epi32(A, I) \
8789  ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
8790
8791#define _mm512_mask_shuffle_epi32(W, U, A, I) \
8792  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
8793                                       (__v16si)_mm512_shuffle_epi32((A), (I)), \
8794                                       (__v16si)(__m512i)(W)))
8795
8796#define _mm512_maskz_shuffle_epi32(U, A, I) \
8797  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
8798                                       (__v16si)_mm512_shuffle_epi32((A), (I)), \
8799                                       (__v16si)_mm512_setzero_si512()))
8800
8801static __inline__ __m512d __DEFAULT_FN_ATTRS512
8802_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
8803{
8804  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
8805                (__v8df) __W,
8806                (__mmask8) __U);
8807}
8808
8809static __inline__ __m512d __DEFAULT_FN_ATTRS512
8810_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
8811{
8812  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
8813                (__v8df) _mm512_setzero_pd (),
8814                (__mmask8) __U);
8815}
8816
8817static __inline__ __m512i __DEFAULT_FN_ATTRS512
8818_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
8819{
8820  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
8821                (__v8di) __W,
8822                (__mmask8) __U);
8823}
8824
8825static __inline__ __m512i __DEFAULT_FN_ATTRS512
8826_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
8827{
8828  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
8829                (__v8di) _mm512_setzero_si512 (),
8830                (__mmask8) __U);
8831}
8832
8833static __inline__ __m512d __DEFAULT_FN_ATTRS512
8834_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
8835{
8836  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
8837              (__v8df) __W,
8838              (__mmask8) __U);
8839}
8840
8841static __inline__ __m512d __DEFAULT_FN_ATTRS512
8842_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
8843{
8844  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
8845              (__v8df) _mm512_setzero_pd(),
8846              (__mmask8) __U);
8847}
8848
8849static __inline__ __m512i __DEFAULT_FN_ATTRS512
8850_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
8851{
8852  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
8853              (__v8di) __W,
8854              (__mmask8) __U);
8855}
8856
8857static __inline__ __m512i __DEFAULT_FN_ATTRS512
8858_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
8859{
8860  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
8861              (__v8di) _mm512_setzero_si512(),
8862              (__mmask8) __U);
8863}
8864
8865static __inline__ __m512 __DEFAULT_FN_ATTRS512
8866_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
8867{
8868  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
8869                   (__v16sf) __W,
8870                   (__mmask16) __U);
8871}
8872
8873static __inline__ __m512 __DEFAULT_FN_ATTRS512
8874_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
8875{
8876  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
8877                   (__v16sf) _mm512_setzero_ps(),
8878                   (__mmask16) __U);
8879}
8880
8881static __inline__ __m512i __DEFAULT_FN_ATTRS512
8882_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
8883{
8884  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
8885              (__v16si) __W,
8886              (__mmask16) __U);
8887}
8888
8889static __inline__ __m512i __DEFAULT_FN_ATTRS512
8890_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
8891{
8892  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
8893              (__v16si) _mm512_setzero_si512(),
8894              (__mmask16) __U);
8895}
8896
8897static __inline__ __m512 __DEFAULT_FN_ATTRS512
8898_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
8899{
8900  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
8901               (__v16sf) __W,
8902               (__mmask16) __U);
8903}
8904
8905static __inline__ __m512 __DEFAULT_FN_ATTRS512
8906_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
8907{
8908  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
8909               (__v16sf) _mm512_setzero_ps(),
8910               (__mmask16) __U);
8911}
8912
8913static __inline__ __m512i __DEFAULT_FN_ATTRS512
8914_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
8915{
8916  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
8917                (__v16si) __W,
8918                (__mmask16) __U);
8919}
8920
8921static __inline__ __m512i __DEFAULT_FN_ATTRS512
8922_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
8923{
8924  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
8925                (__v16si) _mm512_setzero_si512(),
8926                (__mmask16) __U);
8927}
8928
8929#define _mm512_cvt_roundps_pd(A, R) \
8930  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8931                                            (__v8df)_mm512_undefined_pd(), \
8932                                            (__mmask8)-1, (int)(R)))
8933
8934#define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
8935  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8936                                            (__v8df)(__m512d)(W), \
8937                                            (__mmask8)(U), (int)(R)))
8938
8939#define _mm512_maskz_cvt_roundps_pd(U, A, R) \
8940  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
8941                                            (__v8df)_mm512_setzero_pd(), \
8942                                            (__mmask8)(U), (int)(R)))
8943
8944static __inline__ __m512d __DEFAULT_FN_ATTRS512
8945_mm512_cvtps_pd (__m256 __A)
8946{
8947  return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
8948}
8949
8950static __inline__ __m512d __DEFAULT_FN_ATTRS512
8951_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
8952{
8953  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8954                                              (__v8df)_mm512_cvtps_pd(__A),
8955                                              (__v8df)__W);
8956}
8957
8958static __inline__ __m512d __DEFAULT_FN_ATTRS512
8959_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
8960{
8961  return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
8962                                              (__v8df)_mm512_cvtps_pd(__A),
8963                                              (__v8df)_mm512_setzero_pd());
8964}
8965
8966static __inline__ __m512d __DEFAULT_FN_ATTRS512
8967_mm512_cvtpslo_pd (__m512 __A)
8968{
8969  return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
8970}
8971
8972static __inline__ __m512d __DEFAULT_FN_ATTRS512
8973_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
8974{
8975  return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
8976}
8977
8978static __inline__ __m512d __DEFAULT_FN_ATTRS512
8979_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
8980{
8981  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
8982              (__v8df) __A,
8983              (__v8df) __W);
8984}
8985
8986static __inline__ __m512d __DEFAULT_FN_ATTRS512
8987_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
8988{
8989  return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
8990              (__v8df) __A,
8991              (__v8df) _mm512_setzero_pd ());
8992}
8993
8994static __inline__ __m512 __DEFAULT_FN_ATTRS512
8995_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
8996{
8997  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
8998             (__v16sf) __A,
8999             (__v16sf) __W);
9000}
9001
9002static __inline__ __m512 __DEFAULT_FN_ATTRS512
9003_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
9004{
9005  return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
9006             (__v16sf) __A,
9007             (__v16sf) _mm512_setzero_ps ());
9008}
9009
9010static __inline__ void __DEFAULT_FN_ATTRS512
9011_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
9012{
9013  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
9014            (__mmask8) __U);
9015}
9016
9017static __inline__ void __DEFAULT_FN_ATTRS512
9018_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
9019{
9020  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
9021            (__mmask8) __U);
9022}
9023
9024static __inline__ void __DEFAULT_FN_ATTRS512
9025_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
9026{
9027  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
9028            (__mmask16) __U);
9029}
9030
9031static __inline__ void __DEFAULT_FN_ATTRS512
9032_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
9033{
9034  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
9035            (__mmask16) __U);
9036}
9037
9038#define _mm_cvt_roundsd_ss(A, B, R) \
9039  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9040                                              (__v2df)(__m128d)(B), \
9041                                              (__v4sf)_mm_undefined_ps(), \
9042                                              (__mmask8)-1, (int)(R)))
9043
9044#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
9045  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9046                                              (__v2df)(__m128d)(B), \
9047                                              (__v4sf)(__m128)(W), \
9048                                              (__mmask8)(U), (int)(R)))
9049
9050#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
9051  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
9052                                              (__v2df)(__m128d)(B), \
9053                                              (__v4sf)_mm_setzero_ps(), \
9054                                              (__mmask8)(U), (int)(R)))
9055
9056static __inline__ __m128 __DEFAULT_FN_ATTRS128
9057_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
9058{
9059  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
9060                                             (__v2df)__B,
9061                                             (__v4sf)__W,
9062                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9063}
9064
9065static __inline__ __m128 __DEFAULT_FN_ATTRS128
9066_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
9067{
9068  return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
9069                                             (__v2df)__B,
9070                                             (__v4sf)_mm_setzero_ps(),
9071                                             (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9072}
9073
9074#define _mm_cvtss_i32 _mm_cvtss_si32
9075#define _mm_cvtsd_i32 _mm_cvtsd_si32
9076#define _mm_cvti32_sd _mm_cvtsi32_sd
9077#define _mm_cvti32_ss _mm_cvtsi32_ss
9078#ifdef __x86_64__
9079#define _mm_cvtss_i64 _mm_cvtss_si64
9080#define _mm_cvtsd_i64 _mm_cvtsd_si64
9081#define _mm_cvti64_sd _mm_cvtsi64_sd
9082#define _mm_cvti64_ss _mm_cvtsi64_ss
9083#endif
9084
9085#ifdef __x86_64__
9086#define _mm_cvt_roundi64_sd(A, B, R) \
9087  ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
9088                                      (int)(R)))
9089
9090#define _mm_cvt_roundsi64_sd(A, B, R) \
9091  ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
9092                                      (int)(R)))
9093#endif
9094
9095#define _mm_cvt_roundsi32_ss(A, B, R) \
9096  ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
9097
9098#define _mm_cvt_roundi32_ss(A, B, R) \
9099  ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
9100
9101#ifdef __x86_64__
9102#define _mm_cvt_roundsi64_ss(A, B, R) \
9103  ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
9104                                     (int)(R)))
9105
9106#define _mm_cvt_roundi64_ss(A, B, R) \
9107  ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
9108                                     (int)(R)))
9109#endif
9110
9111#define _mm_cvt_roundss_sd(A, B, R) \
9112  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9113                                               (__v4sf)(__m128)(B), \
9114                                               (__v2df)_mm_undefined_pd(), \
9115                                               (__mmask8)-1, (int)(R)))
9116
9117#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
9118  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9119                                               (__v4sf)(__m128)(B), \
9120                                               (__v2df)(__m128d)(W), \
9121                                               (__mmask8)(U), (int)(R)))
9122
9123#define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
9124  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
9125                                               (__v4sf)(__m128)(B), \
9126                                               (__v2df)_mm_setzero_pd(), \
9127                                               (__mmask8)(U), (int)(R)))
9128
9129static __inline__ __m128d __DEFAULT_FN_ATTRS128
9130_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
9131{
9132  return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
9133                                            (__v4sf)__B,
9134                                            (__v2df)__W,
9135                                            (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9136}
9137
9138static __inline__ __m128d __DEFAULT_FN_ATTRS128
9139_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
9140{
9141  return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
9142                                            (__v4sf)__B,
9143                                            (__v2df)_mm_setzero_pd(),
9144                                            (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
9145}
9146
9147static __inline__ __m128d __DEFAULT_FN_ATTRS128
9148_mm_cvtu32_sd (__m128d __A, unsigned __B)
9149{
9150  __A[0] = __B;
9151  return __A;
9152}
9153
9154#ifdef __x86_64__
9155#define _mm_cvt_roundu64_sd(A, B, R) \
9156  ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
9157                                       (unsigned long long)(B), (int)(R)))
9158
9159static __inline__ __m128d __DEFAULT_FN_ATTRS128
9160_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
9161{
9162  __A[0] = __B;
9163  return __A;
9164}
9165#endif
9166
9167#define _mm_cvt_roundu32_ss(A, B, R) \
9168  ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
9169                                      (int)(R)))
9170
9171static __inline__ __m128 __DEFAULT_FN_ATTRS128
9172_mm_cvtu32_ss (__m128 __A, unsigned __B)
9173{
9174  __A[0] = __B;
9175  return __A;
9176}
9177
9178#ifdef __x86_64__
9179#define _mm_cvt_roundu64_ss(A, B, R) \
9180  ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
9181                                      (unsigned long long)(B), (int)(R)))
9182
9183static __inline__ __m128 __DEFAULT_FN_ATTRS128
9184_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
9185{
9186  __A[0] = __B;
9187  return __A;
9188}
9189#endif
9190
9191static __inline__ __m512i __DEFAULT_FN_ATTRS512
9192_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
9193{
9194  return (__m512i) __builtin_ia32_selectd_512(__M,
9195                                              (__v16si) _mm512_set1_epi32(__A),
9196                                              (__v16si) __O);
9197}
9198
9199static __inline__ __m512i __DEFAULT_FN_ATTRS512
9200_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
9201{
9202  return (__m512i) __builtin_ia32_selectq_512(__M,
9203                                              (__v8di) _mm512_set1_epi64(__A),
9204                                              (__v8di) __O);
9205}
9206
9207static  __inline __m512i __DEFAULT_FN_ATTRS512
9208_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
9209    char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
9210    char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
9211    char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
9212    char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
9213    char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
9214    char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
9215    char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
9216    char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
9217    char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
9218    char __e4, char __e3, char __e2, char __e1, char __e0) {
9219
9220  return __extension__ (__m512i)(__v64qi)
9221    {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
9222     __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
9223     __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
9224     __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
9225     __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
9226     __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
9227     __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
9228     __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
9229}
9230
9231static  __inline __m512i __DEFAULT_FN_ATTRS512
9232_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
9233    short __e27, short __e26, short __e25, short __e24, short __e23,
9234    short __e22, short __e21, short __e20, short __e19, short __e18,
9235    short __e17, short __e16, short __e15, short __e14, short __e13,
9236    short __e12, short __e11, short __e10, short __e9, short __e8,
9237    short __e7, short __e6, short __e5, short __e4, short __e3,
9238    short __e2, short __e1, short __e0) {
9239  return __extension__ (__m512i)(__v32hi)
9240    {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
9241     __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
9242     __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
9243     __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
9244}
9245
9246static __inline __m512i __DEFAULT_FN_ATTRS512
9247_mm512_set_epi32 (int __A, int __B, int __C, int __D,
9248     int __E, int __F, int __G, int __H,
9249     int __I, int __J, int __K, int __L,
9250     int __M, int __N, int __O, int __P)
9251{
9252  return __extension__ (__m512i)(__v16si)
9253  { __P, __O, __N, __M, __L, __K, __J, __I,
9254    __H, __G, __F, __E, __D, __C, __B, __A };
9255}
9256
9257#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,           \
9258       e8,e9,e10,e11,e12,e13,e14,e15)          \
9259  _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
9260                   (e5),(e4),(e3),(e2),(e1),(e0))
9261
9262static __inline__ __m512i __DEFAULT_FN_ATTRS512
9263_mm512_set_epi64 (long long __A, long long __B, long long __C,
9264     long long __D, long long __E, long long __F,
9265     long long __G, long long __H)
9266{
9267  return __extension__ (__m512i) (__v8di)
9268  { __H, __G, __F, __E, __D, __C, __B, __A };
9269}
9270
9271#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)           \
9272  _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
9273
9274static __inline__ __m512d __DEFAULT_FN_ATTRS512
9275_mm512_set_pd (double __A, double __B, double __C, double __D,
9276        double __E, double __F, double __G, double __H)
9277{
9278  return __extension__ (__m512d)
9279  { __H, __G, __F, __E, __D, __C, __B, __A };
9280}
9281
9282#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)              \
9283  _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
9284
9285static __inline__ __m512 __DEFAULT_FN_ATTRS512
9286_mm512_set_ps (float __A, float __B, float __C, float __D,
9287        float __E, float __F, float __G, float __H,
9288        float __I, float __J, float __K, float __L,
9289        float __M, float __N, float __O, float __P)
9290{
9291  return __extension__ (__m512)
9292  { __P, __O, __N, __M, __L, __K, __J, __I,
9293    __H, __G, __F, __E, __D, __C, __B, __A };
9294}
9295
9296#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
9297  _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
9298                (e4),(e3),(e2),(e1),(e0))
9299
9300static __inline__ __m512 __DEFAULT_FN_ATTRS512
9301_mm512_abs_ps(__m512 __A)
9302{
9303  return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
9304}
9305
9306static __inline__ __m512 __DEFAULT_FN_ATTRS512
9307_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
9308{
9309  return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
9310}
9311
9312static __inline__ __m512d __DEFAULT_FN_ATTRS512
9313_mm512_abs_pd(__m512d __A)
9314{
9315  return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
9316}
9317
9318static __inline__ __m512d __DEFAULT_FN_ATTRS512
9319_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
9320{
9321  return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
9322}
9323
9324/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
9325 * outputs. This class of vector operation forms the basis of many scientific
9326 * computations. In vector-reduction arithmetic, the evaluation order is
9327 * independent of the order of the input elements of V.
9328
9329 * For floating-point intrinsics:
9330 * 1. When using fadd/fmul intrinsics, the order of operations within the
9331 * vector is unspecified (associative math).
9332 * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
9333 * produce unspecified results.
9334
9335 * Used bisection method. At each step, we partition the vector with previous
9336 * step in half, and the operation is performed on its two halves.
9337 * This takes log2(n) steps where n is the number of elements in the vector.
9338 */
9339
9340static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
9341  return __builtin_reduce_add((__v8di)__W);
9342}
9343
9344static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
9345  return __builtin_reduce_mul((__v8di)__W);
9346}
9347
9348static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
9349  return __builtin_reduce_and((__v8di)__W);
9350}
9351
9352static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
9353  return __builtin_reduce_or((__v8di)__W);
9354}
9355
9356static __inline__ long long __DEFAULT_FN_ATTRS512
9357_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
9358  __W = _mm512_maskz_mov_epi64(__M, __W);
9359  return __builtin_reduce_add((__v8di)__W);
9360}
9361
9362static __inline__ long long __DEFAULT_FN_ATTRS512
9363_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
9364  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
9365  return __builtin_reduce_mul((__v8di)__W);
9366}
9367
9368static __inline__ long long __DEFAULT_FN_ATTRS512
9369_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
9370  __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __W);
9371  return __builtin_reduce_and((__v8di)__W);
9372}
9373
9374static __inline__ long long __DEFAULT_FN_ATTRS512
9375_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
9376  __W = _mm512_maskz_mov_epi64(__M, __W);
9377  return __builtin_reduce_or((__v8di)__W);
9378}
9379
9380// -0.0 is used to ignore the start value since it is the neutral value of
9381// floating point addition. For more information, please refer to
9382// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
9383static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
9384  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
9385}
9386
9387static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
9388  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
9389}
9390
9391static __inline__ double __DEFAULT_FN_ATTRS512
9392_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
9393  __W = _mm512_maskz_mov_pd(__M, __W);
9394  return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
9395}
9396
9397static __inline__ double __DEFAULT_FN_ATTRS512
9398_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
9399  __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
9400  return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
9401}
9402
9403static __inline__ int __DEFAULT_FN_ATTRS512
9404_mm512_reduce_add_epi32(__m512i __W) {
9405  return __builtin_reduce_add((__v16si)__W);
9406}
9407
9408static __inline__ int __DEFAULT_FN_ATTRS512
9409_mm512_reduce_mul_epi32(__m512i __W) {
9410  return __builtin_reduce_mul((__v16si)__W);
9411}
9412
9413static __inline__ int __DEFAULT_FN_ATTRS512
9414_mm512_reduce_and_epi32(__m512i __W) {
9415  return __builtin_reduce_and((__v16si)__W);
9416}
9417
9418static __inline__ int __DEFAULT_FN_ATTRS512
9419_mm512_reduce_or_epi32(__m512i __W) {
9420  return __builtin_reduce_or((__v16si)__W);
9421}
9422
9423static __inline__ int __DEFAULT_FN_ATTRS512
9424_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
9425  __W = _mm512_maskz_mov_epi32(__M, __W);
9426  return __builtin_reduce_add((__v16si)__W);
9427}
9428
9429static __inline__ int __DEFAULT_FN_ATTRS512
9430_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
9431  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
9432  return __builtin_reduce_mul((__v16si)__W);
9433}
9434
9435static __inline__ int __DEFAULT_FN_ATTRS512
9436_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
9437  __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __W);
9438  return __builtin_reduce_and((__v16si)__W);
9439}
9440
9441static __inline__ int __DEFAULT_FN_ATTRS512
9442_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
9443  __W = _mm512_maskz_mov_epi32(__M, __W);
9444  return __builtin_reduce_or((__v16si)__W);
9445}
9446
9447static __inline__ float __DEFAULT_FN_ATTRS512
9448_mm512_reduce_add_ps(__m512 __W) {
9449  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
9450}
9451
9452static __inline__ float __DEFAULT_FN_ATTRS512
9453_mm512_reduce_mul_ps(__m512 __W) {
9454  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
9455}
9456
9457static __inline__ float __DEFAULT_FN_ATTRS512
9458_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
9459  __W = _mm512_maskz_mov_ps(__M, __W);
9460  return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
9461}
9462
9463static __inline__ float __DEFAULT_FN_ATTRS512
9464_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
9465  __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
9466  return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
9467}
9468
9469static __inline__ long long __DEFAULT_FN_ATTRS512
9470_mm512_reduce_max_epi64(__m512i __V) {
9471  return __builtin_reduce_max((__v8di)__V);
9472}
9473
9474static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9475_mm512_reduce_max_epu64(__m512i __V) {
9476  return __builtin_reduce_max((__v8du)__V);
9477}
9478
9479static __inline__ long long __DEFAULT_FN_ATTRS512
9480_mm512_reduce_min_epi64(__m512i __V) {
9481  return __builtin_reduce_min((__v8di)__V);
9482}
9483
9484static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9485_mm512_reduce_min_epu64(__m512i __V) {
9486  return __builtin_reduce_min((__v8du)__V);
9487}
9488
9489static __inline__ long long __DEFAULT_FN_ATTRS512
9490_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
9491  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
9492  return __builtin_reduce_max((__v8di)__V);
9493}
9494
9495static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9496_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
9497  __V = _mm512_maskz_mov_epi64(__M, __V);
9498  return __builtin_reduce_max((__v8du)__V);
9499}
9500
9501static __inline__ long long __DEFAULT_FN_ATTRS512
9502_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
9503  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
9504  return __builtin_reduce_min((__v8di)__V);
9505}
9506
9507static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
9508_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
9509  __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-1LL), __M, __V);
9510  return __builtin_reduce_min((__v8du)__V);
9511}
9512static __inline__ int __DEFAULT_FN_ATTRS512
9513_mm512_reduce_max_epi32(__m512i __V) {
9514  return __builtin_reduce_max((__v16si)__V);
9515}
9516
9517static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9518_mm512_reduce_max_epu32(__m512i __V) {
9519  return __builtin_reduce_max((__v16su)__V);
9520}
9521
9522static __inline__ int __DEFAULT_FN_ATTRS512
9523_mm512_reduce_min_epi32(__m512i __V) {
9524  return __builtin_reduce_min((__v16si)__V);
9525}
9526
9527static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9528_mm512_reduce_min_epu32(__m512i __V) {
9529  return __builtin_reduce_min((__v16su)__V);
9530}
9531
9532static __inline__ int __DEFAULT_FN_ATTRS512
9533_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
9534  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
9535  return __builtin_reduce_max((__v16si)__V);
9536}
9537
9538static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9539_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
9540  __V = _mm512_maskz_mov_epi32(__M, __V);
9541  return __builtin_reduce_max((__v16su)__V);
9542}
9543
9544static __inline__ int __DEFAULT_FN_ATTRS512
9545_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
9546  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
9547  return __builtin_reduce_min((__v16si)__V);
9548}
9549
9550static __inline__ unsigned int __DEFAULT_FN_ATTRS512
9551_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
9552  __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), __M, __V);
9553  return __builtin_reduce_min((__v16su)__V);
9554}
9555
9556static __inline__ double __DEFAULT_FN_ATTRS512
9557_mm512_reduce_max_pd(__m512d __V) {
9558  return __builtin_ia32_reduce_fmax_pd512(__V);
9559}
9560
9561static __inline__ double __DEFAULT_FN_ATTRS512
9562_mm512_reduce_min_pd(__m512d __V) {
9563  return __builtin_ia32_reduce_fmin_pd512(__V);
9564}
9565
9566static __inline__ double __DEFAULT_FN_ATTRS512
9567_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
9568  __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
9569  return __builtin_ia32_reduce_fmax_pd512(__V);
9570}
9571
9572static __inline__ double __DEFAULT_FN_ATTRS512
9573_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
9574  __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
9575  return __builtin_ia32_reduce_fmin_pd512(__V);
9576}
9577
9578static __inline__ float __DEFAULT_FN_ATTRS512
9579_mm512_reduce_max_ps(__m512 __V) {
9580  return __builtin_ia32_reduce_fmax_ps512(__V);
9581}
9582
9583static __inline__ float __DEFAULT_FN_ATTRS512
9584_mm512_reduce_min_ps(__m512 __V) {
9585  return __builtin_ia32_reduce_fmin_ps512(__V);
9586}
9587
9588static __inline__ float __DEFAULT_FN_ATTRS512
9589_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
9590  __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
9591  return __builtin_ia32_reduce_fmax_ps512(__V);
9592}
9593
9594static __inline__ float __DEFAULT_FN_ATTRS512
9595_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
9596  __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
9597  return __builtin_ia32_reduce_fmin_ps512(__V);
9598}
9599
9600/// Moves the least significant 32 bits of a vector of [16 x i32] to a
9601///    32-bit signed integer value.
9602///
9603/// \headerfile <x86intrin.h>
9604///
9605/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
9606///
9607/// \param __A
9608///    A vector of [16 x i32]. The least significant 32 bits are moved to the
9609///    destination.
9610/// \returns A 32-bit signed integer containing the moved value.
9611static __inline__ int __DEFAULT_FN_ATTRS512
9612_mm512_cvtsi512_si32(__m512i __A) {
9613  __v16si __b = (__v16si)__A;
9614  return __b[0];
9615}
9616
9617/// Loads 8 double-precision (64-bit) floating-point elements stored at memory
9618/// locations starting at location \a base_addr at packed 32-bit integer indices
9619/// stored in the lower half of \a vindex scaled by \a scale them in dst.
9620///
9621/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
9622///
9623/// \code{.operation}
9624/// FOR j := 0 to 7
9625///   i := j*64
9626///   m := j*32
9627///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9628///   dst[i+63:i] := MEM[addr+63:addr]
9629/// ENDFOR
9630/// dst[MAX:512] := 0
9631/// \endcode
9632#define _mm512_i32logather_pd(vindex, base_addr, scale)                        \
9633  _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
9634
9635/// Loads 8 double-precision (64-bit) floating-point elements from memory
9636/// starting at location \a base_addr at packed 32-bit integer indices stored in
9637/// the lower half of \a vindex scaled by \a scale into dst using writemask
9638/// \a mask (elements are copied from \a src when the corresponding mask bit is
9639/// not set).
9640///
9641/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
9642///
9643/// \code{.operation}
9644/// FOR j := 0 to 7
9645///   i := j*64
9646///   m := j*32
9647///   IF mask[j]
9648///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9649///     dst[i+63:i] := MEM[addr+63:addr]
9650///   ELSE
9651///     dst[i+63:i] := src[i+63:i]
9652///   FI
9653/// ENDFOR
9654/// dst[MAX:512] := 0
9655/// \endcode
9656#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale)        \
9657  _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex),      \
9658                           (base_addr), (scale))
9659
9660/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
9661/// at packed 32-bit integer indices stored in the lower half of \a vindex
9662/// scaled by \a scale and stores them in dst.
9663///
9664/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
9665///
9666/// \code{.operation}
9667/// FOR j := 0 to 7
9668///   i := j*64
9669///   m := j*32
9670///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9671///   dst[i+63:i] := MEM[addr+63:addr]
9672/// ENDFOR
9673/// dst[MAX:512] := 0
9674/// \endcode
9675#define _mm512_i32logather_epi64(vindex, base_addr, scale)                     \
9676  _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
9677
9678/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
9679/// at packed 32-bit integer indices stored in the lower half of \a vindex
9680/// scaled by \a scale and stores them in dst using writemask \a mask (elements
9681/// are copied from \a src when the corresponding mask bit is not set).
9682///
9683/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
9684///
9685/// \code{.operation}
9686/// FOR j := 0 to 7
9687///   i := j*64
9688///   m := j*32
9689///   IF mask[j]
9690///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9691///     dst[i+63:i] := MEM[addr+63:addr]
9692///   ELSE
9693///     dst[i+63:i] := src[i+63:i]
9694///   FI
9695/// ENDFOR
9696/// dst[MAX:512] := 0
9697/// \endcode
9698#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale)     \
9699  _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex),   \
9700                              (base_addr), (scale))
9701
9702/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
9703/// and to memory locations starting at location \a base_addr at packed 32-bit
9704/// integer indices stored in \a vindex scaled by \a scale.
9705///
9706/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
9707///
9708/// \code{.operation}
9709/// FOR j := 0 to 7
9710///   i := j*64
9711///   m := j*32
9712///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9713///   MEM[addr+63:addr] := v1[i+63:i]
9714/// ENDFOR
9715/// \endcode
9716#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale)                   \
9717  _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
9718
9719/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
9720/// to memory locations starting at location \a base_addr at packed 32-bit
9721/// integer indices stored in \a vindex scaled by \a scale. Only those elements
9722/// whose corresponding mask bit is set in writemask \a mask are written to
9723/// memory.
9724///
9725/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
9726///
9727/// \code{.operation}
9728/// FOR j := 0 to 7
9729///   i := j*64
9730///   m := j*32
9731///   IF mask[j]
9732///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9733///     MEM[addr+63:addr] := a[i+63:i]
9734///   FI
9735/// ENDFOR
9736/// \endcode
9737#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale)        \
9738  _mm512_mask_i32scatter_pd((base_addr), (mask),                               \
9739                            _mm512_castsi512_si256(vindex), (v1), (scale))
9740
9741/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
9742/// memory locations starting at location \a base_addr at packed 32-bit integer
9743/// indices stored in \a vindex scaled by \a scale.
9744///
9745/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
9746///
9747/// \code{.operation}
9748/// FOR j := 0 to 7
9749///   i := j*64
9750///   m := j*32
9751///   addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9752///   MEM[addr+63:addr] := a[i+63:i]
9753/// ENDFOR
9754/// \endcode
9755#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale)                \
9756  _mm512_i32scatter_epi64((base_addr),                                         \
9757                          _mm512_castsi512_si256(vindex), (v1), (scale))
9758
9759/// Stores 8 packed 64-bit integer elements located in a and stores them in
9760/// memory locations starting at location \a base_addr at packed 32-bit integer
9761/// indices stored in \a vindex scaled by scale using writemask \a mask (elements
9762/// whose corresponding mask bit is not set are not written to memory).
9763///
9764/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
9765///
9766/// \code{.operation}
9767/// FOR j := 0 to 7
9768///   i := j*64
9769///   m := j*32
9770///   IF mask[j]
9771///     addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
9772///     MEM[addr+63:addr] := a[i+63:i]
9773///   FI
9774/// ENDFOR
9775/// \endcode
9776#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale)     \
9777  _mm512_mask_i32scatter_epi64((base_addr), (mask),                            \
9778                               _mm512_castsi512_si256(vindex), (v1), (scale))
9779
9780#undef __DEFAULT_FN_ATTRS512
9781#undef __DEFAULT_FN_ATTRS128
9782#undef __DEFAULT_FN_ATTRS
9783#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
9784#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
9785#undef __DEFAULT_FN_ATTRS_CONSTEXPR
9786
9787#endif /* __AVX512FINTRIN_H */