master
   1/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9#ifndef __IMMINTRIN_H
  10#error                                                                         \
  11    "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
  12#endif
  13
  14#ifdef __SSE2__
  15
  16#ifndef __AVX512VLFP16INTRIN_H
  17#define __AVX512VLFP16INTRIN_H
  18
  19/* Define the default attributes for the functions in this file. */
  20#define __DEFAULT_FN_ATTRS256                                                  \
  21  __attribute__((__always_inline__, __nodebug__,                               \
  22                 __target__("avx512fp16,avx512vl,no-evex512"),                 \
  23                 __min_vector_width__(256)))
  24#define __DEFAULT_FN_ATTRS128                                                  \
  25  __attribute__((__always_inline__, __nodebug__,                               \
  26                 __target__("avx512fp16,avx512vl,no-evex512"),                 \
  27                 __min_vector_width__(128)))
  28
  29static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
  30  return __a[0];
  31}
  32
  33static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) {
  34  return __a[0];
  35}
  36
  37static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) {
  38  return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
  39}
  40
  41static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) {
  42  return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
  43}
  44
  45static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) {
  46  return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
  47                            __h, __h, __h, __h, __h, __h, __h, __h};
  48}
  49
  50static __inline __m128h __DEFAULT_FN_ATTRS128
  51_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
  52           _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
  53  return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
  54}
  55
  56static __inline __m256h __DEFAULT_FN_ATTRS256
  57_mm256_set1_pch(_Float16 _Complex h) {
  58  return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h));
  59}
  60
  61static __inline __m128h __DEFAULT_FN_ATTRS128
  62_mm_set1_pch(_Float16 _Complex h) {
  63  return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h));
  64}
  65
  66static __inline __m256h __DEFAULT_FN_ATTRS256
  67_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
  68              _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
  69              _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
  70              _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
  71  return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
  72                            __h10, __h9,  __h8,  __h7,  __h6,  __h5,
  73                            __h4,  __h3,  __h2,  __h1};
  74}
  75
  76#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8)                            \
  77  _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
  78
  79#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
  80                       h14, h15, h16)                                          \
  81  _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8),   \
  82                (h7), (h6), (h5), (h4), (h3), (h2), (h1))
  83
  84static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A,
  85                                                              __m256h __B) {
  86  return (__m256h)((__v16hf)__A + (__v16hf)__B);
  87}
  88
  89static __inline__ __m256h __DEFAULT_FN_ATTRS256
  90_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
  91  return (__m256h)__builtin_ia32_selectph_256(
  92      __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
  93}
  94
  95static __inline__ __m256h __DEFAULT_FN_ATTRS256
  96_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) {
  97  return (__m256h)__builtin_ia32_selectph_256(
  98      __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
  99}
 100
 101static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A,
 102                                                           __m128h __B) {
 103  return (__m128h)((__v8hf)__A + (__v8hf)__B);
 104}
 105
 106static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W,
 107                                                                __mmask8 __U,
 108                                                                __m128h __A,
 109                                                                __m128h __B) {
 110  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
 111                                              (__v8hf)__W);
 112}
 113
 114static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U,
 115                                                                 __m128h __A,
 116                                                                 __m128h __B) {
 117  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
 118                                              (__v8hf)_mm_setzero_ph());
 119}
 120
 121static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A,
 122                                                              __m256h __B) {
 123  return (__m256h)((__v16hf)__A - (__v16hf)__B);
 124}
 125
 126static __inline__ __m256h __DEFAULT_FN_ATTRS256
 127_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
 128  return (__m256h)__builtin_ia32_selectph_256(
 129      __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
 130}
 131
 132static __inline__ __m256h __DEFAULT_FN_ATTRS256
 133_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) {
 134  return (__m256h)__builtin_ia32_selectph_256(
 135      __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
 136}
 137
 138static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A,
 139                                                           __m128h __B) {
 140  return (__m128h)((__v8hf)__A - (__v8hf)__B);
 141}
 142
 143static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W,
 144                                                                __mmask8 __U,
 145                                                                __m128h __A,
 146                                                                __m128h __B) {
 147  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
 148                                              (__v8hf)__W);
 149}
 150
 151static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U,
 152                                                                 __m128h __A,
 153                                                                 __m128h __B) {
 154  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
 155                                              (__v8hf)_mm_setzero_ph());
 156}
 157
 158static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A,
 159                                                              __m256h __B) {
 160  return (__m256h)((__v16hf)__A * (__v16hf)__B);
 161}
 162
 163static __inline__ __m256h __DEFAULT_FN_ATTRS256
 164_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
 165  return (__m256h)__builtin_ia32_selectph_256(
 166      __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
 167}
 168
 169static __inline__ __m256h __DEFAULT_FN_ATTRS256
 170_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) {
 171  return (__m256h)__builtin_ia32_selectph_256(
 172      __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
 173}
 174
 175static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A,
 176                                                           __m128h __B) {
 177  return (__m128h)((__v8hf)__A * (__v8hf)__B);
 178}
 179
 180static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W,
 181                                                                __mmask8 __U,
 182                                                                __m128h __A,
 183                                                                __m128h __B) {
 184  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
 185                                              (__v8hf)__W);
 186}
 187
 188static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U,
 189                                                                 __m128h __A,
 190                                                                 __m128h __B) {
 191  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
 192                                              (__v8hf)_mm_setzero_ph());
 193}
 194
 195static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A,
 196                                                              __m256h __B) {
 197  return (__m256h)((__v16hf)__A / (__v16hf)__B);
 198}
 199
 200static __inline__ __m256h __DEFAULT_FN_ATTRS256
 201_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
 202  return (__m256h)__builtin_ia32_selectph_256(
 203      __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
 204}
 205
 206static __inline__ __m256h __DEFAULT_FN_ATTRS256
 207_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) {
 208  return (__m256h)__builtin_ia32_selectph_256(
 209      __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
 210}
 211
 212static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A,
 213                                                           __m128h __B) {
 214  return (__m128h)((__v8hf)__A / (__v8hf)__B);
 215}
 216
 217static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W,
 218                                                                __mmask8 __U,
 219                                                                __m128h __A,
 220                                                                __m128h __B) {
 221  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
 222                                              (__v8hf)__W);
 223}
 224
 225static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U,
 226                                                                 __m128h __A,
 227                                                                 __m128h __B) {
 228  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
 229                                              (__v8hf)_mm_setzero_ph());
 230}
 231
 232static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A,
 233                                                              __m256h __B) {
 234  return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
 235}
 236
 237static __inline__ __m256h __DEFAULT_FN_ATTRS256
 238_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
 239  return (__m256h)__builtin_ia32_selectph_256(
 240      (__mmask16)__U,
 241      (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
 242      (__v16hf)__W);
 243}
 244
 245static __inline__ __m256h __DEFAULT_FN_ATTRS256
 246_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) {
 247  return (__m256h)__builtin_ia32_selectph_256(
 248      (__mmask16)__U,
 249      (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
 250      (__v16hf)_mm256_setzero_ph());
 251}
 252
 253static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A,
 254                                                           __m128h __B) {
 255  return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
 256}
 257
 258static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W,
 259                                                                __mmask8 __U,
 260                                                                __m128h __A,
 261                                                                __m128h __B) {
 262  return (__m128h)__builtin_ia32_selectph_128(
 263      (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
 264      (__v8hf)__W);
 265}
 266
 267static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U,
 268                                                                 __m128h __A,
 269                                                                 __m128h __B) {
 270  return (__m128h)__builtin_ia32_selectph_128(
 271      (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
 272      (__v8hf)_mm_setzero_ph());
 273}
 274
 275static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A,
 276                                                              __m256h __B) {
 277  return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
 278}
 279
 280static __inline__ __m256h __DEFAULT_FN_ATTRS256
 281_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
 282  return (__m256h)__builtin_ia32_selectph_256(
 283      (__mmask16)__U,
 284      (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
 285      (__v16hf)__W);
 286}
 287
 288static __inline__ __m256h __DEFAULT_FN_ATTRS256
 289_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) {
 290  return (__m256h)__builtin_ia32_selectph_256(
 291      (__mmask16)__U,
 292      (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
 293      (__v16hf)_mm256_setzero_ph());
 294}
 295
 296static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A,
 297                                                           __m128h __B) {
 298  return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
 299}
 300
 301static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W,
 302                                                                __mmask8 __U,
 303                                                                __m128h __A,
 304                                                                __m128h __B) {
 305  return (__m128h)__builtin_ia32_selectph_128(
 306      (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
 307      (__v8hf)__W);
 308}
 309
 310static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U,
 311                                                                 __m128h __A,
 312                                                                 __m128h __B) {
 313  return (__m128h)__builtin_ia32_selectph_128(
 314      (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
 315      (__v8hf)_mm_setzero_ph());
 316}
 317
 318static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
 319  return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
 320}
 321
 322static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
 323  return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
 324}
 325
 326static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) {
 327  return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f));
 328}
 329
 330static __inline__ __m256h __DEFAULT_FN_ATTRS256
 331_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) {
 332  return (__m256h)__builtin_ia32_selectps_256(
 333      (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
 334}
 335
 336static __inline__ __m256h __DEFAULT_FN_ATTRS256
 337_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) {
 338  return (__m256h)__builtin_ia32_selectps_256(
 339      (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps());
 340}
 341
 342static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) {
 343  return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f));
 344}
 345
 346static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W,
 347                                                                  __mmask8 __U,
 348                                                                  __m128h __A) {
 349  return (__m128h)__builtin_ia32_selectps_128(
 350      (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
 351}
 352
 353static __inline__ __m128h __DEFAULT_FN_ATTRS128
 354_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) {
 355  return (__m128h)__builtin_ia32_selectps_128(
 356      (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps());
 357}
 358
 359#define _mm256_cmp_ph_mask(a, b, p)                                            \
 360  ((__mmask16)__builtin_ia32_cmpph256_mask(                                    \
 361      (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
 362
 363#define _mm256_mask_cmp_ph_mask(m, a, b, p)                                    \
 364  ((__mmask16)__builtin_ia32_cmpph256_mask(                                    \
 365      (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
 366
 367#define _mm_cmp_ph_mask(a, b, p)                                               \
 368  ((__mmask8)__builtin_ia32_cmpph128_mask(                                     \
 369      (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
 370
 371#define _mm_mask_cmp_ph_mask(m, a, b, p)                                       \
 372  ((__mmask8)__builtin_ia32_cmpph128_mask(                                     \
 373      (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
 374
 375static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
 376  return (__m256h)__builtin_ia32_rcpph256_mask(
 377      (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
 378}
 379
 380static __inline__ __m256h __DEFAULT_FN_ATTRS256
 381_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
 382  return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
 383                                               (__mmask16)__U);
 384}
 385
 386static __inline__ __m256h __DEFAULT_FN_ATTRS256
 387_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
 388  return (__m256h)__builtin_ia32_rcpph256_mask(
 389      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
 390}
 391
 392static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
 393  return (__m128h)__builtin_ia32_rcpph128_mask(
 394      (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
 395}
 396
 397static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
 398                                                                __mmask8 __U,
 399                                                                __m128h __A) {
 400  return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
 401                                               (__mmask8)__U);
 402}
 403
 404static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
 405                                                                 __m128h __A) {
 406  return (__m128h)__builtin_ia32_rcpph128_mask(
 407      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
 408}
 409
 410static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
 411  return (__m256h)__builtin_ia32_rsqrtph256_mask(
 412      (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
 413}
 414
 415static __inline__ __m256h __DEFAULT_FN_ATTRS256
 416_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
 417  return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
 418                                                 (__mmask16)__U);
 419}
 420
 421static __inline__ __m256h __DEFAULT_FN_ATTRS256
 422_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
 423  return (__m256h)__builtin_ia32_rsqrtph256_mask(
 424      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
 425}
 426
 427static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
 428  return (__m128h)__builtin_ia32_rsqrtph128_mask(
 429      (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
 430}
 431
 432static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
 433                                                                  __mmask8 __U,
 434                                                                  __m128h __A) {
 435  return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
 436                                                 (__mmask8)__U);
 437}
 438
 439static __inline__ __m128h __DEFAULT_FN_ATTRS128
 440_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
 441  return (__m128h)__builtin_ia32_rsqrtph128_mask(
 442      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
 443}
 444
 445static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
 446  return (__m128h)__builtin_ia32_getexpph128_mask(
 447      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
 448}
 449
 450static __inline__ __m128h __DEFAULT_FN_ATTRS128
 451_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
 452  return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
 453                                                  (__mmask8)__U);
 454}
 455
 456static __inline__ __m128h __DEFAULT_FN_ATTRS128
 457_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
 458  return (__m128h)__builtin_ia32_getexpph128_mask(
 459      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
 460}
 461
 462static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
 463  return (__m256h)__builtin_ia32_getexpph256_mask(
 464      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
 465}
 466
 467static __inline__ __m256h __DEFAULT_FN_ATTRS256
 468_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
 469  return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
 470                                                  (__mmask16)__U);
 471}
 472
 473static __inline__ __m256h __DEFAULT_FN_ATTRS256
 474_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
 475  return (__m256h)__builtin_ia32_getexpph256_mask(
 476      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
 477}
 478
 479#define _mm_getmant_ph(A, B, C)                                                \
 480  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
 481      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
 482      (__mmask8)-1))
 483
 484#define _mm_mask_getmant_ph(W, U, A, B, C)                                     \
 485  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
 486      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W),     \
 487      (__mmask8)(U)))
 488
 489#define _mm_maskz_getmant_ph(U, A, B, C)                                       \
 490  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
 491      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
 492      (__mmask8)(U)))
 493
 494#define _mm256_getmant_ph(A, B, C)                                             \
 495  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
 496      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
 497      (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
 498
 499#define _mm256_mask_getmant_ph(W, U, A, B, C)                                  \
 500  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
 501      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W),   \
 502      (__mmask16)(U)))
 503
 504#define _mm256_maskz_getmant_ph(U, A, B, C)                                    \
 505  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
 506      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
 507      (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
 508
 509static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
 510                                                              __m128h __B) {
 511  return (__m128h)__builtin_ia32_scalefph128_mask(
 512      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
 513}
 514
 515static __inline__ __m128h __DEFAULT_FN_ATTRS128
 516_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
 517  return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
 518                                                  (__v8hf)__W, (__mmask8)__U);
 519}
 520
 521static __inline__ __m128h __DEFAULT_FN_ATTRS128
 522_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
 523  return (__m128h)__builtin_ia32_scalefph128_mask(
 524      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
 525}
 526
 527static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
 528                                                                 __m256h __B) {
 529  return (__m256h)__builtin_ia32_scalefph256_mask(
 530      (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
 531}
 532
 533static __inline__ __m256h __DEFAULT_FN_ATTRS256
 534_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
 535  return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
 536                                                  (__v16hf)__W, (__mmask16)__U);
 537}
 538
 539static __inline__ __m256h __DEFAULT_FN_ATTRS256
 540_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
 541  return (__m256h)__builtin_ia32_scalefph256_mask(
 542      (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
 543}
 544
 545#define _mm_roundscale_ph(A, imm)                                              \
 546  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
 547      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(),              \
 548      (__mmask8)-1))
 549
 550#define _mm_mask_roundscale_ph(W, U, A, imm)                                   \
 551  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
 552      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
 553
 554#define _mm_maskz_roundscale_ph(U, A, imm)                                     \
 555  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
 556      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(),              \
 557      (__mmask8)(U)))
 558
 559#define _mm256_roundscale_ph(A, imm)                                           \
 560  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
 561      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
 562      (__mmask16)-1))
 563
 564#define _mm256_mask_roundscale_ph(W, U, A, imm)                                \
 565  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
 566      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W),                \
 567      (__mmask16)(U)))
 568
 569#define _mm256_maskz_roundscale_ph(U, A, imm)                                  \
 570  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
 571      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
 572      (__mmask16)(U)))
 573
 574#define _mm_reduce_ph(A, imm)                                                  \
 575  ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm),  \
 576                                            (__v8hf)_mm_setzero_ph(),          \
 577                                            (__mmask8)-1))
 578
 579#define _mm_mask_reduce_ph(W, U, A, imm)                                       \
 580  ((__m128h)__builtin_ia32_reduceph128_mask(                                   \
 581      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
 582
 583#define _mm_maskz_reduce_ph(U, A, imm)                                         \
 584  ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm),  \
 585                                            (__v8hf)_mm_setzero_ph(),          \
 586                                            (__mmask8)(U)))
 587
 588#define _mm256_reduce_ph(A, imm)                                               \
 589  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
 590                                            (__v16hf)_mm256_setzero_ph(),      \
 591                                            (__mmask16)-1))
 592
 593#define _mm256_mask_reduce_ph(W, U, A, imm)                                    \
 594  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
 595                                            (__v16hf)(__m256h)(W),             \
 596                                            (__mmask16)(U)))
 597
 598#define _mm256_maskz_reduce_ph(U, A, imm)                                      \
 599  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
 600                                            (__v16hf)_mm256_setzero_ph(),      \
 601                                            (__mmask16)(U)))
 602
 603static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
 604  return __builtin_ia32_sqrtph((__v8hf)__a);
 605}
 606
 607static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
 608                                                                 __mmask8 __U,
 609                                                                 __m128h __A) {
 610  return (__m128h)__builtin_ia32_selectph_128(
 611      (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
 612}
 613
 614static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
 615                                                                  __m128h __A) {
 616  return (__m128h)__builtin_ia32_selectph_128(
 617      (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
 618}
 619
 620static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
 621  return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
 622}
 623
 624static __inline__ __m256h __DEFAULT_FN_ATTRS256
 625_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
 626  return (__m256h)__builtin_ia32_selectph_256(
 627      (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
 628}
 629
 630static __inline__ __m256h __DEFAULT_FN_ATTRS256
 631_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
 632  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
 633                                              (__v16hf)_mm256_sqrt_ph(__A),
 634                                              (__v16hf)_mm256_setzero_ph());
 635}
 636
 637#define _mm_mask_fpclass_ph_mask(U, A, imm)                                    \
 638  ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A),            \
 639                                              (int)(imm), (__mmask8)(U)))
 640
 641#define _mm_fpclass_ph_mask(A, imm)                                            \
 642  ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A),            \
 643                                              (int)(imm), (__mmask8)-1))
 644
 645#define _mm256_mask_fpclass_ph_mask(U, A, imm)                                 \
 646  ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A),          \
 647                                               (int)(imm), (__mmask16)(U)))
 648
 649#define _mm256_fpclass_ph_mask(A, imm)                                         \
 650  ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A),          \
 651                                               (int)(imm), (__mmask16)-1))
 652
 653static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
 654  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
 655      (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
 656}
 657
 658static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W,
 659                                                                  __mmask8 __U,
 660                                                                  __m128d __A) {
 661  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
 662                                                   (__mmask8)__U);
 663}
 664
 665static __inline__ __m128h __DEFAULT_FN_ATTRS128
 666_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) {
 667  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
 668      (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
 669}
 670
 671static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) {
 672  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
 673      (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
 674}
 675
 676static __inline__ __m128h __DEFAULT_FN_ATTRS256
 677_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) {
 678  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
 679                                                   (__mmask8)__U);
 680}
 681
 682static __inline__ __m128h __DEFAULT_FN_ATTRS256
 683_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) {
 684  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
 685      (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
 686}
 687
 688static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) {
 689  return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
 690      (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1);
 691}
 692
 693static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W,
 694                                                                  __mmask8 __U,
 695                                                                  __m128h __A) {
 696  return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
 697                                                   (__mmask8)__U);
 698}
 699
 700static __inline__ __m128d __DEFAULT_FN_ATTRS128
 701_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
 702  return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
 703      (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U);
 704}
 705
 706static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) {
 707  return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
 708      (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1);
 709}
 710
 711static __inline__ __m256d __DEFAULT_FN_ATTRS256
 712_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) {
 713  return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
 714                                                   (__mmask8)__U);
 715}
 716
 717static __inline__ __m256d __DEFAULT_FN_ATTRS256
 718_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
 719  return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
 720      (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U);
 721}
 722
 723static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) {
 724  return (__m128i)__builtin_ia32_vcvtph2w128_mask(
 725      (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
 726}
 727
 728static __inline__ __m128i __DEFAULT_FN_ATTRS128
 729_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
 730  return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
 731                                                  (__mmask8)__U);
 732}
 733
 734static __inline__ __m128i __DEFAULT_FN_ATTRS128
 735_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) {
 736  return (__m128i)__builtin_ia32_vcvtph2w128_mask(
 737      (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
 738}
 739
 740static __inline__ __m256i __DEFAULT_FN_ATTRS256
 741_mm256_cvtph_epi16(__m256h __A) {
 742  return (__m256i)__builtin_ia32_vcvtph2w256_mask(
 743      (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
 744}
 745
 746static __inline__ __m256i __DEFAULT_FN_ATTRS256
 747_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
 748  return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
 749                                                  (__mmask16)__U);
 750}
 751
 752static __inline__ __m256i __DEFAULT_FN_ATTRS256
 753_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) {
 754  return (__m256i)__builtin_ia32_vcvtph2w256_mask(
 755      (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
 756}
 757
 758static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) {
 759  return (__m128i)__builtin_ia32_vcvttph2w128_mask(
 760      (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
 761}
 762
 763static __inline__ __m128i __DEFAULT_FN_ATTRS128
 764_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
 765  return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
 766                                                   (__mmask8)__U);
 767}
 768
 769static __inline__ __m128i __DEFAULT_FN_ATTRS128
 770_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) {
 771  return (__m128i)__builtin_ia32_vcvttph2w128_mask(
 772      (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
 773}
 774
 775static __inline__ __m256i __DEFAULT_FN_ATTRS256
 776_mm256_cvttph_epi16(__m256h __A) {
 777  return (__m256i)__builtin_ia32_vcvttph2w256_mask(
 778      (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
 779}
 780
 781static __inline__ __m256i __DEFAULT_FN_ATTRS256
 782_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
 783  return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
 784                                                   (__mmask16)__U);
 785}
 786
 787static __inline__ __m256i __DEFAULT_FN_ATTRS256
 788_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) {
 789  return (__m256i)__builtin_ia32_vcvttph2w256_mask(
 790      (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
 791}
 792
 793static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) {
 794  return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
 795}
 796
 797static __inline__ __m128h __DEFAULT_FN_ATTRS128
 798_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
 799  return (__m128h)__builtin_ia32_selectph_128(
 800      (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
 801}
 802
 803static __inline__ __m128h __DEFAULT_FN_ATTRS128
 804_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) {
 805  return (__m128h)__builtin_ia32_selectph_128(
 806      (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
 807}
 808
 809static __inline__ __m256h __DEFAULT_FN_ATTRS256
 810_mm256_cvtepi16_ph(__m256i __A) {
 811  return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
 812}
 813
 814static __inline__ __m256h __DEFAULT_FN_ATTRS256
 815_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
 816  return (__m256h)__builtin_ia32_selectph_256(
 817      (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
 818}
 819
 820static __inline__ __m256h __DEFAULT_FN_ATTRS256
 821_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) {
 822  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
 823                                              (__v16hf)_mm256_cvtepi16_ph(__A),
 824                                              (__v16hf)_mm256_setzero_ph());
 825}
 826
 827static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) {
 828  return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
 829      (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
 830}
 831
 832static __inline__ __m128i __DEFAULT_FN_ATTRS128
 833_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
 834  return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
 835                                                   (__mmask8)__U);
 836}
 837
 838static __inline__ __m128i __DEFAULT_FN_ATTRS128
 839_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) {
 840  return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
 841      (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
 842}
 843
 844static __inline__ __m256i __DEFAULT_FN_ATTRS256
 845_mm256_cvtph_epu16(__m256h __A) {
 846  return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
 847      (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
 848}
 849
 850static __inline__ __m256i __DEFAULT_FN_ATTRS256
 851_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
 852  return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
 853                                                   (__mmask16)__U);
 854}
 855
 856static __inline__ __m256i __DEFAULT_FN_ATTRS256
 857_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) {
 858  return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
 859      (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
 860}
 861
 862static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) {
 863  return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
 864      (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
 865}
 866
 867static __inline__ __m128i __DEFAULT_FN_ATTRS128
 868_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
 869  return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
 870                                                    (__mmask8)__U);
 871}
 872
 873static __inline__ __m128i __DEFAULT_FN_ATTRS128
 874_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) {
 875  return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
 876      (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
 877}
 878
 879static __inline__ __m256i __DEFAULT_FN_ATTRS256
 880_mm256_cvttph_epu16(__m256h __A) {
 881  return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
 882      (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
 883}
 884
 885static __inline__ __m256i __DEFAULT_FN_ATTRS256
 886_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
 887  return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
 888                                                    (__mmask16)__U);
 889}
 890
 891static __inline__ __m256i __DEFAULT_FN_ATTRS256
 892_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) {
 893  return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
 894      (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
 895}
 896
 897static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) {
 898  return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
 899}
 900
 901static __inline__ __m128h __DEFAULT_FN_ATTRS128
 902_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
 903  return (__m128h)__builtin_ia32_selectph_128(
 904      (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
 905}
 906
 907static __inline__ __m128h __DEFAULT_FN_ATTRS128
 908_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) {
 909  return (__m128h)__builtin_ia32_selectph_128(
 910      (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
 911}
 912
 913static __inline__ __m256h __DEFAULT_FN_ATTRS256
 914_mm256_cvtepu16_ph(__m256i __A) {
 915  return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
 916}
 917
 918static __inline__ __m256h __DEFAULT_FN_ATTRS256
 919_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
 920  return (__m256h)__builtin_ia32_selectph_256(
 921      (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
 922}
 923
 924static __inline__ __m256h __DEFAULT_FN_ATTRS256
 925_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) {
 926  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
 927                                              (__v16hf)_mm256_cvtepu16_ph(__A),
 928                                              (__v16hf)_mm256_setzero_ph());
 929}
 930
 931static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) {
 932  return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
 933      (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
 934}
 935
 936static __inline__ __m128i __DEFAULT_FN_ATTRS128
 937_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
 938  return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
 939                                                   (__mmask8)__U);
 940}
 941
 942static __inline__ __m128i __DEFAULT_FN_ATTRS128
 943_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
 944  return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
 945      (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
 946}
 947
 948static __inline__ __m256i __DEFAULT_FN_ATTRS256
 949_mm256_cvtph_epi32(__m128h __A) {
 950  return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
 951      (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
 952}
 953
 954static __inline__ __m256i __DEFAULT_FN_ATTRS256
 955_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
 956  return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
 957                                                   (__mmask8)__U);
 958}
 959
 960static __inline__ __m256i __DEFAULT_FN_ATTRS256
 961_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
 962  return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
 963      (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
 964}
 965
 966static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) {
 967  return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
 968      (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
 969}
 970
 971static __inline__ __m128i __DEFAULT_FN_ATTRS128
 972_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
 973  return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
 974                                                    (__mmask8)__U);
 975}
 976
 977static __inline__ __m128i __DEFAULT_FN_ATTRS128
 978_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
 979  return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
 980      (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
 981}
 982
 983static __inline__ __m256i __DEFAULT_FN_ATTRS256
 984_mm256_cvtph_epu32(__m128h __A) {
 985  return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
 986      (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
 987}
 988
 989static __inline__ __m256i __DEFAULT_FN_ATTRS256
 990_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
 991  return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
 992                                                    (__mmask8)__U);
 993}
 994
 995static __inline__ __m256i __DEFAULT_FN_ATTRS256
 996_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
 997  return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
 998      (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
 999}
1000
1001static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) {
1002  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1003      (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1004}
1005
1006static __inline__ __m128h __DEFAULT_FN_ATTRS128
1007_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1008  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
1009                                                   (__mmask8)__U);
1010}
1011
1012static __inline__ __m128h __DEFAULT_FN_ATTRS128
1013_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) {
1014  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
1015      (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1016}
1017
1018static __inline__ __m128h __DEFAULT_FN_ATTRS256
1019_mm256_cvtepi32_ph(__m256i __A) {
1020  return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
1021}
1022
1023static __inline__ __m128h __DEFAULT_FN_ATTRS256
1024_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1025  return (__m128h)__builtin_ia32_selectph_128(
1026      (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
1027}
1028
1029static __inline__ __m128h __DEFAULT_FN_ATTRS256
1030_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) {
1031  return (__m128h)__builtin_ia32_selectph_128(
1032      (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
1033}
1034
1035static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) {
1036  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1037      (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1038}
1039
1040static __inline__ __m128h __DEFAULT_FN_ATTRS128
1041_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1042  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
1043                                                    (__mmask8)__U);
1044}
1045
1046static __inline__ __m128h __DEFAULT_FN_ATTRS128
1047_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) {
1048  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
1049      (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1050}
1051
1052static __inline__ __m128h __DEFAULT_FN_ATTRS256
1053_mm256_cvtepu32_ph(__m256i __A) {
1054  return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
1055}
1056
1057static __inline__ __m128h __DEFAULT_FN_ATTRS256
1058_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1059  return (__m128h)__builtin_ia32_selectph_128(
1060      (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
1061}
1062
1063static __inline__ __m128h __DEFAULT_FN_ATTRS256
1064_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) {
1065  return (__m128h)__builtin_ia32_selectph_128(
1066      (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
1067}
1068
1069static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) {
1070  return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1071      (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
1072}
1073
1074static __inline__ __m128i __DEFAULT_FN_ATTRS128
1075_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
1076  return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
1077                                                    (__mmask8)__U);
1078}
1079
1080static __inline__ __m128i __DEFAULT_FN_ATTRS128
1081_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
1082  return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
1083      (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
1084}
1085
1086static __inline__ __m256i __DEFAULT_FN_ATTRS256
1087_mm256_cvttph_epi32(__m128h __A) {
1088  return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1089      (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
1090}
1091
1092static __inline__ __m256i __DEFAULT_FN_ATTRS256
1093_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
1094  return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
1095                                                    (__mmask8)__U);
1096}
1097
1098static __inline__ __m256i __DEFAULT_FN_ATTRS256
1099_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
1100  return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
1101      (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
1102}
1103
1104static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) {
1105  return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1106      (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
1107}
1108
1109static __inline__ __m128i __DEFAULT_FN_ATTRS128
1110_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
1111  return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
1112                                                     (__mmask8)__U);
1113}
1114
1115static __inline__ __m128i __DEFAULT_FN_ATTRS128
1116_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
1117  return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
1118      (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
1119}
1120
1121static __inline__ __m256i __DEFAULT_FN_ATTRS256
1122_mm256_cvttph_epu32(__m128h __A) {
1123  return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1124      (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
1125}
1126
1127static __inline__ __m256i __DEFAULT_FN_ATTRS256
1128_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
1129  return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
1130                                                     (__mmask8)__U);
1131}
1132
1133static __inline__ __m256i __DEFAULT_FN_ATTRS256
1134_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
1135  return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
1136      (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
1137}
1138
1139static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) {
1140  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1141      (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1142}
1143
1144static __inline__ __m128h __DEFAULT_FN_ATTRS128
1145_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1146  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
1147                                                   (__mmask8)__U);
1148}
1149
1150static __inline__ __m128h __DEFAULT_FN_ATTRS128
1151_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) {
1152  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
1153      (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1154}
1155
1156static __inline__ __m128h __DEFAULT_FN_ATTRS256
1157_mm256_cvtepi64_ph(__m256i __A) {
1158  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1159      (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1160}
1161
1162static __inline__ __m128h __DEFAULT_FN_ATTRS256
1163_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1164  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
1165                                                   (__mmask8)__U);
1166}
1167
1168static __inline__ __m128h __DEFAULT_FN_ATTRS256
1169_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) {
1170  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
1171      (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1172}
1173
1174static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) {
1175  return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1176      (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
1177}
1178
1179static __inline__ __m128i __DEFAULT_FN_ATTRS128
1180_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
1181  return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
1182                                                   (__mmask8)__U);
1183}
1184
1185static __inline__ __m128i __DEFAULT_FN_ATTRS128
1186_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
1187  return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
1188      (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
1189}
1190
1191static __inline__ __m256i __DEFAULT_FN_ATTRS256
1192_mm256_cvtph_epi64(__m128h __A) {
1193  return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1194      (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
1195}
1196
1197static __inline__ __m256i __DEFAULT_FN_ATTRS256
1198_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
1199  return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
1200                                                   (__mmask8)__U);
1201}
1202
1203static __inline__ __m256i __DEFAULT_FN_ATTRS256
1204_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
1205  return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
1206      (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
1207}
1208
1209static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) {
1210  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1211      (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1212}
1213
1214static __inline__ __m128h __DEFAULT_FN_ATTRS128
1215_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
1216  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
1217                                                    (__mmask8)__U);
1218}
1219
1220static __inline__ __m128h __DEFAULT_FN_ATTRS128
1221_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) {
1222  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
1223      (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1224}
1225
1226static __inline__ __m128h __DEFAULT_FN_ATTRS256
1227_mm256_cvtepu64_ph(__m256i __A) {
1228  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1229      (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1230}
1231
1232static __inline__ __m128h __DEFAULT_FN_ATTRS256
1233_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
1234  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
1235                                                    (__mmask8)__U);
1236}
1237
1238static __inline__ __m128h __DEFAULT_FN_ATTRS256
1239_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) {
1240  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
1241      (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1242}
1243
1244static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) {
1245  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1246      (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
1247}
1248
1249static __inline__ __m128i __DEFAULT_FN_ATTRS128
1250_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
1251  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1252                                                    (__mmask8)__U);
1253}
1254
1255static __inline__ __m128i __DEFAULT_FN_ATTRS128
1256_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
1257  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
1258      (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
1259}
1260
1261static __inline__ __m256i __DEFAULT_FN_ATTRS256
1262_mm256_cvtph_epu64(__m128h __A) {
1263  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1264      (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
1265}
1266
1267static __inline__ __m256i __DEFAULT_FN_ATTRS256
1268_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
1269  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1270                                                    (__mmask8)__U);
1271}
1272
1273static __inline__ __m256i __DEFAULT_FN_ATTRS256
1274_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
1275  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
1276      (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
1277}
1278
1279static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) {
1280  return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1281      (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
1282}
1283
1284static __inline__ __m128i __DEFAULT_FN_ATTRS128
1285_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
1286  return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
1287                                                    (__mmask8)__U);
1288}
1289
1290static __inline__ __m128i __DEFAULT_FN_ATTRS128
1291_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
1292  return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
1293      (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
1294}
1295
1296static __inline__ __m256i __DEFAULT_FN_ATTRS256
1297_mm256_cvttph_epi64(__m128h __A) {
1298  return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1299      (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
1300}
1301
1302static __inline__ __m256i __DEFAULT_FN_ATTRS256
1303_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
1304  return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
1305                                                    (__mmask8)__U);
1306}
1307
1308static __inline__ __m256i __DEFAULT_FN_ATTRS256
1309_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
1310  return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
1311      (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
1312}
1313
1314static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) {
1315  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1316      (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
1317}
1318
1319static __inline__ __m128i __DEFAULT_FN_ATTRS128
1320_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
1321  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
1322                                                     (__mmask8)__U);
1323}
1324
1325static __inline__ __m128i __DEFAULT_FN_ATTRS128
1326_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
1327  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
1328      (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
1329}
1330
1331static __inline__ __m256i __DEFAULT_FN_ATTRS256
1332_mm256_cvttph_epu64(__m128h __A) {
1333  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1334      (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
1335}
1336
1337static __inline__ __m256i __DEFAULT_FN_ATTRS256
1338_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
1339  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
1340                                                     (__mmask8)__U);
1341}
1342
1343static __inline__ __m256i __DEFAULT_FN_ATTRS256
1344_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
1345  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
1346      (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
1347}
1348
1349static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) {
1350  return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1351      (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1);
1352}
1353
1354static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W,
1355                                                                  __mmask8 __U,
1356                                                                  __m128h __A) {
1357  return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
1358                                                   (__mmask8)__U);
1359}
1360
1361static __inline__ __m128 __DEFAULT_FN_ATTRS128
1362_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
1363  return (__m128)__builtin_ia32_vcvtph2psx128_mask(
1364      (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
1365}
1366
1367static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) {
1368  return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1369      (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1);
1370}
1371
1372static __inline__ __m256 __DEFAULT_FN_ATTRS256
1373_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) {
1374  return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
1375                                                   (__mmask8)__U);
1376}
1377
1378static __inline__ __m256 __DEFAULT_FN_ATTRS256
1379_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
1380  return (__m256)__builtin_ia32_vcvtph2psx256_mask(
1381      (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U);
1382}
1383
1384static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) {
1385  return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1386      (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1387}
1388
1389static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W,
1390                                                                   __mmask8 __U,
1391                                                                   __m128 __A) {
1392  return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
1393                                                    (__mmask8)__U);
1394}
1395
1396static __inline__ __m128h __DEFAULT_FN_ATTRS128
1397_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) {
1398  return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
1399      (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1400}
1401
1402static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) {
1403  return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1404      (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
1405}
1406
1407static __inline__ __m128h __DEFAULT_FN_ATTRS256
1408_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) {
1409  return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
1410                                                    (__mmask8)__U);
1411}
1412
1413static __inline__ __m128h __DEFAULT_FN_ATTRS256
1414_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
1415  return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
1416      (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1417}
1418
1419static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
1420                                                             __m128h __B,
1421                                                             __m128h __C) {
1422  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1423                                          (__v8hf)__C);
1424}
1425
1426static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
1427                                                                  __mmask8 __U,
1428                                                                  __m128h __B,
1429                                                                  __m128h __C) {
1430  return (__m128h)__builtin_ia32_selectph_128(
1431      (__mmask8)__U,
1432      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1433      (__v8hf)__A);
1434}
1435
1436static __inline__ __m128h __DEFAULT_FN_ATTRS128
1437_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1438  return (__m128h)__builtin_ia32_selectph_128(
1439      (__mmask8)__U,
1440      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1441      (__v8hf)__C);
1442}
1443
1444static __inline__ __m128h __DEFAULT_FN_ATTRS128
1445_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1446  return (__m128h)__builtin_ia32_selectph_128(
1447      (__mmask8)__U,
1448      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1449      (__v8hf)_mm_setzero_ph());
1450}
1451
1452static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
1453                                                             __m128h __B,
1454                                                             __m128h __C) {
1455  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
1456                                          -(__v8hf)__C);
1457}
1458
1459static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
1460                                                                  __mmask8 __U,
1461                                                                  __m128h __B,
1462                                                                  __m128h __C) {
1463  return (__m128h)__builtin_ia32_selectph_128(
1464      (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1465      (__v8hf)__A);
1466}
1467
1468static __inline__ __m128h __DEFAULT_FN_ATTRS128
1469_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1470  return (__m128h)__builtin_ia32_selectph_128(
1471      (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1472      (__v8hf)_mm_setzero_ph());
1473}
1474
1475static __inline__ __m128h __DEFAULT_FN_ATTRS128
1476_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1477  return (__m128h)__builtin_ia32_selectph_128(
1478      (__mmask8)__U,
1479      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1480      (__v8hf)__C);
1481}
1482
1483static __inline__ __m128h __DEFAULT_FN_ATTRS128
1484_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1485  return (__m128h)__builtin_ia32_selectph_128(
1486      (__mmask8)__U,
1487      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1488      (__v8hf)_mm_setzero_ph());
1489}
1490
1491static __inline__ __m128h __DEFAULT_FN_ATTRS128
1492_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1493  return (__m128h)__builtin_ia32_selectph_128(
1494      (__mmask8)__U,
1495      __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1496      (__v8hf)_mm_setzero_ph());
1497}
1498
1499static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
1500                                                                __m256h __B,
1501                                                                __m256h __C) {
1502  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1503                                             (__v16hf)__C);
1504}
1505
1506static __inline__ __m256h __DEFAULT_FN_ATTRS256
1507_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1508  return (__m256h)__builtin_ia32_selectph_256(
1509      (__mmask16)__U,
1510      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1511      (__v16hf)__A);
1512}
1513
1514static __inline__ __m256h __DEFAULT_FN_ATTRS256
1515_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1516  return (__m256h)__builtin_ia32_selectph_256(
1517      (__mmask16)__U,
1518      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1519      (__v16hf)__C);
1520}
1521
1522static __inline__ __m256h __DEFAULT_FN_ATTRS256
1523_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1524  return (__m256h)__builtin_ia32_selectph_256(
1525      (__mmask16)__U,
1526      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1527      (__v16hf)_mm256_setzero_ph());
1528}
1529
1530static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
1531                                                                __m256h __B,
1532                                                                __m256h __C) {
1533  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
1534                                             -(__v16hf)__C);
1535}
1536
1537static __inline__ __m256h __DEFAULT_FN_ATTRS256
1538_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1539  return (__m256h)__builtin_ia32_selectph_256(
1540      (__mmask16)__U,
1541      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1542      (__v16hf)__A);
1543}
1544
1545static __inline__ __m256h __DEFAULT_FN_ATTRS256
1546_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1547  return (__m256h)__builtin_ia32_selectph_256(
1548      (__mmask16)__U,
1549      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1550      (__v16hf)_mm256_setzero_ph());
1551}
1552
1553static __inline__ __m256h __DEFAULT_FN_ATTRS256
1554_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1555  return (__m256h)__builtin_ia32_selectph_256(
1556      (__mmask16)__U,
1557      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1558      (__v16hf)__C);
1559}
1560
1561static __inline__ __m256h __DEFAULT_FN_ATTRS256
1562_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1563  return (__m256h)__builtin_ia32_selectph_256(
1564      (__mmask16)__U,
1565      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1566      (__v16hf)_mm256_setzero_ph());
1567}
1568
1569static __inline__ __m256h __DEFAULT_FN_ATTRS256
1570_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1571  return (__m256h)__builtin_ia32_selectph_256(
1572      (__mmask16)__U,
1573      __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1574      (__v16hf)_mm256_setzero_ph());
1575}
1576
1577static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A,
1578                                                                __m128h __B,
1579                                                                __m128h __C) {
1580  return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1581                                             (__v8hf)__C);
1582}
1583
1584static __inline__ __m128h __DEFAULT_FN_ATTRS128
1585_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1586  return (__m128h)__builtin_ia32_selectph_128(
1587      (__mmask8)__U,
1588      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1589      (__v8hf)__A);
1590}
1591
1592static __inline__ __m128h __DEFAULT_FN_ATTRS128
1593_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1594  return (__m128h)__builtin_ia32_selectph_128(
1595      (__mmask8)__U,
1596      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1597      (__v8hf)__C);
1598}
1599
1600static __inline__ __m128h __DEFAULT_FN_ATTRS128
1601_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1602  return (__m128h)__builtin_ia32_selectph_128(
1603      (__mmask8)__U,
1604      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
1605      (__v8hf)_mm_setzero_ph());
1606}
1607
1608static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A,
1609                                                                __m128h __B,
1610                                                                __m128h __C) {
1611  return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
1612                                             -(__v8hf)__C);
1613}
1614
1615static __inline__ __m128h __DEFAULT_FN_ATTRS128
1616_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1617  return (__m128h)__builtin_ia32_selectph_128(
1618      (__mmask8)__U,
1619      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1620      (__v8hf)__A);
1621}
1622
1623static __inline__ __m128h __DEFAULT_FN_ATTRS128
1624_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1625  return (__m128h)__builtin_ia32_selectph_128(
1626      (__mmask8)__U,
1627      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1628      (__v8hf)_mm_setzero_ph());
1629}
1630
1631static __inline__ __m256h __DEFAULT_FN_ATTRS256
1632_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
1633  return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1634                                                (__v16hf)__C);
1635}
1636
1637static __inline__ __m256h __DEFAULT_FN_ATTRS256
1638_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1639  return (__m256h)__builtin_ia32_selectph_256(
1640      (__mmask16)__U,
1641      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1642      (__v16hf)__A);
1643}
1644
1645static __inline__ __m256h __DEFAULT_FN_ATTRS256
1646_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1647  return (__m256h)__builtin_ia32_selectph_256(
1648      (__mmask16)__U,
1649      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1650      (__v16hf)__C);
1651}
1652
1653static __inline__ __m256h __DEFAULT_FN_ATTRS256
1654_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1655  return (__m256h)__builtin_ia32_selectph_256(
1656      (__mmask16)__U,
1657      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
1658      (__v16hf)_mm256_setzero_ph());
1659}
1660
1661static __inline__ __m256h __DEFAULT_FN_ATTRS256
1662_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
1663  return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
1664                                                -(__v16hf)__C);
1665}
1666
1667static __inline__ __m256h __DEFAULT_FN_ATTRS256
1668_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1669  return (__m256h)__builtin_ia32_selectph_256(
1670      (__mmask16)__U,
1671      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1672      (__v16hf)__A);
1673}
1674
1675static __inline__ __m256h __DEFAULT_FN_ATTRS256
1676_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
1677  return (__m256h)__builtin_ia32_selectph_256(
1678      (__mmask16)__U,
1679      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1680      (__v16hf)_mm256_setzero_ph());
1681}
1682
1683static __inline__ __m128h __DEFAULT_FN_ATTRS128
1684_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1685  return (__m128h)__builtin_ia32_selectph_128(
1686      (__mmask8)__U,
1687      __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1688      (__v8hf)__C);
1689}
1690
1691static __inline__ __m256h __DEFAULT_FN_ATTRS256
1692_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1693  return (__m256h)__builtin_ia32_selectph_256(
1694      (__mmask16)__U,
1695      __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1696      (__v16hf)__C);
1697}
1698
1699static __inline__ __m128h __DEFAULT_FN_ATTRS128
1700_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1701  return (__m128h)__builtin_ia32_selectph_128(
1702      (__mmask8)__U,
1703      __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
1704      (__v8hf)__C);
1705}
1706
1707static __inline__ __m256h __DEFAULT_FN_ATTRS256
1708_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1709  return (__m256h)__builtin_ia32_selectph_256(
1710      (__mmask16)__U,
1711      __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
1712      (__v16hf)__C);
1713}
1714
1715static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
1716                                                              __m128h __B,
1717                                                              __m128h __C) {
1718  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1719                                          (__v8hf)__C);
1720}
1721
1722static __inline__ __m128h __DEFAULT_FN_ATTRS128
1723_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1724  return (__m128h)__builtin_ia32_selectph_128(
1725      (__mmask8)__U,
1726      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
1727      (__v8hf)__A);
1728}
1729
1730static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
1731                                                                 __m256h __B,
1732                                                                 __m256h __C) {
1733  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1734                                             (__v16hf)__C);
1735}
1736
1737static __inline__ __m256h __DEFAULT_FN_ATTRS256
1738_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1739  return (__m256h)__builtin_ia32_selectph_256(
1740      (__mmask16)__U,
1741      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
1742      (__v16hf)__A);
1743}
1744
1745static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
1746                                                              __m128h __B,
1747                                                              __m128h __C) {
1748  return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
1749                                          -(__v8hf)__C);
1750}
1751
1752static __inline__ __m128h __DEFAULT_FN_ATTRS128
1753_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1754  return (__m128h)__builtin_ia32_selectph_128(
1755      (__mmask8)__U,
1756      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1757      (__v8hf)__A);
1758}
1759
1760static __inline__ __m128h __DEFAULT_FN_ATTRS128
1761_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1762  return (__m128h)__builtin_ia32_selectph_128(
1763      (__mmask8)__U,
1764      __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
1765      (__v8hf)__C);
1766}
1767
1768static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
1769                                                                 __m256h __B,
1770                                                                 __m256h __C) {
1771  return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
1772                                             -(__v16hf)__C);
1773}
1774
1775static __inline__ __m256h __DEFAULT_FN_ATTRS256
1776_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
1777  return (__m256h)__builtin_ia32_selectph_256(
1778      (__mmask16)__U,
1779      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1780      (__v16hf)__A);
1781}
1782
1783static __inline__ __m256h __DEFAULT_FN_ATTRS256
1784_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
1785  return (__m256h)__builtin_ia32_selectph_256(
1786      (__mmask16)__U,
1787      __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
1788      (__v16hf)__C);
1789}
1790
1791static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A,
1792                                                              __m128h __B) {
1793  return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1794      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
1795}
1796
1797static __inline__ __m128h __DEFAULT_FN_ATTRS128
1798_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1799  return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1800                                                   (__v4sf)__W, (__mmask8)__U);
1801}
1802
1803static __inline__ __m128h __DEFAULT_FN_ATTRS128
1804_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
1805  return (__m128h)__builtin_ia32_vfcmulcph128_mask(
1806      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
1807}
1808
1809static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A,
1810                                                                 __m256h __B) {
1811  return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1812      (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
1813}
1814
1815static __inline__ __m256h __DEFAULT_FN_ATTRS256
1816_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
1817  return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1818                                                   (__v8sf)__W, (__mmask8)__U);
1819}
1820
1821static __inline__ __m256h __DEFAULT_FN_ATTRS256
1822_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
1823  return (__m256h)__builtin_ia32_vfcmulcph256_mask(
1824      (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
1825}
1826
1827static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A,
1828                                                               __m128h __B,
1829                                                               __m128h __C) {
1830  return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1831                                                    (__v4sf)__C, (__mmask8)-1);
1832}
1833
1834static __inline__ __m128h __DEFAULT_FN_ATTRS128
1835_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1836  return (__m128h)__builtin_ia32_selectps_128(
1837      __U,
1838      __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B,
1839                                        (__v4sf)__C, (__mmask8)__U),
1840      (__v4sf)__A);
1841}
1842
1843static __inline__ __m128h __DEFAULT_FN_ATTRS128
1844_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1845  return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1846                                                    (__v4sf)__C, (__mmask8)__U);
1847}
1848
1849static __inline__ __m128h __DEFAULT_FN_ATTRS128
1850_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1851  return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
1852      (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U);
1853}
1854
1855static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A,
1856                                                                  __m256h __B,
1857                                                                  __m256h __C) {
1858  return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1859                                                    (__v8sf)__C, (__mmask8)-1);
1860}
1861
1862static __inline__ __m256h __DEFAULT_FN_ATTRS256
1863_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
1864  return (__m256h)__builtin_ia32_selectps_256(
1865      __U,
1866      __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
1867                                        (__mmask8)__U),
1868      (__v8sf)__A);
1869}
1870
1871static __inline__ __m256h __DEFAULT_FN_ATTRS256
1872_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
1873  return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1874                                                    (__v8sf)__C, (__mmask8)__U);
1875}
1876
1877static __inline__ __m256h __DEFAULT_FN_ATTRS256
1878_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1879  return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
1880      (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U);
1881}
1882
1883static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A,
1884                                                             __m128h __B) {
1885  return (__m128h)__builtin_ia32_vfmulcph128_mask(
1886      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
1887}
1888
1889static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W,
1890                                                                  __mmask8 __U,
1891                                                                  __m128h __A,
1892                                                                  __m128h __B) {
1893  return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
1894                                                  (__v4sf)__W, (__mmask8)__U);
1895}
1896
1897static __inline__ __m128h __DEFAULT_FN_ATTRS128
1898_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
1899  return (__m128h)__builtin_ia32_vfmulcph128_mask(
1900      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
1901}
1902
1903static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A,
1904                                                                __m256h __B) {
1905  return (__m256h)__builtin_ia32_vfmulcph256_mask(
1906      (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
1907}
1908
1909static __inline__ __m256h __DEFAULT_FN_ATTRS256
1910_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
1911  return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
1912                                                  (__v8sf)__W, (__mmask8)__U);
1913}
1914
1915static __inline__ __m256h __DEFAULT_FN_ATTRS256
1916_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
1917  return (__m256h)__builtin_ia32_vfmulcph256_mask(
1918      (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
1919}
1920
1921static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A,
1922                                                              __m128h __B,
1923                                                              __m128h __C) {
1924  return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1925                                                   (__v4sf)__C, (__mmask8)-1);
1926}
1927
1928static __inline__ __m128h __DEFAULT_FN_ATTRS128
1929_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
1930  return (__m128h)__builtin_ia32_selectps_128(
1931      __U,
1932      __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
1933                                       (__mmask8)__U),
1934      (__v4sf)__A);
1935}
1936
1937static __inline__ __m128h __DEFAULT_FN_ATTRS128
1938_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
1939  return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
1940                                                   (__v4sf)__C, (__mmask8)__U);
1941}
1942
1943static __inline__ __m128h __DEFAULT_FN_ATTRS128
1944_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
1945  return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B,
1946                                                    (__v4sf)__C, (__mmask8)__U);
1947}
1948
1949static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A,
1950                                                                 __m256h __B,
1951                                                                 __m256h __C) {
1952  return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1953                                                   (__v8sf)__C, (__mmask8)-1);
1954}
1955
1956static __inline__ __m256h __DEFAULT_FN_ATTRS256
1957_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
1958  return (__m256h)__builtin_ia32_selectps_256(
1959      __U,
1960      __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
1961                                       (__mmask8)__U),
1962      (__v8sf)__A);
1963}
1964
1965static __inline__ __m256h __DEFAULT_FN_ATTRS256
1966_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
1967  return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
1968                                                   (__v8sf)__C, (__mmask8)__U);
1969}
1970
1971static __inline__ __m256h __DEFAULT_FN_ATTRS256
1972_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
1973  return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B,
1974                                                    (__v8sf)__C, (__mmask8)__U);
1975}
1976
1977static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
1978                                                                  __m128h __A,
1979                                                                  __m128h __W) {
1980  return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
1981                                              (__v8hf)__A);
1982}
1983
1984static __inline__ __m256h __DEFAULT_FN_ATTRS256
1985_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
1986  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
1987                                              (__v16hf)__A);
1988}
1989
1990static __inline__ __m128h __DEFAULT_FN_ATTRS128
1991_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
1992  return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
1993                                                 (__v8hi)__B);
1994}
1995
1996static __inline__ __m256h __DEFAULT_FN_ATTRS256
1997_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
1998  return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
1999                                                 (__v16hi)__B);
2000}
2001
2002static __inline__ __m128h __DEFAULT_FN_ATTRS128
2003_mm_permutexvar_ph(__m128i __A, __m128h __B) {
2004  return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
2005}
2006
2007static __inline__ __m256h __DEFAULT_FN_ATTRS256
2008_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
2009  return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
2010}
2011
2012static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2013_mm256_reduce_add_ph(__m256h __W) {
2014  return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
2015}
2016
2017static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2018_mm256_reduce_mul_ph(__m256h __W) {
2019  return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
2020}
2021
2022static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2023_mm256_reduce_max_ph(__m256h __V) {
2024  return __builtin_ia32_reduce_fmax_ph256(__V);
2025}
2026
2027static __inline__ _Float16 __DEFAULT_FN_ATTRS256
2028_mm256_reduce_min_ph(__m256h __V) {
2029  return __builtin_ia32_reduce_fmin_ph256(__V);
2030}
2031
2032static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2033_mm_reduce_add_ph(__m128h __W) {
2034  return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
2035}
2036
2037static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2038_mm_reduce_mul_ph(__m128h __W) {
2039  return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
2040}
2041
2042static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2043_mm_reduce_max_ph(__m128h __V) {
2044  return __builtin_ia32_reduce_fmax_ph128(__V);
2045}
2046
2047static __inline__ _Float16 __DEFAULT_FN_ATTRS128
2048_mm_reduce_min_ph(__m128h __V) {
2049  return __builtin_ia32_reduce_fmin_ph128(__V);
2050}
2051
2052// intrinsics below are alias for f*mul_*ch
2053#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
2054#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
2055#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
2056#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
2057#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
2058#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
2059
2060#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
2061#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
2062#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
2063#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
2064#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
2065#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
2066
2067#undef __DEFAULT_FN_ATTRS128
2068#undef __DEFAULT_FN_ATTRS256
2069
2070#endif
2071#endif