master
   1/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9#ifndef __IMMINTRIN_H
  10#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
  11#endif
  12
  13#ifdef __SSE2__
  14
  15#ifndef __AVX512FP16INTRIN_H
  16#define __AVX512FP16INTRIN_H
  17
  18/* Define the default attributes for the functions in this file. */
  19typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
  20typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
  21typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
  22
  23/* Define the default attributes for the functions in this file. */
  24#define __DEFAULT_FN_ATTRS512                                                  \
  25  __attribute__((__always_inline__, __nodebug__,                               \
  26                 __target__("avx512fp16,evex512"), __min_vector_width__(512)))
  27#define __DEFAULT_FN_ATTRS256                                                  \
  28  __attribute__((__always_inline__, __nodebug__,                               \
  29                 __target__("avx512fp16,no-evex512"),                          \
  30                 __min_vector_width__(256)))
  31#define __DEFAULT_FN_ATTRS128                                                  \
  32  __attribute__((__always_inline__, __nodebug__,                               \
  33                 __target__("avx512fp16,no-evex512"),                          \
  34                 __min_vector_width__(128)))
  35
  36static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
  37  return __a[0];
  38}
  39
  40static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
  41  return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
  42}
  43
  44static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
  45  return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
  46                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
  47}
  48
  49static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
  50  return (__m256h)__builtin_ia32_undef256();
  51}
  52
  53static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
  54  return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
  55                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
  56                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
  57}
  58
  59static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
  60  return (__m128h)__builtin_ia32_undef128();
  61}
  62
  63static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
  64  return (__m512h)__builtin_ia32_undef512();
  65}
  66
  67static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
  68  return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
  69                            __h, __h, __h, __h, __h, __h, __h, __h,
  70                            __h, __h, __h, __h, __h, __h, __h, __h,
  71                            __h, __h, __h, __h, __h, __h, __h, __h};
  72}
  73
  74static __inline __m512h __DEFAULT_FN_ATTRS512
  75_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
  76              _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
  77              _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
  78              _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
  79              _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
  80              _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
  81              _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
  82              _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
  83  return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
  84                            __h25, __h24, __h23, __h22, __h21, __h20, __h19,
  85                            __h18, __h17, __h16, __h15, __h14, __h13, __h12,
  86                            __h11, __h10, __h9,  __h8,  __h7,  __h6,  __h5,
  87                            __h4,  __h3,  __h2,  __h1};
  88}
  89
  90#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
  91                       h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24,  \
  92                       h25, h26, h27, h28, h29, h30, h31, h32)                 \
  93  _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
  94                (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
  95                (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6),     \
  96                (h5), (h4), (h3), (h2), (h1))
  97
  98static __inline __m512h __DEFAULT_FN_ATTRS512
  99_mm512_set1_pch(_Float16 _Complex __h) {
 100  return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, __h));
 101}
 102
 103static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
 104  return (__m128)__a;
 105}
 106
 107static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
 108  return (__m256)__a;
 109}
 110
 111static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
 112  return (__m512)__a;
 113}
 114
 115static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
 116  return (__m128d)__a;
 117}
 118
 119static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
 120  return (__m256d)__a;
 121}
 122
 123static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
 124  return (__m512d)__a;
 125}
 126
 127static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
 128  return (__m128i)__a;
 129}
 130
 131static __inline__ __m256i __DEFAULT_FN_ATTRS256
 132_mm256_castph_si256(__m256h __a) {
 133  return (__m256i)__a;
 134}
 135
 136static __inline__ __m512i __DEFAULT_FN_ATTRS512
 137_mm512_castph_si512(__m512h __a) {
 138  return (__m512i)__a;
 139}
 140
 141static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
 142  return (__m128h)__a;
 143}
 144
 145static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
 146  return (__m256h)__a;
 147}
 148
 149static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
 150  return (__m512h)__a;
 151}
 152
 153static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
 154  return (__m128h)__a;
 155}
 156
 157static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
 158  return (__m256h)__a;
 159}
 160
 161static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
 162  return (__m512h)__a;
 163}
 164
 165static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
 166  return (__m128h)__a;
 167}
 168
 169static __inline__ __m256h __DEFAULT_FN_ATTRS256
 170_mm256_castsi256_ph(__m256i __a) {
 171  return (__m256h)__a;
 172}
 173
 174static __inline__ __m512h __DEFAULT_FN_ATTRS512
 175_mm512_castsi512_ph(__m512i __a) {
 176  return (__m512h)__a;
 177}
 178
 179static __inline__ __m128h __DEFAULT_FN_ATTRS256
 180_mm256_castph256_ph128(__m256h __a) {
 181  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
 182}
 183
 184static __inline__ __m128h __DEFAULT_FN_ATTRS512
 185_mm512_castph512_ph128(__m512h __a) {
 186  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
 187}
 188
 189static __inline__ __m256h __DEFAULT_FN_ATTRS512
 190_mm512_castph512_ph256(__m512h __a) {
 191  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
 192                                 12, 13, 14, 15);
 193}
 194
 195static __inline__ __m256h __DEFAULT_FN_ATTRS256
 196_mm256_castph128_ph256(__m128h __a) {
 197  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
 198                                  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 199}
 200
 201static __inline__ __m512h __DEFAULT_FN_ATTRS512
 202_mm512_castph128_ph512(__m128h __a) {
 203  __m256h __b = __builtin_nondeterministic_value(__b);
 204  return __builtin_shufflevector(
 205      __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
 206                              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
 207      __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
 208      20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
 209}
 210
 211static __inline__ __m512h __DEFAULT_FN_ATTRS512
 212_mm512_castph256_ph512(__m256h __a) {
 213  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
 214                                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
 215                                 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
 216                                 27, 28, 29, 30, 31);
 217}
 218
 219/// Constructs a 256-bit floating-point vector of [16 x half] from a
 220///    128-bit floating-point vector of [8 x half]. The lower 128 bits
 221///    contain the value of the source vector. The upper 384 bits are set
 222///    to zero.
 223///
 224/// \headerfile <x86intrin.h>
 225///
 226/// This intrinsic has no corresponding instruction.
 227///
 228/// \param __a
 229///    A 128-bit vector of [8 x half].
 230/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
 231///    contain the value of the parameter. The upper 384 bits are set to zero.
 232static __inline__ __m256h __DEFAULT_FN_ATTRS256
 233_mm256_zextph128_ph256(__m128h __a) {
 234  return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
 235                                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 236}
 237
 238/// Constructs a 512-bit floating-point vector of [32 x half] from a
 239///    128-bit floating-point vector of [8 x half]. The lower 128 bits
 240///    contain the value of the source vector. The upper 384 bits are set
 241///    to zero.
 242///
 243/// \headerfile <x86intrin.h>
 244///
 245/// This intrinsic has no corresponding instruction.
 246///
 247/// \param __a
 248///    A 128-bit vector of [8 x half].
 249/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
 250///    contain the value of the parameter. The upper 384 bits are set to zero.
 251static __inline__ __m512h __DEFAULT_FN_ATTRS512
 252_mm512_zextph128_ph512(__m128h __a) {
 253  return __builtin_shufflevector(
 254      __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
 255      13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
 256}
 257
 258/// Constructs a 512-bit floating-point vector of [32 x half] from a
 259///    256-bit floating-point vector of [16 x half]. The lower 256 bits
 260///    contain the value of the source vector. The upper 256 bits are set
 261///    to zero.
 262///
 263/// \headerfile <x86intrin.h>
 264///
 265/// This intrinsic has no corresponding instruction.
 266///
 267/// \param __a
 268///    A 256-bit vector of [16 x half].
 269/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
 270///    contain the value of the parameter. The upper 256 bits are set to zero.
 271static __inline__ __m512h __DEFAULT_FN_ATTRS512
 272_mm512_zextph256_ph512(__m256h __a) {
 273  return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
 274                                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 275                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
 276                                 29, 30, 31);
 277}
 278
 279#define _mm_comi_round_sh(A, B, P, R)                                          \
 280  __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
 281
 282#define _mm_comi_sh(A, B, pred)                                                \
 283  _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
 284
 285static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h __A,
 286                                                          __m128h __B) {
 287  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OS,
 288                                _MM_FROUND_CUR_DIRECTION);
 289}
 290
 291static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h __A,
 292                                                          __m128h __B) {
 293  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OS,
 294                                _MM_FROUND_CUR_DIRECTION);
 295}
 296
 297static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h __A,
 298                                                          __m128h __B) {
 299  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OS,
 300                                _MM_FROUND_CUR_DIRECTION);
 301}
 302
 303static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h __A,
 304                                                          __m128h __B) {
 305  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OS,
 306                                _MM_FROUND_CUR_DIRECTION);
 307}
 308
 309static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h __A,
 310                                                          __m128h __B) {
 311  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OS,
 312                                _MM_FROUND_CUR_DIRECTION);
 313}
 314
 315static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h __A,
 316                                                           __m128h __B) {
 317  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_US,
 318                                _MM_FROUND_CUR_DIRECTION);
 319}
 320
 321static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h __A,
 322                                                           __m128h __B) {
 323  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_EQ_OQ,
 324                                _MM_FROUND_CUR_DIRECTION);
 325}
 326
 327static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h __A,
 328                                                           __m128h __B) {
 329  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LT_OQ,
 330                                _MM_FROUND_CUR_DIRECTION);
 331}
 332
 333static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h __A,
 334                                                           __m128h __B) {
 335  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_LE_OQ,
 336                                _MM_FROUND_CUR_DIRECTION);
 337}
 338
 339static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h __A,
 340                                                           __m128h __B) {
 341  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GT_OQ,
 342                                _MM_FROUND_CUR_DIRECTION);
 343}
 344
 345static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h __A,
 346                                                           __m128h __B) {
 347  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_GE_OQ,
 348                                _MM_FROUND_CUR_DIRECTION);
 349}
 350
 351static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h __A,
 352                                                            __m128h __B) {
 353  return __builtin_ia32_vcomish((__v8hf)__A, (__v8hf)__B, _CMP_NEQ_UQ,
 354                                _MM_FROUND_CUR_DIRECTION);
 355}
 356
 357static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
 358                                                              __m512h __B) {
 359  return (__m512h)((__v32hf)__A + (__v32hf)__B);
 360}
 361
 362static __inline__ __m512h __DEFAULT_FN_ATTRS512
 363_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
 364  return (__m512h)__builtin_ia32_selectph_512(
 365      (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
 366}
 367
 368static __inline__ __m512h __DEFAULT_FN_ATTRS512
 369_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
 370  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
 371                                              (__v32hf)_mm512_add_ph(__A, __B),
 372                                              (__v32hf)_mm512_setzero_ph());
 373}
 374
 375#define _mm512_add_round_ph(A, B, R)                                           \
 376  ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
 377                                    (__v32hf)(__m512h)(B), (int)(R)))
 378
 379#define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
 380  ((__m512h)__builtin_ia32_selectph_512(                                       \
 381      (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
 382      (__v32hf)(__m512h)(W)))
 383
 384#define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
 385  ((__m512h)__builtin_ia32_selectph_512(                                       \
 386      (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
 387      (__v32hf)_mm512_setzero_ph()))
 388
 389static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
 390                                                              __m512h __B) {
 391  return (__m512h)((__v32hf)__A - (__v32hf)__B);
 392}
 393
 394static __inline__ __m512h __DEFAULT_FN_ATTRS512
 395_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
 396  return (__m512h)__builtin_ia32_selectph_512(
 397      (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
 398}
 399
 400static __inline__ __m512h __DEFAULT_FN_ATTRS512
 401_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
 402  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
 403                                              (__v32hf)_mm512_sub_ph(__A, __B),
 404                                              (__v32hf)_mm512_setzero_ph());
 405}
 406
 407#define _mm512_sub_round_ph(A, B, R)                                           \
 408  ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
 409                                    (__v32hf)(__m512h)(B), (int)(R)))
 410
 411#define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
 412  ((__m512h)__builtin_ia32_selectph_512(                                       \
 413      (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
 414      (__v32hf)(__m512h)(W)))
 415
 416#define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
 417  ((__m512h)__builtin_ia32_selectph_512(                                       \
 418      (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
 419      (__v32hf)_mm512_setzero_ph()))
 420
 421static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
 422                                                              __m512h __B) {
 423  return (__m512h)((__v32hf)__A * (__v32hf)__B);
 424}
 425
 426static __inline__ __m512h __DEFAULT_FN_ATTRS512
 427_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
 428  return (__m512h)__builtin_ia32_selectph_512(
 429      (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
 430}
 431
 432static __inline__ __m512h __DEFAULT_FN_ATTRS512
 433_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
 434  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
 435                                              (__v32hf)_mm512_mul_ph(__A, __B),
 436                                              (__v32hf)_mm512_setzero_ph());
 437}
 438
 439#define _mm512_mul_round_ph(A, B, R)                                           \
 440  ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
 441                                    (__v32hf)(__m512h)(B), (int)(R)))
 442
 443#define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
 444  ((__m512h)__builtin_ia32_selectph_512(                                       \
 445      (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
 446      (__v32hf)(__m512h)(W)))
 447
 448#define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
 449  ((__m512h)__builtin_ia32_selectph_512(                                       \
 450      (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
 451      (__v32hf)_mm512_setzero_ph()))
 452
 453static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
 454                                                              __m512h __B) {
 455  return (__m512h)((__v32hf)__A / (__v32hf)__B);
 456}
 457
 458static __inline__ __m512h __DEFAULT_FN_ATTRS512
 459_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
 460  return (__m512h)__builtin_ia32_selectph_512(
 461      (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
 462}
 463
 464static __inline__ __m512h __DEFAULT_FN_ATTRS512
 465_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
 466  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
 467                                              (__v32hf)_mm512_div_ph(__A, __B),
 468                                              (__v32hf)_mm512_setzero_ph());
 469}
 470
 471#define _mm512_div_round_ph(A, B, R)                                           \
 472  ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
 473                                    (__v32hf)(__m512h)(B), (int)(R)))
 474
 475#define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
 476  ((__m512h)__builtin_ia32_selectph_512(                                       \
 477      (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
 478      (__v32hf)(__m512h)(W)))
 479
 480#define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
 481  ((__m512h)__builtin_ia32_selectph_512(                                       \
 482      (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
 483      (__v32hf)_mm512_setzero_ph()))
 484
 485static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
 486                                                              __m512h __B) {
 487  return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
 488                                          _MM_FROUND_CUR_DIRECTION);
 489}
 490
 491static __inline__ __m512h __DEFAULT_FN_ATTRS512
 492_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
 493  return (__m512h)__builtin_ia32_selectph_512(
 494      (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
 495}
 496
 497static __inline__ __m512h __DEFAULT_FN_ATTRS512
 498_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
 499  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
 500                                              (__v32hf)_mm512_min_ph(__A, __B),
 501                                              (__v32hf)_mm512_setzero_ph());
 502}
 503
 504#define _mm512_min_round_ph(A, B, R)                                           \
 505  ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
 506                                    (__v32hf)(__m512h)(B), (int)(R)))
 507
 508#define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
 509  ((__m512h)__builtin_ia32_selectph_512(                                       \
 510      (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
 511      (__v32hf)(__m512h)(W)))
 512
 513#define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
 514  ((__m512h)__builtin_ia32_selectph_512(                                       \
 515      (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
 516      (__v32hf)_mm512_setzero_ph()))
 517
 518static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
 519                                                              __m512h __B) {
 520  return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
 521                                          _MM_FROUND_CUR_DIRECTION);
 522}
 523
 524static __inline__ __m512h __DEFAULT_FN_ATTRS512
 525_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
 526  return (__m512h)__builtin_ia32_selectph_512(
 527      (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
 528}
 529
 530static __inline__ __m512h __DEFAULT_FN_ATTRS512
 531_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
 532  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
 533                                              (__v32hf)_mm512_max_ph(__A, __B),
 534                                              (__v32hf)_mm512_setzero_ph());
 535}
 536
 537#define _mm512_max_round_ph(A, B, R)                                           \
 538  ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
 539                                    (__v32hf)(__m512h)(B), (int)(R)))
 540
 541#define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
 542  ((__m512h)__builtin_ia32_selectph_512(                                       \
 543      (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
 544      (__v32hf)(__m512h)(W)))
 545
 546#define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
 547  ((__m512h)__builtin_ia32_selectph_512(                                       \
 548      (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
 549      (__v32hf)_mm512_setzero_ph()))
 550
 551static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
 552  return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
 553}
 554
 555static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
 556  return (__m512h)_mm512_xor_epi32((__m512i)__A,
 557                                   _mm512_set1_epi32(-2147483648));
 558}
 559
 560static __inline__ __m512h __DEFAULT_FN_ATTRS512
 561_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
 562  return (__m512h)__builtin_ia32_selectps_512(
 563      (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
 564}
 565
 566static __inline__ __m512h __DEFAULT_FN_ATTRS512
 567_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
 568  return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
 569                                              (__v16sf)_mm512_conj_pch(__A),
 570                                              (__v16sf)_mm512_setzero_ps());
 571}
 572
 573static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
 574                                                           __m128h __B) {
 575  __A[0] += __B[0];
 576  return __A;
 577}
 578
 579static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
 580                                                                __mmask8 __U,
 581                                                                __m128h __A,
 582                                                                __m128h __B) {
 583  __A = _mm_add_sh(__A, __B);
 584  return __builtin_ia32_selectsh_128(__U, __A, __W);
 585}
 586
 587static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
 588                                                                 __m128h __A,
 589                                                                 __m128h __B) {
 590  __A = _mm_add_sh(__A, __B);
 591  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
 592}
 593
 594#define _mm_add_round_sh(A, B, R)                                              \
 595  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
 596      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 597      (__mmask8)-1, (int)(R)))
 598
 599#define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
 600  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
 601      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
 602      (__mmask8)(U), (int)(R)))
 603
 604#define _mm_maskz_add_round_sh(U, A, B, R)                                     \
 605  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
 606      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 607      (__mmask8)(U), (int)(R)))
 608
 609static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
 610                                                           __m128h __B) {
 611  __A[0] -= __B[0];
 612  return __A;
 613}
 614
 615static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
 616                                                                __mmask8 __U,
 617                                                                __m128h __A,
 618                                                                __m128h __B) {
 619  __A = _mm_sub_sh(__A, __B);
 620  return __builtin_ia32_selectsh_128(__U, __A, __W);
 621}
 622
 623static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
 624                                                                 __m128h __A,
 625                                                                 __m128h __B) {
 626  __A = _mm_sub_sh(__A, __B);
 627  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
 628}
 629
 630#define _mm_sub_round_sh(A, B, R)                                              \
 631  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
 632      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 633      (__mmask8)-1, (int)(R)))
 634
 635#define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
 636  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
 637      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
 638      (__mmask8)(U), (int)(R)))
 639
 640#define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
 641  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
 642      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 643      (__mmask8)(U), (int)(R)))
 644
 645static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
 646                                                           __m128h __B) {
 647  __A[0] *= __B[0];
 648  return __A;
 649}
 650
 651static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
 652                                                                __mmask8 __U,
 653                                                                __m128h __A,
 654                                                                __m128h __B) {
 655  __A = _mm_mul_sh(__A, __B);
 656  return __builtin_ia32_selectsh_128(__U, __A, __W);
 657}
 658
 659static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
 660                                                                 __m128h __A,
 661                                                                 __m128h __B) {
 662  __A = _mm_mul_sh(__A, __B);
 663  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
 664}
 665
 666#define _mm_mul_round_sh(A, B, R)                                              \
 667  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
 668      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 669      (__mmask8)-1, (int)(R)))
 670
 671#define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
 672  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
 673      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
 674      (__mmask8)(U), (int)(R)))
 675
 676#define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
 677  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
 678      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 679      (__mmask8)(U), (int)(R)))
 680
 681static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
 682                                                           __m128h __B) {
 683  __A[0] /= __B[0];
 684  return __A;
 685}
 686
 687static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
 688                                                                __mmask8 __U,
 689                                                                __m128h __A,
 690                                                                __m128h __B) {
 691  __A = _mm_div_sh(__A, __B);
 692  return __builtin_ia32_selectsh_128(__U, __A, __W);
 693}
 694
 695static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
 696                                                                 __m128h __A,
 697                                                                 __m128h __B) {
 698  __A = _mm_div_sh(__A, __B);
 699  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
 700}
 701
 702#define _mm_div_round_sh(A, B, R)                                              \
 703  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
 704      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 705      (__mmask8)-1, (int)(R)))
 706
 707#define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
 708  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
 709      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
 710      (__mmask8)(U), (int)(R)))
 711
 712#define _mm_maskz_div_round_sh(U, A, B, R)                                     \
 713  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
 714      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 715      (__mmask8)(U), (int)(R)))
 716
 717static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
 718                                                           __m128h __B) {
 719  return (__m128h)__builtin_ia32_minsh_round_mask(
 720      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
 721      _MM_FROUND_CUR_DIRECTION);
 722}
 723
 724static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
 725                                                                __mmask8 __U,
 726                                                                __m128h __A,
 727                                                                __m128h __B) {
 728  return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
 729                                                  (__v8hf)__W, (__mmask8)__U,
 730                                                  _MM_FROUND_CUR_DIRECTION);
 731}
 732
 733static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
 734                                                                 __m128h __A,
 735                                                                 __m128h __B) {
 736  return (__m128h)__builtin_ia32_minsh_round_mask(
 737      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
 738      _MM_FROUND_CUR_DIRECTION);
 739}
 740
 741#define _mm_min_round_sh(A, B, R)                                              \
 742  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
 743      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 744      (__mmask8)-1, (int)(R)))
 745
 746#define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
 747  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
 748      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
 749      (__mmask8)(U), (int)(R)))
 750
 751#define _mm_maskz_min_round_sh(U, A, B, R)                                     \
 752  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
 753      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 754      (__mmask8)(U), (int)(R)))
 755
 756static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
 757                                                           __m128h __B) {
 758  return (__m128h)__builtin_ia32_maxsh_round_mask(
 759      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
 760      _MM_FROUND_CUR_DIRECTION);
 761}
 762
 763static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
 764                                                                __mmask8 __U,
 765                                                                __m128h __A,
 766                                                                __m128h __B) {
 767  return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
 768                                                  (__v8hf)__W, (__mmask8)__U,
 769                                                  _MM_FROUND_CUR_DIRECTION);
 770}
 771
 772static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
 773                                                                 __m128h __A,
 774                                                                 __m128h __B) {
 775  return (__m128h)__builtin_ia32_maxsh_round_mask(
 776      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
 777      _MM_FROUND_CUR_DIRECTION);
 778}
 779
 780#define _mm_max_round_sh(A, B, R)                                              \
 781  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
 782      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 783      (__mmask8)-1, (int)(R)))
 784
 785#define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
 786  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
 787      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
 788      (__mmask8)(U), (int)(R)))
 789
 790#define _mm_maskz_max_round_sh(U, A, B, R)                                     \
 791  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
 792      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
 793      (__mmask8)(U), (int)(R)))
 794
 795#define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
 796  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
 797                                           (__v32hf)(__m512h)(B), (int)(P),    \
 798                                           (__mmask32)-1, (int)(R)))
 799
 800#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
 801  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
 802                                           (__v32hf)(__m512h)(B), (int)(P),    \
 803                                           (__mmask32)(U), (int)(R)))
 804
 805#define _mm512_cmp_ph_mask(A, B, P)                                            \
 806  _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
 807
 808#define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
 809  _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
 810
 811#define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
 812  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
 813                                       (__v8hf)(__m128h)(Y), (int)(P),         \
 814                                       (__mmask8)-1, (int)(R)))
 815
 816#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
 817  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
 818                                       (__v8hf)(__m128h)(Y), (int)(P),         \
 819                                       (__mmask8)(M), (int)(R)))
 820
 821#define _mm_cmp_sh_mask(X, Y, P)                                               \
 822  ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
 823      (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
 824      _MM_FROUND_CUR_DIRECTION))
 825
 826#define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
 827  ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
 828      (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
 829      _MM_FROUND_CUR_DIRECTION))
 830// loads with vmovsh:
 831static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
 832  struct __mm_load_sh_struct {
 833    _Float16 __u;
 834  } __attribute__((__packed__, __may_alias__));
 835  _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
 836  return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
 837}
 838
 839static __inline__ __m128h __DEFAULT_FN_ATTRS128
 840_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
 841  __m128h src = (__v8hf)__builtin_shufflevector(
 842      (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
 843
 844  return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
 845}
 846
 847static __inline__ __m128h __DEFAULT_FN_ATTRS128
 848_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
 849  return (__m128h)__builtin_ia32_loadsh128_mask(
 850      (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
 851}
 852
 853static __inline__ __m512h __DEFAULT_FN_ATTRS512
 854_mm512_load_ph(void const *__p) {
 855  return *(const __m512h *)__p;
 856}
 857
 858static __inline__ __m256h __DEFAULT_FN_ATTRS256
 859_mm256_load_ph(void const *__p) {
 860  return *(const __m256h *)__p;
 861}
 862
 863static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
 864  return *(const __m128h *)__p;
 865}
 866
 867static __inline__ __m512h __DEFAULT_FN_ATTRS512
 868_mm512_loadu_ph(void const *__p) {
 869  struct __loadu_ph {
 870    __m512h_u __v;
 871  } __attribute__((__packed__, __may_alias__));
 872  return ((const struct __loadu_ph *)__p)->__v;
 873}
 874
 875static __inline__ __m256h __DEFAULT_FN_ATTRS256
 876_mm256_loadu_ph(void const *__p) {
 877  struct __loadu_ph {
 878    __m256h_u __v;
 879  } __attribute__((__packed__, __may_alias__));
 880  return ((const struct __loadu_ph *)__p)->__v;
 881}
 882
 883static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
 884  struct __loadu_ph {
 885    __m128h_u __v;
 886  } __attribute__((__packed__, __may_alias__));
 887  return ((const struct __loadu_ph *)__p)->__v;
 888}
 889
 890// stores with vmovsh:
 891static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
 892                                                          __m128h __a) {
 893  struct __mm_store_sh_struct {
 894    _Float16 __u;
 895  } __attribute__((__packed__, __may_alias__));
 896  ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
 897}
 898
 899static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
 900                                                               __mmask8 __U,
 901                                                               __m128h __A) {
 902  __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
 903}
 904
 905static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
 906                                                             __m512h __A) {
 907  *(__m512h *)__P = __A;
 908}
 909
 910static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
 911                                                             __m256h __A) {
 912  *(__m256h *)__P = __A;
 913}
 914
 915static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
 916                                                          __m128h __A) {
 917  *(__m128h *)__P = __A;
 918}
 919
 920static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
 921                                                              __m512h __A) {
 922  struct __storeu_ph {
 923    __m512h_u __v;
 924  } __attribute__((__packed__, __may_alias__));
 925  ((struct __storeu_ph *)__P)->__v = __A;
 926}
 927
 928static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
 929                                                              __m256h __A) {
 930  struct __storeu_ph {
 931    __m256h_u __v;
 932  } __attribute__((__packed__, __may_alias__));
 933  ((struct __storeu_ph *)__P)->__v = __A;
 934}
 935
 936static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
 937                                                           __m128h __A) {
 938  struct __storeu_ph {
 939    __m128h_u __v;
 940  } __attribute__((__packed__, __may_alias__));
 941  ((struct __storeu_ph *)__P)->__v = __A;
 942}
 943
 944// moves with vmovsh:
 945static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
 946                                                            __m128h __b) {
 947  __a[0] = __b[0];
 948  return __a;
 949}
 950
 951static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
 952                                                                 __mmask8 __U,
 953                                                                 __m128h __A,
 954                                                                 __m128h __B) {
 955  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
 956}
 957
 958static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
 959                                                                  __m128h __A,
 960                                                                  __m128h __B) {
 961  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
 962                                     _mm_setzero_ph());
 963}
 964
 965// vmovw:
 966static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
 967  return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
 968}
 969
 970static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
 971  __v8hi __b = (__v8hi)__a;
 972  return __b[0];
 973}
 974
 975static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
 976  return (__m512h)__builtin_ia32_rcpph512_mask(
 977      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
 978}
 979
 980static __inline__ __m512h __DEFAULT_FN_ATTRS512
 981_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
 982  return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
 983                                               (__mmask32)__U);
 984}
 985
 986static __inline__ __m512h __DEFAULT_FN_ATTRS512
 987_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
 988  return (__m512h)__builtin_ia32_rcpph512_mask(
 989      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
 990}
 991
 992static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
 993  return (__m512h)__builtin_ia32_rsqrtph512_mask(
 994      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
 995}
 996
 997static __inline__ __m512h __DEFAULT_FN_ATTRS512
 998_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
 999  return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
1000                                                 (__mmask32)__U);
1001}
1002
1003static __inline__ __m512h __DEFAULT_FN_ATTRS512
1004_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
1005  return (__m512h)__builtin_ia32_rsqrtph512_mask(
1006      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
1007}
1008
1009#define _mm512_getmant_ph(A, B, C)                                             \
1010  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1011      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1012      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
1013      _MM_FROUND_CUR_DIRECTION))
1014
1015#define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
1016  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1017      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1018      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1019
1020#define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
1021  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1022      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1023      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1024
1025#define _mm512_getmant_round_ph(A, B, C, R)                                    \
1026  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1027      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1028      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1029
1030#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
1031  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1032      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
1033      (__mmask32)(U), (int)(R)))
1034
1035#define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
1036  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
1037      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
1038      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1039
1040static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
1041  return (__m512h)__builtin_ia32_getexpph512_mask(
1042      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1043      _MM_FROUND_CUR_DIRECTION);
1044}
1045
1046static __inline__ __m512h __DEFAULT_FN_ATTRS512
1047_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1048  return (__m512h)__builtin_ia32_getexpph512_mask(
1049      (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1050}
1051
1052static __inline__ __m512h __DEFAULT_FN_ATTRS512
1053_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
1054  return (__m512h)__builtin_ia32_getexpph512_mask(
1055      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1056      _MM_FROUND_CUR_DIRECTION);
1057}
1058
1059#define _mm512_getexp_round_ph(A, R)                                           \
1060  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1061                                            (__v32hf)_mm512_undefined_ph(),    \
1062                                            (__mmask32)-1, (int)(R)))
1063
1064#define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
1065  ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
1066      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
1067
1068#define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
1069  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
1070                                            (__v32hf)_mm512_setzero_ph(),      \
1071                                            (__mmask32)(U), (int)(R)))
1072
1073static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
1074                                                                 __m512h __B) {
1075  return (__m512h)__builtin_ia32_scalefph512_mask(
1076      (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
1077      _MM_FROUND_CUR_DIRECTION);
1078}
1079
1080static __inline__ __m512h __DEFAULT_FN_ATTRS512
1081_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
1082  return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
1083                                                  (__v32hf)__W, (__mmask32)__U,
1084                                                  _MM_FROUND_CUR_DIRECTION);
1085}
1086
1087static __inline__ __m512h __DEFAULT_FN_ATTRS512
1088_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
1089  return (__m512h)__builtin_ia32_scalefph512_mask(
1090      (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1091      _MM_FROUND_CUR_DIRECTION);
1092}
1093
1094#define _mm512_scalef_round_ph(A, B, R)                                        \
1095  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1096      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1097      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
1098
1099#define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
1100  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1101      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
1102      (__mmask32)(U), (int)(R)))
1103
1104#define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
1105  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
1106      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
1107      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1108
1109#define _mm512_roundscale_ph(A, B)                                             \
1110  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1111      (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
1112      _MM_FROUND_CUR_DIRECTION))
1113
1114#define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
1115  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1116      (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
1117      (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
1118
1119#define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
1120  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
1121      (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1122      (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
1123
1124#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
1125  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
1126                                           (__v32hf)(__m512h)(A),              \
1127                                           (__mmask32)(B), (int)(R)))
1128
1129#define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
1130  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
1131                                           (__v32hf)_mm512_setzero_ph(),       \
1132                                           (__mmask32)(A), (int)(R)))
1133
1134#define _mm512_roundscale_round_ph(A, imm, R)                                  \
1135  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
1136                                           (__v32hf)_mm512_undefined_ph(),     \
1137                                           (__mmask32)-1, (int)(R)))
1138
1139#define _mm512_reduce_ph(A, imm)                                               \
1140  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1141      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
1142      (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
1143
1144#define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
1145  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1146      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
1147      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1148
1149#define _mm512_maskz_reduce_ph(U, A, imm)                                      \
1150  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
1151      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
1152      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
1153
1154#define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
1155  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1156                                            (__v32hf)(__m512h)(W),             \
1157                                            (__mmask32)(U), (int)(R)))
1158
1159#define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
1160  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1161                                            (__v32hf)_mm512_setzero_ph(),      \
1162                                            (__mmask32)(U), (int)(R)))
1163
1164#define _mm512_reduce_round_ph(A, imm, R)                                      \
1165  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
1166                                            (__v32hf)_mm512_undefined_ph(),    \
1167                                            (__mmask32)-1, (int)(R)))
1168
1169static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
1170                                                           __m128h __B) {
1171  return (__m128h)__builtin_ia32_rcpsh_mask(
1172      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1173}
1174
1175static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
1176                                                                __mmask8 __U,
1177                                                                __m128h __A,
1178                                                                __m128h __B) {
1179  return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
1180                                            (__v8hf)__W, (__mmask8)__U);
1181}
1182
1183static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
1184                                                                 __m128h __A,
1185                                                                 __m128h __B) {
1186  return (__m128h)__builtin_ia32_rcpsh_mask(
1187      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1188}
1189
1190static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
1191                                                             __m128h __B) {
1192  return (__m128h)__builtin_ia32_rsqrtsh_mask(
1193      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
1194}
1195
1196static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
1197                                                                  __mmask8 __U,
1198                                                                  __m128h __A,
1199                                                                  __m128h __B) {
1200  return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
1201                                              (__v8hf)__W, (__mmask8)__U);
1202}
1203
1204static __inline__ __m128h __DEFAULT_FN_ATTRS128
1205_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1206  return (__m128h)__builtin_ia32_rsqrtsh_mask(
1207      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
1208}
1209
1210#define _mm_getmant_round_sh(A, B, C, D, R)                                    \
1211  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1212      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1213      (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
1214
1215#define _mm_getmant_sh(A, B, C, D)                                             \
1216  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1217      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1218      (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
1219
1220#define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
1221  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1222      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1223      (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1224
1225#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
1226  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1227      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1228      (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
1229
1230#define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
1231  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1232      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1233      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
1234
1235#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
1236  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
1237      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
1238      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1239
1240#define _mm_getexp_round_sh(A, B, R)                                           \
1241  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1242      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1243      (__mmask8)-1, (int)(R)))
1244
1245static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
1246                                                              __m128h __B) {
1247  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1248      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1249      _MM_FROUND_CUR_DIRECTION);
1250}
1251
1252static __inline__ __m128h __DEFAULT_FN_ATTRS128
1253_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1254  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1255      (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
1256      _MM_FROUND_CUR_DIRECTION);
1257}
1258
1259#define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
1260  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1261      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1262      (__mmask8)(U), (int)(R)))
1263
1264static __inline__ __m128h __DEFAULT_FN_ATTRS128
1265_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1266  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
1267      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1268      _MM_FROUND_CUR_DIRECTION);
1269}
1270
1271#define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
1272  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
1273      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1274      (__mmask8)(U), (int)(R)))
1275
1276#define _mm_scalef_round_sh(A, B, R)                                           \
1277  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1278      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1279      (__mmask8)-1, (int)(R)))
1280
1281static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
1282                                                              __m128h __B) {
1283  return (__m128h)__builtin_ia32_scalefsh_round_mask(
1284      (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1285      _MM_FROUND_CUR_DIRECTION);
1286}
1287
1288static __inline__ __m128h __DEFAULT_FN_ATTRS128
1289_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
1290  return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
1291                                                     (__v8hf)__W, (__mmask8)__U,
1292                                                     _MM_FROUND_CUR_DIRECTION);
1293}
1294
1295#define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
1296  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1297      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1298      (__mmask8)(U), (int)(R)))
1299
1300static __inline__ __m128h __DEFAULT_FN_ATTRS128
1301_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
1302  return (__m128h)__builtin_ia32_scalefsh_round_mask(
1303      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1304      _MM_FROUND_CUR_DIRECTION);
1305}
1306
1307#define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
1308  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
1309      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1310      (__mmask8)(U), (int)(R)))
1311
1312#define _mm_roundscale_round_sh(A, B, imm, R)                                  \
1313  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1314      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1315      (__mmask8)-1, (int)(imm), (int)(R)))
1316
1317#define _mm_roundscale_sh(A, B, imm)                                           \
1318  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1319      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1320      (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
1321
1322#define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
1323  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1324      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1325      (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1326
1327#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
1328  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1329      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1330      (__mmask8)(U), (int)(I), (int)(R)))
1331
1332#define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
1333  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1334      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1335      (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
1336
1337#define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
1338  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
1339      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1340      (__mmask8)(U), (int)(I), (int)(R)))
1341
1342#define _mm_reduce_sh(A, B, C)                                                 \
1343  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1344      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1345      (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
1346
1347#define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
1348  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1349      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1350      (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1351
1352#define _mm_maskz_reduce_sh(U, A, B, C)                                        \
1353  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1354      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1355      (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
1356
1357#define _mm_reduce_round_sh(A, B, C, R)                                        \
1358  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1359      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1360      (__mmask8)-1, (int)(C), (int)(R)))
1361
1362#define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
1363  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1364      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1365      (__mmask8)(U), (int)(C), (int)(R)))
1366
1367#define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
1368  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
1369      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1370      (__mmask8)(U), (int)(C), (int)(R)))
1371
1372#define _mm512_sqrt_round_ph(A, R)                                             \
1373  ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
1374
1375#define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
1376  ((__m512h)__builtin_ia32_selectph_512(                                       \
1377      (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1378      (__v32hf)(__m512h)(W)))
1379
1380#define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
1381  ((__m512h)__builtin_ia32_selectph_512(                                       \
1382      (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
1383      (__v32hf)_mm512_setzero_ph()))
1384
1385static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
1386  return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
1387                                           _MM_FROUND_CUR_DIRECTION);
1388}
1389
1390static __inline__ __m512h __DEFAULT_FN_ATTRS512
1391_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
1392  return (__m512h)__builtin_ia32_selectph_512(
1393      (__mmask32)(__U),
1394      (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1395      (__v32hf)(__m512h)(__W));
1396}
1397
1398static __inline__ __m512h __DEFAULT_FN_ATTRS512
1399_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
1400  return (__m512h)__builtin_ia32_selectph_512(
1401      (__mmask32)(__U),
1402      (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
1403      (__v32hf)_mm512_setzero_ph());
1404}
1405
1406#define _mm_sqrt_round_sh(A, B, R)                                             \
1407  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1408      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1409      (__mmask8)-1, (int)(R)))
1410
1411#define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
1412  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1413      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
1414      (__mmask8)(U), (int)(R)))
1415
1416#define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
1417  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
1418      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
1419      (__mmask8)(U), (int)(R)))
1420
1421static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
1422                                                            __m128h __B) {
1423  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1424      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1425      (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
1426}
1427
1428static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
1429                                                                 __mmask32 __U,
1430                                                                 __m128h __A,
1431                                                                 __m128h __B) {
1432  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1433      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
1434      (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1435}
1436
1437static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
1438                                                                  __m128h __A,
1439                                                                  __m128h __B) {
1440  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
1441      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
1442      (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
1443}
1444
1445#define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
1446  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1447                                               (int)(imm), (__mmask32)(U)))
1448
1449#define _mm512_fpclass_ph_mask(A, imm)                                         \
1450  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
1451                                               (int)(imm), (__mmask32)-1))
1452
1453#define _mm_fpclass_sh_mask(A, imm)                                            \
1454  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1455                                           (__mmask8)-1))
1456
1457#define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
1458  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
1459                                           (__mmask8)(U)))
1460
1461#define _mm512_cvt_roundpd_ph(A, R)                                            \
1462  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1463      (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
1464
1465#define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
1466  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
1467                                             (__mmask8)(U), (int)(R)))
1468
1469#define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
1470  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
1471      (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
1472
1473static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
1474  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1475      (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
1476      _MM_FROUND_CUR_DIRECTION);
1477}
1478
1479static __inline__ __m128h __DEFAULT_FN_ATTRS512
1480_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
1481  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1482      (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1483}
1484
1485static __inline__ __m128h __DEFAULT_FN_ATTRS512
1486_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
1487  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
1488      (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1489      _MM_FROUND_CUR_DIRECTION);
1490}
1491
1492#define _mm512_cvt_roundph_pd(A, R)                                            \
1493  ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1494      (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
1495
1496#define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
1497  ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
1498                                             (__mmask8)(U), (int)(R)))
1499
1500#define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
1501  ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
1502      (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
1503
1504static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
1505  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1506      (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
1507      _MM_FROUND_CUR_DIRECTION);
1508}
1509
1510static __inline__ __m512d __DEFAULT_FN_ATTRS512
1511_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
1512  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1513      (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
1514}
1515
1516static __inline__ __m512d __DEFAULT_FN_ATTRS512
1517_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
1518  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
1519      (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
1520      _MM_FROUND_CUR_DIRECTION);
1521}
1522
1523#define _mm_cvt_roundsh_ss(A, B, R)                                            \
1524  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1525                                               (__v4sf)_mm_undefined_ps(),     \
1526                                               (__mmask8)(-1), (int)(R)))
1527
1528#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
1529  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
1530      (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
1531
1532#define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
1533  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
1534                                               (__v4sf)_mm_setzero_ps(),       \
1535                                               (__mmask8)(U), (int)(R)))
1536
1537static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
1538                                                            __m128h __B) {
1539  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1540      (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
1541      _MM_FROUND_CUR_DIRECTION);
1542}
1543
1544static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
1545                                                                 __mmask8 __U,
1546                                                                 __m128 __A,
1547                                                                 __m128h __B) {
1548  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
1549                                                     (__v4sf)__W, (__mmask8)__U,
1550                                                     _MM_FROUND_CUR_DIRECTION);
1551}
1552
1553static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
1554                                                                  __m128 __A,
1555                                                                  __m128h __B) {
1556  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
1557      (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
1558      _MM_FROUND_CUR_DIRECTION);
1559}
1560
1561#define _mm_cvt_roundss_sh(A, B, R)                                            \
1562  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1563                                                (__v8hf)_mm_undefined_ph(),    \
1564                                                (__mmask8)(-1), (int)(R)))
1565
1566#define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
1567  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
1568      (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1569
1570#define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
1571  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
1572                                                (__v8hf)_mm_setzero_ph(),      \
1573                                                (__mmask8)(U), (int)(R)))
1574
1575static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
1576                                                             __m128 __B) {
1577  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1578      (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1579      _MM_FROUND_CUR_DIRECTION);
1580}
1581
1582static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
1583                                                                  __mmask8 __U,
1584                                                                  __m128h __A,
1585                                                                  __m128 __B) {
1586  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1587      (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
1588      _MM_FROUND_CUR_DIRECTION);
1589}
1590
1591static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
1592                                                                   __m128h __A,
1593                                                                   __m128 __B) {
1594  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
1595      (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1596      _MM_FROUND_CUR_DIRECTION);
1597}
1598
1599#define _mm_cvt_roundsd_sh(A, B, R)                                            \
1600  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1601                                                (__v8hf)_mm_undefined_ph(),    \
1602                                                (__mmask8)(-1), (int)(R)))
1603
1604#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
1605  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
1606      (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
1607
1608#define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
1609  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
1610                                                (__v8hf)_mm_setzero_ph(),      \
1611                                                (__mmask8)(U), (int)(R)))
1612
1613static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
1614                                                             __m128d __B) {
1615  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1616      (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
1617      _MM_FROUND_CUR_DIRECTION);
1618}
1619
1620static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
1621                                                                  __mmask8 __U,
1622                                                                  __m128h __A,
1623                                                                  __m128d __B) {
1624  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1625      (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
1626      _MM_FROUND_CUR_DIRECTION);
1627}
1628
1629static __inline__ __m128h __DEFAULT_FN_ATTRS128
1630_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
1631  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
1632      (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
1633      _MM_FROUND_CUR_DIRECTION);
1634}
1635
1636#define _mm_cvt_roundsh_sd(A, B, R)                                            \
1637  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1638                                                (__v2df)_mm_undefined_pd(),    \
1639                                                (__mmask8)(-1), (int)(R)))
1640
1641#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
1642  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
1643      (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
1644
1645#define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
1646  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
1647                                                (__v2df)_mm_setzero_pd(),      \
1648                                                (__mmask8)(U), (int)(R)))
1649
1650static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
1651                                                             __m128h __B) {
1652  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1653      (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
1654      _MM_FROUND_CUR_DIRECTION);
1655}
1656
1657static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
1658                                                                  __mmask8 __U,
1659                                                                  __m128d __A,
1660                                                                  __m128h __B) {
1661  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1662      (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
1663      _MM_FROUND_CUR_DIRECTION);
1664}
1665
1666static __inline__ __m128d __DEFAULT_FN_ATTRS128
1667_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
1668  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
1669      (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
1670      _MM_FROUND_CUR_DIRECTION);
1671}
1672
1673#define _mm512_cvt_roundph_epi16(A, R)                                         \
1674  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1675                                            (__v32hi)_mm512_undefined_epi32(), \
1676                                            (__mmask32)(-1), (int)(R)))
1677
1678#define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
1679  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
1680                                            (__mmask32)(U), (int)(R)))
1681
1682#define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
1683  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
1684                                            (__v32hi)_mm512_setzero_epi32(),   \
1685                                            (__mmask32)(U), (int)(R)))
1686
1687static __inline__ __m512i __DEFAULT_FN_ATTRS512
1688_mm512_cvtph_epi16(__m512h __A) {
1689  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1690      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1691      _MM_FROUND_CUR_DIRECTION);
1692}
1693
1694static __inline__ __m512i __DEFAULT_FN_ATTRS512
1695_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1696  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1697      (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1698}
1699
1700static __inline__ __m512i __DEFAULT_FN_ATTRS512
1701_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
1702  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
1703      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1704      _MM_FROUND_CUR_DIRECTION);
1705}
1706
1707#define _mm512_cvtt_roundph_epi16(A, R)                                        \
1708  ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
1709      (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1710      (int)(R)))
1711
1712#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
1713  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
1714                                             (__mmask32)(U), (int)(R)))
1715
1716#define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
1717  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
1718                                             (__v32hi)_mm512_setzero_epi32(),  \
1719                                             (__mmask32)(U), (int)(R)))
1720
1721static __inline__ __m512i __DEFAULT_FN_ATTRS512
1722_mm512_cvttph_epi16(__m512h __A) {
1723  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1724      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
1725      _MM_FROUND_CUR_DIRECTION);
1726}
1727
1728static __inline__ __m512i __DEFAULT_FN_ATTRS512
1729_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
1730  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1731      (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1732}
1733
1734static __inline__ __m512i __DEFAULT_FN_ATTRS512
1735_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
1736  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
1737      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
1738      _MM_FROUND_CUR_DIRECTION);
1739}
1740
1741#define _mm512_cvt_roundepi16_ph(A, R)                                         \
1742  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
1743                                            (__v32hf)_mm512_undefined_ph(),    \
1744                                            (__mmask32)(-1), (int)(R)))
1745
1746#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
1747  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
1748                                            (__mmask32)(U), (int)(R)))
1749
1750#define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
1751  ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
1752      (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1753
1754static __inline__ __m512h __DEFAULT_FN_ATTRS512
1755_mm512_cvtepi16_ph(__m512i __A) {
1756  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1757      (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1758      _MM_FROUND_CUR_DIRECTION);
1759}
1760
1761static __inline__ __m512h __DEFAULT_FN_ATTRS512
1762_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1763  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1764      (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1765}
1766
1767static __inline__ __m512h __DEFAULT_FN_ATTRS512
1768_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
1769  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
1770      (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1771      _MM_FROUND_CUR_DIRECTION);
1772}
1773
1774#define _mm512_cvt_roundph_epu16(A, R)                                         \
1775  ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
1776      (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1777      (int)(R)))
1778
1779#define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
1780  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
1781                                             (__mmask32)(U), (int)(R)))
1782
1783#define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
1784  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
1785                                             (__v32hu)_mm512_setzero_epi32(),  \
1786                                             (__mmask32)(U), (int)(R)))
1787
1788static __inline__ __m512i __DEFAULT_FN_ATTRS512
1789_mm512_cvtph_epu16(__m512h __A) {
1790  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1791      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1792      _MM_FROUND_CUR_DIRECTION);
1793}
1794
1795static __inline__ __m512i __DEFAULT_FN_ATTRS512
1796_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1797  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1798      (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1799}
1800
1801static __inline__ __m512i __DEFAULT_FN_ATTRS512
1802_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
1803  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
1804      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1805      _MM_FROUND_CUR_DIRECTION);
1806}
1807
1808#define _mm512_cvtt_roundph_epu16(A, R)                                        \
1809  ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
1810      (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
1811      (int)(R)))
1812
1813#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
1814  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
1815                                              (__mmask32)(U), (int)(R)))
1816
1817#define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
1818  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
1819                                              (__v32hu)_mm512_setzero_epi32(), \
1820                                              (__mmask32)(U), (int)(R)))
1821
1822static __inline__ __m512i __DEFAULT_FN_ATTRS512
1823_mm512_cvttph_epu16(__m512h __A) {
1824  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1825      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
1826      _MM_FROUND_CUR_DIRECTION);
1827}
1828
1829static __inline__ __m512i __DEFAULT_FN_ATTRS512
1830_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
1831  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1832      (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1833}
1834
1835static __inline__ __m512i __DEFAULT_FN_ATTRS512
1836_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
1837  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
1838      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
1839      _MM_FROUND_CUR_DIRECTION);
1840}
1841
1842#define _mm512_cvt_roundepu16_ph(A, R)                                         \
1843  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
1844                                             (__v32hf)_mm512_undefined_ph(),   \
1845                                             (__mmask32)(-1), (int)(R)))
1846
1847#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
1848  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
1849                                             (__mmask32)(U), (int)(R)))
1850
1851#define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
1852  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
1853      (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
1854
1855static __inline__ __m512h __DEFAULT_FN_ATTRS512
1856_mm512_cvtepu16_ph(__m512i __A) {
1857  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1858      (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
1859      _MM_FROUND_CUR_DIRECTION);
1860}
1861
1862static __inline__ __m512h __DEFAULT_FN_ATTRS512
1863_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
1864  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1865      (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
1866}
1867
1868static __inline__ __m512h __DEFAULT_FN_ATTRS512
1869_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
1870  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
1871      (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
1872      _MM_FROUND_CUR_DIRECTION);
1873}
1874
1875#define _mm512_cvt_roundph_epi32(A, R)                                         \
1876  ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
1877      (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1878      (int)(R)))
1879
1880#define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
1881  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
1882                                             (__mmask16)(U), (int)(R)))
1883
1884#define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
1885  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
1886                                             (__v16si)_mm512_setzero_epi32(),  \
1887                                             (__mmask16)(U), (int)(R)))
1888
1889static __inline__ __m512i __DEFAULT_FN_ATTRS512
1890_mm512_cvtph_epi32(__m256h __A) {
1891  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1892      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
1893      _MM_FROUND_CUR_DIRECTION);
1894}
1895
1896static __inline__ __m512i __DEFAULT_FN_ATTRS512
1897_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
1898  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1899      (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1900}
1901
1902static __inline__ __m512i __DEFAULT_FN_ATTRS512
1903_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
1904  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
1905      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
1906      _MM_FROUND_CUR_DIRECTION);
1907}
1908
1909#define _mm512_cvt_roundph_epu32(A, R)                                         \
1910  ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
1911      (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
1912      (int)(R)))
1913
1914#define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
1915  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
1916                                              (__mmask16)(U), (int)(R)))
1917
1918#define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
1919  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
1920                                              (__v16su)_mm512_setzero_epi32(), \
1921                                              (__mmask16)(U), (int)(R)))
1922
1923static __inline__ __m512i __DEFAULT_FN_ATTRS512
1924_mm512_cvtph_epu32(__m256h __A) {
1925  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1926      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
1927      _MM_FROUND_CUR_DIRECTION);
1928}
1929
1930static __inline__ __m512i __DEFAULT_FN_ATTRS512
1931_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
1932  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1933      (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1934}
1935
1936static __inline__ __m512i __DEFAULT_FN_ATTRS512
1937_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
1938  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
1939      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
1940      _MM_FROUND_CUR_DIRECTION);
1941}
1942
1943#define _mm512_cvt_roundepi32_ph(A, R)                                         \
1944  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
1945                                             (__v16hf)_mm256_undefined_ph(),   \
1946                                             (__mmask16)(-1), (int)(R)))
1947
1948#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
1949  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
1950                                             (__mmask16)(U), (int)(R)))
1951
1952#define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
1953  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
1954      (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1955
1956static __inline__ __m256h __DEFAULT_FN_ATTRS512
1957_mm512_cvtepi32_ph(__m512i __A) {
1958  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1959      (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1960      _MM_FROUND_CUR_DIRECTION);
1961}
1962
1963static __inline__ __m256h __DEFAULT_FN_ATTRS512
1964_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1965  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1966      (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
1967}
1968
1969static __inline__ __m256h __DEFAULT_FN_ATTRS512
1970_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
1971  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
1972      (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
1973      _MM_FROUND_CUR_DIRECTION);
1974}
1975
1976#define _mm512_cvt_roundepu32_ph(A, R)                                         \
1977  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
1978                                              (__v16hf)_mm256_undefined_ph(),  \
1979                                              (__mmask16)(-1), (int)(R)))
1980
1981#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
1982  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
1983                                              (__mmask16)(U), (int)(R)))
1984
1985#define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
1986  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
1987      (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
1988
1989static __inline__ __m256h __DEFAULT_FN_ATTRS512
1990_mm512_cvtepu32_ph(__m512i __A) {
1991  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1992      (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
1993      _MM_FROUND_CUR_DIRECTION);
1994}
1995
1996static __inline__ __m256h __DEFAULT_FN_ATTRS512
1997_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
1998  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
1999      (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2000}
2001
2002static __inline__ __m256h __DEFAULT_FN_ATTRS512
2003_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
2004  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
2005      (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2006      _MM_FROUND_CUR_DIRECTION);
2007}
2008
2009#define _mm512_cvtt_roundph_epi32(A, R)                                        \
2010  ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
2011      (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2012      (int)(R)))
2013
2014#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
2015  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
2016                                              (__mmask16)(U), (int)(R)))
2017
2018#define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
2019  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
2020                                              (__v16si)_mm512_setzero_epi32(), \
2021                                              (__mmask16)(U), (int)(R)))
2022
2023static __inline__ __m512i __DEFAULT_FN_ATTRS512
2024_mm512_cvttph_epi32(__m256h __A) {
2025  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2026      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
2027      _MM_FROUND_CUR_DIRECTION);
2028}
2029
2030static __inline__ __m512i __DEFAULT_FN_ATTRS512
2031_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
2032  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2033      (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2034}
2035
2036static __inline__ __m512i __DEFAULT_FN_ATTRS512
2037_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
2038  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
2039      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
2040      _MM_FROUND_CUR_DIRECTION);
2041}
2042
2043#define _mm512_cvtt_roundph_epu32(A, R)                                        \
2044  ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2045      (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
2046      (int)(R)))
2047
2048#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
2049  ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
2050                                               (__mmask16)(U), (int)(R)))
2051
2052#define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
2053  ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
2054      (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
2055      (int)(R)))
2056
2057static __inline__ __m512i __DEFAULT_FN_ATTRS512
2058_mm512_cvttph_epu32(__m256h __A) {
2059  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2060      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
2061      _MM_FROUND_CUR_DIRECTION);
2062}
2063
2064static __inline__ __m512i __DEFAULT_FN_ATTRS512
2065_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
2066  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2067      (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2068}
2069
2070static __inline__ __m512i __DEFAULT_FN_ATTRS512
2071_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
2072  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
2073      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
2074      _MM_FROUND_CUR_DIRECTION);
2075}
2076
2077#define _mm512_cvt_roundepi64_ph(A, R)                                         \
2078  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2079      (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2080
2081#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
2082  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
2083                                             (__mmask8)(U), (int)(R)))
2084
2085#define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
2086  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
2087      (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2088
2089static __inline__ __m128h __DEFAULT_FN_ATTRS512
2090_mm512_cvtepi64_ph(__m512i __A) {
2091  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2092      (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2093      _MM_FROUND_CUR_DIRECTION);
2094}
2095
2096static __inline__ __m128h __DEFAULT_FN_ATTRS512
2097_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2098  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2099      (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2100}
2101
2102static __inline__ __m128h __DEFAULT_FN_ATTRS512
2103_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
2104  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
2105      (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2106      _MM_FROUND_CUR_DIRECTION);
2107}
2108
2109#define _mm512_cvt_roundph_epi64(A, R)                                         \
2110  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
2111                                             (__v8di)_mm512_undefined_epi32(), \
2112                                             (__mmask8)(-1), (int)(R)))
2113
2114#define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
2115  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
2116                                             (__mmask8)(U), (int)(R)))
2117
2118#define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
2119  ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
2120      (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2121
2122static __inline__ __m512i __DEFAULT_FN_ATTRS512
2123_mm512_cvtph_epi64(__m128h __A) {
2124  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2125      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2126      _MM_FROUND_CUR_DIRECTION);
2127}
2128
2129static __inline__ __m512i __DEFAULT_FN_ATTRS512
2130_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2131  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2132      (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2133}
2134
2135static __inline__ __m512i __DEFAULT_FN_ATTRS512
2136_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
2137  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
2138      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2139      _MM_FROUND_CUR_DIRECTION);
2140}
2141
2142#define _mm512_cvt_roundepu64_ph(A, R)                                         \
2143  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2144      (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
2145
2146#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
2147  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
2148                                              (__mmask8)(U), (int)(R)))
2149
2150#define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
2151  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
2152      (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
2153
2154static __inline__ __m128h __DEFAULT_FN_ATTRS512
2155_mm512_cvtepu64_ph(__m512i __A) {
2156  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2157      (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
2158      _MM_FROUND_CUR_DIRECTION);
2159}
2160
2161static __inline__ __m128h __DEFAULT_FN_ATTRS512
2162_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
2163  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2164      (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2165}
2166
2167static __inline__ __m128h __DEFAULT_FN_ATTRS512
2168_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
2169  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
2170      (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
2171      _MM_FROUND_CUR_DIRECTION);
2172}
2173
2174#define _mm512_cvt_roundph_epu64(A, R)                                         \
2175  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2176      (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2177      (int)(R)))
2178
2179#define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
2180  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
2181                                              (__mmask8)(U), (int)(R)))
2182
2183#define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
2184  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
2185      (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2186
2187static __inline__ __m512i __DEFAULT_FN_ATTRS512
2188_mm512_cvtph_epu64(__m128h __A) {
2189  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2190      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2191      _MM_FROUND_CUR_DIRECTION);
2192}
2193
2194static __inline__ __m512i __DEFAULT_FN_ATTRS512
2195_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2196  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2197      (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2198}
2199
2200static __inline__ __m512i __DEFAULT_FN_ATTRS512
2201_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
2202  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
2203      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2204      _MM_FROUND_CUR_DIRECTION);
2205}
2206
2207#define _mm512_cvtt_roundph_epi64(A, R)                                        \
2208  ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2209      (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2210      (int)(R)))
2211
2212#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
2213  ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
2214                                              (__mmask8)(U), (int)(R)))
2215
2216#define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
2217  ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
2218      (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2219
2220static __inline__ __m512i __DEFAULT_FN_ATTRS512
2221_mm512_cvttph_epi64(__m128h __A) {
2222  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2223      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
2224      _MM_FROUND_CUR_DIRECTION);
2225}
2226
2227static __inline__ __m512i __DEFAULT_FN_ATTRS512
2228_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
2229  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2230      (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2231}
2232
2233static __inline__ __m512i __DEFAULT_FN_ATTRS512
2234_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
2235  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
2236      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
2237      _MM_FROUND_CUR_DIRECTION);
2238}
2239
2240#define _mm512_cvtt_roundph_epu64(A, R)                                        \
2241  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2242      (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
2243      (int)(R)))
2244
2245#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
2246  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
2247                                               (__mmask8)(U), (int)(R)))
2248
2249#define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
2250  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
2251      (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
2252
2253static __inline__ __m512i __DEFAULT_FN_ATTRS512
2254_mm512_cvttph_epu64(__m128h __A) {
2255  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2256      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
2257      _MM_FROUND_CUR_DIRECTION);
2258}
2259
2260static __inline__ __m512i __DEFAULT_FN_ATTRS512
2261_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
2262  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2263      (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2264}
2265
2266static __inline__ __m512i __DEFAULT_FN_ATTRS512
2267_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
2268  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
2269      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
2270      _MM_FROUND_CUR_DIRECTION);
2271}
2272
2273#define _mm_cvt_roundsh_i32(A, R)                                              \
2274  ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
2275
2276static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
2277  return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2278}
2279
2280#define _mm_cvt_roundsh_u32(A, R)                                              \
2281  ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
2282
2283static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2284_mm_cvtsh_u32(__m128h __A) {
2285  return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
2286                                                   _MM_FROUND_CUR_DIRECTION);
2287}
2288
2289#ifdef __x86_64__
2290#define _mm_cvt_roundsh_i64(A, R)                                              \
2291  ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
2292
2293static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
2294  return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
2295                                               _MM_FROUND_CUR_DIRECTION);
2296}
2297
2298#define _mm_cvt_roundsh_u64(A, R)                                              \
2299  ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
2300
2301static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2302_mm_cvtsh_u64(__m128h __A) {
2303  return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
2304      (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2305}
2306#endif // __x86_64__
2307
2308#define _mm_cvt_roundu32_sh(A, B, R)                                           \
2309  ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
2310
2311static __inline__ __m128h __DEFAULT_FN_ATTRS128
2312_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
2313  __A[0] = __B;
2314  return __A;
2315}
2316
2317#ifdef __x86_64__
2318#define _mm_cvt_roundu64_sh(A, B, R)                                           \
2319  ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
2320                                        (int)(R)))
2321
2322static __inline__ __m128h __DEFAULT_FN_ATTRS128
2323_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
2324  __A[0] = __B;
2325  return __A;
2326}
2327#endif
2328
2329#define _mm_cvt_roundi32_sh(A, B, R)                                           \
2330  ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
2331
2332static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
2333                                                              int __B) {
2334  __A[0] = __B;
2335  return __A;
2336}
2337
2338#ifdef __x86_64__
2339#define _mm_cvt_roundi64_sh(A, B, R)                                           \
2340  ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
2341
2342static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
2343                                                              long long __B) {
2344  __A[0] = __B;
2345  return __A;
2346}
2347#endif
2348
2349#define _mm_cvtt_roundsh_i32(A, R)                                             \
2350  ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
2351
2352static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
2353  return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
2354                                          _MM_FROUND_CUR_DIRECTION);
2355}
2356
2357#ifdef __x86_64__
2358#define _mm_cvtt_roundsh_i64(A, R)                                             \
2359  ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
2360
2361static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
2362  return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
2363                                                _MM_FROUND_CUR_DIRECTION);
2364}
2365#endif
2366
2367#define _mm_cvtt_roundsh_u32(A, R)                                             \
2368  ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
2369
2370static __inline__ unsigned int __DEFAULT_FN_ATTRS128
2371_mm_cvttsh_u32(__m128h __A) {
2372  return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
2373                                                    _MM_FROUND_CUR_DIRECTION);
2374}
2375
2376#ifdef __x86_64__
2377#define _mm_cvtt_roundsh_u64(A, R)                                             \
2378  ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
2379
2380static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
2381_mm_cvttsh_u64(__m128h __A) {
2382  return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
2383      (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
2384}
2385#endif
2386
2387#define _mm512_cvtx_roundph_ps(A, R)                                           \
2388  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
2389                                             (__v16sf)_mm512_undefined_ps(),   \
2390                                             (__mmask16)(-1), (int)(R)))
2391
2392#define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
2393  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
2394                                             (__mmask16)(U), (int)(R)))
2395
2396#define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
2397  ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
2398      (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
2399
2400static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
2401  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2402      (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
2403      _MM_FROUND_CUR_DIRECTION);
2404}
2405
2406static __inline__ __m512 __DEFAULT_FN_ATTRS512
2407_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
2408  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2409      (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2410}
2411
2412static __inline__ __m512 __DEFAULT_FN_ATTRS512
2413_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
2414  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
2415      (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
2416      _MM_FROUND_CUR_DIRECTION);
2417}
2418
2419#define _mm512_cvtx_roundps_ph(A, R)                                           \
2420  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
2421                                              (__v16hf)_mm256_undefined_ph(),  \
2422                                              (__mmask16)(-1), (int)(R)))
2423
2424#define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
2425  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
2426                                              (__mmask16)(U), (int)(R)))
2427
2428#define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
2429  ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
2430      (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
2431
2432static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
2433  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2434      (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
2435      _MM_FROUND_CUR_DIRECTION);
2436}
2437
2438static __inline__ __m256h __DEFAULT_FN_ATTRS512
2439_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
2440  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2441      (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
2442}
2443
2444static __inline__ __m256h __DEFAULT_FN_ATTRS512
2445_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
2446  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
2447      (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
2448      _MM_FROUND_CUR_DIRECTION);
2449}
2450
2451#define _mm512_fmadd_round_ph(A, B, C, R)                                      \
2452  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2453      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2454      (__mmask32)-1, (int)(R)))
2455
2456#define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                              \
2457  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2458      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2459      (__mmask32)(U), (int)(R)))
2460
2461#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                             \
2462  ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2463      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2464      (__mmask32)(U), (int)(R)))
2465
2466#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                             \
2467  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2468      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2469      (__mmask32)(U), (int)(R)))
2470
2471#define _mm512_fmsub_round_ph(A, B, C, R)                                      \
2472  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2473      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2474      (__mmask32)-1, (int)(R)))
2475
2476#define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                              \
2477  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2478      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2479      (__mmask32)(U), (int)(R)))
2480
2481#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                             \
2482  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2483      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2484      (__mmask32)(U), (int)(R)))
2485
2486#define _mm512_fnmadd_round_ph(A, B, C, R)                                     \
2487  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2488      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2489      (__mmask32)-1, (int)(R)))
2490
2491#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                            \
2492  ((__m512h)__builtin_ia32_vfmaddph512_mask3(                                  \
2493      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2494      (__mmask32)(U), (int)(R)))
2495
2496#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                            \
2497  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2498      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2499      (__mmask32)(U), (int)(R)))
2500
2501#define _mm512_fnmsub_round_ph(A, B, C, R)                                     \
2502  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2503      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2504      (__mmask32)-1, (int)(R)))
2505
2506#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                            \
2507  ((__m512h)__builtin_ia32_vfmaddph512_maskz(                                  \
2508      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2509      (__mmask32)(U), (int)(R)))
2510
2511static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
2512                                                                __m512h __B,
2513                                                                __m512h __C) {
2514  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2515                                                  (__v32hf)__C, (__mmask32)-1,
2516                                                  _MM_FROUND_CUR_DIRECTION);
2517}
2518
2519static __inline__ __m512h __DEFAULT_FN_ATTRS512
2520_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2521  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2522                                                  (__v32hf)__C, (__mmask32)__U,
2523                                                  _MM_FROUND_CUR_DIRECTION);
2524}
2525
2526static __inline__ __m512h __DEFAULT_FN_ATTRS512
2527_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2528  return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
2529                                                   (__v32hf)__C, (__mmask32)__U,
2530                                                   _MM_FROUND_CUR_DIRECTION);
2531}
2532
2533static __inline__ __m512h __DEFAULT_FN_ATTRS512
2534_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2535  return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
2536                                                   (__v32hf)__C, (__mmask32)__U,
2537                                                   _MM_FROUND_CUR_DIRECTION);
2538}
2539
2540static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
2541                                                                __m512h __B,
2542                                                                __m512h __C) {
2543  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2544                                                  -(__v32hf)__C, (__mmask32)-1,
2545                                                  _MM_FROUND_CUR_DIRECTION);
2546}
2547
2548static __inline__ __m512h __DEFAULT_FN_ATTRS512
2549_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2550  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
2551                                                  -(__v32hf)__C, (__mmask32)__U,
2552                                                  _MM_FROUND_CUR_DIRECTION);
2553}
2554
2555static __inline__ __m512h __DEFAULT_FN_ATTRS512
2556_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2557  return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2558      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2559      _MM_FROUND_CUR_DIRECTION);
2560}
2561
2562static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
2563                                                                 __m512h __B,
2564                                                                 __m512h __C) {
2565  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2566                                                  (__v32hf)__C, (__mmask32)-1,
2567                                                  _MM_FROUND_CUR_DIRECTION);
2568}
2569
2570static __inline__ __m512h __DEFAULT_FN_ATTRS512
2571_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2572  return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2573                                                   (__v32hf)__C, (__mmask32)__U,
2574                                                   _MM_FROUND_CUR_DIRECTION);
2575}
2576
2577static __inline__ __m512h __DEFAULT_FN_ATTRS512
2578_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2579  return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
2580                                                   (__v32hf)__C, (__mmask32)__U,
2581                                                   _MM_FROUND_CUR_DIRECTION);
2582}
2583
2584static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
2585                                                                 __m512h __B,
2586                                                                 __m512h __C) {
2587  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2588                                                  -(__v32hf)__C, (__mmask32)-1,
2589                                                  _MM_FROUND_CUR_DIRECTION);
2590}
2591
2592static __inline__ __m512h __DEFAULT_FN_ATTRS512
2593_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2594  return (__m512h)__builtin_ia32_vfmaddph512_maskz(
2595      -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2596      _MM_FROUND_CUR_DIRECTION);
2597}
2598
2599#define _mm512_fmaddsub_round_ph(A, B, C, R)                                   \
2600  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2601      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2602      (__mmask32)-1, (int)(R)))
2603
2604#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                           \
2605  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2606      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2607      (__mmask32)(U), (int)(R)))
2608
2609#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                          \
2610  ((__m512h)__builtin_ia32_vfmaddsubph512_mask3(                               \
2611      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2612      (__mmask32)(U), (int)(R)))
2613
2614#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                          \
2615  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2616      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2617      (__mmask32)(U), (int)(R)))
2618
2619#define _mm512_fmsubadd_round_ph(A, B, C, R)                                   \
2620  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2621      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2622      (__mmask32)-1, (int)(R)))
2623
2624#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                           \
2625  ((__m512h)__builtin_ia32_vfmaddsubph512_mask(                                \
2626      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2627      (__mmask32)(U), (int)(R)))
2628
2629#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                          \
2630  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz(                               \
2631      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),    \
2632      (__mmask32)(U), (int)(R)))
2633
2634static __inline__ __m512h __DEFAULT_FN_ATTRS512
2635_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
2636  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2637      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
2638      _MM_FROUND_CUR_DIRECTION);
2639}
2640
2641static __inline__ __m512h __DEFAULT_FN_ATTRS512
2642_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2643  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2644      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2645      _MM_FROUND_CUR_DIRECTION);
2646}
2647
2648static __inline__ __m512h __DEFAULT_FN_ATTRS512
2649_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2650  return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
2651      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2652      _MM_FROUND_CUR_DIRECTION);
2653}
2654
2655static __inline__ __m512h __DEFAULT_FN_ATTRS512
2656_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2657  return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2658      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2659      _MM_FROUND_CUR_DIRECTION);
2660}
2661
2662static __inline__ __m512h __DEFAULT_FN_ATTRS512
2663_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
2664  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2665      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
2666      _MM_FROUND_CUR_DIRECTION);
2667}
2668
2669static __inline__ __m512h __DEFAULT_FN_ATTRS512
2670_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2671  return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
2672      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2673      _MM_FROUND_CUR_DIRECTION);
2674}
2675
2676static __inline__ __m512h __DEFAULT_FN_ATTRS512
2677_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
2678  return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
2679      (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
2680      _MM_FROUND_CUR_DIRECTION);
2681}
2682
2683#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                             \
2684  ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2685      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2686      (__mmask32)(U), (int)(R)))
2687
2688static __inline__ __m512h __DEFAULT_FN_ATTRS512
2689_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2690  return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
2691                                                   (__v32hf)__C, (__mmask32)__U,
2692                                                   _MM_FROUND_CUR_DIRECTION);
2693}
2694
2695#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                          \
2696  ((__m512h)__builtin_ia32_vfmsubaddph512_mask3(                               \
2697      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),     \
2698      (__mmask32)(U), (int)(R)))
2699
2700static __inline__ __m512h __DEFAULT_FN_ATTRS512
2701_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2702  return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
2703      (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
2704      _MM_FROUND_CUR_DIRECTION);
2705}
2706
2707#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                             \
2708  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2709      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2710      (__mmask32)(U), (int)(R)))
2711
2712static __inline__ __m512h __DEFAULT_FN_ATTRS512
2713_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2714  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2715                                                  (__v32hf)__C, (__mmask32)__U,
2716                                                  _MM_FROUND_CUR_DIRECTION);
2717}
2718
2719#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                             \
2720  ((__m512h)__builtin_ia32_vfmaddph512_mask(                                   \
2721      (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C),   \
2722      (__mmask32)(U), (int)(R)))
2723
2724#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                            \
2725  ((__m512h)__builtin_ia32_vfmsubph512_mask3(                                  \
2726      -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C),    \
2727      (__mmask32)(U), (int)(R)))
2728
2729static __inline__ __m512h __DEFAULT_FN_ATTRS512
2730_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
2731  return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
2732                                                  -(__v32hf)__C, (__mmask32)__U,
2733                                                  _MM_FROUND_CUR_DIRECTION);
2734}
2735
2736static __inline__ __m512h __DEFAULT_FN_ATTRS512
2737_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
2738  return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
2739                                                   (__v32hf)__C, (__mmask32)__U,
2740                                                   _MM_FROUND_CUR_DIRECTION);
2741}
2742
2743static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
2744                                                             __m128h __A,
2745                                                             __m128h __B) {
2746  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2747                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2748}
2749
2750static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
2751                                                                  __mmask8 __U,
2752                                                                  __m128h __A,
2753                                                                  __m128h __B) {
2754  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
2755                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2756}
2757
2758#define _mm_fmadd_round_sh(A, B, C, R)                                         \
2759  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2760      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2761      (__mmask8)-1, (int)(R)))
2762
2763#define _mm_mask_fmadd_round_sh(W, U, A, B, R)                                 \
2764  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2765      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),        \
2766      (__mmask8)(U), (int)(R)))
2767
2768static __inline__ __m128h __DEFAULT_FN_ATTRS128
2769_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2770  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
2771                                        (__mmask8)__U,
2772                                        _MM_FROUND_CUR_DIRECTION);
2773}
2774
2775#define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
2776  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2777      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),        \
2778      (__mmask8)(U), (int)(R)))
2779
2780static __inline__ __m128h __DEFAULT_FN_ATTRS128
2781_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2782  return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2783                                        (__mmask8)__U,
2784                                        _MM_FROUND_CUR_DIRECTION);
2785}
2786
2787#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R)                                \
2788  ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2789      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2790      (__mmask8)(U), (int)(R)))
2791
2792static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
2793                                                             __m128h __A,
2794                                                             __m128h __B) {
2795  return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2796                                                -(__v8hf)__B, (__mmask8)-1,
2797                                                _MM_FROUND_CUR_DIRECTION);
2798}
2799
2800static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
2801                                                                  __mmask8 __U,
2802                                                                  __m128h __A,
2803                                                                  __m128h __B) {
2804  return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
2805                                                -(__v8hf)__B, (__mmask8)__U,
2806                                                _MM_FROUND_CUR_DIRECTION);
2807}
2808
2809#define _mm_fmsub_round_sh(A, B, C, R)                                         \
2810  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2811      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2812      (__mmask8)-1, (int)(R)))
2813
2814#define _mm_mask_fmsub_round_sh(W, U, A, B, R)                                 \
2815  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2816      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),       \
2817      (__mmask8)(U), (int)(R)))
2818
2819static __inline__ __m128h __DEFAULT_FN_ATTRS128
2820_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2821  return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
2822                                                 -(__v8hf)__C, (__mmask8)__U,
2823                                                 _MM_FROUND_CUR_DIRECTION);
2824}
2825
2826#define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
2827  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2828      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),       \
2829      (__mmask8)(U), (int)R))
2830
2831static __inline__ __m128h __DEFAULT_FN_ATTRS128
2832_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2833  return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
2834                                        (__mmask8)__U,
2835                                        _MM_FROUND_CUR_DIRECTION);
2836}
2837
2838#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R)                                \
2839  ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2840      (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),        \
2841      (__mmask8)(U), (int)(R)))
2842
2843static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
2844                                                              __m128h __A,
2845                                                              __m128h __B) {
2846  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2847                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2848}
2849
2850static __inline__ __m128h __DEFAULT_FN_ATTRS128
2851_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2852  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
2853                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2854}
2855
2856#define _mm_fnmadd_round_sh(A, B, C, R)                                        \
2857  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2858      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2859      (__mmask8)-1, (int)(R)))
2860
2861#define _mm_mask_fnmadd_round_sh(W, U, A, B, R)                                \
2862  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2863      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B),       \
2864      (__mmask8)(U), (int)(R)))
2865
2866static __inline__ __m128h __DEFAULT_FN_ATTRS128
2867_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2868  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
2869                                        (__mmask8)__U,
2870                                        _MM_FROUND_CUR_DIRECTION);
2871}
2872
2873#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                               \
2874  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2875      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C),       \
2876      (__mmask8)(U), (int)(R)))
2877
2878static __inline__ __m128h __DEFAULT_FN_ATTRS128
2879_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2880  return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2881                                        (__mmask8)__U,
2882                                        _MM_FROUND_CUR_DIRECTION);
2883}
2884
2885#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R)                               \
2886  ((__m128h)__builtin_ia32_vfmaddsh3_mask3(                                    \
2887      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2888      (__mmask8)(U), (int)(R)))
2889
2890static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
2891                                                              __m128h __A,
2892                                                              __m128h __B) {
2893  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2894                                       (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
2895}
2896
2897static __inline__ __m128h __DEFAULT_FN_ATTRS128
2898_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
2899  return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
2900                                       (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
2901}
2902
2903#define _mm_fnmsub_round_sh(A, B, C, R)                                        \
2904  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2905      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2906      (__mmask8)-1, (int)(R)))
2907
2908#define _mm_mask_fnmsub_round_sh(W, U, A, B, R)                                \
2909  ((__m128h)__builtin_ia32_vfmaddsh3_mask(                                     \
2910      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B),      \
2911      (__mmask8)(U), (int)(R)))
2912
2913static __inline__ __m128h __DEFAULT_FN_ATTRS128
2914_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2915  return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
2916                                        (__mmask8)__U,
2917                                        _MM_FROUND_CUR_DIRECTION);
2918}
2919
2920#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                               \
2921  ((__m128h)__builtin_ia32_vfmaddsh3_maskz(                                    \
2922      (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C),      \
2923      (__mmask8)(U), (int)(R)))
2924
2925static __inline__ __m128h __DEFAULT_FN_ATTRS128
2926_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
2927  return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
2928                                        (__mmask8)__U,
2929                                        _MM_FROUND_CUR_DIRECTION);
2930}
2931
2932#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R)                               \
2933  ((__m128h)__builtin_ia32_vfmsubsh3_mask3(                                    \
2934      (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y),       \
2935      (__mmask8)(U), (int)(R)))
2936
2937static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
2938                                                               __m128h __B,
2939                                                               __m128h __C) {
2940  return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2941                                                 (__v4sf)__C, (__mmask8)-1,
2942                                                 _MM_FROUND_CUR_DIRECTION);
2943}
2944
2945static __inline__ __m128h __DEFAULT_FN_ATTRS128
2946_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2947  return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
2948      (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2949}
2950
2951static __inline__ __m128h __DEFAULT_FN_ATTRS128
2952_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
2953  return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
2954                                                  (__v4sf)__C, (__mmask8)__U,
2955                                                  _MM_FROUND_CUR_DIRECTION);
2956}
2957
2958static __inline__ __m128h __DEFAULT_FN_ATTRS128
2959_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
2960  return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
2961      (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
2962}
2963
2964#define _mm_fcmadd_round_sch(A, B, C, R)                                       \
2965  ((__m128h)__builtin_ia32_vfcmaddcsh_mask(                                    \
2966      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2967      (__mmask8)-1, (int)(R)))
2968
2969#define _mm_mask_fcmadd_round_sch(A, U, B, C, R)                               \
2970  ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask(                              \
2971      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2972      (__mmask8)(U), (int)(R)))
2973
2974#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R)                              \
2975  ((__m128h)__builtin_ia32_vfcmaddcsh_maskz(                                   \
2976      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2977      (__mmask8)(U), (int)(R)))
2978
2979#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R)                              \
2980  ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(                             \
2981      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
2982      (__mmask8)(U), (int)(R)))
2983
2984static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
2985                                                              __m128h __B,
2986                                                              __m128h __C) {
2987  return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
2988                                                (__v4sf)__C, (__mmask8)-1,
2989                                                _MM_FROUND_CUR_DIRECTION);
2990}
2991
2992static __inline__ __m128h __DEFAULT_FN_ATTRS128
2993_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
2994  return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
2995      (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
2996}
2997
2998static __inline__ __m128h __DEFAULT_FN_ATTRS128
2999_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
3000  return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
3001                                                 (__v4sf)__C, (__mmask8)__U,
3002                                                 _MM_FROUND_CUR_DIRECTION);
3003}
3004
3005static __inline__ __m128h __DEFAULT_FN_ATTRS128
3006_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
3007  return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
3008      (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
3009}
3010
3011#define _mm_fmadd_round_sch(A, B, C, R)                                        \
3012  ((__m128h)__builtin_ia32_vfmaddcsh_mask(                                     \
3013      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3014      (__mmask8)-1, (int)(R)))
3015
3016#define _mm_mask_fmadd_round_sch(A, U, B, C, R)                                \
3017  ((__m128h)__builtin_ia32_vfmaddcsh_round_mask(                               \
3018      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3019      (__mmask8)(U), (int)(R)))
3020
3021#define _mm_maskz_fmadd_round_sch(U, A, B, C, R)                               \
3022  ((__m128h)__builtin_ia32_vfmaddcsh_maskz(                                    \
3023      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3024      (__mmask8)(U), (int)(R)))
3025
3026#define _mm_mask3_fmadd_round_sch(A, B, C, U, R)                               \
3027  ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3(                              \
3028      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C),        \
3029      (__mmask8)(U), (int)(R)))
3030
3031static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
3032                                                              __m128h __B) {
3033  return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3034      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3035      _MM_FROUND_CUR_DIRECTION);
3036}
3037
3038static __inline__ __m128h __DEFAULT_FN_ATTRS128
3039_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
3040  return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3041                                                (__v4sf)__W, (__mmask8)__U,
3042                                                _MM_FROUND_CUR_DIRECTION);
3043}
3044
3045static __inline__ __m128h __DEFAULT_FN_ATTRS128
3046_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3047  return (__m128h)__builtin_ia32_vfcmulcsh_mask(
3048      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3049      _MM_FROUND_CUR_DIRECTION);
3050}
3051
3052#define _mm_fcmul_round_sch(A, B, R)                                           \
3053  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3054      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3055      (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3056
3057#define _mm_mask_fcmul_round_sch(W, U, A, B, R)                                \
3058  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3059      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3060      (__mmask8)(U), (int)(R)))
3061
3062#define _mm_maskz_fcmul_round_sch(U, A, B, R)                                  \
3063  ((__m128h)__builtin_ia32_vfcmulcsh_mask(                                     \
3064      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3065      (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3066
3067static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
3068                                                             __m128h __B) {
3069  return (__m128h)__builtin_ia32_vfmulcsh_mask(
3070      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
3071      _MM_FROUND_CUR_DIRECTION);
3072}
3073
3074static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
3075                                                                  __mmask8 __U,
3076                                                                  __m128h __A,
3077                                                                  __m128h __B) {
3078  return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
3079                                               (__v4sf)__W, (__mmask8)__U,
3080                                               _MM_FROUND_CUR_DIRECTION);
3081}
3082
3083static __inline__ __m128h __DEFAULT_FN_ATTRS128
3084_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
3085  return (__m128h)__builtin_ia32_vfmulcsh_mask(
3086      (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
3087      _MM_FROUND_CUR_DIRECTION);
3088}
3089
3090#define _mm_fmul_round_sch(A, B, R)                                            \
3091  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3092      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3093      (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
3094
3095#define _mm_mask_fmul_round_sch(W, U, A, B, R)                                 \
3096  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3097      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W),        \
3098      (__mmask8)(U), (int)(R)))
3099
3100#define _mm_maskz_fmul_round_sch(U, A, B, R)                                   \
3101  ((__m128h)__builtin_ia32_vfmulcsh_mask(                                      \
3102      (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B),                              \
3103      (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
3104
3105static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
3106                                                                 __m512h __B) {
3107  return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3108      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3109      _MM_FROUND_CUR_DIRECTION);
3110}
3111
3112static __inline__ __m512h __DEFAULT_FN_ATTRS512
3113_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3114  return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3115                                                   (__v16sf)__W, (__mmask16)__U,
3116                                                   _MM_FROUND_CUR_DIRECTION);
3117}
3118
3119static __inline__ __m512h __DEFAULT_FN_ATTRS512
3120_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3121  return (__m512h)__builtin_ia32_vfcmulcph512_mask(
3122      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3123      _MM_FROUND_CUR_DIRECTION);
3124}
3125
3126#define _mm512_fcmul_round_pch(A, B, R)                                        \
3127  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3128      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3129      (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3130
3131#define _mm512_mask_fcmul_round_pch(W, U, A, B, R)                             \
3132  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3133      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3134      (__mmask16)(U), (int)(R)))
3135
3136#define _mm512_maskz_fcmul_round_pch(U, A, B, R)                               \
3137  ((__m512h)__builtin_ia32_vfcmulcph512_mask(                                  \
3138      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3139      (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3140
3141static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
3142                                                                __m512h __B) {
3143  return (__m512h)__builtin_ia32_vfmulcph512_mask(
3144      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
3145      _MM_FROUND_CUR_DIRECTION);
3146}
3147
3148static __inline__ __m512h __DEFAULT_FN_ATTRS512
3149_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
3150  return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
3151                                                  (__v16sf)__W, (__mmask16)__U,
3152                                                  _MM_FROUND_CUR_DIRECTION);
3153}
3154
3155static __inline__ __m512h __DEFAULT_FN_ATTRS512
3156_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
3157  return (__m512h)__builtin_ia32_vfmulcph512_mask(
3158      (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
3159      _MM_FROUND_CUR_DIRECTION);
3160}
3161
3162#define _mm512_fmul_round_pch(A, B, R)                                         \
3163  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3164      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3165      (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
3166
3167#define _mm512_mask_fmul_round_pch(W, U, A, B, R)                              \
3168  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3169      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W),     \
3170      (__mmask16)(U), (int)(R)))
3171
3172#define _mm512_maskz_fmul_round_pch(U, A, B, R)                                \
3173  ((__m512h)__builtin_ia32_vfmulcph512_mask(                                   \
3174      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B),                            \
3175      (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
3176
3177static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
3178                                                                  __m512h __B,
3179                                                                  __m512h __C) {
3180  return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3181      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
3182      _MM_FROUND_CUR_DIRECTION);
3183}
3184
3185static __inline__ __m512h __DEFAULT_FN_ATTRS512
3186_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3187  return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
3188      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3189      _MM_FROUND_CUR_DIRECTION);
3190}
3191
3192static __inline__ __m512h __DEFAULT_FN_ATTRS512
3193_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3194  return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
3195      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3196      _MM_FROUND_CUR_DIRECTION);
3197}
3198
3199static __inline__ __m512h __DEFAULT_FN_ATTRS512
3200_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3201  return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
3202      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3203      _MM_FROUND_CUR_DIRECTION);
3204}
3205
3206#define _mm512_fcmadd_round_pch(A, B, C, R)                                    \
3207  ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3208      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3209      (__mmask16)-1, (int)(R)))
3210
3211#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R)                            \
3212  ((__m512h)__builtin_ia32_vfcmaddcph512_mask(                                 \
3213      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3214      (__mmask16)(U), (int)(R)))
3215
3216#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R)                           \
3217  ((__m512h)__builtin_ia32_vfcmaddcph512_mask3(                                \
3218      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3219      (__mmask16)(U), (int)(R)))
3220
3221#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R)                           \
3222  ((__m512h)__builtin_ia32_vfcmaddcph512_maskz(                                \
3223      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3224      (__mmask16)(U), (int)(R)))
3225
3226static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
3227                                                                 __m512h __B,
3228                                                                 __m512h __C) {
3229  return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
3230                                                    (__v16sf)__C, (__mmask16)-1,
3231                                                    _MM_FROUND_CUR_DIRECTION);
3232}
3233
3234static __inline__ __m512h __DEFAULT_FN_ATTRS512
3235_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
3236  return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
3237                                                   (__v16sf)__C, (__mmask16)__U,
3238                                                   _MM_FROUND_CUR_DIRECTION);
3239}
3240
3241static __inline__ __m512h __DEFAULT_FN_ATTRS512
3242_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
3243  return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
3244      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3245      _MM_FROUND_CUR_DIRECTION);
3246}
3247
3248static __inline__ __m512h __DEFAULT_FN_ATTRS512
3249_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
3250  return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
3251      (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
3252      _MM_FROUND_CUR_DIRECTION);
3253}
3254
3255#define _mm512_fmadd_round_pch(A, B, C, R)                                     \
3256  ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3257      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3258      (__mmask16)-1, (int)(R)))
3259
3260#define _mm512_mask_fmadd_round_pch(A, U, B, C, R)                             \
3261  ((__m512h)__builtin_ia32_vfmaddcph512_mask(                                  \
3262      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3263      (__mmask16)(U), (int)(R)))
3264
3265#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R)                            \
3266  ((__m512h)__builtin_ia32_vfmaddcph512_mask3(                                 \
3267      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3268      (__mmask16)(U), (int)(R)))
3269
3270#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R)                            \
3271  ((__m512h)__builtin_ia32_vfmaddcph512_maskz(                                 \
3272      (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C),     \
3273      (__mmask16)(U), (int)(R)))
3274
3275static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3276_mm512_reduce_add_ph(__m512h __W) {
3277  return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
3278}
3279
3280static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3281_mm512_reduce_mul_ph(__m512h __W) {
3282  return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
3283}
3284
3285static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3286_mm512_reduce_max_ph(__m512h __V) {
3287  return __builtin_ia32_reduce_fmax_ph512(__V);
3288}
3289
3290static __inline__ _Float16 __DEFAULT_FN_ATTRS512
3291_mm512_reduce_min_ph(__m512h __V) {
3292  return __builtin_ia32_reduce_fmin_ph512(__V);
3293}
3294
3295static __inline__ __m512h __DEFAULT_FN_ATTRS512
3296_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
3297  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
3298                                              (__v32hf)__A);
3299}
3300
3301static __inline__ __m512h __DEFAULT_FN_ATTRS512
3302_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
3303  return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
3304                                                 (__v32hi)__B);
3305}
3306
3307static __inline__ __m512h __DEFAULT_FN_ATTRS512
3308_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
3309  return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
3310}
3311
3312// intrinsics below are alias for f*mul_*ch
3313#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
3314#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
3315#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
3316#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
3317#define _mm512_mask_mul_round_pch(W, U, A, B, R)                               \
3318  _mm512_mask_fmul_round_pch(W, U, A, B, R)
3319#define _mm512_maskz_mul_round_pch(U, A, B, R)                                 \
3320  _mm512_maskz_fmul_round_pch(U, A, B, R)
3321
3322#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
3323#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
3324#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
3325#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
3326#define _mm512_mask_cmul_round_pch(W, U, A, B, R)                              \
3327  _mm512_mask_fcmul_round_pch(W, U, A, B, R)
3328#define _mm512_maskz_cmul_round_pch(U, A, B, R)                                \
3329  _mm512_maskz_fcmul_round_pch(U, A, B, R)
3330
3331#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
3332#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
3333#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
3334#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
3335#define _mm_mask_mul_round_sch(W, U, A, B, R)                                  \
3336  _mm_mask_fmul_round_sch(W, U, A, B, R)
3337#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
3338
3339#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
3340#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
3341#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
3342#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
3343#define _mm_mask_cmul_round_sch(W, U, A, B, R)                                 \
3344  _mm_mask_fcmul_round_sch(W, U, A, B, R)
3345#define _mm_maskz_cmul_round_sch(U, A, B, R)                                   \
3346  _mm_maskz_fcmul_round_sch(U, A, B, R)
3347
3348#undef __DEFAULT_FN_ATTRS128
3349#undef __DEFAULT_FN_ATTRS256
3350#undef __DEFAULT_FN_ATTRS512
3351
3352#endif
3353#endif