master
   1/*===-------------- avx10_2bf16intrin.h - AVX10-BF16 intrinsics ------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9#ifndef __IMMINTRIN_H
  10#error                                                                         \
  11    "Never use <avx10_2bf16intrin.h> directly; include <immintrin.h> instead."
  12#endif
  13
  14#ifdef __SSE2__
  15
  16#ifndef __AVX10_2BF16INTRIN_H
  17#define __AVX10_2BF16INTRIN_H
  18
  19typedef __bf16 __m128bh_u __attribute__((__vector_size__(16), __aligned__(1)));
  20typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1)));
  21
  22/* Define the default attributes for the functions in this file. */
  23#define __DEFAULT_FN_ATTRS256                                                  \
  24  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
  25                 __min_vector_width__(256)))
  26#define __DEFAULT_FN_ATTRS128                                                  \
  27  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
  28                 __min_vector_width__(128)))
  29
  30static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) {
  31  return __builtin_bit_cast(__m256bh, _mm256_setzero_ps());
  32}
  33
  34static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_setzero_pbh(void) {
  35  return __builtin_bit_cast(__m128bh, _mm_setzero_ps());
  36}
  37
  38static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castbf16_ps(__m128bh __a) {
  39  return (__m128)__a;
  40}
  41
  42static __inline__ __m256 __DEFAULT_FN_ATTRS256
  43_mm256_castbf16_ps(__m256bh __a) {
  44  return (__m256)__a;
  45}
  46
  47static __inline__ __m256d __DEFAULT_FN_ATTRS256
  48_mm256_castbf16_pd(__m256bh __a) {
  49  return (__m256d)__a;
  50}
  51
  52static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castbf16_pd(__m128bh __a) {
  53  return (__m128d)__a;
  54}
  55
  56static __inline__ __m128i __DEFAULT_FN_ATTRS128
  57_mm_castbf16_si128(__m128bh __a) {
  58  return (__m128i)__a;
  59}
  60
  61static __inline__ __m256i __DEFAULT_FN_ATTRS256
  62_mm256_castbf16_si256(__m256bh __a) {
  63  return (__m256i)__a;
  64}
  65
  66static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castps_pbh(__m128 __a) {
  67  return (__m128bh)__a;
  68}
  69
  70static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_castps_pbh(__m256 __a) {
  71  return (__m256bh)__a;
  72}
  73
  74static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtsbh_bf16(__m128bh __a) {
  75  return __a[0];
  76}
  77
  78static __inline__ __bf16 __DEFAULT_FN_ATTRS256
  79_mm256_cvtsbh_bf16(__m256bh __a) {
  80  return __a[0];
  81}
  82
  83static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_castpd_pbh(__m128d __a) {
  84  return (__m128bh)__a;
  85}
  86
  87static __inline__ __m256bh __DEFAULT_FN_ATTRS256
  88_mm256_castpd_pbh(__m256d __a) {
  89  return (__m256bh)__a;
  90}
  91
  92static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  93_mm_castsi128_pbh(__m128i __a) {
  94  return (__m128bh)__a;
  95}
  96
  97static __inline__ __m256bh __DEFAULT_FN_ATTRS256
  98_mm256_castsi256_pbh(__m256i __a) {
  99  return (__m256bh)__a;
 100}
 101
 102static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 103_mm256_castbf16256_pbh128(__m256bh __a) {
 104  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
 105}
 106
 107static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 108_mm256_castbf16128_pbh256(__m128bh __a) {
 109  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
 110                                 -1, -1, -1, -1, -1);
 111}
 112
 113static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 114_mm256_zextbf16128_pbh256(__m128bh __a) {
 115  return __builtin_shufflevector(__a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4,
 116                                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 117}
 118
 119static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_undefined_pbh(void) {
 120  return (__m256bh)__builtin_ia32_undef256();
 121}
 122
 123static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 124_mm_load_sbh(void const *__dp) {
 125  __m128bh src = (__v8bf)_mm_setzero_pbh();
 126  return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__dp, src,
 127                                                    1);
 128}
 129
 130static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 131_mm_mask_load_sbh(__m128bh __W, __mmask8 __U, const void *__A) {
 132  __m128bh src = (__v8bf)__builtin_shufflevector(
 133      (__v8bf)__W, (__v8bf)_mm_setzero_pbh(), 0, 8, 8, 8, 8, 8, 8, 8);
 134
 135  return (__m128bh)__builtin_ia32_loadsbf16128_mask((const __v8bf *)__A, src,
 136                                                    __U & 1);
 137}
 138
 139static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 140_mm_maskz_load_sbh(__mmask8 __U, const void *__A) {
 141  return (__m128bh)__builtin_ia32_loadsbf16128_mask(
 142      (const __v8bf *)__A, (__v8bf)_mm_setzero_pbh(), __U & 1);
 143}
 144
 145static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 146_mm256_load_pbh(void const *__p) {
 147  return *(const __m256bh *)__p;
 148}
 149
 150static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_load_pbh(void const *__p) {
 151  return *(const __m128bh *)__p;
 152}
 153
 154static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 155_mm256_loadu_pbh(void const *__p) {
 156  struct __loadu_pbh {
 157    __m256bh_u __v;
 158  } __attribute__((__packed__, __may_alias__));
 159  return ((const struct __loadu_pbh *)__p)->__v;
 160}
 161
 162static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 163_mm_loadu_pbh(void const *__p) {
 164  struct __loadu_pbh {
 165    __m128bh_u __v;
 166  } __attribute__((__packed__, __may_alias__));
 167  return ((const struct __loadu_pbh *)__p)->__v;
 168}
 169
 170static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sbh(void *__dp,
 171                                                           __m128bh __a) {
 172  struct __mm_store_sbh_struct {
 173    __bf16 __u;
 174  } __attribute__((__packed__, __may_alias__));
 175  ((struct __mm_store_sbh_struct *)__dp)->__u = __a[0];
 176}
 177
 178static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sbh(void *__W,
 179                                                                __mmask8 __U,
 180                                                                __m128bh __A) {
 181  __builtin_ia32_storesbf16128_mask((__v8bf *)__W, __A, __U & 1);
 182}
 183
 184static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_pbh(void *__P,
 185                                                              __m256bh __A) {
 186  *(__m256bh *)__P = __A;
 187}
 188
 189static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_pbh(void *__P,
 190                                                           __m128bh __A) {
 191  *(__m128bh *)__P = __A;
 192}
 193
 194static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_pbh(void *__P,
 195                                                               __m256bh __A) {
 196  struct __storeu_pbh {
 197    __m256bh_u __v;
 198  } __attribute__((__packed__, __may_alias__));
 199  ((struct __storeu_pbh *)__P)->__v = __A;
 200}
 201
 202static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_pbh(void *__P,
 203                                                            __m128bh __A) {
 204  struct __storeu_pbh {
 205    __m128bh_u __v;
 206  } __attribute__((__packed__, __may_alias__));
 207  ((struct __storeu_pbh *)__P)->__v = __A;
 208}
 209
 210static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_move_sbh(__m128bh __a,
 211                                                              __m128bh __b) {
 212  __a[0] = __b[0];
 213  return __a;
 214}
 215
 216static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 217_mm_mask_move_sbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
 218  return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B), __W);
 219}
 220
 221static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 222_mm_maskz_move_sbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
 223  return __builtin_ia32_selectsbf_128(__U, _mm_move_sbh(__A, __B),
 224                                      _mm_setzero_pbh());
 225}
 226
 227static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_undefined_pbh(void) {
 228  return (__m128bh)__builtin_ia32_undef128();
 229}
 230
 231static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_set_sbh(__bf16 bf) {
 232  return (__v8bf)__builtin_shufflevector(
 233      (__v8bf){bf, bf, bf, bf, bf, bf, bf, bf}, (__v8bf)_mm_setzero_pbh(), 0, 8,
 234      8, 8, 8, 8, 8, 8);
 235}
 236
 237static __inline __m128bh __DEFAULT_FN_ATTRS128 _mm_set1_pbh(__bf16 bf) {
 238  return (__m128bh)(__v8bf){bf, bf, bf, bf, bf, bf, bf, bf};
 239}
 240
 241static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set1_pbh(__bf16 bf) {
 242  return (__m256bh)(__v16bf){bf, bf, bf, bf, bf, bf, bf, bf,
 243                             bf, bf, bf, bf, bf, bf, bf, bf};
 244}
 245
 246static __inline __m128bh __DEFAULT_FN_ATTRS128
 247_mm_set_pbh(__bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5,
 248            __bf16 bf6, __bf16 bf7, __bf16 bf8) {
 249  return (__m128bh)(__v8bf){bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8};
 250}
 251
 252static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_set_pbh(
 253    __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
 254    __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
 255    __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16) {
 256  return (__m256bh)(__v16bf){bf1, bf2,  bf3,  bf4,  bf5,  bf6,  bf7,  bf8,
 257                             bf9, bf10, bf11, bf12, bf13, bf14, bf15, bf16};
 258}
 259
 260#define _mm_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8)                   \
 261  _mm_set_pbh((bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2), (bf1))
 262
 263#define _mm256_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10,     \
 264                        bf11, bf12, bf13, bf14, bf15, bf16)                    \
 265  _mm256_set_pbh((bf16), (bf15), (bf14), (bf13), (bf12), (bf11), (bf10),       \
 266                 (bf9), (bf8), (bf7), (bf6), (bf5), (bf4), (bf3), (bf2),       \
 267                 (bf1))
 268
 269static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_abs_pbh(__m256bh __A) {
 270  return (__m256bh)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF),
 271                                    (__m256i)__A);
 272}
 273
 274static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_abs_pbh(__m128bh __A) {
 275  return (__m128bh)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
 276}
 277
 278static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 279_mm_mask_blend_pbh(__mmask8 __U, __m128bh __A, __m128bh __W) {
 280  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, (__v8bf)__W,
 281                                                (__v8bf)__A);
 282}
 283
 284static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 285_mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
 286  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, (__v16bf)__W,
 287                                                (__v16bf)__A);
 288}
 289
 290static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 291_mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
 292  return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
 293                                                  (__v8hi)__B);
 294}
 295
 296static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 297_mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
 298  return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
 299                                                  (__v16hi)__B);
 300}
 301
 302static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 303_mm_permutexvar_pbh(__m128i __A, __m128bh __B) {
 304  return (__m128bh)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
 305}
 306
 307static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 308_mm256_permutexvar_pbh(__m256i __A, __m256bh __B) {
 309  return (__m256bh)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
 310}
 311
 312static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_add_pbh(__m256bh __A,
 313                                                                __m256bh __B) {
 314  return (__m256bh)((__v16bf)__A + (__v16bf)__B);
 315}
 316
 317static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 318_mm256_mask_add_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
 319  return (__m256bh)__builtin_ia32_selectpbf_256(
 320      (__mmask16)__U, (__v16bf)_mm256_add_pbh(__A, __B), (__v16bf)__W);
 321}
 322
 323static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 324_mm256_maskz_add_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
 325  return (__m256bh)__builtin_ia32_selectpbf_256(
 326      (__mmask16)__U, (__v16bf)_mm256_add_pbh(__A, __B),
 327      (__v16bf)_mm256_setzero_pbh());
 328}
 329
 330static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_add_pbh(__m128bh __A,
 331                                                             __m128bh __B) {
 332  return (__m128bh)((__v8bf)__A + (__v8bf)__B);
 333}
 334
 335static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 336_mm_mask_add_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
 337  return (__m128bh)__builtin_ia32_selectpbf_128(
 338      (__mmask8)__U, (__v8bf)_mm_add_pbh(__A, __B), (__v8bf)__W);
 339}
 340
 341static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 342_mm_maskz_add_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
 343  return (__m128bh)__builtin_ia32_selectpbf_128(
 344      (__mmask8)__U, (__v8bf)_mm_add_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
 345}
 346
 347static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sub_pbh(__m256bh __A,
 348                                                                __m256bh __B) {
 349  return (__m256bh)((__v16bf)__A - (__v16bf)__B);
 350}
 351
 352static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 353_mm256_mask_sub_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
 354  return (__m256bh)__builtin_ia32_selectpbf_256(
 355      (__mmask16)__U, (__v16bf)_mm256_sub_pbh(__A, __B), (__v16bf)__W);
 356}
 357
 358static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 359_mm256_maskz_sub_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
 360  return (__m256bh)__builtin_ia32_selectpbf_256(
 361      (__mmask16)__U, (__v16bf)_mm256_sub_pbh(__A, __B),
 362      (__v16bf)_mm256_setzero_pbh());
 363}
 364
 365static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sub_pbh(__m128bh __A,
 366                                                             __m128bh __B) {
 367  return (__m128bh)((__v8bf)__A - (__v8bf)__B);
 368}
 369
 370static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 371_mm_mask_sub_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
 372  return (__m128bh)__builtin_ia32_selectpbf_128(
 373      (__mmask8)__U, (__v8bf)_mm_sub_pbh(__A, __B), (__v8bf)__W);
 374}
 375
 376static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 377_mm_maskz_sub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
 378  return (__m128bh)__builtin_ia32_selectpbf_128(
 379      (__mmask8)__U, (__v8bf)_mm_sub_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
 380}
 381
 382static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mul_pbh(__m256bh __A,
 383                                                                __m256bh __B) {
 384  return (__m256bh)((__v16bf)__A * (__v16bf)__B);
 385}
 386
 387static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 388_mm256_mask_mul_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
 389  return (__m256bh)__builtin_ia32_selectpbf_256(
 390      (__mmask16)__U, (__v16bf)_mm256_mul_pbh(__A, __B), (__v16bf)__W);
 391}
 392
 393static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 394_mm256_maskz_mul_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
 395  return (__m256bh)__builtin_ia32_selectpbf_256(
 396      (__mmask16)__U, (__v16bf)_mm256_mul_pbh(__A, __B),
 397      (__v16bf)_mm256_setzero_pbh());
 398}
 399
 400static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mul_pbh(__m128bh __A,
 401                                                             __m128bh __B) {
 402  return (__m128bh)((__v8bf)__A * (__v8bf)__B);
 403}
 404
 405static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 406_mm_mask_mul_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
 407  return (__m128bh)__builtin_ia32_selectpbf_128(
 408      (__mmask8)__U, (__v8bf)_mm_mul_pbh(__A, __B), (__v8bf)__W);
 409}
 410
 411static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 412_mm_maskz_mul_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
 413  return (__m128bh)__builtin_ia32_selectpbf_128(
 414      (__mmask8)__U, (__v8bf)_mm_mul_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
 415}
 416
 417static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_div_pbh(__m256bh __A,
 418                                                                __m256bh __B) {
 419  return (__m256bh)((__v16bf)__A / (__v16bf)__B);
 420}
 421
 422static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 423_mm256_mask_div_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
 424  return (__m256bh)__builtin_ia32_selectpbf_256(
 425      (__mmask16)__U, (__v16bf)_mm256_div_pbh(__A, __B), (__v16bf)__W);
 426}
 427
 428static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 429_mm256_maskz_div_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
 430  return (__m256bh)__builtin_ia32_selectpbf_256(
 431      (__mmask16)__U, (__v16bf)_mm256_div_pbh(__A, __B),
 432      (__v16bf)_mm256_setzero_pbh());
 433}
 434
 435static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_div_pbh(__m128bh __A,
 436                                                             __m128bh __B) {
 437  return (__m128bh)((__v8bf)__A / (__v8bf)__B);
 438}
 439
 440static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 441_mm_mask_div_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
 442  return (__m128bh)__builtin_ia32_selectpbf_128(
 443      (__mmask8)__U, (__v8bf)_mm_div_pbh(__A, __B), (__v8bf)__W);
 444}
 445
 446static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 447_mm_maskz_div_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
 448  return (__m128bh)__builtin_ia32_selectpbf_128(
 449      (__mmask8)__U, (__v8bf)_mm_div_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
 450}
 451
 452static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_max_pbh(__m256bh __A,
 453                                                                __m256bh __B) {
 454  return (__m256bh)__builtin_ia32_vmaxbf16256((__v16bf)__A, (__v16bf)__B);
 455}
 456
 457static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 458_mm256_mask_max_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
 459  return (__m256bh)__builtin_ia32_selectpbf_256(
 460      (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B), (__v16bf)__W);
 461}
 462
 463static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 464_mm256_maskz_max_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
 465  return (__m256bh)__builtin_ia32_selectpbf_256(
 466      (__mmask16)__U, (__v16bf)_mm256_max_pbh(__A, __B),
 467      (__v16bf)_mm256_setzero_pbh());
 468}
 469
 470static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_max_pbh(__m128bh __A,
 471                                                             __m128bh __B) {
 472  return (__m128bh)__builtin_ia32_vmaxbf16128((__v8bf)__A, (__v8bf)__B);
 473}
 474
 475static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 476_mm_mask_max_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
 477  return (__m128bh)__builtin_ia32_selectpbf_128(
 478      (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)__W);
 479}
 480
 481static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 482_mm_maskz_max_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
 483  return (__m128bh)__builtin_ia32_selectpbf_128(
 484      (__mmask8)__U, (__v8bf)_mm_max_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
 485}
 486
 487static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_min_pbh(__m256bh __A,
 488                                                                __m256bh __B) {
 489  return (__m256bh)__builtin_ia32_vminbf16256((__v16bf)__A, (__v16bf)__B);
 490}
 491
 492static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 493_mm256_mask_min_pbh(__m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
 494  return (__m256bh)__builtin_ia32_selectpbf_256(
 495      (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B), (__v16bf)__W);
 496}
 497
 498static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 499_mm256_maskz_min_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
 500  return (__m256bh)__builtin_ia32_selectpbf_256(
 501      (__mmask16)__U, (__v16bf)_mm256_min_pbh(__A, __B),
 502      (__v16bf)_mm256_setzero_pbh());
 503}
 504
 505static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_min_pbh(__m128bh __A,
 506                                                             __m128bh __B) {
 507  return (__m128bh)__builtin_ia32_vminbf16128((__v8bf)__A, (__v8bf)__B);
 508}
 509
 510static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 511_mm_mask_min_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
 512  return (__m128bh)__builtin_ia32_selectpbf_128(
 513      (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)__W);
 514}
 515
 516static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 517_mm_maskz_min_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
 518  return (__m128bh)__builtin_ia32_selectpbf_128(
 519      (__mmask8)__U, (__v8bf)_mm_min_pbh(__A, __B), (__v8bf)_mm_setzero_pbh());
 520}
 521
 522static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sbh(__m128bh A,
 523                                                           __m128bh B) {
 524  return __builtin_ia32_vcomisbf16eq((__v8bf)A, (__v8bf)B);
 525}
 526
 527static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sbh(__m128bh A,
 528                                                           __m128bh B) {
 529  return __builtin_ia32_vcomisbf16lt((__v8bf)A, (__v8bf)B);
 530}
 531
 532static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sbh(__m128bh A,
 533                                                           __m128bh B) {
 534  return __builtin_ia32_vcomisbf16le((__v8bf)A, (__v8bf)B);
 535}
 536
 537static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sbh(__m128bh A,
 538                                                           __m128bh B) {
 539  return __builtin_ia32_vcomisbf16gt((__v8bf)A, (__v8bf)B);
 540}
 541
 542static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sbh(__m128bh A,
 543                                                           __m128bh B) {
 544  return __builtin_ia32_vcomisbf16ge((__v8bf)A, (__v8bf)B);
 545}
 546
 547static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sbh(__m128bh A,
 548                                                            __m128bh B) {
 549  return __builtin_ia32_vcomisbf16neq((__v8bf)A, (__v8bf)B);
 550}
 551
 552#define _mm256_cmp_pbh_mask(__A, __B, __P)                                     \
 553  ((__mmask16)__builtin_ia32_vcmpbf16256_mask((__v16bf)(__m256bh)(__A),        \
 554                                              (__v16bf)(__m256bh)(__B),        \
 555                                              (int)(__P), (__mmask16) - 1))
 556
 557#define _mm256_mask_cmp_pbh_mask(__U, __A, __B, __P)                           \
 558  ((__mmask16)__builtin_ia32_vcmpbf16256_mask((__v16bf)(__m256bh)(__A),        \
 559                                              (__v16bf)(__m256bh)(__B),        \
 560                                              (int)(__P), (__mmask16)(__U)))
 561
 562#define _mm_cmp_pbh_mask(__A, __B, __P)                                        \
 563  ((__mmask8)__builtin_ia32_vcmpbf16128_mask((__v8bf)(__m128bh)(__A),          \
 564                                             (__v8bf)(__m128bh)(__B),          \
 565                                             (int)(__P), (__mmask8) - 1))
 566
 567#define _mm_mask_cmp_pbh_mask(__U, __A, __B, __P)                              \
 568  ((__mmask8)__builtin_ia32_vcmpbf16128_mask((__v8bf)(__m128bh)(__A),          \
 569                                             (__v8bf)(__m128bh)(__B),          \
 570                                             (int)(__P), (__mmask8)(__U)))
 571
 572#define _mm256_mask_fpclass_pbh_mask(__U, __A, imm)                            \
 573  ((__mmask16)__builtin_ia32_vfpclassbf16256_mask(                             \
 574      (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16)(__U)))
 575
 576#define _mm256_fpclass_pbh_mask(__A, imm)                                      \
 577  ((__mmask16)__builtin_ia32_vfpclassbf16256_mask(                             \
 578      (__v16bf)(__m256bh)(__A), (int)(imm), (__mmask16) - 1))
 579
 580#define _mm_mask_fpclass_pbh_mask(__U, __A, imm)                               \
 581  ((__mmask8)__builtin_ia32_vfpclassbf16128_mask((__v8bf)(__m128bh)(__A),      \
 582                                                 (int)(imm), (__mmask8)(__U)))
 583
 584#define _mm_fpclass_pbh_mask(__A, imm)                                         \
 585  ((__mmask8)__builtin_ia32_vfpclassbf16128_mask((__v8bf)(__m128bh)(__A),      \
 586                                                 (int)(imm), (__mmask8) - 1))
 587
 588static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 589_mm256_scalef_pbh(__m256bh __A, __m256bh __B) {
 590  return (__m256bh)__builtin_ia32_vscalefbf16256_mask(
 591      (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_undefined_pbh(),
 592      (__mmask16)-1);
 593}
 594
 595static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_scalef_pbh(
 596    __m256bh __W, __mmask16 __U, __m256bh __A, __m256bh __B) {
 597  return (__m256bh)__builtin_ia32_vscalefbf16256_mask(
 598      (__v16bf)__A, (__v16bf)__B, (__v16bf)__W, (__mmask16)__U);
 599}
 600
 601static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 602_mm256_maskz_scalef_pbh(__mmask16 __U, __m256bh __A, __m256bh __B) {
 603  return (__m256bh)__builtin_ia32_vscalefbf16256_mask(
 604      (__v16bf)__A, (__v16bf)__B, (__v16bf)_mm256_setzero_pbh(),
 605      (__mmask16)__U);
 606}
 607
 608static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_scalef_pbh(__m128bh __A,
 609                                                                __m128bh __B) {
 610  return (__m128bh)__builtin_ia32_vscalefbf16128_mask(
 611      (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1);
 612}
 613
 614static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 615_mm_mask_scalef_pbh(__m128bh __W, __mmask8 __U, __m128bh __A, __m128bh __B) {
 616  return (__m128bh)__builtin_ia32_vscalefbf16128_mask(
 617      (__v8bf)__A, (__v8bf)__B, (__v8bf)__W, (__mmask8)__U);
 618}
 619
 620static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 621_mm_maskz_scalef_pbh(__mmask8 __U, __m128bh __A, __m128bh __B) {
 622  return (__m128bh)__builtin_ia32_vscalefbf16128_mask(
 623      (__v8bf)__A, (__v8bf)__B, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U);
 624}
 625
 626static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_rcp_pbh(__m256bh __A) {
 627  return (__m256bh)__builtin_ia32_vrcpbf16256_mask(
 628      (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1);
 629}
 630
 631static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 632_mm256_mask_rcp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
 633  return (__m256bh)__builtin_ia32_vrcpbf16256_mask((__v16bf)__A, (__v16bf)__W,
 634                                                   (__mmask16)__U);
 635}
 636
 637static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 638_mm256_maskz_rcp_pbh(__mmask16 __U, __m256bh __A) {
 639  return (__m256bh)__builtin_ia32_vrcpbf16256_mask(
 640      (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U);
 641}
 642
 643static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rcp_pbh(__m128bh __A) {
 644  return (__m128bh)__builtin_ia32_vrcpbf16128_mask(
 645      (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1);
 646}
 647
 648static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 649_mm_mask_rcp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
 650  return (__m128bh)__builtin_ia32_vrcpbf16128_mask((__v8bf)__A, (__v8bf)__W,
 651                                                   (__mmask8)__U);
 652}
 653
 654static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 655_mm_maskz_rcp_pbh(__mmask8 __U, __m128bh __A) {
 656  return (__m128bh)__builtin_ia32_vrcpbf16128_mask(
 657      (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U);
 658}
 659
 660static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 661_mm256_getexp_pbh(__m256bh __A) {
 662  return (__m256bh)__builtin_ia32_vgetexpbf16256_mask(
 663      (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1);
 664}
 665
 666static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 667_mm256_mask_getexp_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
 668  return (__m256bh)__builtin_ia32_vgetexpbf16256_mask(
 669      (__v16bf)__A, (__v16bf)__W, (__mmask16)__U);
 670}
 671
 672static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 673_mm256_maskz_getexp_pbh(__mmask16 __U, __m256bh __A) {
 674  return (__m256bh)__builtin_ia32_vgetexpbf16256_mask(
 675      (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U);
 676}
 677
 678static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_getexp_pbh(__m128bh __A) {
 679  return (__m128bh)__builtin_ia32_vgetexpbf16128_mask(
 680      (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1);
 681}
 682
 683static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 684_mm_mask_getexp_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
 685  return (__m128bh)__builtin_ia32_vgetexpbf16128_mask((__v8bf)__A, (__v8bf)__W,
 686                                                      (__mmask8)__U);
 687}
 688
 689static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 690_mm_maskz_getexp_pbh(__mmask8 __U, __m128bh __A) {
 691  return (__m128bh)__builtin_ia32_vgetexpbf16128_mask(
 692      (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U);
 693}
 694
 695static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 696_mm256_rsqrt_pbh(__m256bh __A) {
 697  return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask(
 698      (__v16bf)__A, (__v16bf)_mm256_undefined_pbh(), (__mmask16)-1);
 699}
 700
 701static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 702_mm256_mask_rsqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
 703  return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask((__v16bf)__A, (__v16bf)__W,
 704                                                     (__mmask16)__U);
 705}
 706
 707static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 708_mm256_maskz_rsqrt_pbh(__mmask16 __U, __m256bh __A) {
 709  return (__m256bh)__builtin_ia32_vrsqrtbf16256_mask(
 710      (__v16bf)__A, (__v16bf)_mm256_setzero_pbh(), (__mmask16)__U);
 711}
 712
 713static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_rsqrt_pbh(__m128bh __A) {
 714  return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask(
 715      (__v8bf)__A, (__v8bf)_mm_undefined_pbh(), (__mmask8)-1);
 716}
 717
 718static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 719_mm_mask_rsqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
 720  return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask((__v8bf)__A, (__v8bf)__W,
 721                                                     (__mmask8)__U);
 722}
 723
 724static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 725_mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
 726  return (__m128bh)__builtin_ia32_vrsqrtbf16128_mask(
 727      (__v8bf)__A, (__v8bf)_mm_setzero_pbh(), (__mmask8)__U);
 728}
 729
 730#define _mm256_reduce_pbh(__A, imm)                                            \
 731  ((__m256bh)__builtin_ia32_vreducebf16256_mask(                               \
 732      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_undefined_pbh(),   \
 733      (__mmask16) - 1))
 734
 735#define _mm256_mask_reduce_pbh(__W, __U, __A, imm)                             \
 736  ((__m256bh)__builtin_ia32_vreducebf16256_mask(                               \
 737      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W),          \
 738      (__mmask16)(__U)))
 739
 740#define _mm256_maskz_reduce_pbh(__U, __A, imm)                                 \
 741  ((__m256bh)__builtin_ia32_vreducebf16256_mask(                               \
 742      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(),     \
 743      (__mmask16)(__U)))
 744
 745#define _mm_reduce_pbh(__A, imm)                                               \
 746  ((__m128bh)__builtin_ia32_vreducebf16128_mask(                               \
 747      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_undefined_pbh(),        \
 748      (__mmask8) - 1))
 749
 750#define _mm_mask_reduce_pbh(__W, __U, __A, imm)                                \
 751  ((__m128bh)__builtin_ia32_vreducebf16128_mask(                               \
 752      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W),            \
 753      (__mmask8)(__U)))
 754
 755#define _mm_maskz_reduce_pbh(__U, __A, imm)                                    \
 756  ((__m128bh)__builtin_ia32_vreducebf16128_mask(                               \
 757      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(),          \
 758      (__mmask8)(__U)))
 759
 760#define _mm256_roundscale_pbh(__A, imm)                                        \
 761  ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask(                            \
 762      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(),     \
 763      (__mmask16) - 1))
 764
 765#define _mm256_mask_roundscale_pbh(__W, __U, __A, imm)                         \
 766  ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask(                            \
 767      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)(__m256bh)(__W),          \
 768      (__mmask16)(__U)))
 769
 770#define _mm256_maskz_roundscale_pbh(__U, __A, imm)                             \
 771  ((__m256bh)__builtin_ia32_vrndscalebf16_256_mask(                            \
 772      (__v16bf)(__m256bh)(__A), (int)(imm), (__v16bf)_mm256_setzero_pbh(),     \
 773      (__mmask16)(__U)))
 774
 775#define _mm_roundscale_pbh(__A, imm)                                           \
 776  ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask(                            \
 777      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(),          \
 778      (__mmask8) - 1))
 779
 780#define _mm_mask_roundscale_pbh(__W, __U, __A, imm)                            \
 781  ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask(                            \
 782      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)(__m128bh)(__W),            \
 783      (__mmask8)(__U)))
 784
 785#define _mm_maskz_roundscale_pbh(__U, __A, imm)                                \
 786  ((__m128bh)__builtin_ia32_vrndscalebf16_128_mask(                            \
 787      (__v8bf)(__m128bh)(__A), (int)(imm), (__v8bf)_mm_setzero_pbh(),          \
 788      (__mmask8)(__U)))
 789
 790#define _mm256_getmant_pbh(__A, __B, __C)                                      \
 791  ((__m256bh)__builtin_ia32_vgetmantbf16256_mask(                              \
 792      (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)),                   \
 793      (__v16bf)_mm256_undefined_pbh(), (__mmask16) - 1))
 794
 795#define _mm256_mask_getmant_pbh(__W, __U, __A, __B, __C)                       \
 796  ((__m256bh)__builtin_ia32_vgetmantbf16256_mask(                              \
 797      (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)),                   \
 798      (__v16bf)(__m256bh)(__W), (__mmask16)(__U)))
 799
 800#define _mm256_maskz_getmant_pbh(__U, __A, __B, __C)                           \
 801  ((__m256bh)__builtin_ia32_vgetmantbf16256_mask(                              \
 802      (__v16bf)(__m256bh)(__A), (int)(((__C) << 2) | (__B)),                   \
 803      (__v16bf)_mm256_setzero_pbh(), (__mmask16)(__U)))
 804
 805#define _mm_getmant_pbh(__A, __B, __C)                                         \
 806  ((__m128bh)__builtin_ia32_vgetmantbf16128_mask(                              \
 807      (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)),                    \
 808      (__v8bf)_mm_undefined_pbh(), (__mmask8) - 1))
 809
 810#define _mm_mask_getmant_pbh(__W, __U, __A, __B, __C)                          \
 811  ((__m128bh)__builtin_ia32_vgetmantbf16128_mask(                              \
 812      (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)),                    \
 813      (__v8bf)(__m128bh)(__W), (__mmask8)(__U)))
 814
 815#define _mm_maskz_getmant_pbh(__U, __A, __B, __C)                              \
 816  ((__m128bh)__builtin_ia32_vgetmantbf16128_mask(                              \
 817      (__v8bf)(__m128bh)(__A), (int)(((__C) << 2) | (__B)),                    \
 818      (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
 819
 820static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
 821  return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
 822}
 823
 824static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 825_mm256_mask_sqrt_pbh(__m256bh __W, __mmask16 __U, __m256bh __A) {
 826  return (__m256bh)__builtin_ia32_selectpbf_256(
 827      (__mmask16)__U, (__v16bf)_mm256_sqrt_pbh(__A), (__v16bf)__W);
 828}
 829
 830static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 831_mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
 832  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
 833                                                (__v16bf)_mm256_sqrt_pbh(__A),
 834                                                (__v16bf)_mm256_setzero_pbh());
 835}
 836
 837static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
 838  return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
 839}
 840
 841static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 842_mm_mask_sqrt_pbh(__m128bh __W, __mmask8 __U, __m128bh __A) {
 843  return (__m128bh)__builtin_ia32_selectpbf_128(
 844      (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)__W);
 845}
 846
 847static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 848_mm_maskz_sqrt_pbh(__mmask8 __U, __m128bh __A) {
 849  return (__m128bh)__builtin_ia32_selectpbf_128(
 850      (__mmask8)__U, (__v8bf)_mm_sqrt_pbh(__A), (__v8bf)_mm_setzero_pbh());
 851}
 852
 853static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 854_mm256_fmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
 855  return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B,
 856                                                (__v16bf)__C);
 857}
 858
 859static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 860_mm256_mask_fmadd_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
 861  return (__m256bh)__builtin_ia32_selectpbf_256(
 862      (__mmask16)__U,
 863      _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__A);
 864}
 865
 866static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_pbh(
 867    __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
 868  return (__m256bh)__builtin_ia32_selectpbf_256(
 869      (__mmask16)__U,
 870      _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__C);
 871}
 872
 873static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pbh(
 874    __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
 875  return (__m256bh)__builtin_ia32_selectpbf_256(
 876      (__mmask16)__U,
 877      _mm256_fmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
 878      (__v16bf)_mm256_setzero_pbh());
 879}
 880
 881static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 882_mm256_fmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
 883  return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, (__v16bf)__B,
 884                                                -(__v16bf)__C);
 885}
 886
 887static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 888_mm256_mask_fmsub_pbh(__m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
 889  return (__m256bh)__builtin_ia32_selectpbf_256(
 890      (__mmask16)__U,
 891      _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__A);
 892}
 893
 894static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_pbh(
 895    __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
 896  return (__m256bh)__builtin_ia32_selectpbf_256(
 897      (__mmask16)__U,
 898      _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C), (__v16bf)__C);
 899}
 900
 901static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pbh(
 902    __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
 903  return (__m256bh)__builtin_ia32_selectpbf_256(
 904      (__mmask16)__U,
 905      _mm256_fmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
 906      (__v16bf)_mm256_setzero_pbh());
 907}
 908
 909static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 910_mm256_fnmadd_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
 911  return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B,
 912                                                (__v16bf)__C);
 913}
 914
 915static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pbh(
 916    __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
 917  return (__m256bh)__builtin_ia32_selectpbf_256(
 918      (__mmask16)__U,
 919      _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
 920      (__v16bf)__A);
 921}
 922
 923static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_pbh(
 924    __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
 925  return (__m256bh)__builtin_ia32_selectpbf_256(
 926      (__mmask16)__U,
 927      _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
 928      (__v16bf)__C);
 929}
 930
 931static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pbh(
 932    __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
 933  return (__m256bh)__builtin_ia32_selectpbf_256(
 934      (__mmask16)__U,
 935      _mm256_fnmadd_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
 936      (__v16bf)_mm256_setzero_pbh());
 937}
 938
 939static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 940_mm256_fnmsub_pbh(__m256bh __A, __m256bh __B, __m256bh __C) {
 941  return (__m256bh)__builtin_ia32_vfmaddbf16256((__v16bf)__A, -(__v16bf)__B,
 942                                                -(__v16bf)__C);
 943}
 944
 945static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pbh(
 946    __m256bh __A, __mmask16 __U, __m256bh __B, __m256bh __C) {
 947  return (__m256bh)__builtin_ia32_selectpbf_256(
 948      (__mmask16)__U,
 949      _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
 950      (__v16bf)__A);
 951}
 952
 953static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_pbh(
 954    __m256bh __A, __m256bh __B, __m256bh __C, __mmask16 __U) {
 955  return (__m256bh)__builtin_ia32_selectpbf_256(
 956      (__mmask16)__U,
 957      _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
 958      (__v16bf)__C);
 959}
 960
 961static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pbh(
 962    __mmask16 __U, __m256bh __A, __m256bh __B, __m256bh __C) {
 963  return (__m256bh)__builtin_ia32_selectpbf_256(
 964      (__mmask16)__U,
 965      _mm256_fnmsub_pbh((__v16bf)__A, (__v16bf)__B, (__v16bf)__C),
 966      (__v16bf)_mm256_setzero_pbh());
 967}
 968
 969static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmadd_pbh(__m128bh __A,
 970                                                               __m128bh __B,
 971                                                               __m128bh __C) {
 972  return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B,
 973                                                (__v8bf)__C);
 974}
 975
 976static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 977_mm_mask_fmadd_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
 978  return (__m128bh)__builtin_ia32_selectpbf_128(
 979      (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
 980      (__v8bf)__A);
 981}
 982
 983static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 984_mm_mask3_fmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
 985  return (__m128bh)__builtin_ia32_selectpbf_128(
 986      (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
 987      (__v8bf)__C);
 988}
 989
 990static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 991_mm_maskz_fmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
 992  return (__m128bh)__builtin_ia32_selectpbf_128(
 993      (__mmask8)__U, _mm_fmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
 994      (__v8bf)_mm_setzero_pbh());
 995}
 996
 997static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fmsub_pbh(__m128bh __A,
 998                                                               __m128bh __B,
 999                                                               __m128bh __C) {
1000  return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, (__v8bf)__B,
1001                                                -(__v8bf)__C);
1002}
1003
1004static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1005_mm_mask_fmsub_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
1006  return (__m128bh)__builtin_ia32_selectpbf_128(
1007      (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1008      (__v8bf)__A);
1009}
1010
1011static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1012_mm_mask3_fmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
1013  return (__m128bh)__builtin_ia32_selectpbf_128(
1014      (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1015      (__v8bf)__C);
1016}
1017
1018static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1019_mm_maskz_fmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
1020  return (__m128bh)__builtin_ia32_selectpbf_128(
1021      (__mmask8)__U, _mm_fmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1022      (__v8bf)_mm_setzero_pbh());
1023}
1024
1025static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmadd_pbh(__m128bh __A,
1026                                                                __m128bh __B,
1027                                                                __m128bh __C) {
1028  return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B,
1029                                                (__v8bf)__C);
1030}
1031
1032static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1033_mm_mask_fnmadd_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
1034  return (__m128bh)__builtin_ia32_selectpbf_128(
1035      (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1036      (__v8bf)__A);
1037}
1038
1039static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1040_mm_mask3_fnmadd_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
1041  return (__m128bh)__builtin_ia32_selectpbf_128(
1042      (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1043      (__v8bf)__C);
1044}
1045
1046static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1047_mm_maskz_fnmadd_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
1048  return (__m128bh)__builtin_ia32_selectpbf_128(
1049      (__mmask8)__U, _mm_fnmadd_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1050      (__v8bf)_mm_setzero_pbh());
1051}
1052
1053static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_fnmsub_pbh(__m128bh __A,
1054                                                                __m128bh __B,
1055                                                                __m128bh __C) {
1056  return (__m128bh)__builtin_ia32_vfmaddbf16128((__v8bf)__A, -(__v8bf)__B,
1057                                                -(__v8bf)__C);
1058}
1059
1060static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1061_mm_mask_fnmsub_pbh(__m128bh __A, __mmask8 __U, __m128bh __B, __m128bh __C) {
1062  return (__m128bh)__builtin_ia32_selectpbf_128(
1063      (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1064      (__v8bf)__A);
1065}
1066
1067static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1068_mm_mask3_fnmsub_pbh(__m128bh __A, __m128bh __B, __m128bh __C, __mmask8 __U) {
1069  return (__m128bh)__builtin_ia32_selectpbf_128(
1070      (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1071      (__v8bf)__C);
1072}
1073
1074static __inline__ __m128bh __DEFAULT_FN_ATTRS128
1075_mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
1076  return (__m128bh)__builtin_ia32_selectpbf_128(
1077      (__mmask8)__U, _mm_fnmsub_pbh((__v8bf)__A, (__v8bf)__B, (__v8bf)__C),
1078      (__v8bf)_mm_setzero_pbh());
1079}
1080
1081#undef __DEFAULT_FN_ATTRS128
1082#undef __DEFAULT_FN_ATTRS256
1083
1084#endif
1085#endif