master
  1/*===----------- avx10_2_512bf16intrin.h - AVX10-BF16 intrinsics ---------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9#ifndef __IMMINTRIN_H
 10#error                                                                         \
 11    "Never use <avx10_2_512bf16intrin.h> directly; include <immintrin.h> instead."
 12#endif
 13
 14#ifdef __SSE2__
 15
 16#ifndef __AVX10_2_512BF16INTRIN_H
 17#define __AVX10_2_512BF16INTRIN_H
 18
 19/* Define the default attributes for the functions in this file. */
 20typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
 21
 22/* Define the default attributes for the functions in this file. */
 23#define __DEFAULT_FN_ATTRS512                                                  \
 24  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"),    \
 25                 __min_vector_width__(512)))
 26
 27static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
 28  return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
 29}
 30
 31static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_undefined_pbh(void) {
 32  return (__m512bh)__builtin_ia32_undef512();
 33}
 34
 35static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set1_pbh(__bf16 bf) {
 36  return (__m512bh)(__v32bf){bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
 37                             bf, bf, bf, bf, bf, bf, bf, bf, bf, bf, bf,
 38                             bf, bf, bf, bf, bf, bf, bf, bf, bf, bf};
 39}
 40
 41static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_set_pbh(
 42    __bf16 bf1, __bf16 bf2, __bf16 bf3, __bf16 bf4, __bf16 bf5, __bf16 bf6,
 43    __bf16 bf7, __bf16 bf8, __bf16 bf9, __bf16 bf10, __bf16 bf11, __bf16 bf12,
 44    __bf16 bf13, __bf16 bf14, __bf16 bf15, __bf16 bf16, __bf16 bf17,
 45    __bf16 bf18, __bf16 bf19, __bf16 bf20, __bf16 bf21, __bf16 bf22,
 46    __bf16 bf23, __bf16 bf24, __bf16 bf25, __bf16 bf26, __bf16 bf27,
 47    __bf16 bf28, __bf16 bf29, __bf16 bf30, __bf16 bf31, __bf16 bf32) {
 48  return (__m512bh)(__v32bf){bf32, bf31, bf30, bf29, bf28, bf27, bf26, bf25,
 49                             bf24, bf23, bf22, bf21, bf20, bf19, bf18, bf17,
 50                             bf16, bf15, bf14, bf13, bf12, bf11, bf10, bf9,
 51                             bf8,  bf7,  bf6,  bf5,  bf4,  bf3,  bf2,  bf1};
 52}
 53
 54#define _mm512_setr_pbh(bf1, bf2, bf3, bf4, bf5, bf6, bf7, bf8, bf9, bf10,     \
 55                        bf11, bf12, bf13, bf14, bf15, bf16, bf17, bf18, bf19,  \
 56                        bf20, bf21, bf22, bf23, bf24, bf25, bf26, bf27, bf28,  \
 57                        bf29, bf30, bf31, bf32)                                \
 58  _mm512_set_pbh((bf32), (bf31), (bf30), (bf29), (bf28), (bf27), (bf26),       \
 59                 (bf25), (bf24), (bf23), (bf22), (bf21), (bf20), (bf19),       \
 60                 (bf18), (bf17), (bf16), (bf15), (bf14), (bf13), (bf12),       \
 61                 (bf11), (bf10), (bf9), (bf8), (bf7), (bf6), (bf5), (bf4),     \
 62                 (bf3), (bf2), (bf1))
 63
 64static __inline__ __m512 __DEFAULT_FN_ATTRS512
 65_mm512_castbf16_ps(__m512bh __a) {
 66  return (__m512)__a;
 67}
 68
 69static __inline__ __m512d __DEFAULT_FN_ATTRS512
 70_mm512_castbf16_pd(__m512bh __a) {
 71  return (__m512d)__a;
 72}
 73
 74static __inline__ __m512i __DEFAULT_FN_ATTRS512
 75_mm512_castbf16_si512(__m512bh __a) {
 76  return (__m512i)__a;
 77}
 78
 79static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_castps_pbh(__m512 __a) {
 80  return (__m512bh)__a;
 81}
 82
 83static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 84_mm512_castpd_pbh(__m512d __a) {
 85  return (__m512bh)__a;
 86}
 87
 88static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 89_mm512_castsi512_pbh(__m512i __a) {
 90  return (__m512bh)__a;
 91}
 92
 93static __inline__ __m128bh __DEFAULT_FN_ATTRS512
 94_mm512_castbf16512_pbh128(__m512bh __a) {
 95  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
 96}
 97
 98static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 99_mm512_castbf16512_pbh256(__m512bh __a) {
100  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
101                                 12, 13, 14, 15);
102}
103
104static __inline__ __m512bh __DEFAULT_FN_ATTRS512
105_mm512_castbf16128_pbh512(__m128bh __a) {
106  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
107                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
108                                 -1, -1, -1, -1, -1, -1, -1, -1, -1);
109}
110
111static __inline__ __m512bh __DEFAULT_FN_ATTRS512
112_mm512_castbf16256_pbh512(__m256bh __a) {
113  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
114                                 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
115                                 -1, -1, -1, -1, -1, -1, -1, -1);
116}
117
118static __inline__ __m512bh __DEFAULT_FN_ATTRS512
119_mm512_zextbf16128_pbh512(__m128bh __a) {
120  return __builtin_shufflevector(
121      __a, (__v8bf)_mm_setzero_pbh(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
122      13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
123}
124
125static __inline__ __m512bh __DEFAULT_FN_ATTRS512
126_mm512_zextbf16256_pbh512(__m256bh __a) {
127  return __builtin_shufflevector(__a, (__v16bf)_mm256_setzero_pbh(), 0, 1, 2, 3,
128                                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
129                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
130                                 29, 30, 31);
131}
132
133static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_abs_pbh(__m512bh __A) {
134  return (__m512bh)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF),
135                                    (__m512i)__A);
136}
137
138static __inline__ __m512bh __DEFAULT_FN_ATTRS512
139_mm512_load_pbh(void const *__p) {
140  return *(const __m512bh *)__p;
141}
142
143static __inline__ __m512bh __DEFAULT_FN_ATTRS512
144_mm512_loadu_pbh(void const *__p) {
145  struct __loadu_pbh {
146    __m512bh_u __v;
147  } __attribute__((__packed__, __may_alias__));
148  return ((const struct __loadu_pbh *)__p)->__v;
149}
150
151static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_pbh(void *__P,
152                                                              __m512bh __A) {
153  *(__m512bh *)__P = __A;
154}
155
156static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_pbh(void *__P,
157                                                               __m512bh __A) {
158  struct __storeu_pbh {
159    __m512bh_u __v;
160  } __attribute__((__packed__, __may_alias__));
161  ((struct __storeu_pbh *)__P)->__v = __A;
162}
163
164static __inline__ __m512bh __DEFAULT_FN_ATTRS512
165_mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
166  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, (__v32bf)__W,
167                                                (__v32bf)__A);
168}
169
170static __inline__ __m512bh __DEFAULT_FN_ATTRS512
171_mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
172  return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
173                                                  (__v32hi)__B);
174}
175
176static __inline__ __m512bh __DEFAULT_FN_ATTRS512
177_mm512_permutexvar_pbh(__m512i __A, __m512bh __B) {
178  return (__m512bh)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
179}
180
181static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_add_pbh(__m512bh __A,
182                                                                __m512bh __B) {
183  return (__m512bh)((__v32bf)__A + (__v32bf)__B);
184}
185
186static __inline__ __m512bh __DEFAULT_FN_ATTRS512
187_mm512_mask_add_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
188  return (__m512bh)__builtin_ia32_selectpbf_512(
189      (__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B), (__v32bf)__W);
190}
191
192static __inline__ __m512bh __DEFAULT_FN_ATTRS512
193_mm512_maskz_add_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
194  return (__m512bh)__builtin_ia32_selectpbf_512(
195      (__mmask32)__U, (__v32bf)_mm512_add_pbh(__A, __B),
196      (__v32bf)_mm512_setzero_pbh());
197}
198
199static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sub_pbh(__m512bh __A,
200                                                                __m512bh __B) {
201  return (__m512bh)((__v32bf)__A - (__v32bf)__B);
202}
203
204static __inline__ __m512bh __DEFAULT_FN_ATTRS512
205_mm512_mask_sub_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
206  return (__m512bh)__builtin_ia32_selectpbf_512(
207      (__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B), (__v32bf)__W);
208}
209
210static __inline__ __m512bh __DEFAULT_FN_ATTRS512
211_mm512_maskz_sub_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
212  return (__m512bh)__builtin_ia32_selectpbf_512(
213      (__mmask32)__U, (__v32bf)_mm512_sub_pbh(__A, __B),
214      (__v32bf)_mm512_setzero_pbh());
215}
216
217static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mul_pbh(__m512bh __A,
218                                                                __m512bh __B) {
219  return (__m512bh)((__v32bf)__A * (__v32bf)__B);
220}
221
222static __inline__ __m512bh __DEFAULT_FN_ATTRS512
223_mm512_mask_mul_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
224  return (__m512bh)__builtin_ia32_selectpbf_512(
225      (__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B), (__v32bf)__W);
226}
227
228static __inline__ __m512bh __DEFAULT_FN_ATTRS512
229_mm512_maskz_mul_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
230  return (__m512bh)__builtin_ia32_selectpbf_512(
231      (__mmask32)__U, (__v32bf)_mm512_mul_pbh(__A, __B),
232      (__v32bf)_mm512_setzero_pbh());
233}
234
235static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_div_pbh(__m512bh __A,
236                                                                __m512bh __B) {
237  return (__m512bh)((__v32bf)__A / (__v32bf)__B);
238}
239
240static __inline__ __m512bh __DEFAULT_FN_ATTRS512
241_mm512_mask_div_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
242  return (__m512bh)__builtin_ia32_selectpbf_512(
243      (__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B), (__v32bf)__W);
244}
245
246static __inline__ __m512bh __DEFAULT_FN_ATTRS512
247_mm512_maskz_div_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
248  return (__m512bh)__builtin_ia32_selectpbf_512(
249      (__mmask32)__U, (__v32bf)_mm512_div_pbh(__A, __B),
250      (__v32bf)_mm512_setzero_pbh());
251}
252
253static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_max_pbh(__m512bh __A,
254                                                                __m512bh __B) {
255  return (__m512bh)__builtin_ia32_vmaxbf16512((__v32bf)__A, (__v32bf)__B);
256}
257
258static __inline__ __m512bh __DEFAULT_FN_ATTRS512
259_mm512_mask_max_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
260  return (__m512bh)__builtin_ia32_selectpbf_512(
261      (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B), (__v32bf)__W);
262}
263
264static __inline__ __m512bh __DEFAULT_FN_ATTRS512
265_mm512_maskz_max_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
266  return (__m512bh)__builtin_ia32_selectpbf_512(
267      (__mmask32)__U, (__v32bf)_mm512_max_pbh(__A, __B),
268      (__v32bf)_mm512_setzero_pbh());
269}
270
271static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_min_pbh(__m512bh __A,
272                                                                __m512bh __B) {
273  return (__m512bh)__builtin_ia32_vminbf16512((__v32bf)__A, (__v32bf)__B);
274}
275
276static __inline__ __m512bh __DEFAULT_FN_ATTRS512
277_mm512_mask_min_pbh(__m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
278  return (__m512bh)__builtin_ia32_selectpbf_512(
279      (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B), (__v32bf)__W);
280}
281
282static __inline__ __m512bh __DEFAULT_FN_ATTRS512
283_mm512_maskz_min_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
284  return (__m512bh)__builtin_ia32_selectpbf_512(
285      (__mmask32)__U, (__v32bf)_mm512_min_pbh(__A, __B),
286      (__v32bf)_mm512_setzero_pbh());
287}
288
289#define _mm512_cmp_pbh_mask(__A, __B, __P)                                     \
290  ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A),        \
291                                              (__v32bf)(__m512bh)(__B),        \
292                                              (int)(__P), (__mmask32) - 1))
293
294#define _mm512_mask_cmp_pbh_mask(__U, __A, __B, __P)                           \
295  ((__mmask32)__builtin_ia32_vcmpbf16512_mask((__v32bf)(__m512bh)(__A),        \
296                                              (__v32bf)(__m512bh)(__B),        \
297                                              (int)(__P), (__mmask32)(__U)))
298
299#define _mm512_mask_fpclass_pbh_mask(__U, __A, imm)                            \
300  ((__mmask32)__builtin_ia32_vfpclassbf16512_mask(                             \
301      (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32)(__U)))
302
303#define _mm512_fpclass_pbh_mask(__A, imm)                                      \
304  ((__mmask32)__builtin_ia32_vfpclassbf16512_mask(                             \
305      (__v32bf)(__m512bh)(__A), (int)(imm), (__mmask32) - 1))
306
307static __inline__ __m512bh __DEFAULT_FN_ATTRS512
308_mm512_scalef_pbh(__m512bh __A, __m512bh __B) {
309  return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
310      (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_undefined_pbh(),
311      (__mmask32)-1);
312}
313
314static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pbh(
315    __m512bh __W, __mmask32 __U, __m512bh __A, __m512bh __B) {
316  return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
317      (__v32bf)__A, (__v32bf)__B, (__v32bf)__W, (__mmask32)__U);
318}
319
320static __inline__ __m512bh __DEFAULT_FN_ATTRS512
321_mm512_maskz_scalef_pbh(__mmask32 __U, __m512bh __A, __m512bh __B) {
322  return (__m512bh)__builtin_ia32_vscalefbf16512_mask(
323      (__v32bf)__A, (__v32bf)__B, (__v32bf)_mm512_setzero_pbh(),
324      (__mmask32)__U);
325}
326
327static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_rcp_pbh(__m512bh __A) {
328  return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
329      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
330}
331
332static __inline__ __m512bh __DEFAULT_FN_ATTRS512
333_mm512_mask_rcp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
334  return (__m512bh)__builtin_ia32_vrcpbf16512_mask((__v32bf)__A, (__v32bf)__W,
335                                                   (__mmask32)__U);
336}
337
338static __inline__ __m512bh __DEFAULT_FN_ATTRS512
339_mm512_maskz_rcp_pbh(__mmask32 __U, __m512bh __A) {
340  return (__m512bh)__builtin_ia32_vrcpbf16512_mask(
341      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
342}
343
344static __inline__ __m512bh __DEFAULT_FN_ATTRS512
345_mm512_getexp_pbh(__m512bh __A) {
346  return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
347      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
348}
349
350static __inline__ __m512bh __DEFAULT_FN_ATTRS512
351_mm512_mask_getexp_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
352  return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
353      (__v32bf)__A, (__v32bf)__W, (__mmask32)__U);
354}
355
356static __inline__ __m512bh __DEFAULT_FN_ATTRS512
357_mm512_maskz_getexp_pbh(__mmask32 __U, __m512bh __A) {
358  return (__m512bh)__builtin_ia32_vgetexpbf16512_mask(
359      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
360}
361
362static __inline__ __m512bh __DEFAULT_FN_ATTRS512
363_mm512_rsqrt_pbh(__m512bh __A) {
364  return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
365      (__v32bf)__A, (__v32bf)_mm512_undefined_pbh(), (__mmask32)-1);
366}
367
368static __inline__ __m512bh __DEFAULT_FN_ATTRS512
369_mm512_mask_rsqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
370  return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask((__v32bf)__A, (__v32bf)__W,
371                                                     (__mmask32)__U);
372}
373
374static __inline__ __m512bh __DEFAULT_FN_ATTRS512
375_mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
376  return (__m512bh)__builtin_ia32_vrsqrtbf16512_mask(
377      (__v32bf)__A, (__v32bf)_mm512_setzero_pbh(), (__mmask32)__U);
378}
379
380#define _mm512_reduce_pbh(__A, imm)                                            \
381  ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
382      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_undefined_pbh(),   \
383      (__mmask32) - 1))
384
385#define _mm512_mask_reduce_pbh(__W, __U, __A, imm)                             \
386  ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
387      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W),          \
388      (__mmask32)(__U)))
389
390#define _mm512_maskz_reduce_pbh(__U, __A, imm)                                 \
391  ((__m512bh)__builtin_ia32_vreducebf16512_mask(                               \
392      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
393      (__mmask32)(__U)))
394
395#define _mm512_roundscale_pbh(__A, imm)                                        \
396  ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
397      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
398      (__mmask32) - 1))
399
400#define _mm512_mask_roundscale_pbh(__W, __U, __A, imm)                         \
401  ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
402      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)(__m512bh)(__W),          \
403      (__mmask32)(__U)))
404
405#define _mm512_maskz_roundscale_pbh(__U, __A, imm)                             \
406  ((__m512bh)__builtin_ia32_vrndscalebf16_mask(                                \
407      (__v32bf)(__m512bh)(__A), (int)(imm), (__v32bf)_mm512_setzero_pbh(),     \
408      (__mmask32)(__U)))
409
410#define _mm512_getmant_pbh(__A, __B, __C)                                      \
411  ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
412      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
413      (__v32bf)_mm512_undefined_pbh(), (__mmask32) - 1))
414
415#define _mm512_mask_getmant_pbh(__W, __U, __A, __B, __C)                       \
416  ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
417      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
418      (__v32bf)(__m512bh)(__W), (__mmask32)(__U)))
419
420#define _mm512_maskz_getmant_pbh(__U, __A, __B, __C)                           \
421  ((__m512bh)__builtin_ia32_vgetmantbf16512_mask(                              \
422      (__v32bf)(__m512bh)(__A), (int)(((__C) << 2) | (__B)),                   \
423      (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
424
425static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
426  return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
427}
428
429static __inline__ __m512bh __DEFAULT_FN_ATTRS512
430_mm512_mask_sqrt_pbh(__m512bh __W, __mmask32 __U, __m512bh __A) {
431  return (__m512bh)__builtin_ia32_selectpbf_512(
432      (__mmask32)__U, (__v32bf)_mm512_sqrt_pbh(__A), (__v32bf)__W);
433}
434
435static __inline__ __m512bh __DEFAULT_FN_ATTRS512
436_mm512_maskz_sqrt_pbh(__mmask32 __U, __m512bh __A) {
437  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
438                                                (__v32bf)_mm512_sqrt_pbh(__A),
439                                                (__v32bf)_mm512_setzero_pbh());
440}
441
442static __inline__ __m512bh __DEFAULT_FN_ATTRS512
443_mm512_fmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
444  return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B,
445                                                (__v32bf)__C);
446}
447
448static __inline__ __m512bh __DEFAULT_FN_ATTRS512
449_mm512_mask_fmadd_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
450  return (__m512bh)__builtin_ia32_selectpbf_512(
451      (__mmask32)__U,
452      _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
453}
454
455static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pbh(
456    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
457  return (__m512bh)__builtin_ia32_selectpbf_512(
458      (__mmask32)__U,
459      _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
460}
461
462static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pbh(
463    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
464  return (__m512bh)__builtin_ia32_selectpbf_512(
465      (__mmask32)__U,
466      _mm512_fmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
467      (__v32bf)_mm512_setzero_pbh());
468}
469
470static __inline__ __m512bh __DEFAULT_FN_ATTRS512
471_mm512_fmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
472  return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, (__v32bf)__B,
473                                                -(__v32bf)__C);
474}
475
476static __inline__ __m512bh __DEFAULT_FN_ATTRS512
477_mm512_mask_fmsub_pbh(__m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
478  return (__m512bh)__builtin_ia32_selectpbf_512(
479      (__mmask32)__U,
480      _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__A);
481}
482
483static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_pbh(
484    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
485  return (__m512bh)__builtin_ia32_selectpbf_512(
486      (__mmask32)__U,
487      _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C), (__v32bf)__C);
488}
489
490static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pbh(
491    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
492  return (__m512bh)__builtin_ia32_selectpbf_512(
493      (__mmask32)__U,
494      _mm512_fmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
495      (__v32bf)_mm512_setzero_pbh());
496}
497
498static __inline__ __m512bh __DEFAULT_FN_ATTRS512
499_mm512_fnmadd_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
500  return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B,
501                                                (__v32bf)__C);
502}
503
504static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pbh(
505    __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
506  return (__m512bh)__builtin_ia32_selectpbf_512(
507      (__mmask32)__U,
508      _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
509      (__v32bf)__A);
510}
511
512static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmadd_pbh(
513    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
514  return (__m512bh)__builtin_ia32_selectpbf_512(
515      (__mmask32)__U,
516      _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
517      (__v32bf)__C);
518}
519
520static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pbh(
521    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
522  return (__m512bh)__builtin_ia32_selectpbf_512(
523      (__mmask32)__U,
524      _mm512_fnmadd_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
525      (__v32bf)_mm512_setzero_pbh());
526}
527
528static __inline__ __m512bh __DEFAULT_FN_ATTRS512
529_mm512_fnmsub_pbh(__m512bh __A, __m512bh __B, __m512bh __C) {
530  return (__m512bh)__builtin_ia32_vfmaddbf16512((__v32bf)__A, -(__v32bf)__B,
531                                                -(__v32bf)__C);
532}
533
534static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pbh(
535    __m512bh __A, __mmask32 __U, __m512bh __B, __m512bh __C) {
536  return (__m512bh)__builtin_ia32_selectpbf_512(
537      (__mmask32)__U,
538      _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
539      (__v32bf)__A);
540}
541
542static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsub_pbh(
543    __m512bh __A, __m512bh __B, __m512bh __C, __mmask32 __U) {
544  return (__m512bh)__builtin_ia32_selectpbf_512(
545      (__mmask32)__U,
546      _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
547      (__v32bf)__C);
548}
549
550static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
551    __mmask32 __U, __m512bh __A, __m512bh __B, __m512bh __C) {
552  return (__m512bh)__builtin_ia32_selectpbf_512(
553      (__mmask32)__U,
554      _mm512_fnmsub_pbh((__v32bf)__A, (__v32bf)__B, (__v32bf)__C),
555      (__v32bf)_mm512_setzero_pbh());
556}
557
558#undef __DEFAULT_FN_ATTRS512
559
560#endif
561#endif