zig/lib/include/avx10_2convertintrin.h at master

   1/*===--------------- avx10_2convertintrin.h - AVX10_2CONVERT ---------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9#ifndef __IMMINTRIN_H
  10#error                                                                         \
  11    "Never use <avx10_2convertintrin.h> directly; include <immintrin.h> instead."
  12#endif // __IMMINTRIN_H
  13
  14#ifdef __SSE2__
  15
  16#ifndef __AVX10_2CONVERTINTRIN_H
  17#define __AVX10_2CONVERTINTRIN_H
  18
  19/* Define the default attributes for the functions in this file. */
  20#define __DEFAULT_FN_ATTRS128                                                  \
  21  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
  22                 __min_vector_width__(128)))
  23#define __DEFAULT_FN_ATTRS256                                                  \
  24  __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"),    \
  25                 __min_vector_width__(256)))
  26
  27// clang-format off
  28
  29/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
  30///    single-precision (32-bit) floating-point elements to a 128-bit vector
  31///    containing FP16 elements.
  32///
  33/// \code{.operation}
  34/// FOR i := 0 to 7
  35/// 	IF i < 4
  36/// 		dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
  37/// 	ELSE
  38/// 		dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
  39/// 	FI
  40///
  41/// ENDFOR
  42///
  43/// dst[MAX:128] := 0
  44/// \endcode
  45///
  46/// \headerfile <immintrin.h>
  47///
  48/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
  49///
  50/// \param __A
  51///    A 128-bit vector of [4 x float].
  52/// \param __B
  53///    A 128-bit vector of [4 x float].
  54/// \returns
  55///    A 128-bit vector of [8 x fp16]. Lower 4 elements correspond to the
  56///    (converted) elements from \a __B; higher order elements correspond to the
  57///    (converted) elements from \a __A.
  58static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtx2ps_ph(__m128 __A,
  59                                                               __m128 __B) {
  60  return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
  61      (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)(-1));
  62}
  63
  64/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
  65///    single-precision (32-bit) floating-point elements to a 128-bit vector
  66///    containing FP16 elements. Merging mask \a __U is used to determine if given
  67///    element should be taken from \a __W instead.
  68///
  69/// \code{.operation}
  70/// FOR i := 0 to 7
  71/// 	IF __U[i]
  72/// 		IF i < 4
  73/// 			dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
  74/// 		ELSE
  75/// 			dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
  76/// 		FI
  77/// 	ELSE
  78/// 		dst.fp16[i] := __W.fp16[i]
  79/// 	FI
  80/// ENDFOR
  81///
  82/// dst[MAX:128] := 0
  83/// \endcode
  84///
  85/// \headerfile <immintrin.h>
  86///
  87/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
  88///
  89/// \param __W
  90///    A 128-bit vector of [8 x fp16].
  91/// \param __U
  92///    A 8-bit merging mask.
  93/// \param __A
  94///    A 128-bit vector of [4 x float].
  95/// \param __B
  96///    A 128-bit vector of [4 x float].
  97/// \returns
  98///    A 128-bit vector of [8 x fp16]. Lower elements correspond to the
  99///    (converted) elements from \a __B; higher order elements correspond to the
 100///    (converted) elements from \a __A. If corresponding mask bit is not set, then
 101///    element from \a __W is taken instead.
 102static __inline__ __m128h __DEFAULT_FN_ATTRS128
 103_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
 104  return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
 105      (__v4sf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U);
 106}
 107
 108/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
 109///    single-precision (32-bit) floating-point elements to a 128-bit vector
 110///    containing FP16 elements. Zeroing mask \a __U is used to determine if given
 111///    element should be zeroed instead.
 112///
 113/// \code{.operation}
 114/// FOR i := 0 to 7
 115/// 	IF __U[i]
 116/// 		IF i < 4
 117/// 			dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
 118/// 		ELSE
 119/// 			dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
 120/// 		FI
 121/// 	ELSE
 122/// 		dst.fp16[i] := 0
 123/// 	FI
 124/// ENDFOR
 125///
 126/// dst[MAX:128] := 0
 127/// \endcode
 128///
 129/// \headerfile <immintrin.h>
 130///
 131/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
 132///
 133/// \param __U
 134///    A 8-bit zeroing mask.
 135/// \param __A
 136///    A 128-bit vector of [4 x float].
 137/// \param __B
 138///    A 128-bit vector of [4 x float].
 139/// \returns
 140///    A 128-bit vector of [8 x fp16]. Lower elements correspond to the
 141///    (converted) elements from \a __B; higher order elements correspond to the
 142///    (converted) elements from \a __A. If corresponding mask bit is not set,
 143///    then zero is taken instead.
 144static __inline__ __m128h __DEFAULT_FN_ATTRS128
 145_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
 146  return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
 147      (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
 148}
 149
 150/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
 151///    single-precision (32-bit) floating-point elements to a 256-bit vector
 152///    containing FP16 elements.
 153///   
 154/// \code{.operation}
 155/// FOR i := 0 to 15 
 156/// 	IF i < 8
 157/// 		dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
 158/// 	ELSE
 159/// 		dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
 160/// 	FI
 161/// ENDFOR
 162///
 163/// dst[MAX:256] := 0
 164/// \endcode
 165///
 166/// \headerfile <immintrin.h>
 167///
 168/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
 169///
 170/// \param __A
 171///    A 256-bit vector of [8 x float].
 172/// \param __B
 173///    A 256-bit vector of [8 x float].
 174/// \returns
 175///    A 256-bit vector of [16 x fp16]. Lower elements correspond to the
 176///    (converted) elements from \a __B; higher order elements correspond to the
 177///    (converted) elements from \a __A.
 178static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
 179                                                                  __m256 __B) {
 180  return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
 181      (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)(-1));
 182}
 183
 184/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
 185///    single-precision (32-bit) floating-point elements to a 256-bit vector
 186///    containing FP16 elements. Merging mask \a __U is used to determine if given
 187///    element should be taken from \a __W instead.
 188///
 189/// \code{.operation}
 190/// FOR i := 0 to 15
 191/// 	IF __U[i]
 192/// 		IF i < 8
 193/// 			dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
 194/// 		ELSE
 195/// 			dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
 196/// 		FI
 197/// 	ELSE
 198/// 		dst.fp16[i] := __W.fp16[i]
 199/// 	FI
 200/// ENDFOR
 201///
 202/// dst[MAX:256] := 0
 203/// \endcode
 204///
 205/// \headerfile <immintrin.h>
 206///
 207/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
 208///
 209/// \param __W
 210///    A 256-bit vector of [16 x fp16].
 211/// \param __U
 212///    A 16-bit merging mask.
 213/// \param __A
 214///    A 256-bit vector of [8 x float].
 215/// \param __B
 216///    A 256-bit vector of [8 x float].
 217/// \returns
 218///    A 256-bit vector of [16 x fp16]. Lower elements correspond to the
 219///    (converted) elements from \a __B; higher order elements correspond to the
 220///    (converted) elements from \a __A. If corresponding mask bit is not set, then
 221///    element from \a __W is taken instead.
 222static __inline__ __m256h __DEFAULT_FN_ATTRS256
 223_mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
 224  return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
 225      (__v8sf)__A, (__v8sf)__B, (__v16hf)__W, (__mmask16)__U);
 226}
 227
 228/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
 229///    single-precision (32-bit) floating-point elements to a 256-bit vector
 230///    containing FP16 elements. Zeroing mask \a __U is used to determine if given
 231///    element should be zeroed instead.
 232///
 233/// \code{.operation}
 234/// FOR i := 0 to 15 
 235/// 	IF __U[i]
 236/// 		IF i < 8
 237/// 			dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
 238/// 		ELSE
 239/// 			dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
 240/// 		FI
 241/// 	ELSE
 242/// 		dst.fp16[i] := 0
 243/// 	FI
 244/// ENDFOR
 245///
 246/// dst[MAX:256] := 0
 247/// \endcode
 248///
 249/// \headerfile <immintrin.h>
 250///
 251/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
 252///
 253/// \param __U
 254///    A 16-bit zeroing mask.
 255/// \param __A
 256///    A 256-bit vector of [8 x float].
 257/// \param __B
 258///    A 256-bit vector of [8 x float].
 259/// \returns
 260///    A 256-bit vector of [16 x fp16]. Lower elements correspond to the
 261///    (converted) elements from \a __B; higher order elements correspond to the
 262///    (converted) elements from \a __A. If corresponding mask bit is not set,
 263///    then zero is taken instead.
 264static __inline__ __m256h __DEFAULT_FN_ATTRS256
 265_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
 266  return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
 267      (__v8sf)__A, (__v8sf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
 268}
 269
 270/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 271///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 272///    16-bit integer stored in \a __B.
 273///
 274/// \code{.operation}
 275/// FOR i := 0 to 7
 276/// 	dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 277/// ENDFOR
 278///
 279/// dst[MAX:64] := 0
 280/// \endcode
 281///
 282/// \headerfile <immintrin.h>
 283///
 284/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
 285///
 286/// \param __A
 287///    A 128-bit vector of [8 x int16].
 288/// \param __B
 289///    A 128-bit vector of [8 x fp16].
 290/// \returns
 291///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
 292///    converted elements from \a __B using biases from \a __A; higher order
 293///    elements are zeroed.
 294static __inline__ __m128i __DEFAULT_FN_ATTRS128
 295_mm_cvtbiasph_bf8(__m128i __A, __m128h __B) {
 296  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
 297      (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
 298}
 299
 300/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 301///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 302///    16-bit integer stored in \a __B. Merging mask \a __U is used to determine if
 303///    given element should be taken from \a __W instead.
 304///
 305/// \code{.operation}
 306/// FOR i := 0 to 7
 307/// 	IF __U[i]
 308/// 		dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 309/// 	ELSE
 310/// 		dst.bf8[i] := __W.bf8[i]
 311/// 	FI
 312/// ENDFOR
 313///
 314/// dst[MAX:64] := 0
 315/// \endcode
 316///
 317/// \headerfile <immintrin.h>
 318///
 319/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
 320///
 321/// \param __W
 322///    A 128-bit vector of [16 x bf8].
 323/// \param __U
 324///    A 8-bit merging mask.
 325/// \param __A
 326///    A 128-bit vector of [8 x int16].
 327/// \param __B
 328///    A 128-bit vector of [8 x fp16].
 329/// \returns
 330///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
 331///    converted elements from \a __B, using biases from \a __A; higher order
 332///    elements are zeroed. If corresponding mask bit is not set, then element
 333///    from \a __W is taken instead.
 334static __inline__ __m128i __DEFAULT_FN_ATTRS128
 335_mm_mask_cvtbiasph_bf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
 336  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
 337      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
 338}
 339
 340/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 341///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 342///    16-bit integer stored in \a __B. Zeroing mask \a __U is used to determine if
 343///    given element should be zeroed instead.
 344///
 345/// \code{.operation}
 346/// FOR i := 0 to 7
 347/// 	IF __U[i]
 348///	 	dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 349///	 ELSE
 350///	 	dst.bf8[i] := 0
 351///	 FI
 352/// ENDFOR
 353///
 354/// dst[MAX:64] := 0
 355/// \endcode
 356///
 357/// \headerfile <immintrin.h>
 358///
 359/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
 360///
 361/// \param __U
 362///    A 8-bit zeroing mask.
 363/// \param __A
 364///    A 128-bit vector of [8 x int16].
 365/// \param __B
 366///    A 128-bit vector of [8 x fp16].
 367/// \returns
 368///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
 369///    converted elements from \a __B, using biases from \a __A; higher order
 370///    elements are zeroed. If corresponding mask bit is not set, then element
 371///    is zeroed.
 372static __inline__ __m128i __DEFAULT_FN_ATTRS128
 373_mm_maskz_cvtbiasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) {
 374  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_128_mask(
 375      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
 376      (__mmask8)__U);
 377}
 378
 379/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 380///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 381///    16-bit integer stored in \a __B.
 382///
 383/// \code{.operation}
 384/// FOR i := 0 to 15
 385/// 	dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 386/// ENDFOR
 387///
 388/// dst[MAX:128] := 0
 389/// \endcode
 390///
 391/// \headerfile <immintrin.h>
 392///
 393/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
 394///
 395/// \param __A
 396///    A 256-bit vector of [16 x int16].
 397/// \param __B
 398///    A 256-bit vector of [16 x fp16].
 399/// \returns
 400///    A 128-bit vector of [16 x bf8]. Elements correspond to the
 401///    converted elements from \a __B using biases from \a __A.
 402static __inline__ __m128i __DEFAULT_FN_ATTRS256
 403_mm256_cvtbiasph_bf8(__m256i __A, __m256h __B) {
 404  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
 405      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
 406      (__mmask16)-1);
 407}
 408
 409/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 410///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 411///    16-bit integer stored in \a __B. Merging mask \a __U is used to determine if
 412///    given element should be taken from \a __W instead.
 413///
 414/// \code{.operation}
 415/// FOR i := 0 to 15
 416/// 	IF __U[i]
 417/// 		dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 418/// 	ELSE
 419/// 		dst.bf8[i] := __W.bf8[i]
 420/// 	FI
 421/// ENDFOR
 422///
 423/// dst[MAX:128] := 0
 424/// \endcode
 425///
 426/// \headerfile <immintrin.h>
 427///
 428/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
 429///
 430/// \param __W
 431///    A 128-bit vector of [16 x bf8].
 432/// \param __U
 433///    A 16-bit merging mask.
 434/// \param __A
 435///    A 256-bit vector of [16 x int16].
 436/// \param __B
 437///    A 256-bit vector of [16 x fp16].
 438/// \returns
 439///    A 128-bit vector of [16 x bf8]. Elements correspond to the converted
 440///    elements from \a __B, using biases from \a __A. If corresponding mask bit
 441///    is not set, then element from \a __W is taken instead.
 442static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_bf8(
 443    __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
 444  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
 445      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
 446}
 447
 448/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 449///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 450///    16-bit integer stored in \a __B. Zeroing mask \a __U is used to determine if
 451///    given element should be zeroed instead.
 452///
 453/// \code{.operation}
 454/// FOR i := 0 to 15
 455/// 	IF __U[i]
 456///	 	dst.bf8[i] := convert_fp16_to_bf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 457///	 ELSE
 458///	 	dst.bf8[i] := 0
 459///	 FI
 460/// ENDFOR
 461///
 462/// dst[MAX:128] := 0
 463/// \endcode
 464///
 465/// \headerfile <immintrin.h>
 466///
 467/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
 468///
 469/// \param __U
 470///    A 16-bit zeroing mask.
 471/// \param __A
 472///    A 256-bit vector of [16 x int16].
 473/// \param __B
 474///    A 256-bit vector of [16 x fp16].
 475/// \returns
 476///    A 128-bit vector of [16 x bf8]. Elements correspond to the converted
 477///    elements from \a __B, using biases from \a __A. If corresponding mask bit
 478///    is not set, then element is zeroed.
 479static __inline__ __m128i __DEFAULT_FN_ATTRS256
 480_mm256_maskz_cvtbiasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) {
 481  return (__m128i)__builtin_ia32_vcvtbiasph2bf8_256_mask(
 482      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
 483      (__mmask16)__U);
 484}
 485
 486/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 487///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 488///    16-bit integer stored in \a __B. Results are saturated.
 489///
 490/// \code{.operation}
 491/// FOR i := 0 to 7
 492/// 	dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 493/// ENDFOR
 494///
 495/// dst[MAX:64] := 0
 496/// \endcode
 497///
 498/// \headerfile <immintrin.h>
 499///
 500/// This intrinsic corresponds to the \c VCVTBIASPH2BF8 instruction.
 501///
 502/// \param __A
 503///    A 128-bit vector of [8 x int16].
 504/// \param __B
 505///    A 128-bit vector of [8 x fp16].
 506/// \returns
 507///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
 508///    converted elements from \a __B using biases from \a __A; higher order
 509///    elements are zeroed.
 510static __inline__ __m128i __DEFAULT_FN_ATTRS128
 511_mm_cvts_biasph_bf8(__m128i __A, __m128h __B) {
 512  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
 513      (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
 514}
 515
 516/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 517///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 518///    16-bit integer stored in \a __B. Results are saturated. Merging mask \a __U
 519///    is used to determine if given element should be taken from \a __W instead.
 520///
 521/// \code{.operation}
 522/// FOR i := 0 to 7
 523/// 	IF __U[i]
 524/// 		dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 525/// 	ELSE
 526/// 		dst.bf8[i] := __W.bf8[i]
 527/// 	FI
 528/// ENDFOR
 529///
 530/// dst[MAX:64] := 0
 531/// \endcode
 532///
 533/// \headerfile <immintrin.h>
 534///
 535/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
 536///
 537/// \param __W
 538///    A 128-bit vector of [16 x bf8].
 539/// \param __U
 540///    A 8-bit merging mask.
 541/// \param __A
 542///    A 128-bit vector of [8 x int16].
 543/// \param __B
 544///    A 128-bit vector of [8 x fp16].
 545/// \returns
 546///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
 547///    converted elements from \a __B, using biases from \a __A; higher order
 548///    elements are zeroed. If corresponding mask bit is not set, then element
 549///    from \a __W is taken instead.
 550static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvts_biasph_bf8(__m128i
 551		__W, __mmask8 __U, __m128i __A, __m128h __B) { return
 552	(__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask( (__v16qi)__A,
 553			(__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U); }
 554
 555/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 556///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 557///    16-bit integer stored in \a __B. Results are saturated. Zeroing mask \a __U
 558///    is used to determine if given element should be zeroed instead.
 559///
 560/// \code{.operation}
 561/// FOR i := 0 to 7
 562/// 	IF __U[i]
 563///	 	dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 564///	 ELSE
 565///	 	dst.bf8[i] := 0
 566///	 FI
 567/// ENDFOR
 568///
 569/// dst[MAX:64] := 0
 570/// \endcode
 571///
 572/// \headerfile <immintrin.h>
 573///
 574/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
 575///
 576/// \param __U
 577///    A 8-bit zeroing mask.
 578/// \param __A
 579///    A 128-bit vector of [8 x int16].
 580/// \param __B
 581///    A 128-bit vector of [8 x fp16].
 582/// \returns
 583///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
 584///    converted elements from \a __B, using biases from \a __A; higher order
 585///    elements are zeroed. If corresponding mask bit is not set, then element
 586///    is zeroed.
 587static __inline__ __m128i __DEFAULT_FN_ATTRS128
 588_mm_maskz_cvts_biasph_bf8(__mmask8 __U, __m128i __A, __m128h __B) {
 589  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_128_mask(
 590      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
 591      (__mmask8)__U);
 592}
 593
 594
 595/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 596///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 597///    16-bit integer stored in \a __B. Results are saturated.
 598///
 599/// \code{.operation}
 600/// FOR i := 0 to 15
 601/// 	dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 602/// ENDFOR
 603///
 604/// dst[MAX:128] := 0
 605/// \endcode
 606///
 607/// \headerfile <immintrin.h>
 608///
 609/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
 610///
 611/// \param __A
 612///    A 256-bit vector of [16 x int16].
 613/// \param __B
 614///    A 256-bit vector of [16 x fp16].
 615/// \returns
 616///    A 128-bit vector of [16 x bf8]. Elements correspond to the
 617///    converted elements from \a __B using biases from \a __A.
 618static __inline__ __m128i __DEFAULT_FN_ATTRS256
 619_mm256_cvts_biasph_bf8(__m256i __A, __m256h __B) {
 620  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
 621      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
 622      (__mmask16)-1);
 623}
 624
 625/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 626///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 627///    16-bit integer stored in \a __B. Results are saturated. Merging mask \a __U
 628///    is used to determine if given element should be taken from \a __W instead.
 629///
 630/// \code{.operation}
 631/// FOR i := 0 to 15
 632/// 	IF __U[i]
 633/// 		dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 634/// 	ELSE
 635/// 		dst.bf8[i] := __W.bf8[i]
 636/// 	FI
 637/// ENDFOR
 638///
 639/// dst[MAX:128] := 0
 640/// \endcode
 641///
 642/// \headerfile <immintrin.h>
 643///
 644/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
 645///
 646/// \param __W
 647///    A 128-bit vector of [16 x bf8].
 648/// \param __U
 649///    A 16-bit merging mask.
 650/// \param __A
 651///    A 256-bit vector of [16 x int16].
 652/// \param __B
 653///    A 256-bit vector of [16 x fp16].
 654/// \returns
 655///    A 128-bit vector of [16 x bf8]. Elements correspond to the converted
 656///    elements from \a __B, using biases from \a __A. If corresponding mask bit
 657///    is not set, then element from \a __W is taken instead.
 658static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvts_biasph_bf8(
 659    __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
 660  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
 661      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
 662}
 663
 664/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 665///    to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
 666///    16-bit integer stored in \a __B. Results are saturated. Zeroing mask \a __U
 667///    is used to determine if given element should be zeroed instead.
 668///
 669/// \code{.operation}
 670/// FOR i := 0 to 15
 671/// 	IF __U[i]
 672///	 	dst.bf8[i] := convert_fp16_to_bf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 673///	 ELSE
 674///	 	dst.bf8[i] := 0
 675///	 FI
 676/// ENDFOR
 677///
 678/// dst[MAX:128] := 0
 679/// \endcode
 680///
 681/// \headerfile <immintrin.h>
 682///
 683/// This intrinsic corresponds to the \c VCVTBIASPH2BF8S instruction.
 684///
 685/// \param __U
 686///    A 16-bit zeroing mask.
 687/// \param __A
 688///    A 256-bit vector of [16 x int16].
 689/// \param __B
 690///    A 256-bit vector of [16 x fp16].
 691/// \returns
 692///    A 128-bit vector of [16 x bf8]. Elements correspond to the converted
 693///    elements from \a __B, using biases from \a __A. If corresponding mask bit
 694///    is not set, then element is zeroed.
 695static __inline__ __m128i __DEFAULT_FN_ATTRS256
 696_mm256_maskz_cvts_biasph_bf8(__mmask16 __U, __m256i __A, __m256h __B) {
 697  return (__m128i)__builtin_ia32_vcvtbiasph2bf8s_256_mask(
 698      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
 699      (__mmask16)__U);
 700}
 701
 702/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 703///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 704///    16-bit integer stored in \a __B.
 705///
 706/// \code{.operation}
 707/// FOR i := 0 to 7
 708/// 	dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 709/// ENDFOR
 710///
 711/// dst[MAX:64] := 0
 712/// \endcode
 713///
 714/// \headerfile <immintrin.h>
 715///
 716/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
 717///
 718/// \param __A
 719///    A 128-bit vector of [8 x int16].
 720/// \param __B
 721///    A 128-bit vector of [8 x fp16].
 722/// \returns
 723///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
 724///    converted elements from \a __B using biases from \a __A; higher order
 725///    elements are zeroed.
 726static __inline__ __m128i __DEFAULT_FN_ATTRS128
 727_mm_cvtbiasph_hf8(__m128i __A, __m128h __B) {
 728  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
 729      (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
 730}
 731
 732/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 733///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 734///    16-bit integer stored in \a __B. Merging mask \a __U is used to determine if
 735///    given element should be taken from \a __W instead.
 736///
 737/// \code{.operation}
 738/// FOR i := 0 to 7
 739/// 	IF __U[i]
 740/// 		dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 741/// 	ELSE
 742/// 		dst.hf8[i] := __W.hf8[i]
 743/// 	FI
 744/// ENDFOR
 745///
 746/// dst[MAX:64] := 0
 747/// \endcode
 748///
 749/// \headerfile <immintrin.h>
 750///
 751/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
 752///
 753/// \param __W
 754///    A 128-bit vector of [16 x hf8].
 755/// \param __U
 756///    A 8-bit merging mask.
 757/// \param __A
 758///    A 128-bit vector of [8 x int16].
 759/// \param __B
 760///    A 128-bit vector of [8 x fp16].
 761/// \returns
 762///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
 763///    converted elements from \a __B, using biases from \a __A; higher order
 764///    elements are zeroed. If corresponding mask bit is not set, then element
 765///    from \a __W is taken instead.
 766static __inline__ __m128i __DEFAULT_FN_ATTRS128
 767_mm_mask_cvtbiasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
 768  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
 769      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
 770}
 771
 772/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 773///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 774///    16-bit integer stored in \a __B. Zeroing mask \a __U is used to determine if
 775///    given element should be zeroed instead.
 776///
 777/// \code{.operation}
 778/// FOR i := 0 to 7
 779/// 	IF __U[i]
 780///	 	dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 781///	 ELSE
 782///	 	dst.hf8[i] := 0
 783///	 FI
 784/// ENDFOR
 785///
 786/// dst[MAX:64] := 0
 787/// \endcode
 788///
 789/// \headerfile <immintrin.h>
 790///
 791/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
 792///
 793/// \param __U
 794///    A 8-bit zeroing mask.
 795/// \param __A
 796///    A 128-bit vector of [8 x int16].
 797/// \param __B
 798///    A 128-bit vector of [8 x fp16].
 799/// \returns
 800///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
 801///    converted elements from \a __B, using biases from \a __A; higher order
 802///    elements are zeroed. If corresponding mask bit is not set, then element
 803///    is zeroed.
 804static __inline__ __m128i __DEFAULT_FN_ATTRS128
 805_mm_maskz_cvtbiasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) {
 806  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_128_mask(
 807      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
 808      (__mmask8)__U);
 809}
 810
 811/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 812///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 813///    16-bit integer stored in \a __B.
 814///
 815/// \code{.operation}
 816/// FOR i := 0 to 15
 817/// 	dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 818/// ENDFOR
 819///
 820/// dst[MAX:128] := 0
 821/// \endcode
 822///
 823/// \headerfile <immintrin.h>
 824///
 825/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
 826///
 827/// \param __A
 828///    A 256-bit vector of [16 x half].
 829/// \param __B
 830///    A 256-bit vector of [16 x i16].
 831/// \returns
 832///    A 128-bit vector of [16 x hf8]. Elements correspond to the
 833///    converted elements from \a __B using biases from \a __A.
 834static __inline__ __m128i __DEFAULT_FN_ATTRS256
 835_mm256_cvtbiasph_hf8(__m256i __A, __m256h __B) {
 836  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
 837      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
 838      (__mmask16)-1);
 839}
 840
 841/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 842///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 843///    16-bit integer stored in \a __B. Merging mask \a __U is used to determine if
 844///    given element should be taken from \a __W instead.
 845///
 846/// \code{.operation}
 847/// FOR i := 0 to 15
 848/// 	IF __U[i]
 849/// 		dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 850/// 	ELSE
 851/// 		dst.hf8[i] := __W.hf8[i]
 852/// 	FI
 853/// ENDFOR
 854///
 855/// dst[MAX:128] := 0
 856/// \endcode
 857///
 858/// \headerfile <immintrin.h>
 859///
 860/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
 861///
 862/// \param __W
 863///    A 128-bit vector of [16 x hf8].
 864/// \param __U
 865///    A 16-bit merging mask.
 866/// \param __A
 867///    A 256-bit vector of [16 x int16].
 868/// \param __B
 869///    A 256-bit vector of [16 x fp16].
 870/// \returns
 871///    A 128-bit vector of [16 x hf8]. Elements correspond to the converted
 872///    elements from \a __B, using biases from \a __A. If corresponding mask bit
 873///    is not set, then element from \a __W is taken instead.
 874static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtbiasph_hf8(
 875    __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
 876  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
 877      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
 878}
 879
 880/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
 881///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 882///    16-bit integer stored in \a __B. Zeroing mask \a __U is used to determine if
 883///    given element should be taken zeroed instead.
 884///
 885/// \code{.operation}
 886/// FOR i := 0 to 15
 887/// 	IF __U[i]
 888///	 	dst.hf8[i] := convert_fp16_to_hf8_with_bias(__A.int8[2 * i], __B.fp16[i])
 889///	 ELSE
 890///	 	dst.hf8[i] := 0
 891///	 FI
 892/// ENDFOR
 893///
 894/// dst[MAX:128] := 0
 895/// \endcode
 896///
 897/// \headerfile <immintrin.h>
 898///
 899/// This intrinsic corresponds to the \c VCVTBIASPH2HF8 instruction.
 900///
 901/// \param __U
 902///    A 16-bit zeroing mask.
 903/// \param __A
 904///    A 256-bit vector of [16 x half].
 905/// \param __B
 906///    A 256-bit vector of [16 x i16].
 907/// \returns
 908///    A 128-bit vector of [16 x hf8]. Elements correspond to the converted
 909///    elements from \a __B, using biases from \a __A. If corresponding mask bit
 910///    is not set, then element is zeroed.
 911static __inline__ __m128i __DEFAULT_FN_ATTRS256
 912_mm256_maskz_cvtbiasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) {
 913  return (__m128i)__builtin_ia32_vcvtbiasph2hf8_256_mask(
 914      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
 915      (__mmask16)__U);
 916}
 917
 918/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 919///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 920///    16-bit integer stored in \a __B. Results are saturated.
 921///
 922/// \code{.operation}
 923/// FOR i := 0 to 7
 924/// 	dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 925/// ENDFOR
 926///
 927/// dst[MAX:64] := 0
 928/// \endcode
 929///
 930/// \headerfile <immintrin.h>
 931///
 932/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S`instruction.
 933///
 934/// \param __A
 935///    A 128-bit vector of [8 x int16].
 936/// \param __B
 937///    A 128-bit vector of [8 x fp16].
 938/// \returns
 939///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
 940///    converted elements from \a __B using biases from \a __A; higher order
 941///    elements are zeroed.
 942static __inline__ __m128i __DEFAULT_FN_ATTRS128
 943_mm_cvts_biasph_hf8(__m128i __A, __m128h __B) {
 944  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
 945      (__v16qi)__A, (__v8hf)__B, (__v16qi)_mm_undefined_si128(), (__mmask8)-1);
 946}
 947
 948/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 949///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 950///    16-bit integer stored in \a __B. Results are saturated. Merging mask \a __U
 951///    is used to determine if given element should be taken from \a __W instead.
 952///
 953/// \code{.operation}
 954/// FOR i := 0 to 7
 955/// 	IF __U[i]
 956/// 		dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 957/// 	ELSE
 958/// 		dst.hf8[i] := __W.hf8[i]
 959/// 	FI
 960/// ENDFOR
 961///
 962/// dst[MAX:64] := 0
 963/// \endcode
 964///
 965/// \headerfile <immintrin.h>
 966///
 967/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
 968///
 969/// \param __W
 970///    A 128-bit vector of [16 x hf8].
 971/// \param __U
 972///    A 8-bit merging mask.
 973/// \param __A
 974///    A 128-bit vector of [8 x int16].
 975/// \param __B
 976///    A 128-bit vector of [8 x fp16].
 977/// \returns
 978///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
 979///    converted elements from \a __B, using biases from \a __A; higher order
 980///    elements are zeroed. If corresponding mask bit is not set, then element
 981///    from \a __W is taken instead.
 982static __inline__ __m128i __DEFAULT_FN_ATTRS128
 983_mm_mask_cvts_biasph_hf8(__m128i __W, __mmask8 __U, __m128i __A, __m128h __B) {
 984  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
 985      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)__W, (__mmask8)__U);
 986}
 987
 988/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
 989///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
 990///    16-bit integer stored in \a __B. Results are saturated. Zeroing mask \a __U
 991///    is used to determine if given element should be zeroed instead.
 992///
 993/// \code{.operation}
 994/// FOR i := 0 to 7
 995/// 	IF __U[i]
 996///	 	dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
 997///	 ELSE
 998///	 	dst.hf8[i] := 0
 999///	 FI
1000/// ENDFOR
1001///
1002/// dst[MAX:64] := 0
1003/// \endcode
1004///
1005/// \headerfile <immintrin.h>
1006///
1007/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
1008///
1009/// \param __U
1010///    A 8-bit zeroing mask.
1011/// \param __A
1012///    A 128-bit vector of [8 x int16].
1013/// \param __B
1014///    A 128-bit vector of [8 x fp16].
1015/// \returns
1016///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
1017///    converted elements from \a __B, using biases from \a __A; higher order
1018///    elements are zeroed. If corresponding mask bit is not set, then element
1019///    is zeroed.
1020static __inline__ __m128i __DEFAULT_FN_ATTRS128
1021_mm_maskz_cvts_biasph_hf8(__mmask8 __U, __m128i __A, __m128h __B) {
1022  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_128_mask(
1023      (__v16qi)__A, (__v8hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
1024      (__mmask8)__U);
1025}
1026
1027/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
1028///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
1029///    16-bit integer stored in \a __B. Results are saturated.
1030///
1031/// \code{.operation}
1032/// FOR i := 0 to 15
1033/// 	dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
1034/// ENDFOR
1035///
1036/// dst[MAX:128] := 0
1037/// \endcode
1038///
1039/// \headerfile <immintrin.h>
1040///
1041/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
1042///
1043/// \param __A
1044///    A 256-bit vector of [16 x int16].
1045/// \param __B
1046///    A 256-bit vector of [16 x fp16].
1047/// \returns
1048///    A 128-bit vector of [16 x hf8]. Elements correspond to the
1049///    converted elements from \a __B using biases from \a __A.
1050static __inline__ __m128i __DEFAULT_FN_ATTRS256
1051_mm256_cvts_biasph_hf8(__m256i __A, __m256h __B) {
1052  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
1053      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_undefined_si128(),
1054      (__mmask16)-1);
1055}
1056
1057/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
1058///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
1059///    16-bit integer stored in \a __B. Results are saturated. Merging mask \a __U
1060///    is used to determine if given element should be taken from \a __W instead.
1061///
1062/// \code{.operation}
1063/// FOR i := 0 to 15
1064/// 	IF __U[i]
1065/// 		dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
1066/// 	ELSE
1067/// 		dst.hf8[i] := __W.hf8[i]
1068/// 	FI
1069/// ENDFOR
1070///
1071/// dst[MAX:128] := 0
1072/// \endcode
1073///
1074/// \headerfile <immintrin.h>
1075///
1076/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
1077///
1078/// \param __W
1079///    A 128-bit vector of [16 x hf8].
1080/// \param __U
1081///    A 16-bit merging mask.
1082/// \param __A
1083///    A 256-bit vector of [16 x int16].
1084/// \param __B
1085///    A 256-bit vector of [16 x fp16].
1086/// \returns
1087///    A 128-bit vector of [16 x hf8]. Elements correspond to the converted
1088///    elements from \a __B, using biases from \a __A. If corresponding mask bit
1089///    is not set, then element from \a __W is taken instead.
1090static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvts_biasph_hf8(
1091    __m128i __W, __mmask16 __U, __m256i __A, __m256h __B) {
1092  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
1093      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)__W, (__mmask16)__U);
1094}
1095
1096/// Convert 256-bit vector \a __B containing packed FP16 floating-point elements
1097///    to FP8 E4M3 numbers, using conversion biases stored in lower 8 bits of each
1098///    16-bit integer stored in \a __B. Results are saturated. Zeroing mask \a __U
1099///    is used to determine if given element should be zeroed instead.
1100///
1101/// \code{.operation}
1102/// FOR i := 0 to 15
1103/// 	IF __U[i]
1104///	 	dst.hf8[i] := convert_fp16_to_hf8_with_bias_saturate(__A.int8[2 * i], __B.fp16[i])
1105///	 ELSE
1106///	 	dst.hf8[i] := 0
1107///	 FI
1108/// ENDFOR
1109///
1110/// dst[MAX:128] := 0
1111/// \endcode
1112///
1113/// \headerfile <immintrin.h>
1114///
1115/// This intrinsic corresponds to the \c VCVTBIASPH2HF8S instruction.
1116///
1117/// \param __U
1118///    A 16-bit zeroing mask.
1119/// \param __A
1120///    A 256-bit vector of [16 x int16].
1121/// \param __B
1122///    A 256-bit vector of [16 x fp16].
1123/// \returns
1124///    A 128-bit vector of [16 x hf8]. Elements correspond to the converted
1125///    elements from \a __B, using biases from \a __A. If corresponding mask bit
1126///    is not set, then element is zeroed.
1127static __inline__ __m128i __DEFAULT_FN_ATTRS256
1128_mm256_maskz_cvts_biasph_hf8(__mmask16 __U, __m256i __A, __m256h __B) {
1129  return (__m128i)__builtin_ia32_vcvtbiasph2hf8s_256_mask(
1130      (__v32qi)__A, (__v16hf)__B, (__v16qi)(__m128i)_mm_setzero_si128(),
1131      (__mmask16)__U);
1132}
1133
1134/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1135///    floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1136///
1137/// \code{.operation}
1138/// FOR i := 0 to 15 
1139/// 	IF i < 8
1140/// 		dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1141/// 	ELSE
1142/// 		dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 8])
1143/// 	FI
1144/// ENDFOR
1145///
1146/// dst[MAX:128] := 0
1147/// \endcode
1148///
1149/// \headerfile <immintrin.h>
1150///
1151/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1152///
1153/// \param __A
1154///    A 128-bit vector of [8 x fp16].
1155/// \param __B
1156///    A 128-bit vector of [8 x fp16].
1157/// \returns
1158///    A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1159///    (converted) elements from \a __B; higher order elements correspond to the
1160///    (converted) elements from \a __A.
1161static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_bf8(__m128h __A,
1162                                                                  __m128h __B) {
1163  return (__m128i)__builtin_ia32_vcvt2ph2bf8_128((__v8hf)(__A),
1164                                                   (__v8hf)(__B));
1165}
1166
1167/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1168///    floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1169///    Merging mask \a __U is used to determine if given element should be taken
1170///    from \a __W instead.
1171///
1172/// \code{.operation}
1173/// FOR i := 0 to 15
1174/// 	IF __U[i]
1175/// 		IF i < 8
1176/// 			dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1177/// 		ELSE
1178/// 			dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 8])
1179/// 		FI
1180/// 	ELSE
1181/// 		dst.bf8[i] := __W.bf8[i]
1182/// 	FI
1183/// ENDFOR
1184///
1185/// dst[MAX:128] := 0
1186/// \endcode
1187///
1188/// \headerfile <immintrin.h>
1189///
1190/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1191///
1192/// \param __W
1193///    A 128-bit vector of [16 x bf8].
1194/// \param __U
1195///    A 16-bit merging mask.
1196/// \param __A
1197///    A 128-bit vector of [8 x fp16].
1198/// \param __B
1199///    A 128-bit vector of [8 x fp16].
1200/// \returns
1201///    A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1202///    (converted) elements from \a __B; higher order elements correspond to the
1203///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1204///    element from \a __W is taken instead.
1205static __inline__ __m128i __DEFAULT_FN_ATTRS128
1206_mm_mask_cvt2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
1207  return (__m128i)__builtin_ia32_selectb_128(
1208      (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B), (__v16qi)__W);
1209}
1210
1211/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1212///    floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1213///    Zeroing mask \a __U is used to determine if given element should be zeroed
1214///    instead.
1215///
1216/// \code{.operation}
1217/// FOR i := 0 to 15 
1218/// 	IF __U[i]
1219/// 		IF i < 8
1220/// 			dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1221/// 		ELSE
1222/// 			dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 8])
1223/// 		FI
1224/// 	ELSE
1225/// 		dst.bf8[i] := 0
1226/// 	FI
1227/// ENDFOR
1228///
1229/// dst[MAX:128] := 0
1230/// \endcode
1231///
1232/// \headerfile <immintrin.h>
1233///
1234/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1235///
1236/// \param __U
1237///    A 16-bit zeroing mask.
1238/// \param __A
1239///    A 128-bit vector of [8 x fp16].
1240/// \param __B
1241///    A 128-bit vector of [8 x fp16].
1242/// \returns
1243///    A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1244///    (converted) elements from \a __B; higher order elements correspond to the
1245///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1246///    zero is taken instead.
1247static __inline__ __m128i __DEFAULT_FN_ATTRS128
1248_mm_maskz_cvt2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) {
1249  return (__m128i)__builtin_ia32_selectb_128(
1250      (__mmask16)__U, (__v16qi)_mm_cvt2ph_bf8(__A, __B),
1251      (__v16qi)(__m128i)_mm_setzero_si128());
1252}
1253
1254/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1255///    floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1256///
1257/// \code{.operation}
1258/// FOR i := 0 to 31 
1259/// 	IF i < 16 
1260/// 		dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1261/// 	ELSE
1262/// 		dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 16])
1263/// 	FI
1264/// ENDFOR
1265///
1266/// dst[MAX:256] := 0
1267/// \endcode
1268///
1269/// \headerfile <immintrin.h>
1270///
1271/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1272///
1273/// \param __A
1274///    A 256-bit vector of [16 x fp16].
1275/// \param __B
1276///    A 256-bit vector of [16 x fp16].
1277/// \returns
1278///    A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1279///    (converted) elements from \a __B; higher order elements correspond to the
1280///    (converted) elements from \a __A.
1281static __inline__ __m256i __DEFAULT_FN_ATTRS256
1282_mm256_cvt2ph_bf8(__m256h __A, __m256h __B) {
1283  return (__m256i)__builtin_ia32_vcvt2ph2bf8_256((__v16hf)(__A),
1284                                                   (__v16hf)(__B));
1285}
1286
1287/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1288///    floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1289///    Merging mask \a __U is used to determine if given element should be taken
1290///    from \a __W instead.
1291///
1292/// \code{.operation}
1293/// FOR i := 0 to 31 
1294/// 	IF __U[i]
1295/// 		IF i < 16 
1296/// 			dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1297/// 		ELSE
1298/// 			dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 16])
1299/// 		FI
1300/// 	ELSE
1301/// 		dst.bf8[i] := __W.bf8[i]
1302/// 	FI
1303/// ENDFOR
1304///
1305/// dst[MAX:256] := 0
1306/// \endcode
1307///
1308/// \headerfile <immintrin.h>
1309///
1310/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1311///
1312/// \param __W
1313///    A 256-bit vector of [32 x bf8].
1314/// \param __U
1315///    A 32-bit merging mask.
1316/// \param __A
1317///    A 256-bit vector of [16 x fp16].
1318/// \param __B
1319///    A 256-bit vector of [16 x fp16].
1320/// \returns
1321///    A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1322///    (converted) elements from \a __B; higher order elements correspond to the
1323///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1324///    element from \a __W is taken instead.
1325static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvt2ph_bf8(
1326    __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
1327  return (__m256i)__builtin_ia32_selectb_256(
1328      (__mmask32)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B), (__v32qi)__W);
1329}
1330
1331/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1332///    floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1333///    Zeroing mask \a __U is used to determine if given element should be zeroed
1334///    instead.
1335///
1336/// \code{.operation}
1337/// FOR i := 0 to 31 
1338/// 	IF __U[i]
1339/// 		IF i < 16 
1340/// 			dst.bf8[i] := convert_fp16_to_bf8(__B.fp16[i])
1341/// 		ELSE
1342/// 			dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i - 16])
1343/// 		FI
1344/// 	ELSE
1345/// 		dst.bf8[i] := 0
1346/// 	FI
1347/// ENDFOR
1348///
1349/// dst[MAX:256] := 0
1350/// \endcode
1351///
1352/// \headerfile <immintrin.h>
1353///
1354/// This intrinsic corresponds to the \c VCVT2PH2BF8 instruction.
1355///
1356/// \param __U
1357///    A 32-bit zeroing mask.
1358/// \param __A
1359///    A 256-bit vector of [16 x fp16].
1360/// \param __B
1361///    A 256-bit vector of [16 x fp16].
1362/// \returns
1363///    A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1364///    (converted) elements from \a __B; higher order elements correspond to the
1365///    (converted) elements from \a __A. If corresponding mask bit is not set,
1366///    zero is taken instead.
1367static __inline__ __m256i __DEFAULT_FN_ATTRS256
1368_mm256_maskz_cvt2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
1369  return (__m256i)__builtin_ia32_selectb_256(
1370      (__mmask32)__U, (__v32qi)_mm256_cvt2ph_bf8(__A, __B),
1371      (__v32qi)(__m256i)_mm256_setzero_si256());
1372}
1373
1374/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1375///    floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1376///    Resulting elements are saturated in case of overflow.
1377///
1378/// \code{.operation}
1379/// FOR i := 0 to 15 
1380/// 	IF i < 8
1381/// 		dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1382/// 	ELSE
1383/// 		dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 8])
1384/// 	FI
1385/// ENDFOR
1386///
1387/// dst[MAX:128] := 0
1388/// \endcode
1389///
1390/// \headerfile <immintrin.h>
1391///
1392/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1393///
1394/// \param __A
1395///    A 128-bit vector of [8 x fp16].
1396/// \param __B
1397///    A 128-bit vector of [8 x fp16].
1398/// \returns
1399///    A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1400///    (converted) elements from \a __B; higher order elements correspond to the
1401///    (converted) elements from \a __A.
1402static __inline__ __m128i __DEFAULT_FN_ATTRS128
1403_mm_cvts_2ph_bf8(__m128h __A, __m128h __B) {
1404  return (__m128i)__builtin_ia32_vcvt2ph2bf8s_128((__v8hf)(__A),
1405                                                    (__v8hf)(__B));
1406}
1407
1408/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1409///    floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1410///    Merging mask \a __U is used to determine if given element should be taken
1411///    from \a __W instead. Resulting elements are saturated in case of overflow.
1412///
1413/// \code{.operation}
1414/// FOR i := 0 to 15 
1415/// 	IF __U[i]
1416/// 		IF i < 8
1417/// 			dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1418/// 		ELSE
1419/// 			dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 8])
1420/// 		FI
1421/// 	ELSE
1422/// 		dst.bf8[i] := __W.bf8[i]
1423/// 	FI
1424/// ENDFOR
1425///
1426/// dst[MAX:128] := 0
1427/// \endcode
1428///
1429/// \headerfile <immintrin.h>
1430///
1431/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1432///
1433/// \param __W
1434///    A 128-bit vector of [16 x bf8].
1435/// \param __U
1436///    A 16-bit merging mask.
1437/// \param __A
1438///    A 128-bit vector of [8 x fp16].
1439/// \param __B
1440///    A 128-bit vector of [8 x fp16].
1441/// \returns
1442///    A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1443///    (converted) elements from \a __B; higher order elements correspond to the
1444///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1445///    element from \a __W is taken instead.
1446static __inline__ __m128i __DEFAULT_FN_ATTRS128
1447_mm_mask_cvts_2ph_bf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
1448  return (__m128i)__builtin_ia32_selectb_128(
1449      (__mmask16)__U, (__v16qi)_mm_cvts_2ph_bf8(__A, __B), (__v16qi)__W);
1450}
1451
1452/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1453///    floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
1454///    Zeroing mask \a __U is used to determine if given element should be zeroed
1455///    instead. Resulting elements are saturated in case of overflow.
1456///
1457/// \code{.operation}
1458/// FOR i := 0 to 15 
1459/// 	IF __U[i]
1460/// 		IF i < 8
1461/// 			dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1462/// 		ELSE
1463/// 			dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 8])
1464/// 		FI
1465/// 	ELSE
1466/// 		dst.bf8[i] := 0
1467/// 	FI
1468/// ENDFOR
1469///
1470/// dst[MAX:128] := 0
1471/// \endcode
1472///
1473/// \headerfile <immintrin.h>
1474///
1475/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1476///
1477/// \param __U
1478///    A 16-bit zeroing mask.
1479/// \param __A
1480///    A 128-bit vector of [8 x fp16].
1481/// \param __B
1482///    A 128-bit vector of [8 x fp16].
1483/// \returns
1484///    A 128-bit vector of [16 x bf8]. Lower 8 elements correspond to the
1485///    (converted) elements from \a __B; higher order elements correspond to the
1486///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1487///    zero is taken instead.
1488static __inline__ __m128i __DEFAULT_FN_ATTRS128
1489_mm_maskz_cvts_2ph_bf8(__mmask16 __U, __m128h __A, __m128h __B) {
1490  return (__m128i)__builtin_ia32_selectb_128(
1491      (__mmask16)__U, (__v16qi)_mm_cvts_2ph_bf8(__A, __B),
1492      (__v16qi)(__m128i)_mm_setzero_si128());
1493}
1494
1495/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1496///    floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1497///    Resulting elements are saturated in case of overflow.
1498///
1499/// \code{.operation}
1500/// FOR i := 0 to 31 
1501/// 	IF i < 16 
1502/// 		dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1503/// 	ELSE
1504/// 		dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 16])
1505/// 	FI
1506/// ENDFOR
1507///
1508/// dst[MAX:256] := 0
1509/// \endcode
1510///
1511/// \headerfile <immintrin.h>
1512///
1513/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1514///
1515/// \param __A
1516///    A 256-bit vector of [16 x fp16].
1517/// \param __B
1518///    A 256-bit vector of [16 x fp16].
1519/// \returns
1520///    A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1521///    (converted) elements from \a __B; higher order elements correspond to the
1522///    (converted) elements from \a __A.
1523static __inline__ __m256i __DEFAULT_FN_ATTRS256
1524_mm256_cvts_2ph_bf8(__m256h __A, __m256h __B) {
1525  return (__m256i)__builtin_ia32_vcvt2ph2bf8s_256((__v16hf)(__A),
1526                                                    (__v16hf)(__B));
1527}
1528
1529/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1530///    floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1531///    Merging mask \a __U is used to determine if given element should be taken
1532///    from \a __W instead. Resulting elements are saturated in case of overflow.
1533///
1534/// \code{.operation}
1535/// FOR i := 0 to 31 
1536/// 	IF __U[i]
1537/// 		IF i < 16 
1538/// 			dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1539/// 		ELSE
1540/// 			dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 16])
1541/// 		FI
1542/// 	ELSE
1543/// 		dst.bf8[i] := __W.bf8[i]
1544/// 	FI
1545/// ENDFOR
1546///
1547/// dst[MAX:256] := 0
1548/// \endcode
1549///
1550/// \headerfile <immintrin.h>
1551///
1552/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1553///
1554/// \param __W
1555///    A 256-bit vector of [32 x bf8].
1556/// \param __U
1557///    A 32-bit merging mask.
1558/// \param __A
1559///    A 256-bit vector of [16 x fp16].
1560/// \param __B
1561///    A 256-bit vector of [16 x fp16].
1562/// \returns
1563///    A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1564///    (converted) elements from \a __B; higher order elements correspond to the
1565///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1566///    element from \a __W is taken instead.
1567static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvts_2ph_bf8(
1568    __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
1569  return (__m256i)__builtin_ia32_selectb_256(
1570      (__mmask32)__U, (__v32qi)_mm256_cvts_2ph_bf8(__A, __B), (__v32qi)__W);
1571}
1572
1573/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1574///    floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
1575///    Zeroing mask \a __U is used to determine if given element should be zeroed
1576///    instead. Resulting elements are saturated in case of overflow.
1577///
1578/// \code{.operation}
1579/// FOR i := 0 to 31 
1580/// 	IF __U[i]
1581/// 		IF i < 16 
1582/// 			dst.bf8[i] := convert_fp16_to_bf8_saturate(__B.fp16[i])
1583/// 		ELSE
1584/// 			dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i - 16])
1585/// 		FI
1586/// 	ELSE
1587/// 		dst.bf8[i] := 0
1588/// 	FI
1589/// ENDFOR
1590///
1591/// dst[MAX:256] := 0
1592/// \endcode
1593///
1594/// \headerfile <immintrin.h>
1595///
1596/// This intrinsic corresponds to the \c VCVT2PH2BF8S instruction.
1597///
1598/// \param __U
1599///    A 32-bit zeroing mask.
1600/// \param __A
1601///    A 256-bit vector of [16 x fp16].
1602/// \param __B
1603///    A 256-bit vector of [16 x fp16].
1604/// \returns
1605///    A 256-bit vector of [32 x bf8]. Lower 16 elements correspond to the
1606///    (converted) elements from \a __B; higher order elements correspond to the
1607///    (converted) elements from \a __A. If corresponding mask bit is not set,
1608///    zero is taken instead.
1609static __inline__ __m256i __DEFAULT_FN_ATTRS256
1610_mm256_maskz_cvts_2ph_bf8(__mmask32 __U, __m256h __A, __m256h __B) {
1611  return (__m256i)__builtin_ia32_selectb_256(
1612      (__mmask32)__U, (__v32qi)_mm256_cvts_2ph_bf8(__A, __B),
1613      (__v32qi)(__m256i)_mm256_setzero_si256());
1614}
1615
1616/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1617///    floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1618///
1619/// \code{.operation}
1620/// FOR i := 0 to 15 
1621/// 	IF i < 8
1622/// 		dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1623/// 	ELSE
1624/// 		dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 8])
1625/// 	FI
1626/// ENDFOR
1627///
1628/// dst[MAX:128] := 0
1629/// \endcode
1630///
1631/// \headerfile <immintrin.h>
1632///
1633/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1634///
1635/// \param __A
1636///    A 128-bit vector of [8 x fp16].
1637/// \param __B
1638///    A 128-bit vector of [8 x fp16].
1639/// \returns
1640///    A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1641///    (converted) elements from \a __B; higher order elements correspond to the
1642///    (converted) elements from \a __A.
1643static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvt2ph_hf8(__m128h __A,
1644                                                                  __m128h __B) {
1645  return (__m128i)__builtin_ia32_vcvt2ph2hf8_128((__v8hf)(__A),
1646                                                   (__v8hf)(__B));
1647}
1648
1649/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1650///    floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1651///    Merging mask \a __U is used to determine if given element should be taken
1652///    from \a __W instead.
1653///
1654/// \code{.operation}
1655/// FOR i := 0 to 15 
1656/// 	IF __U[i]
1657/// 		IF i < 8
1658/// 			dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1659/// 		ELSE
1660/// 			dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 8])
1661/// 		FI
1662/// 	ELSE
1663/// 		dst.hf8[i] := __W.hf8[i]
1664/// 	FI
1665/// ENDFOR
1666///
1667/// dst[MAX:128] := 0
1668/// \endcode
1669///
1670/// \headerfile <immintrin.h>
1671///
1672/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1673///
1674/// \param __W
1675///    A 128-bit vector of [16 x hf8].
1676/// \param __U
1677///    A 16-bit merging mask.
1678/// \param __A
1679///    A 128-bit vector of [8 x fp16].
1680/// \param __B
1681///    A 128-bit vector of [8 x fp16].
1682/// \returns
1683///    A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1684///    (converted) elements from \a __B; higher order elements correspond to the
1685///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1686///    element from \a __W is taken instead.
1687static __inline__ __m128i __DEFAULT_FN_ATTRS128
1688_mm_mask_cvt2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
1689  return (__m128i)__builtin_ia32_selectb_128(
1690      (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B), (__v16qi)__W);
1691}
1692
1693/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1694///    floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1695///    Zeroing mask \a __U is used to determine if given element should be zeroed
1696///    instead.
1697///
1698/// \code{.operation}
1699/// FOR i := 0 to 15 
1700/// 	IF __U[i]
1701/// 		IF i < 8
1702/// 			dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1703/// 		ELSE
1704/// 			dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 8])
1705/// 		FI
1706/// 	ELSE
1707/// 		dst.hf8[i] := 0
1708/// 	FI
1709/// ENDFOR
1710///
1711/// dst[MAX:128] := 0
1712/// \endcode
1713///
1714/// \headerfile <immintrin.h>
1715///
1716/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1717///
1718/// \param __U
1719///    A 16-bit zeroing mask.
1720/// \param __A
1721///    A 128-bit vector of [8 x fp16].
1722/// \param __B
1723///    A 128-bit vector of [8 x fp16].
1724/// \returns
1725///    A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1726///    (converted) elements from \a __B; higher order elements correspond to the
1727///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1728///    zero is taken instead.
1729static __inline__ __m128i __DEFAULT_FN_ATTRS128
1730_mm_maskz_cvt2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) {
1731  return (__m128i)__builtin_ia32_selectb_128(
1732      (__mmask16)__U, (__v16qi)_mm_cvt2ph_hf8(__A, __B),
1733      (__v16qi)(__m128i)_mm_setzero_si128());
1734}
1735
1736/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1737///    floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
1738///
1739/// \code{.operation}
1740/// FOR i := 0 to 31 
1741/// 	IF i < 16 
1742/// 		dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1743/// 	ELSE
1744/// 		dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 16])
1745/// 	FI
1746/// ENDFOR
1747///
1748/// dst[MAX:256] := 0
1749/// \endcode
1750///
1751/// \headerfile <immintrin.h>
1752///
1753/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1754///
1755/// \param __A
1756///    A 256-bit vector of [16 x fp16].
1757/// \param __B
1758///    A 256-bit vector of [16 x fp16].
1759/// \returns
1760///    A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
1761///    (converted) elements from \a __B; higher order elements correspond to the
1762///    (converted) elements from \a __A.
1763static __inline__ __m256i __DEFAULT_FN_ATTRS256
1764_mm256_cvt2ph_hf8(__m256h __A, __m256h __B) {
1765  return (__m256i)__builtin_ia32_vcvt2ph2hf8_256((__v16hf)(__A),
1766                                                   (__v16hf)(__B));
1767}
1768
1769/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1770///    floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
1771///    Merging mask \a __U is used to determine if given element should be taken
1772///    from \a __W instead.
1773///
1774/// \code{.operation}
1775/// FOR i := 0 to 31 
1776/// 	IF __U[i]
1777/// 		IF i < 16 
1778/// 			dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1779/// 		ELSE
1780/// 			dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 16])
1781/// 		FI
1782/// 	ELSE
1783/// 		dst.hf8[i] := __W.hf8[i]
1784/// 	FI
1785/// ENDFOR
1786///
1787/// dst[MAX:256] := 0
1788/// \endcode
1789///
1790/// \headerfile <immintrin.h>
1791///
1792/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1793///
1794/// \param __W
1795///    A 256-bit vector of [32 x hf8].
1796/// \param __U
1797///    A 32-bit merging mask.
1798/// \param __A
1799///    A 256-bit vector of [16 x fp16].
1800/// \param __B
1801///    A 256-bit vector of [16 x fp16].
1802/// \returns
1803///    A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
1804///    (converted) elements from \a __B; higher order elements correspond to the
1805///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1806///    element from \a __W is taken instead.
1807static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvt2ph_hf8(
1808    __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
1809  return (__m256i)__builtin_ia32_selectb_256(
1810      (__mmask32)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B), (__v32qi)__W);
1811}
1812
1813/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1814///    floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
1815///    Zeroing mask \a __U is used to determine if given element should be zeroed
1816///    instead.
1817///
1818/// \code{.operation}
1819/// FOR i := 0 to 31 
1820/// 	IF __U[i]
1821/// 		IF i < 16 
1822/// 			dst.hf8[i] := convert_fp16_to_hf8(__B.fp16[i])
1823/// 		ELSE
1824/// 			dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i - 16])
1825/// 		FI
1826/// 	ELSE
1827/// 		dst.hf8[i] := 0
1828/// 	FI
1829/// ENDFOR
1830///
1831/// dst[MAX:256] := 0
1832/// \endcode
1833///
1834/// \headerfile <immintrin.h>
1835///
1836/// This intrinsic corresponds to the \c VCVT2PH2HF8 instruction.
1837///
1838/// \param __U
1839///    A 32-bit zeroing mask.
1840/// \param __A
1841///    A 256-bit vector of [16 x fp16].
1842/// \param __B
1843///    A 256-bit vector of [16 x fp16].
1844/// \returns
1845///    A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
1846///    (converted) elements from \a __B; higher order elements correspond to the
1847///    (converted) elements from \a __A. If corresponding mask bit is not set,
1848///    zero is taken instead.
1849static __inline__ __m256i __DEFAULT_FN_ATTRS256
1850_mm256_maskz_cvt2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
1851  return (__m256i)__builtin_ia32_selectb_256(
1852      (__mmask32)__U, (__v32qi)_mm256_cvt2ph_hf8(__A, __B),
1853      (__v32qi)(__m256i)_mm256_setzero_si256());
1854}
1855
1856/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1857///    floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1858///    Resulting elements are saturated in case of overflow.
1859///
1860/// \code{.operation}
1861/// FOR i := 0 to 15 
1862/// 	IF i < 8
1863/// 		dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
1864/// 	ELSE
1865/// 		dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 8])
1866/// 	FI
1867/// ENDFOR
1868///
1869/// dst[MAX:128] := 0
1870/// \endcode
1871///
1872/// \headerfile <immintrin.h>
1873///
1874/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
1875///
1876/// \param __A
1877///    A 128-bit vector of [8 x fp16].
1878/// \param __B
1879///    A 128-bit vector of [8 x fp16].
1880/// \returns
1881///    A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1882///    (converted) elements from \a __B; higher order elements correspond to the
1883///    (converted) elements from \a __A.
1884static __inline__ __m128i __DEFAULT_FN_ATTRS128
1885_mm_cvts_2ph_hf8(__m128h __A, __m128h __B) {
1886  return (__m128i)__builtin_ia32_vcvt2ph2hf8s_128((__v8hf)(__A),
1887                                                    (__v8hf)(__B));
1888}
1889
1890/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1891///    floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1892///    Merging mask \a __U is used to determine if given element should be taken
1893///    from \a __W instead. Resulting elements are saturated in case of overflow.
1894///
1895/// \code{.operation}
1896/// FOR i := 0 to 15 
1897/// 	IF __U[i]
1898/// 		IF i < 8
1899/// 			dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
1900/// 		ELSE
1901/// 			dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 8])
1902/// 		FI
1903/// 	ELSE
1904/// 		dst.hf8[i] := __W.hf8[i]
1905/// 	FI
1906/// ENDFOR
1907///
1908/// dst[MAX:128] := 0
1909/// \endcode
1910///
1911/// \headerfile <immintrin.h>
1912///
1913/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
1914///
1915/// \param __W
1916///    A 128-bit vector of [16 x hf8].
1917/// \param __U
1918///    A 16-bit merging mask.
1919/// \param __A
1920///    A 128-bit vector of [8 x fp16].
1921/// \param __B
1922///    A 128-bit vector of [8 x fp16].
1923/// \returns
1924///    A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1925///    (converted) elements from \a __B; higher order elements correspond to the
1926///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1927///    element from \a __W is taken instead.
1928static __inline__ __m128i __DEFAULT_FN_ATTRS128
1929_mm_mask_cvts_2ph_hf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
1930  return (__m128i)__builtin_ia32_selectb_128(
1931      (__mmask16)__U, (__v16qi)_mm_cvts_2ph_hf8(__A, __B), (__v16qi)__W);
1932}
1933
1934/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
1935///    floating-point elements to a 128-bit vector containing E4M3 FP8 elements.
1936///    Zeroing mask \a __U is used to determine if given element should be zeroed
1937///    instead. Resulting elements are saturated in case of overflow.
1938///
1939/// \code{.operation}
1940/// FOR i := 0 to 15 
1941/// 	IF __U[i]
1942/// 		IF i < 8
1943/// 			dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
1944/// 		ELSE
1945/// 			dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 8])
1946/// 		FI
1947/// 	ELSE
1948/// 		dst.hf8[i] := 0
1949/// 	FI
1950/// ENDFOR
1951///
1952/// dst[MAX:128] := 0
1953/// \endcode
1954///
1955/// \headerfile <immintrin.h>
1956///
1957/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
1958///
1959/// \param __U
1960///    A 16-bit zeroing mask.
1961/// \param __A
1962///    A 128-bit vector of [8 x fp16].
1963/// \param __B
1964///    A 128-bit vector of [8 x fp16].
1965/// \returns
1966///    A 128-bit vector of [16 x hf8]. Lower 8 elements correspond to the
1967///    (converted) elements from \a __B; higher order elements correspond to the
1968///    (converted) elements from \a __A. If corresponding mask bit is not set, then
1969///    zero is taken instead.
1970static __inline__ __m128i __DEFAULT_FN_ATTRS128
1971_mm_maskz_cvts_2ph_hf8(__mmask16 __U, __m128h __A, __m128h __B) {
1972  return (__m128i)__builtin_ia32_selectb_128(
1973      (__mmask16)__U, (__v16qi)_mm_cvts_2ph_hf8(__A, __B),
1974      (__v16qi)(__m128i)_mm_setzero_si128());
1975}
1976
1977/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
1978///    floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
1979///    Resulting elements are saturated in case of overflow.
1980///
1981/// \code{.operation}
1982/// FOR i := 0 to 31 
1983/// 	IF i < 16 
1984/// 		dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
1985/// 	ELSE
1986/// 		dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 16])
1987/// 	FI
1988/// ENDFOR
1989///
1990/// dst[MAX:256] := 0
1991/// \endcode
1992///
1993/// \headerfile <immintrin.h>
1994///
1995/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
1996///
1997/// \param __A
1998///    A 256-bit vector of [16 x fp16].
1999/// \param __B
2000///    A 256-bit vector of [16 x fp16].
2001/// \returns
2002///    A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
2003///    (converted) elements from \a __B; higher order elements correspond to the
2004///    (converted) elements from \a __A.
2005static __inline__ __m256i __DEFAULT_FN_ATTRS256
2006_mm256_cvts_2ph_hf8(__m256h __A, __m256h __B) {
2007  return (__m256i)__builtin_ia32_vcvt2ph2hf8s_256((__v16hf)(__A),
2008                                                    (__v16hf)(__B));
2009}
2010
2011/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
2012///    floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
2013///    Merging mask \a __U is used to determine if given element should be taken
2014///    from \a __W instead. Resulting elements are saturated in case of overflow.
2015///
2016/// \code{.operation}
2017/// FOR i := 0 to 31 
2018/// 	IF __U[i]
2019/// 		IF i < 16 
2020/// 			dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
2021/// 		ELSE
2022/// 			dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 16])
2023/// 		FI
2024/// 	ELSE
2025/// 		dst.hf8[i] := __W.hf8[i]
2026/// 	FI
2027/// ENDFOR
2028///
2029/// dst[MAX:256] := 0
2030/// \endcode
2031///
2032/// \headerfile <immintrin.h>
2033///
2034/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
2035///
2036/// \param __W
2037///    A 256-bit vector of [32 x hf8].
2038/// \param __U
2039///    A 32-bit merging mask.
2040/// \param __A
2041///    A 256-bit vector of [16 x fp16].
2042/// \param __B
2043///    A 256-bit vector of [16 x fp16].
2044/// \returns
2045///    A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
2046///    (converted) elements from \a __B; higher order elements correspond to the
2047///    (converted) elements from \a __A. If corresponding mask bit is not set, then
2048///    element from \a __W is taken instead.
2049static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvts_2ph_hf8(
2050    __m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
2051  return (__m256i)__builtin_ia32_selectb_256(
2052      (__mmask32)__U, (__v32qi)_mm256_cvts_2ph_hf8(__A, __B), (__v32qi)__W);
2053}
2054
2055/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
2056///    floating-point elements to a 256-bit vector containing E4M3 FP8 elements.
2057///    Zeroing mask \a __U is used to determine if given element should be zeroed
2058///    instead. Resulting elements are saturated in case of overflow.
2059///
2060/// \code{.operation}
2061/// FOR i := 0 to 31 
2062/// 	IF __U[i]
2063/// 		IF i < 16 
2064/// 			dst.hf8[i] := convert_fp16_to_hf8_saturate(__B.fp16[i])
2065/// 		ELSE
2066/// 			dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i - 16])
2067/// 		FI
2068/// 	ELSE
2069/// 		dst.hf8[i] := 0
2070/// 	FI
2071/// ENDFOR
2072///
2073/// dst[MAX:256] := 0
2074/// \endcode
2075///
2076/// \headerfile <immintrin.h>
2077///
2078/// This intrinsic corresponds to the \c VCVT2PH2HF8S instruction.
2079///
2080/// \param __U
2081///    A 32-bit zeroing mask.
2082/// \param __A
2083///    A 256-bit vector of [16 x fp16].
2084/// \param __B
2085///    A 256-bit vector of [16 x fp16].
2086/// \returns
2087///    A 256-bit vector of [32 x hf8]. Lower 16 elements correspond to the
2088///    (converted) elements from \a __B; higher order elements correspond to the
2089///    (converted) elements from \a __A. If corresponding mask bit is not set,
2090///    zero is taken instead.
2091static __inline__ __m256i __DEFAULT_FN_ATTRS256
2092_mm256_maskz_cvts_2ph_hf8(__mmask32 __U, __m256h __A, __m256h __B) {
2093  return (__m256i)__builtin_ia32_selectb_256(
2094      (__mmask32)__U, (__v32qi)_mm256_cvts_2ph_hf8(__A, __B),
2095      (__v32qi)(__m256i)_mm256_setzero_si256());
2096}
2097
2098/// Convert 128-bit vector \a __A, containing packed FP8 E4M3 floating-point
2099///    elements to a 128-bit vector containing FP16 elements. The conversion is exact.
2100///
2101/// \code{.operation}
2102/// FOR i := 0 to 7
2103/// 	dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2104/// ENDFOR
2105///
2106/// dst[MAX:128] := 0
2107/// \endcode
2108///
2109/// \headerfile <immintrin.h>
2110///
2111/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2112///
2113/// \param __A
2114///    A 128-bit vector of [16 x hf8].
2115/// \returns
2116///    A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
2117///    (converted) elements from \a __A.
2118static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvthf8_ph(__m128i __A) {
2119  return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
2120      (__v16qi)__A, (__v8hf)(__m128h)_mm_undefined_ph(), (__mmask8)-1);
2121}
2122
2123/// Convert 128-bit vector \a __A, containing packed FP8 E4M3 floating-point
2124///    elements to a 128-bit vector containing FP16 elements. The conversion is
2125///    exact. Merging mask \a __U is used to determine if given element should be
2126///    taken from \a __W instead.
2127///
2128/// \code{.operation}
2129/// FOR i := 0 to 7
2130/// 	IF __U[i]
2131/// 		dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2132/// 	ELSE
2133/// 		dst.fp16[i] := __W.fp16[i]
2134/// 	FI
2135/// ENDFOR
2136///
2137/// dst[MAX:128] := 0
2138/// \endcode
2139///
2140/// \headerfile <immintrin.h>
2141///
2142/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2143///
2144/// \param __W
2145///    A 128-bit vector of [8 x fp16].
2146/// \param __U
2147///    A 8-bit merging mask.
2148/// \param __A
2149///    A 128-bit vector of [16 x hf8].
2150/// \returns
2151///    A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
2152///    (converted) elements from \a __A. If corresponding mask bit is not set, then
2153///    element from \a __W is taken instead.
2154static __inline__ __m128h __DEFAULT_FN_ATTRS128
2155_mm_mask_cvthf8_ph(__m128h __W, __mmask8 __U, __m128i __A) {
2156  return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
2157      (__v16qi)__A, (__v8hf)(__m128h)__W, (__mmask8)__U);
2158}
2159
2160/// Convert 128-bit vector \a __A, containing packed FP8 E4M3 floating-point
2161///    elements to a 128-bit vector containing FP16 elements. The conversion is
2162///    exact. Zeroing mask \a __U is used to determine if given element should be
2163///    zeroed instead.
2164///
2165/// \code{.operation}
2166/// FOR i := 0 to 7
2167/// 	IF __U[i]
2168/// 		dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2169/// 	ELSE
2170/// 		dst.fp16[i] := 0
2171/// 	FI
2172/// ENDFOR
2173///
2174/// dst[MAX:128] := 0
2175/// \endcode
2176///
2177/// \headerfile <immintrin.h>
2178///
2179/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2180///
2181/// \param __U
2182///    A 8-bit zeroing mask.
2183/// \param __A
2184///    A 128-bit vector of [16 x hf8].
2185/// \returns
2186///    A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
2187///    (converted) elements from \a __A. If corresponding mask bit is not set, then
2188///    zero is taken instead.
2189static __inline__ __m128h __DEFAULT_FN_ATTRS128
2190_mm_maskz_cvthf8_ph(__mmask8 __U, __m128i __A) {
2191  return (__m128h)__builtin_ia32_vcvthf8_2ph128_mask(
2192      (__v16qi)__A, (__v8hf)(__m128h)_mm_setzero_ph(), (__mmask8)__U);
2193}
2194
2195/// Convert 256-bit vector \a __A, containing packed FP8 E4M3 floating-point
2196///    elements to a 256-bit vector containing FP16 elements. The conversion is exact.
2197///
2198/// \code{.operation}
2199/// FOR i := 0 to 15
2200/// 	dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2201/// ENDFOR
2202///
2203/// dst[MAX:256] := 0
2204/// \endcode
2205///
2206/// \headerfile <immintrin.h>
2207///
2208/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2209///
2210/// \param __A
2211///    A 256-bit vector of [32 x hf8].
2212/// \returns
2213///    A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
2214///    (converted) elements from \a __A.
2215static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvthf8_ph(__m128i __A) {
2216  return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
2217      (__v16qi)__A, (__v16hf)(__m256h)_mm256_undefined_ph(), (__mmask16)-1);
2218}
2219
2220/// Convert 256-bit vector \a __A, containing packed FP8 E4M3 floating-point
2221///    elements to a 256-bit vector containing FP16 elements. The conversion is
2222///    exact. Merging mask \a __U is used to determine if given element should be
2223///    taken from \a __W instead.
2224///
2225/// \code{.operation}
2226/// FOR i := 0 to 15 
2227/// 	IF __U[i]
2228/// 		dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2229/// 	ELSE
2230/// 		dst.fp16[i] := __W.fp16[i]
2231/// 	FI
2232/// ENDFOR
2233///
2234/// dst[MAX:256] := 0
2235/// \endcode
2236///
2237/// \headerfile <immintrin.h>
2238///
2239/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2240///
2241/// \param __W
2242///    A 256-bit vector of [16 x fp16].
2243/// \param __U
2244///    A 16-bit merging mask.
2245/// \param __A
2246///    A 256-bit vector of [32 x hf8].
2247/// \returns
2248///    A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
2249///    (converted) elements from \a __A. If corresponding mask bit is not set, then
2250///    element from \a __W is taken instead.
2251static __inline__ __m256h __DEFAULT_FN_ATTRS256
2252_mm256_mask_cvthf8_ph(__m256h __W, __mmask16 __U, __m128i __A) {
2253  return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
2254      (__v16qi)__A, (__v16hf)(__m256h)__W, (__mmask16)__U);
2255}
2256
2257/// Convert 256-bit vector \a __A, containing packed FP8 E4M3 floating-point
2258///    elements to a 256-bit vector containing FP16 elements. The conversion is
2259///    exact. Zeroing mask \a __U is used to determine if given element should be
2260///    zeroed instead.
2261///
2262/// \code{.operation}
2263/// FOR i := 0 to 15 
2264/// 	IF __U[i]
2265/// 		dst.fp16[i] := convert_hf8_to_fp16(__A.hf8[i])
2266/// 	ELSE
2267/// 		dst.fp16[i] := 0
2268/// 	FI
2269/// ENDFOR
2270///
2271/// dst[MAX:256] := 0
2272/// \endcode
2273///
2274/// \headerfile <immintrin.h>
2275///
2276/// This intrinsic corresponds to the \c VCVTHF82PH instruction.
2277///
2278/// \param __U
2279///    A 16-bit zeroing mask.
2280/// \param __A
2281///    A 256-bit vector of [32 x hf8].
2282/// \returns
2283///    A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
2284///    (converted) elements from \a __A. If corresponding mask bit is not set, then
2285///    zero is taken instead.
2286static __inline__ __m256h __DEFAULT_FN_ATTRS256
2287_mm256_maskz_cvthf8_ph(__mmask16 __U, __m128i __A) {
2288  return (__m256h)__builtin_ia32_vcvthf8_2ph256_mask(
2289      (__v16qi)__A, (__v16hf)(__m256h)_mm256_setzero_ph(), (__mmask16)__U);
2290}
2291
2292/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2293///    to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2294///    resulting vector are zeroed.
2295///
2296/// \code{.operation}
2297/// FOR i := 0 to 7
2298/// 	dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2299/// ENDFOR
2300///
2301/// dst[MAX:64] := 0
2302/// \endcode
2303///
2304/// \headerfile <immintrin.h>
2305///
2306/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2307///
2308/// \param __A
2309///    A 128-bit vector of [8 x fp16].
2310/// \returns
2311///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the (converted)
2312///    elements from \a __A; upper elements are zeroed. 
2313static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_bf8(__m128h __A) {
2314  return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
2315      (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
2316}
2317
2318/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2319///    to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2320///    resulting vector are zeroed. Merging mask \a __U is used to determine if
2321///    given element should be taken from \a __W instead.
2322///
2323/// \code{.operation}
2324/// FOR i := 0 to 7
2325/// 	IF __U[i]
2326/// 		dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2327/// 	ELSE
2328/// 		dst.bf8[i] := __W.bf8[i]
2329/// 	FI
2330/// ENDFOR
2331///
2332/// dst[MAX:64] := 0
2333/// \endcode
2334///
2335/// \headerfile <immintrin.h>
2336///
2337/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2338///
2339/// \param __W
2340///    A 128-bit vector of [16 x bf8].
2341/// \param __U
2342///    A 8-bit merging mask.
2343/// \param __A
2344///    A 128-bit vector of [8 x fp16].
2345/// \returns
2346///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
2347///    (converted) elements from \a __A; upper elements are zeroed. If
2348///    corresponding mask bit is not set, then element from \a __W is taken instead.
2349static __inline__ __m128i __DEFAULT_FN_ATTRS128
2350_mm_mask_cvtph_bf8(__m128i __W, __mmask8 __U, __m128h __A) {
2351  return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
2352      (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
2353}
2354
2355/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2356///    to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2357///    resulting vector are zeroed. Zeroing mask \a __U is used to determine if
2358///    given element should be zeroed instead.
2359///
2360/// \code{.operation}
2361/// FOR i := 0 to 7
2362/// 	IF __U[i]
2363/// 		dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2364/// 	ELSE
2365/// 		dst.bf8[i] := 0
2366/// 	FI
2367/// ENDFOR
2368///
2369/// dst[MAX:64] := 0
2370/// \endcode
2371///
2372/// \headerfile <immintrin.h>
2373///
2374/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2375///
2376/// \param __U
2377///    A 8-bit zeroing mask.
2378/// \param __A
2379///    A 128-bit vector of [8 x fp16].
2380/// \returns
2381///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
2382///    (converted) elements from \a __A; upper elements are zeroed. If
2383///    corresponding mask bit is not set, then element is zeroed.
2384static __inline__ __m128i __DEFAULT_FN_ATTRS128
2385_mm_maskz_cvtph_bf8(__mmask8 __U, __m128h __A) {
2386  return (__m128i)__builtin_ia32_vcvtph2bf8_128_mask(
2387      (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
2388}
2389
2390/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2391///    to a 128-bit vector containing E5M2 FP8 elements.
2392///
2393/// \code{.operation}
2394/// FOR i := 0 to 15
2395/// 	dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2396/// ENDFOR
2397///
2398/// dst[MAX:128] := 0
2399/// \endcode
2400///
2401/// \headerfile <immintrin.h>
2402///
2403/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2404///
2405/// \param __A
2406///    A 256-bit vector of [16 x fp16].
2407/// \returns
2408///    A 128-bit vector of [16 x bf8]. Resulting elements correspond to the (converted)
2409///    elements from \a __A.
2410static __inline__ __m128i __DEFAULT_FN_ATTRS256
2411_mm256_cvtph_bf8(__m256h __A) {
2412  return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
2413      (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
2414}
2415
2416/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2417///    to a 128-bit vector containing E5M2 FP8 elements. Merging mask \a __U is
2418///    used to determine if given element should be taken from \a __W instead.
2419///   
2420/// \code{.operation}
2421/// FOR i := 0 to 15
2422/// 	IF __U[i]
2423/// 		dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2424/// 	ELSE
2425/// 		dst.bf8[i] := __W.bf8[i]
2426/// 	FI
2427/// ENDFOR
2428///
2429/// dst[MAX:128] := 0
2430/// \endcode
2431///
2432/// \headerfile <immintrin.h>
2433///
2434/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2435///
2436/// \param __W
2437///    A 128-bit vector of [16 x bf8].
2438/// \param __U
2439///    A 16-bit merging mask.
2440/// \param __A
2441///    A 256-bit vector of [8 x fp16].
2442/// \returns
2443///    A 128-bit vector of [16 x bf8]. Resulting elements correspond to the
2444///    (converted) elements from \a __A. If
2445///    corresponding mask bit is not set, then element from \a __W is taken instead.
2446static __inline__ __m128i __DEFAULT_FN_ATTRS256
2447_mm256_mask_cvtph_bf8(__m128i __W, __mmask16 __U, __m256h __A) {
2448  return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
2449      (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
2450}
2451
2452/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2453///    to a 128-bit vector containing E5M2 FP8 elements. Zeroing mask \a __U is
2454///    used to determine if given element should be zeroed instead.
2455///   
2456/// \code{.operation}
2457/// FOR i := 0 to 15
2458/// 	IF __U[i]
2459/// 		dst.bf8[i] := convert_fp16_to_bf8(__A.fp16[i])
2460/// 	ELSE
2461/// 		dst.bf8[i] := 0
2462/// 	FI
2463/// ENDFOR
2464///
2465/// dst[MAX:128] := 0
2466/// \endcode
2467///
2468/// \headerfile <immintrin.h>
2469///
2470/// This intrinsic corresponds to the \c VCVTPH2BF8 instruction.
2471///
2472/// \param __U
2473///    A 16-bit zeroing mask.
2474/// \param __A
2475///    A 256-bit vector of [16 x fp16].
2476/// \returns
2477///    A 128-bit vector of [16 x bf8]. Resulting elements correspond to the
2478///    (converted) elements from \a __A. If corresponding mask bit is not set,
2479///    then element is zeroed instead.
2480static __inline__ __m128i __DEFAULT_FN_ATTRS256
2481_mm256_maskz_cvtph_bf8(__mmask16 __U, __m256h __A) {
2482  return (__m128i)__builtin_ia32_vcvtph2bf8_256_mask(
2483      (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
2484}
2485
2486/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2487///    to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2488///    resulting vector are zeroed. Results are saturated.
2489///   
2490/// \code{.operation}
2491/// FOR i := 0 to 7
2492/// 	dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2493/// ENDFOR
2494///
2495/// dst[MAX:64] := 0
2496/// \endcode
2497///
2498/// \headerfile <immintrin.h>
2499///
2500/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2501///
2502/// \param __A
2503///    A 128-bit vector of [8 x fp16].
2504/// \returns
2505///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the (converted)
2506///    elements from \a __A; upper elements are zeroed. 
2507static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts_ph_bf8(__m128h __A) {
2508  return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
2509      (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
2510}
2511
2512/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2513///    to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2514///    resulting vector are zeroed. Results are saturated. Merging mask \a __U is
2515///    used to determine if given element should be taken from \a __W instead.
2516///
2517/// \code{.operation}
2518/// FOR i := 0 to 7
2519/// 	IF __U[i]
2520/// 		dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2521/// 	ELSE
2522/// 		dst.bf8[i] := __W.bf8[i]
2523/// 	FI
2524/// ENDFOR
2525///
2526/// dst[MAX:64] := 0
2527/// \endcode
2528///
2529/// \headerfile <immintrin.h>
2530///
2531/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2532///
2533/// \param __W
2534///    A 128-bit vector of [16 x bf8].
2535/// \param __U
2536///    A 8-bit merging mask.
2537/// \param __A
2538///    A 128-bit vector of [8 x fp16].
2539/// \returns
2540///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
2541///    (converted) elements from \a __A; upper elements are zeroed. If
2542///    corresponding mask bit is not set, then element from \a __W is taken instead.
2543static __inline__ __m128i __DEFAULT_FN_ATTRS128
2544_mm_mask_cvts_ph_bf8(__m128i __W, __mmask8 __U, __m128h __A) {
2545  return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
2546      (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
2547}
2548
2549/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2550///    to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2551///    resulting vector are zeroed. Results are saturated. Zeroing mask \a __U is
2552///    used to determine if given element should be zeroed instead.
2553///
2554/// \code{.operation}
2555/// FOR i := 0 to 7
2556/// 	IF __U[i]
2557/// 		dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2558/// 	ELSE
2559/// 		dst.bf8[i] := 0
2560/// 	FI
2561/// ENDFOR
2562///
2563/// dst[MAX:64] := 0
2564/// \endcode
2565///
2566/// \headerfile <immintrin.h>
2567///
2568/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2569///
2570/// \param __U
2571///    A 8-bit zeroing mask.
2572/// \param __A
2573///    A 128-bit vector of [8 x fp16].
2574/// \returns
2575///    A 128-bit vector of [16 x bf8]. Lower elements correspond to the
2576///    (converted) elements from \a __A; upper elements are zeroed. If
2577///    corresponding mask bit is not set, then element is zeroed.
2578static __inline__ __m128i __DEFAULT_FN_ATTRS128
2579_mm_maskz_cvts_ph_bf8(__mmask8 __U, __m128h __A) {
2580  return (__m128i)__builtin_ia32_vcvtph2bf8s_128_mask(
2581      (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
2582}
2583
2584/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2585///    to a 128-bit vector containing E5M2 FP8 elements. Results are saturated.
2586///
2587/// \code{.operation}
2588/// FOR i := 0 to 15
2589/// 	dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2590/// ENDFOR
2591///
2592/// dst[MAX:128] := 0
2593/// \endcode
2594///
2595/// \headerfile <immintrin.h>
2596///
2597/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2598///
2599/// \param __A
2600///    A 256-bit vector of [16 x fp16].
2601/// \returns
2602///    A 128-bit vector of [16 x bf8]. Resulting elements correspond to the (converted)
2603///    elements from \a __A.
2604static __inline__ __m128i __DEFAULT_FN_ATTRS256
2605_mm256_cvts_ph_bf8(__m256h __A) {
2606  return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
2607      (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
2608}
2609
2610/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2611///    to a 128-bit vector containing E5M2 FP8 elements. Results are saturated.
2612///    Merging mask \a __U is used to determine if given element should be taken
2613///    from \a __W instead.
2614///
2615/// \code{.operation}
2616/// FOR i := 0 to 15
2617/// 	IF __U[i]
2618/// 		dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2619/// 	ELSE
2620/// 		dst.bf8[i] := __W.bf8[i]
2621/// 	FI
2622/// ENDFOR
2623///
2624/// dst[MAX:128] := 0
2625/// \endcode
2626///
2627/// \headerfile <immintrin.h>
2628///
2629/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2630///
2631/// \param __W
2632///    A 128-bit vector of [16 x bf8].
2633/// \param __U
2634///    A 16-bit merging mask.
2635/// \param __A
2636///    A 256-bit vector of [8 x fp16].
2637/// \returns
2638///    A 128-bit vector of [16 x bf8]. Resulting elements correspond to the
2639///    (converted) elements from \a __A. If
2640///    corresponding mask bit is not set, then element from \a __W is taken instead.
2641static __inline__ __m128i __DEFAULT_FN_ATTRS256
2642_mm256_mask_cvts_ph_bf8(__m128i __W, __mmask16 __U, __m256h __A) {
2643  return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
2644      (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
2645}
2646
2647/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2648///    to a 128-bit vector containing E5M2 FP8 elements. Results are saturated.
2649///    Zeroing mask \a __U is used to determine if given element should be zeroed
2650///    instead.
2651///
2652/// \code{.operation}
2653/// FOR i := 0 to 15 
2654/// 	IF __U[i]
2655/// 		dst.bf8[i] := convert_fp16_to_bf8_saturate(__A.fp16[i])
2656/// 	ELSE
2657/// 		dst.bf8[i] := 0
2658/// 	FI
2659/// ENDFOR
2660///
2661/// dst[MAX:128] := 0
2662/// \endcode
2663///
2664/// \headerfile <immintrin.h>
2665///
2666/// This intrinsic corresponds to the \c VCVTPH2BF8S instruction.
2667///
2668/// \param __U
2669///    A 16-bit zeroing mask.
2670/// \param __A
2671///    A 256-bit vector of [16 x fp16].
2672/// \returns
2673///    A 128-bit vector of [16 x bf8]. Resulting elements correspond to the
2674///    (converted) elements from \a __A. If corresponding mask bit is not set,
2675///    then element is zeroed instead.
2676static __inline__ __m128i __DEFAULT_FN_ATTRS256
2677_mm256_maskz_cvts_ph_bf8(__mmask16 __U, __m256h __A) {
2678  return (__m128i)__builtin_ia32_vcvtph2bf8s_256_mask(
2679      (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
2680}
2681
2682/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2683///    to a 128-bit vector containing E5M2 FP8 elements. Upper elements of
2684///    resulting vector are zeroed.
2685///
2686/// \code{.operation}
2687/// FOR i := 0 to 7
2688/// 	dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2689/// ENDFOR
2690///
2691/// dst[MAX:64] := 0
2692/// \endcode
2693///
2694/// \headerfile <immintrin.h>
2695///
2696/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2697///
2698/// \param __A
2699///    A 128-bit vector of [8 x fp16].
2700/// \returns
2701///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the (converted)
2702///    elements from \a __A; upper elements are zeroed. 
2703static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_hf8(__m128h __A) {
2704  return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
2705      (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
2706}
2707
2708/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2709///    to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2710///    resulting vector are zeroed. Merging mask \a __U is used to determine if
2711///    given element should be taken from \a __W instead.
2712///
2713/// \code{.operation}
2714/// FOR i := 0 to 7
2715/// 	IF __U[i]
2716/// 		dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2717/// 	ELSE
2718/// 		dst.hf8[i] := __W.hf8[i]
2719/// 	FI
2720/// ENDFOR
2721///
2722/// dst[MAX:64] := 0
2723/// \endcode
2724///
2725/// \headerfile <immintrin.h>
2726///
2727/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2728///
2729/// \param __W
2730///    A 128-bit vector of [16 x hf8].
2731/// \param __U
2732///    A 8-bit merging mask.
2733/// \param __A
2734///    A 128-bit vector of [8 x fp16].
2735/// \returns
2736///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
2737///    (converted) elements from \a __A; upper elements are zeroed. If
2738///    corresponding mask bit is not set, then element from \a __W is taken instead.
2739static __inline__ __m128i __DEFAULT_FN_ATTRS128
2740_mm_mask_cvtph_hf8(__m128i __W, __mmask8 __U, __m128h __A) {
2741  return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
2742      (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
2743}
2744
2745/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2746///    to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2747///    resulting vector are zeroed. Zeroing mask \a __U is used to determine if
2748///    given element should be zeroed instead.
2749///
2750/// \code{.operation}
2751/// FOR i := 0 to 7
2752/// 	IF __U[i]
2753/// 		dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2754/// 	ELSE
2755/// 		dst.hf8[i] := 0
2756/// 	FI
2757/// ENDFOR
2758///
2759/// dst[MAX:64] := 0
2760/// \endcode
2761///
2762/// \headerfile <immintrin.h>
2763///
2764/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2765///
2766/// \param __U
2767///    A 8-bit zeroing mask.
2768/// \param __A
2769///    A 128-bit vector of [8 x fp16].
2770/// \returns
2771///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
2772///    (converted) elements from \a __A; upper elements are zeroed. If
2773///    corresponding mask bit is not set, then element is zeroed.
2774static __inline__ __m128i __DEFAULT_FN_ATTRS128
2775_mm_maskz_cvtph_hf8(__mmask8 __U, __m128h __A) {
2776  return (__m128i)__builtin_ia32_vcvtph2hf8_128_mask(
2777      (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
2778}
2779
2780/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2781///    to a 128-bit vector containing E4M3 FP8 elements.
2782///
2783/// \code{.operation}
2784/// FOR i := 0 to 15
2785/// 	dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2786/// ENDFOR
2787///
2788/// dst[MAX:128] := 0
2789/// \endcode
2790///
2791/// \headerfile <immintrin.h>
2792///
2793/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2794///
2795/// \param __A
2796///    A 256-bit vector of [16 x fp16].
2797/// \returns
2798///    A 128-bit vector of [16 x hf8]. Resulting elements correspond to the (converted)
2799///    elements from \a __A.
2800static __inline__ __m128i __DEFAULT_FN_ATTRS256
2801_mm256_cvtph_hf8(__m256h __A) {
2802  return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
2803      (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
2804}
2805
2806/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2807///    to a 128-bit vector containing E4M3 FP8 elements. Merging mask \a __U is
2808///    used to determine if given element should be taken from \a __W instead.
2809///   
2810/// \code{.operation}
2811/// FOR i := 0 to 15
2812/// 	IF __U[i]
2813/// 		dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2814/// 	ELSE
2815/// 		dst.hf8[i] := __W.hf8[i]
2816/// 	FI
2817/// ENDFOR
2818///
2819/// dst[MAX:128] := 0
2820/// \endcode
2821///
2822/// \headerfile <immintrin.h>
2823///
2824/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2825///
2826/// \param __W
2827///    A 128-bit vector of [16 x hf8].
2828/// \param __U
2829///    A 16-bit merging mask.
2830/// \param __A
2831///    A 256-bit vector of [8 x fp16].
2832/// \returns
2833///    A 128-bit vector of [16 x hf8]. Resulting elements correspond to the
2834///    (converted) elements from \a __A. If
2835///    corresponding mask bit is not set, then element from \a __W is taken instead.
2836static __inline__ __m128i __DEFAULT_FN_ATTRS256
2837_mm256_mask_cvtph_hf8(__m128i __W, __mmask16 __U, __m256h __A) {
2838  return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
2839      (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
2840}
2841
2842/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2843///    to a 128-bit vector containing E4M3 FP8 elements. Zeroing mask \a __U is
2844///    used to determine if given element should be zeroed instead.
2845///   
2846/// \code{.operation}
2847/// FOR i := 0 to 15
2848/// 	IF __U[i]
2849/// 		dst.hf8[i] := convert_fp16_to_hf8(__A.fp16[i])
2850/// 	ELSE
2851/// 		dst.hf8[i] := 0
2852/// 	FI
2853/// ENDFOR
2854///
2855/// dst[MAX:128] := 0
2856/// \endcode
2857///
2858/// \headerfile <immintrin.h>
2859///
2860/// This intrinsic corresponds to the \c VCVTPH2HF8 instruction.
2861///
2862/// \param __U
2863///    A 16-bit zeroing mask.
2864/// \param __A
2865///    A 256-bit vector of [16 x fp16].
2866/// \returns
2867///    A 128-bit vector of [16 x hf8]. Resulting elements correspond to the
2868///    (converted) elements from \a __A. If corresponding mask bit is not set,
2869///    then element is zeroed instead.
2870static __inline__ __m128i __DEFAULT_FN_ATTRS256
2871_mm256_maskz_cvtph_hf8(__mmask16 __U, __m256h __A) {
2872  return (__m128i)__builtin_ia32_vcvtph2hf8_256_mask(
2873      (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
2874}
2875
2876/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2877///    to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2878///    resulting vector are zeroed. Results are saturated.
2879///   
2880/// \code{.operation}
2881/// FOR i := 0 to 7
2882/// 	dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
2883/// ENDFOR
2884///
2885/// dst[MAX:64] := 0
2886/// \endcode
2887///
2888/// \headerfile <immintrin.h>
2889///
2890/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
2891///
2892/// \param __A
2893///    A 128-bit vector of [8 x fp16].
2894/// \returns
2895///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the (converted)
2896///    elements from \a __A; upper elements are zeroed. 
2897static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvts_ph_hf8(__m128h __A) {
2898  return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
2899      (__v8hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask8)-1);
2900}
2901
2902/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2903///    to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2904///    resulting vector are zeroed. Results are saturated. Merging mask \a __U is
2905///    used to determine if given element should be taken from \a __W instead.
2906///
2907/// \code{.operation}
2908/// FOR i := 0 to 7
2909/// 	IF __U[i]
2910/// 		dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
2911/// 	ELSE
2912/// 		dst.hf8[i] := __W.hf8[i]
2913/// 	FI
2914/// ENDFOR
2915///
2916/// dst[MAX:64] := 0
2917/// \endcode
2918///
2919/// \headerfile <immintrin.h>
2920///
2921/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
2922///
2923/// \param __W
2924///    A 128-bit vector of [16 x hf8].
2925/// \param __U
2926///    A 8-bit merging mask.
2927/// \param __A
2928///    A 128-bit vector of [8 x fp16].
2929/// \returns
2930///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
2931///    (converted) elements from \a __A; upper elements are zeroed. If
2932///    corresponding mask bit is not set, then element from \a __W is taken instead.
2933static __inline__ __m128i __DEFAULT_FN_ATTRS128
2934_mm_mask_cvts_ph_hf8(__m128i __W, __mmask8 __U, __m128h __A) {
2935  return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
2936      (__v8hf)__A, (__v16qi)(__m128i)__W, (__mmask8)__U);
2937}
2938
2939/// Convert 128-bit vector \a __A containing packed FP16 floating-point elements
2940///    to a 128-bit vector containing E4M3 FP8 elements. Upper elements of
2941///    resulting vector are zeroed. Results are saturated. Zeroing mask \a __U is
2942///    used to determine if given element should be zeroed instead.
2943///
2944/// \code{.operation}
2945/// FOR i := 0 to 7
2946/// 	IF __U[i]
2947/// 		dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
2948/// 	ELSE
2949/// 		dst.hf8[i] := 0
2950/// 	FI
2951/// ENDFOR
2952///
2953/// dst[MAX:64] := 0
2954/// \endcode
2955///
2956/// \headerfile <immintrin.h>
2957///
2958/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
2959///
2960/// \param __U
2961///    A 8-bit zeroing mask.
2962/// \param __A
2963///    A 128-bit vector of [8 x fp16].
2964/// \returns
2965///    A 128-bit vector of [16 x hf8]. Lower elements correspond to the
2966///    (converted) elements from \a __A; upper elements are zeroed. If
2967///    corresponding mask bit is not set, then element is zeroed.
2968static __inline__ __m128i __DEFAULT_FN_ATTRS128
2969_mm_maskz_cvts_ph_hf8(__mmask8 __U, __m128h __A) {
2970  return (__m128i)__builtin_ia32_vcvtph2hf8s_128_mask(
2971      (__v8hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask8)__U);
2972}
2973
2974/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
2975///    to a 128-bit vector containing E4M3 FP8 elements. Results are saturated.
2976///
2977/// \code{.operation}
2978/// FOR i := 0 to 15
2979/// 	dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
2980/// ENDFOR
2981///
2982/// dst[MAX:128] := 0
2983/// \endcode
2984///
2985/// \headerfile <immintrin.h>
2986///
2987/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
2988///
2989/// \param __A
2990///    A 256-bit vector of [16 x fp16].
2991/// \returns
2992///    A 128-bit vector of [16 x hf8]. Resulting elements correspond to the (converted)
2993///    elements from \a __A.
2994static __inline__ __m128i __DEFAULT_FN_ATTRS256
2995_mm256_cvts_ph_hf8(__m256h __A) {
2996  return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
2997      (__v16hf)__A, (__v16qi)(__m128i)_mm_undefined_si128(), (__mmask16)-1);
2998}
2999
3000/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
3001///    to a 128-bit vector containing E4M3 FP8 elements. Results are saturated.
3002///    Merging mask \a __U is used to determine if given element should be taken
3003///    from \a __W instead.
3004///
3005/// \code{.operation}
3006/// FOR i := 0 to 15
3007/// 	IF __U[i]
3008/// 		dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
3009/// 	ELSE
3010/// 		dst.hf8[i] := __W.hf8[i]
3011/// 	FI
3012/// ENDFOR
3013///
3014/// dst[MAX:128] := 0
3015/// \endcode
3016///
3017/// \headerfile <immintrin.h>
3018///
3019/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
3020///
3021/// \param __W
3022///    A 128-bit vector of [16 x hf8].
3023/// \param __U
3024///    A 16-bit merging mask.
3025/// \param __A
3026///    A 256-bit vector of [8 x fp16].
3027/// \returns
3028///    A 128-bit vector of [16 x hf8]. Resulting elements correspond to the
3029///    (converted) elements from \a __A. If
3030///    corresponding mask bit is not set, then element from \a __W is taken instead.
3031static __inline__ __m128i __DEFAULT_FN_ATTRS256
3032_mm256_mask_cvts_ph_hf8(__m128i __W, __mmask16 __U, __m256h __A) {
3033  return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
3034      (__v16hf)__A, (__v16qi)(__m128i)__W, (__mmask16)__U);
3035}
3036
3037/// Convert 256-bit vector \a __A containing packed FP16 floating-point elements
3038///    to a 128-bit vector containing E4M3 FP8 elements. Results are saturated.
3039///    Zeroing mask \a __U is used to determine if given element should be zeroed
3040///    instead.
3041///
3042/// \code{.operation}
3043/// FOR i := 0 to 15 
3044/// 	IF __U[i]
3045/// 		dst.hf8[i] := convert_fp16_to_hf8_saturate(__A.fp16[i])
3046/// 	ELSE
3047/// 		dst.hf8[i] := 0
3048/// 	FI
3049/// ENDFOR
3050///
3051/// dst[MAX:128] := 0
3052/// \endcode
3053///
3054/// \headerfile <immintrin.h>
3055///
3056/// This intrinsic corresponds to the \c VCVTPH2HF8S instruction.
3057///
3058/// \param __U
3059///    A 16-bit zeroing mask.
3060/// \param __A
3061///    A 256-bit vector of [16 x fp16].
3062/// \returns
3063///    A 128-bit vector of [16 x hf8]. Resulting elements correspond to the
3064///    (converted) elements from \a __A. If corresponding mask bit is not set,
3065///    then element is zeroed instead.
3066static __inline__ __m128i __DEFAULT_FN_ATTRS256
3067_mm256_maskz_cvts_ph_hf8(__mmask16 __U, __m256h __A) {
3068  return (__m128i)__builtin_ia32_vcvtph2hf8s_256_mask(
3069      (__v16hf)__A, (__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)__U);
3070}
3071
3072/// Convert 128-bit vector \a __A, containing packed FP8 E5M2 floating-point
3073///    elements to a 128-bit vector containing FP16 elements. The conversion is exact.
3074///
3075/// \code{.operation}
3076/// FOR i := 0 to 7
3077/// 	dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3078/// ENDFOR
3079/// \endcode
3080///
3081/// \headerfile <immintrin.h>
3082///
3083/// This intrinsic does not correspond to a single instruction.
3084///
3085/// \param __A
3086///    A 128-bit vector of [16 x bf8].
3087/// \returns
3088///    A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
3089///    (converted) elements from \a __A.
3090static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtbf8_ph(__m128i __A) {
3091  return _mm_castsi128_ph(_mm_slli_epi16(_mm_cvtepi8_epi16(__A), 8));
3092}
3093
3094/// Convert 128-bit vector \a __A, containing packed FP8 E5M2 floating-point
3095///    elements to a 128-bit vector containing FP16 elements. The conversion is
3096///    exact. Merging mask \a __U is used to determine if given element should be
3097///    taken from \a __W instead.
3098///
3099/// \code{.operation}
3100/// FOR i := 0 to 7
3101/// 	IF __U[i]
3102/// 		dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3103/// 	ELSE
3104/// 		dst.fp16[i] := __W.fp16[i]
3105/// 	FI
3106/// ENDFOR
3107/// \endcode
3108///
3109/// \headerfile <immintrin.h>
3110///
3111/// This intrinsic does not correspond to a single instruction.
3112///
3113/// \param __W
3114///    A 128-bit vector of [8 x fp16].
3115/// \param __U
3116///    A 8-bit merging mask.
3117/// \param __A
3118///    A 128-bit vector of [16 x bf8].
3119/// \returns
3120///    A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
3121///    (converted) elements from \a __A. If corresponding mask bit is not set, then
3122///    element from \a __W is taken instead.
3123static __inline__ __m128h __DEFAULT_FN_ATTRS128
3124_mm_mask_cvtbf8_ph(__m128h __W, __mmask8 __U, __m128i __A) {
3125  return _mm_castsi128_ph(
3126      _mm_mask_slli_epi16((__m128i)__W, __U, _mm_cvtepi8_epi16(__A), 8));
3127}
3128
3129/// Convert 128-bit vector \a __A, containing packed FP8 E5M2 floating-point
3130///    elements to a 128-bit vector containing FP16 elements. The conversion is
3131///    exact. Zeroing mask \a __U is used to determine if given element should be
3132///    zeroed instead.
3133///
3134/// \code{.operation}
3135/// FOR i := 0 to 7
3136/// 	IF __U[i]
3137/// 		dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3138/// 	ELSE
3139/// 		dst.fp16[i] := 0
3140/// 	FI
3141/// ENDFOR
3142/// \endcode
3143///
3144/// \headerfile <immintrin.h>
3145///
3146/// This intrinsic does not correspond to a single instruction.
3147///
3148/// \param __U
3149///    A 8-bit zeroing mask.
3150/// \param __A
3151///    A 128-bit vector of [16 x bf8].
3152/// \returns
3153///    A 128-bit vector of [8 x fp16]. Resulting elements correspond to the
3154///    (converted) elements from \a __A. If corresponding mask bit is not set, then
3155///    zero is taken instead.
3156static __inline__ __m128h __DEFAULT_FN_ATTRS128
3157_mm_maskz_cvtbf8_ph(__mmask8 __U, __m128i __A) {
3158  return _mm_castsi128_ph(_mm_slli_epi16(_mm_maskz_cvtepi8_epi16(__U, __A), 8));
3159}
3160
3161/// Convert 256-bit vector \a __A, containing packed FP8 E4M3 floating-point
3162///    elements to a 256-bit vector containing FP16 elements. The conversion is exact.
3163///
3164/// \code{.operation}
3165/// FOR i := 0 to 15
3166/// 	dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3167/// ENDFOR
3168/// \endcode
3169///
3170/// \headerfile <immintrin.h>
3171///
3172/// This intrinsic does not correspond to a single instruction.
3173///
3174/// \param __A
3175///    A 256-bit vector of [32 x bf8].
3176/// \returns
3177///    A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
3178///    (converted) elements from \a __A.
3179static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtbf8_ph(__m128i __A) {
3180  return _mm256_castsi256_ph(_mm256_slli_epi16(_mm256_cvtepi8_epi16(__A), 8));
3181}
3182
3183/// Convert 256-bit vector \a __A, containing packed FP8 E5M2 floating-point
3184///    elements to a 256-bit vector containing FP16 elements. The conversion is
3185///    exact. Merging mask \a __U is used to determine if given element should be
3186///    taken from \a __W instead.
3187///
3188/// \code{.operation}
3189/// FOR i := 0 to 15 
3190/// 	IF __U[i]
3191/// 		dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3192/// 	ELSE
3193/// 		dst.fp16[i] := __W.fp16[i]
3194/// 	FI
3195/// ENDFOR
3196/// \endcode
3197///
3198/// \headerfile <immintrin.h>
3199///
3200/// This intrinsic does not correspond to a single instruction.
3201///
3202/// \param __W
3203///    A 256-bit vector of [16 x fp16].
3204/// \param __U
3205///    A 16-bit merging mask.
3206/// \param __A
3207///    A 256-bit vector of [32 x bf8].
3208/// \returns
3209///    A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
3210///    (converted) elements from \a __A. If corresponding mask bit is not set, then
3211///    element from \a __W is taken instead.
3212static __inline__ __m256h __DEFAULT_FN_ATTRS256
3213_mm256_mask_cvtbf8_ph(__m256h __W, __mmask16 __U, __m128i __A) {
3214  return _mm256_castsi256_ph(
3215      _mm256_mask_slli_epi16((__m256i)__W, __U, _mm256_cvtepi8_epi16(__A), 8));
3216}
3217
3218/// Convert 256-bit vector \a __A, containing packed FP8 E5M2 floating-point
3219///    elements to a 256-bit vector containing FP16 elements. The conversion is
3220///    exact. Zeroing mask \a __U is used to determine if given element should be
3221///    zeroed instead.
3222///
3223/// \code{.operation}
3224/// FOR i := 0 to 15 
3225/// 	IF __U[i]
3226/// 		dst.fp16[i] := convert_bf8_to_fp16(__A.bf8[i])
3227/// 	ELSE
3228/// 		dst.fp16[i] := 0
3229/// 	FI
3230/// ENDFOR
3231/// \endcode
3232///
3233/// \headerfile <immintrin.h>
3234///
3235/// This intrinsic does not correspond to a single instruction.
3236///
3237/// \param __U
3238///    A 16-bit zeroing mask.
3239/// \param __A
3240///    A 256-bit vector of [32 x bf8].
3241/// \returns
3242///    A 256-bit vector of [16 x fp16]. Resulting elements correspond to the
3243///    (converted) elements from \a __A. If corresponding mask bit is not set, then
3244///    zero is taken instead.
3245static __inline__ __m256h __DEFAULT_FN_ATTRS256
3246_mm256_maskz_cvtbf8_ph(__mmask16 __U, __m128i __A) {
3247  return _mm256_castsi256_ph(
3248      _mm256_slli_epi16(_mm256_maskz_cvtepi8_epi16(__U, __A), 8));
3249}
3250
3251// clang-format on
3252
3253#undef __DEFAULT_FN_ATTRS128
3254#undef __DEFAULT_FN_ATTRS256
3255
3256#endif // __AVX10_2CONVERTINTRIN_H
3257#endif // __SSE2__