zig/lib/include/avx2intrin.h at master

   1/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9
  10#ifndef __IMMINTRIN_H
  11#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
  12#endif
  13
  14#ifndef __AVX2INTRIN_H
  15#define __AVX2INTRIN_H
  16
  17/* Define the default attributes for the functions in this file. */
  18#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
  19#define __DEFAULT_FN_ATTRS256                                                  \
  20  __attribute__((__always_inline__, __nodebug__,                               \
  21                 __target__("avx2,no-evex512"), __min_vector_width__(256)))
  22#define __DEFAULT_FN_ATTRS128                                                  \
  23  __attribute__((__always_inline__, __nodebug__,                               \
  24                 __target__("avx2,no-evex512"), __min_vector_width__(128)))
  25#else
  26#define __DEFAULT_FN_ATTRS256                                                  \
  27  __attribute__((__always_inline__, __nodebug__, __target__("avx2"),           \
  28                 __min_vector_width__(256)))
  29#define __DEFAULT_FN_ATTRS128                                                  \
  30  __attribute__((__always_inline__, __nodebug__, __target__("avx2"),           \
  31                 __min_vector_width__(128)))
  32#endif
  33
  34/* SSE4 Multiple Packed Sums of Absolute Difference.  */
  35/// Computes sixteen sum of absolute difference (SAD) operations on sets of
  36///    four unsigned 8-bit integers from the 256-bit integer vectors \a X and
  37///    \a Y.
  38///
  39///    Eight SAD results are computed using the lower half of the input
  40///    vectors, and another eight using the upper half. These 16-bit values
  41///    are returned in the lower and upper halves of the 256-bit result,
  42///    respectively.
  43///
  44///    A single SAD operation selects four bytes from \a X and four bytes from
  45///    \a Y as input. It computes the differences between each \a X byte and
  46///    the corresponding \a Y byte, takes the absolute value of each
  47///    difference, and sums these four values to form one 16-bit result. The
  48///    intrinsic computes 16 of these results with different sets of input
  49///    bytes.
  50///
  51///    For each set of eight results, the SAD operations use the same four
  52///    bytes from \a Y; the starting bit position for these four bytes is
  53///    specified by \a M[1:0] times 32. The eight operations use successive
  54///    sets of four bytes from \a X; the starting bit position for the first
  55///    set of four bytes is specified by \a M[2] times 32. These bit positions
  56///    are all relative to the 128-bit lane for each set of eight operations.
  57///
  58/// \code{.operation}
  59/// r := 0
  60/// FOR i := 0 TO 1
  61///   j := i*3
  62///   Ybase := M[j+1:j]*32 + i*128
  63///   Xbase := M[j+2]*32 + i*128
  64///   FOR k := 0 TO 3
  65///     temp0 := ABS(X[Xbase+7:Xbase] - Y[Ybase+7:Ybase])
  66///     temp1 := ABS(X[Xbase+15:Xbase+8] - Y[Ybase+15:Ybase+8])
  67///     temp2 := ABS(X[Xbase+23:Xbase+16] - Y[Ybase+23:Ybase+16])
  68///     temp3 := ABS(X[Xbase+31:Xbase+24] - Y[Ybase+31:Ybase+24])
  69///     result[r+15:r] := temp0 + temp1 + temp2 + temp3
  70///     Xbase := Xbase + 8
  71///     r := r + 16
  72///   ENDFOR
  73/// ENDFOR
  74/// \endcode
  75///
  76/// \headerfile <immintrin.h>
  77///
  78/// \code
  79/// __m256i _mm256_mpsadbw_epu8(__m256i X, __m256i Y, const int M);
  80/// \endcode
  81///
  82/// This intrinsic corresponds to the \c VMPSADBW instruction.
  83///
  84/// \param X
  85///    A 256-bit integer vector containing one of the inputs.
  86/// \param Y
  87///    A 256-bit integer vector containing one of the inputs.
  88/// \param M
  89///     An unsigned immediate value specifying the starting positions of the
  90///     bytes to operate on.
  91/// \returns A 256-bit vector of [16 x i16] containing the result.
  92#define _mm256_mpsadbw_epu8(X, Y, M) \
  93  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
  94                                      (__v32qi)(__m256i)(Y), (int)(M)))
  95
  96/// Computes the absolute value of each signed byte in the 256-bit integer
  97///    vector \a __a and returns each value in the corresponding byte of
  98///    the result.
  99///
 100/// \headerfile <immintrin.h>
 101///
 102/// This intrinsic corresponds to the \c VPABSB instruction.
 103///
 104/// \param __a
 105///    A 256-bit integer vector.
 106/// \returns A 256-bit integer vector containing the result.
 107static __inline__ __m256i __DEFAULT_FN_ATTRS256
 108_mm256_abs_epi8(__m256i __a)
 109{
 110    return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
 111}
 112
 113/// Computes the absolute value of each signed 16-bit element in the 256-bit
 114///    vector of [16 x i16] in \a __a and returns each value in the
 115///    corresponding element of the result.
 116///
 117/// \headerfile <immintrin.h>
 118///
 119/// This intrinsic corresponds to the \c VPABSW instruction.
 120///
 121/// \param __a
 122///    A 256-bit vector of [16 x i16].
 123/// \returns A 256-bit vector of [16 x i16] containing the result.
 124static __inline__ __m256i __DEFAULT_FN_ATTRS256
 125_mm256_abs_epi16(__m256i __a)
 126{
 127    return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
 128}
 129
 130/// Computes the absolute value of each signed 32-bit element in the 256-bit
 131///    vector of [8 x i32] in \a __a and returns each value in the
 132///    corresponding element of the result.
 133///
 134/// \headerfile <immintrin.h>
 135///
 136/// This intrinsic corresponds to the \c VPABSD instruction.
 137///
 138/// \param __a
 139///    A 256-bit vector of [8 x i32].
 140/// \returns A 256-bit vector of [8 x i32] containing the result.
 141static __inline__ __m256i __DEFAULT_FN_ATTRS256
 142_mm256_abs_epi32(__m256i __a)
 143{
 144    return (__m256i)__builtin_elementwise_abs((__v8si)__a);
 145}
 146
 147/// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
 148///    integers using signed saturation, and returns the 256-bit result.
 149///
 150/// \code{.operation}
 151/// FOR i := 0 TO 7
 152///   j := i*16
 153///   k := i*8
 154///   result[7+k:k] := SATURATE8(__a[15+j:j])
 155///   result[71+k:64+k] := SATURATE8(__b[15+j:j])
 156///   result[135+k:128+k] := SATURATE8(__a[143+j:128+j])
 157///   result[199+k:192+k] := SATURATE8(__b[143+j:128+j])
 158/// ENDFOR
 159/// \endcode
 160///
 161/// \headerfile <immintrin.h>
 162///
 163/// This intrinsic corresponds to the \c VPACKSSWB instruction.
 164///
 165/// \param __a
 166///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
 167///    result[191:128].
 168/// \param __b
 169///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 170///    result[255:192].
 171/// \returns A 256-bit integer vector containing the result.
 172static __inline__ __m256i __DEFAULT_FN_ATTRS256
 173_mm256_packs_epi16(__m256i __a, __m256i __b)
 174{
 175  return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 176}
 177
 178/// Converts the elements of two 256-bit vectors of [8 x i32] to 16-bit
 179///    integers using signed saturation, and returns the resulting 256-bit
 180///    vector of [16 x i16].
 181///
 182/// \code{.operation}
 183/// FOR i := 0 TO 3
 184///   j := i*32
 185///   k := i*16
 186///   result[15+k:k] := SATURATE16(__a[31+j:j])
 187///   result[79+k:64+k] := SATURATE16(__b[31+j:j])
 188///   result[143+k:128+k] := SATURATE16(__a[159+j:128+j])
 189///   result[207+k:192+k] := SATURATE16(__b[159+j:128+j])
 190/// ENDFOR
 191/// \endcode
 192///
 193/// \headerfile <immintrin.h>
 194///
 195/// This intrinsic corresponds to the \c VPACKSSDW instruction.
 196///
 197/// \param __a
 198///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
 199///    result[191:128].
 200/// \param __b
 201///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 202///    result[255:192].
 203/// \returns A 256-bit vector of [16 x i16] containing the result.
 204static __inline__ __m256i __DEFAULT_FN_ATTRS256
 205_mm256_packs_epi32(__m256i __a, __m256i __b)
 206{
 207  return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 208}
 209
 210/// Converts elements from two 256-bit vectors of [16 x i16] to 8-bit integers
 211///    using unsigned saturation, and returns the 256-bit result.
 212///
 213/// \code{.operation}
 214/// FOR i := 0 TO 7
 215///   j := i*16
 216///   k := i*8
 217///   result[7+k:k] := SATURATE8U(__a[15+j:j])
 218///   result[71+k:64+k] := SATURATE8U(__b[15+j:j])
 219///   result[135+k:128+k] := SATURATE8U(__a[143+j:128+j])
 220///   result[199+k:192+k] := SATURATE8U(__b[143+j:128+j])
 221/// ENDFOR
 222/// \endcode
 223///
 224/// \headerfile <immintrin.h>
 225///
 226/// This intrinsic corresponds to the \c VPACKUSWB instruction.
 227///
 228/// \param __a
 229///    A 256-bit vector of [16 x i16] used to generate result[63:0] and
 230///    result[191:128].
 231/// \param __b
 232///    A 256-bit vector of [16 x i16] used to generate result[127:64] and
 233///    result[255:192].
 234/// \returns A 256-bit integer vector containing the result.
 235static __inline__ __m256i __DEFAULT_FN_ATTRS256
 236_mm256_packus_epi16(__m256i __a, __m256i __b)
 237{
 238  return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 239}
 240
 241/// Converts elements from two 256-bit vectors of [8 x i32] to 16-bit integers
 242///    using unsigned saturation, and returns the resulting 256-bit vector of
 243///    [16 x i16].
 244///
 245/// \code{.operation}
 246/// FOR i := 0 TO 3
 247///   j := i*32
 248///   k := i*16
 249///   result[15+k:k] := SATURATE16U(__V1[31+j:j])
 250///   result[79+k:64+k] := SATURATE16U(__V2[31+j:j])
 251///   result[143+k:128+k] := SATURATE16U(__V1[159+j:128+j])
 252///   result[207+k:192+k] := SATURATE16U(__V2[159+j:128+j])
 253/// ENDFOR
 254/// \endcode
 255///
 256/// \headerfile <immintrin.h>
 257///
 258/// This intrinsic corresponds to the \c VPACKUSDW instruction.
 259///
 260/// \param __V1
 261///    A 256-bit vector of [8 x i32] used to generate result[63:0] and
 262///    result[191:128].
 263/// \param __V2
 264///    A 256-bit vector of [8 x i32] used to generate result[127:64] and
 265///    result[255:192].
 266/// \returns A 256-bit vector of [16 x i16] containing the result.
 267static __inline__ __m256i __DEFAULT_FN_ATTRS256
 268_mm256_packus_epi32(__m256i __V1, __m256i __V2)
 269{
 270  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 271}
 272
 273/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 274///    vectors and returns the lower 8 bits of each sum in the corresponding
 275///    byte of the 256-bit integer vector result (overflow is ignored).
 276///
 277/// \headerfile <immintrin.h>
 278///
 279/// This intrinsic corresponds to the \c VPADDB instruction.
 280///
 281/// \param __a
 282///    A 256-bit integer vector containing one of the source operands.
 283/// \param __b
 284///    A 256-bit integer vector containing one of the source operands.
 285/// \returns A 256-bit integer vector containing the sums.
 286static __inline__ __m256i __DEFAULT_FN_ATTRS256
 287_mm256_add_epi8(__m256i __a, __m256i __b)
 288{
 289  return (__m256i)((__v32qu)__a + (__v32qu)__b);
 290}
 291
 292/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 293///    [16 x i16] and returns the lower 16 bits of each sum in the
 294///    corresponding element of the [16 x i16] result (overflow is ignored).
 295///
 296/// \headerfile <immintrin.h>
 297///
 298/// This intrinsic corresponds to the \c VPADDW instruction.
 299///
 300/// \param __a
 301///    A 256-bit vector of [16 x i16] containing one of the source operands.
 302/// \param __b
 303///    A 256-bit vector of [16 x i16] containing one of the source operands.
 304/// \returns A 256-bit vector of [16 x i16] containing the sums.
 305static __inline__ __m256i __DEFAULT_FN_ATTRS256
 306_mm256_add_epi16(__m256i __a, __m256i __b)
 307{
 308  return (__m256i)((__v16hu)__a + (__v16hu)__b);
 309}
 310
 311/// Adds 32-bit integers from corresponding elements of two 256-bit vectors of
 312///    [8 x i32] and returns the lower 32 bits of each sum in the corresponding
 313///    element of the [8 x i32] result (overflow is ignored).
 314///
 315/// \headerfile <immintrin.h>
 316///
 317/// This intrinsic corresponds to the \c VPADDD instruction.
 318///
 319/// \param __a
 320///    A 256-bit vector of [8 x i32] containing one of the source operands.
 321/// \param __b
 322///    A 256-bit vector of [8 x i32] containing one of the source operands.
 323/// \returns A 256-bit vector of [8 x i32] containing the sums.
 324static __inline__ __m256i __DEFAULT_FN_ATTRS256
 325_mm256_add_epi32(__m256i __a, __m256i __b)
 326{
 327  return (__m256i)((__v8su)__a + (__v8su)__b);
 328}
 329
 330/// Adds 64-bit integers from corresponding elements of two 256-bit vectors of
 331///    [4 x i64] and returns the lower 64 bits of each sum in the corresponding
 332///    element of the [4 x i64] result (overflow is ignored).
 333///
 334/// \headerfile <immintrin.h>
 335///
 336/// This intrinsic corresponds to the \c VPADDQ instruction.
 337///
 338/// \param __a
 339///    A 256-bit vector of [4 x i64] containing one of the source operands.
 340/// \param __b
 341///    A 256-bit vector of [4 x i64] containing one of the source operands.
 342/// \returns A 256-bit vector of [4 x i64] containing the sums.
 343static __inline__ __m256i __DEFAULT_FN_ATTRS256
 344_mm256_add_epi64(__m256i __a, __m256i __b)
 345{
 346  return (__m256i)((__v4du)__a + (__v4du)__b);
 347}
 348
 349/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 350///    vectors using signed saturation, and returns each sum in the
 351///    corresponding byte of the 256-bit integer vector result.
 352///
 353/// \headerfile <immintrin.h>
 354///
 355/// This intrinsic corresponds to the \c VPADDSB instruction.
 356///
 357/// \param __a
 358///    A 256-bit integer vector containing one of the source operands.
 359/// \param __b
 360///    A 256-bit integer vector containing one of the source operands.
 361/// \returns A 256-bit integer vector containing the sums.
 362static __inline__ __m256i __DEFAULT_FN_ATTRS256
 363_mm256_adds_epi8(__m256i __a, __m256i __b)
 364{
 365  return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
 366}
 367
 368/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 369///    [16 x i16] using signed saturation, and returns the [16 x i16] result.
 370///
 371/// \headerfile <immintrin.h>
 372///
 373/// This intrinsic corresponds to the \c VPADDSW instruction.
 374///
 375/// \param __a
 376///    A 256-bit vector of [16 x i16] containing one of the source operands.
 377/// \param __b
 378///    A 256-bit vector of [16 x i16] containing one of the source operands.
 379/// \returns A 256-bit vector of [16 x i16] containing the sums.
 380static __inline__ __m256i __DEFAULT_FN_ATTRS256
 381_mm256_adds_epi16(__m256i __a, __m256i __b)
 382{
 383  return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
 384}
 385
 386/// Adds 8-bit integers from corresponding bytes of two 256-bit integer
 387///    vectors using unsigned saturation, and returns each sum in the
 388///    corresponding byte of the 256-bit integer vector result.
 389///
 390/// \headerfile <immintrin.h>
 391///
 392/// This intrinsic corresponds to the \c VPADDUSB instruction.
 393///
 394/// \param __a
 395///    A 256-bit integer vector containing one of the source operands.
 396/// \param __b
 397///    A 256-bit integer vector containing one of the source operands.
 398/// \returns A 256-bit integer vector containing the sums.
 399static __inline__ __m256i __DEFAULT_FN_ATTRS256
 400_mm256_adds_epu8(__m256i __a, __m256i __b)
 401{
 402  return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
 403}
 404
 405/// Adds 16-bit integers from corresponding elements of two 256-bit vectors of
 406///    [16 x i16] using unsigned saturation, and returns the [16 x i16] result.
 407///
 408/// \headerfile <immintrin.h>
 409///
 410/// This intrinsic corresponds to the \c VPADDUSW instruction.
 411///
 412/// \param __a
 413///    A 256-bit vector of [16 x i16] containing one of the source operands.
 414/// \param __b
 415///    A 256-bit vector of [16 x i16] containing one of the source operands.
 416/// \returns A 256-bit vector of [16 x i16] containing the sums.
 417static __inline__ __m256i __DEFAULT_FN_ATTRS256
 418_mm256_adds_epu16(__m256i __a, __m256i __b)
 419{
 420  return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
 421}
 422
 423/// Uses the lower half of the 256-bit vector \a a as the upper half of a
 424///    temporary 256-bit value, and the lower half of the 256-bit vector \a b
 425///    as the lower half of the temporary value. Right-shifts the temporary
 426///    value by \a n bytes, and uses the lower 16 bytes of the shifted value
 427///    as the lower 16 bytes of the result. Uses the upper halves of \a a and
 428///    \a b to make another temporary value, right shifts by \a n, and uses
 429///    the lower 16 bytes of the shifted value as the upper 16 bytes of the
 430///    result.
 431///
 432/// \headerfile <immintrin.h>
 433///
 434/// \code
 435/// __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int n);
 436/// \endcode
 437///
 438/// This intrinsic corresponds to the \c VPALIGNR instruction.
 439///
 440/// \param a
 441///    A 256-bit integer vector containing source values.
 442/// \param b
 443///    A 256-bit integer vector containing source values.
 444/// \param n
 445///    An immediate value specifying the number of bytes to shift.
 446/// \returns A 256-bit integer vector containing the result.
 447#define _mm256_alignr_epi8(a, b, n) \
 448  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
 449                                      (__v32qi)(__m256i)(b), (n)))
 450
 451/// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
 452///    \a __b.
 453///
 454/// \headerfile <immintrin.h>
 455///
 456/// This intrinsic corresponds to the \c VPAND instruction.
 457///
 458/// \param __a
 459///    A 256-bit integer vector.
 460/// \param __b
 461///    A 256-bit integer vector.
 462/// \returns A 256-bit integer vector containing the result.
 463static __inline__ __m256i __DEFAULT_FN_ATTRS256
 464_mm256_and_si256(__m256i __a, __m256i __b)
 465{
 466  return (__m256i)((__v4du)__a & (__v4du)__b);
 467}
 468
 469/// Computes the bitwise AND of the 256-bit integer vector in \a __b with
 470///    the bitwise NOT of the 256-bit integer vector in \a __a.
 471///
 472/// \headerfile <immintrin.h>
 473///
 474/// This intrinsic corresponds to the \c VPANDN instruction.
 475///
 476/// \param __a
 477///    A 256-bit integer vector.
 478/// \param __b
 479///    A 256-bit integer vector.
 480/// \returns A 256-bit integer vector containing the result.
 481static __inline__ __m256i __DEFAULT_FN_ATTRS256
 482_mm256_andnot_si256(__m256i __a, __m256i __b)
 483{
 484  return (__m256i)(~(__v4du)__a & (__v4du)__b);
 485}
 486
 487/// Computes the averages of the corresponding unsigned bytes in the two
 488///    256-bit integer vectors in \a __a and \a __b and returns each
 489///    average in the corresponding byte of the 256-bit result.
 490///
 491/// \code{.operation}
 492/// FOR i := 0 TO 31
 493///   j := i*8
 494///   result[j+7:j] := (__a[j+7:j] + __b[j+7:j] + 1) >> 1
 495/// ENDFOR
 496/// \endcode
 497///
 498/// \headerfile <immintrin.h>
 499///
 500/// This intrinsic corresponds to the \c VPAVGB instruction.
 501///
 502/// \param __a
 503///    A 256-bit integer vector.
 504/// \param __b
 505///    A 256-bit integer vector.
 506/// \returns A 256-bit integer vector containing the result.
 507static __inline__ __m256i __DEFAULT_FN_ATTRS256
 508_mm256_avg_epu8(__m256i __a, __m256i __b)
 509{
 510  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
 511}
 512
 513/// Computes the averages of the corresponding unsigned 16-bit integers in
 514///    the two 256-bit vectors of [16 x i16] in \a __a and \a __b and returns
 515///    each average in the corresponding element of the 256-bit result.
 516///
 517/// \code{.operation}
 518/// FOR i := 0 TO 15
 519///   j := i*16
 520///   result[j+15:j] := (__a[j+15:j] + __b[j+15:j] + 1) >> 1
 521/// ENDFOR
 522/// \endcode
 523///
 524/// \headerfile <immintrin.h>
 525///
 526/// This intrinsic corresponds to the \c VPAVGW instruction.
 527///
 528/// \param __a
 529///    A 256-bit vector of [16 x i16].
 530/// \param __b
 531///    A 256-bit vector of [16 x i16].
 532/// \returns A 256-bit vector of [16 x i16] containing the result.
 533static __inline__ __m256i __DEFAULT_FN_ATTRS256
 534_mm256_avg_epu16(__m256i __a, __m256i __b)
 535{
 536  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
 537}
 538
 539/// Merges 8-bit integer values from either of the two 256-bit vectors
 540///    \a __V1 or \a __V2, as specified by the 256-bit mask \a __M and returns
 541///    the resulting 256-bit integer vector.
 542///
 543/// \code{.operation}
 544/// FOR i := 0 TO 31
 545///   j := i*8
 546///   IF __M[7+i] == 0
 547///     result[7+j:j] := __V1[7+j:j]
 548///   ELSE
 549///     result[7+j:j] := __V2[7+j:j]
 550///   FI
 551/// ENDFOR
 552/// \endcode
 553///
 554/// \headerfile <immintrin.h>
 555///
 556/// This intrinsic corresponds to the \c VPBLENDVB instruction.
 557///
 558/// \param __V1
 559///    A 256-bit integer vector containing source values.
 560/// \param __V2
 561///    A 256-bit integer vector containing source values.
 562/// \param __M
 563///    A 256-bit integer vector, with bit [7] of each byte specifying the
 564///    source for each corresponding byte of the result. When the mask bit
 565///    is 0, the byte is copied from \a __V1; otherwise, it is copied from
 566///    \a __V2.
 567/// \returns A 256-bit integer vector containing the result.
 568static __inline__ __m256i __DEFAULT_FN_ATTRS256
 569_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 570{
 571  return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
 572                                              (__v32qi)__M);
 573}
 574
 575/// Merges 16-bit integer values from either of the two 256-bit vectors
 576///    \a V1 or \a V2, as specified by the immediate integer operand \a M,
 577///    and returns the resulting 256-bit vector of [16 x i16].
 578///
 579/// \code{.operation}
 580/// FOR i := 0 TO 7
 581///   j := i*16
 582///   IF M[i] == 0
 583///     result[7+j:j] := V1[7+j:j]
 584///     result[135+j:128+j] := V1[135+j:128+j]
 585///   ELSE
 586///     result[7+j:j] := V2[7+j:j]
 587///     result[135+j:128+j] := V2[135+j:128+j]
 588///   FI
 589/// ENDFOR
 590/// \endcode
 591///
 592/// \headerfile <immintrin.h>
 593///
 594/// \code
 595/// __m256i _mm256_blend_epi16(__m256i V1, __m256i V2, const int M);
 596/// \endcode
 597///
 598/// This intrinsic corresponds to the \c VPBLENDW instruction.
 599///
 600/// \param V1
 601///    A 256-bit vector of [16 x i16] containing source values.
 602/// \param V2
 603///    A 256-bit vector of [16 x i16] containing source values.
 604/// \param M
 605///    An immediate 8-bit integer operand, with bits [7:0] specifying the
 606///    source for each element of the result. The position of the mask bit
 607///    corresponds to the index of a copied value. When a mask bit is 0, the
 608///    element is copied from \a V1; otherwise, it is copied from \a V2.
 609///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
 610///    elements 1 and 9, and so forth.
 611/// \returns A 256-bit vector of [16 x i16] containing the result.
 612#define _mm256_blend_epi16(V1, V2, M) \
 613  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
 614                                      (__v16hi)(__m256i)(V2), (int)(M)))
 615
 616/// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
 617///    \a __b for equality and returns the outcomes in the corresponding
 618///    bytes of the 256-bit result.
 619///
 620/// \code{.operation}
 621/// FOR i := 0 TO 31
 622///   j := i*8
 623///   result[j+7:j] := (__a[j+7:j] == __b[j+7:j]) ? 0xFF : 0
 624/// ENDFOR
 625/// \endcode
 626///
 627/// \headerfile <immintrin.h>
 628///
 629/// This intrinsic corresponds to the \c VPCMPEQB instruction.
 630///
 631/// \param __a
 632///    A 256-bit integer vector containing one of the inputs.
 633/// \param __b
 634///    A 256-bit integer vector containing one of the inputs.
 635/// \returns A 256-bit integer vector containing the result.
 636static __inline__ __m256i __DEFAULT_FN_ATTRS256
 637_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 638{
 639  return (__m256i)((__v32qi)__a == (__v32qi)__b);
 640}
 641
 642/// Compares corresponding elements in the 256-bit vectors of [16 x i16] in
 643///    \a __a and \a __b for equality and returns the outcomes in the
 644///    corresponding elements of the 256-bit result.
 645///
 646/// \code{.operation}
 647/// FOR i := 0 TO 15
 648///   j := i*16
 649///   result[j+15:j] := (__a[j+15:j] == __b[j+15:j]) ? 0xFFFF : 0
 650/// ENDFOR
 651/// \endcode
 652///
 653/// \headerfile <immintrin.h>
 654///
 655/// This intrinsic corresponds to the \c VPCMPEQW instruction.
 656///
 657/// \param __a
 658///    A 256-bit vector of [16 x i16] containing one of the inputs.
 659/// \param __b
 660///    A 256-bit vector of [16 x i16] containing one of the inputs.
 661/// \returns A 256-bit vector of [16 x i16] containing the result.
 662static __inline__ __m256i __DEFAULT_FN_ATTRS256
 663_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 664{
 665  return (__m256i)((__v16hi)__a == (__v16hi)__b);
 666}
 667
 668/// Compares corresponding elements in the 256-bit vectors of [8 x i32] in
 669///    \a __a and \a __b for equality and returns the outcomes in the
 670///    corresponding elements of the 256-bit result.
 671///
 672/// \code{.operation}
 673/// FOR i := 0 TO 7
 674///   j := i*32
 675///   result[j+31:j] := (__a[j+31:j] == __b[j+31:j]) ? 0xFFFFFFFF : 0
 676/// ENDFOR
 677/// \endcode
 678///
 679/// \headerfile <immintrin.h>
 680///
 681/// This intrinsic corresponds to the \c VPCMPEQD instruction.
 682///
 683/// \param __a
 684///    A 256-bit vector of [8 x i32] containing one of the inputs.
 685/// \param __b
 686///    A 256-bit vector of [8 x i32] containing one of the inputs.
 687/// \returns A 256-bit vector of [8 x i32] containing the result.
 688static __inline__ __m256i __DEFAULT_FN_ATTRS256
 689_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 690{
 691  return (__m256i)((__v8si)__a == (__v8si)__b);
 692}
 693
 694/// Compares corresponding elements in the 256-bit vectors of [4 x i64] in
 695///    \a __a and \a __b for equality and returns the outcomes in the
 696///    corresponding elements of the 256-bit result.
 697///
 698/// \code{.operation}
 699/// FOR i := 0 TO 3
 700///   j := i*64
 701///   result[j+63:j] := (__a[j+63:j] == __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
 702/// ENDFOR
 703/// \endcode
 704///
 705/// \headerfile <immintrin.h>
 706///
 707/// This intrinsic corresponds to the \c VPCMPEQQ instruction.
 708///
 709/// \param __a
 710///    A 256-bit vector of [4 x i64] containing one of the inputs.
 711/// \param __b
 712///    A 256-bit vector of [4 x i64] containing one of the inputs.
 713/// \returns A 256-bit vector of [4 x i64] containing the result.
 714static __inline__ __m256i __DEFAULT_FN_ATTRS256
 715_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 716{
 717  return (__m256i)((__v4di)__a == (__v4di)__b);
 718}
 719
 720/// Compares corresponding signed bytes in the 256-bit integer vectors in
 721///    \a __a and \a __b for greater-than and returns the outcomes in the
 722///    corresponding bytes of the 256-bit result.
 723///
 724/// \code{.operation}
 725/// FOR i := 0 TO 31
 726///   j := i*8
 727///   result[j+7:j] := (__a[j+7:j] > __b[j+7:j]) ? 0xFF : 0
 728/// ENDFOR
 729/// \endcode
 730///
 731/// \headerfile <immintrin.h>
 732///
 733/// This intrinsic corresponds to the \c VPCMPGTB instruction.
 734///
 735/// \param __a
 736///    A 256-bit integer vector containing one of the inputs.
 737/// \param __b
 738///    A 256-bit integer vector containing one of the inputs.
 739/// \returns A 256-bit integer vector containing the result.
 740static __inline__ __m256i __DEFAULT_FN_ATTRS256
 741_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 742{
 743  /* This function always performs a signed comparison, but __v32qi is a char
 744     which may be signed or unsigned, so use __v32qs. */
 745  return (__m256i)((__v32qs)__a > (__v32qs)__b);
 746}
 747
 748/// Compares corresponding signed elements in the 256-bit vectors of
 749///    [16 x i16] in \a __a and \a __b for greater-than and returns the
 750///    outcomes in the corresponding elements of the 256-bit result.
 751///
 752/// \code{.operation}
 753/// FOR i := 0 TO 15
 754///   j := i*16
 755///   result[j+15:j] := (__a[j+15:j] > __b[j+15:j]) ? 0xFFFF : 0
 756/// ENDFOR
 757/// \endcode
 758///
 759/// \headerfile <immintrin.h>
 760///
 761/// This intrinsic corresponds to the \c VPCMPGTW instruction.
 762///
 763/// \param __a
 764///    A 256-bit vector of [16 x i16] containing one of the inputs.
 765/// \param __b
 766///    A 256-bit vector of [16 x i16] containing one of the inputs.
 767/// \returns A 256-bit vector of [16 x i16] containing the result.
 768static __inline__ __m256i __DEFAULT_FN_ATTRS256
 769_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 770{
 771  return (__m256i)((__v16hi)__a > (__v16hi)__b);
 772}
 773
 774/// Compares corresponding signed elements in the 256-bit vectors of
 775///    [8 x i32] in \a __a and \a __b for greater-than and returns the
 776///    outcomes in the corresponding elements of the 256-bit result.
 777///
 778/// \code{.operation}
 779/// FOR i := 0 TO 7
 780///   j := i*32
 781///   result[j+31:j] := (__a[j+31:j] > __b[j+31:j]) ? 0xFFFFFFFF : 0
 782/// ENDFOR
 783/// \endcode
 784///
 785/// \headerfile <immintrin.h>
 786///
 787/// This intrinsic corresponds to the \c VPCMPGTD instruction.
 788///
 789/// \param __a
 790///    A 256-bit vector of [8 x i32] containing one of the inputs.
 791/// \param __b
 792///    A 256-bit vector of [8 x i32] containing one of the inputs.
 793/// \returns A 256-bit vector of [8 x i32] containing the result.
 794static __inline__ __m256i __DEFAULT_FN_ATTRS256
 795_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 796{
 797  return (__m256i)((__v8si)__a > (__v8si)__b);
 798}
 799
 800/// Compares corresponding signed elements in the 256-bit vectors of
 801///    [4 x i64] in \a __a and \a __b for greater-than and returns the
 802///    outcomes in the corresponding elements of the 256-bit result.
 803///
 804/// \code{.operation}
 805/// FOR i := 0 TO 3
 806///   j := i*64
 807///   result[j+63:j] := (__a[j+63:j] > __b[j+63:j]) ? 0xFFFFFFFFFFFFFFFF : 0
 808/// ENDFOR
 809/// \endcode
 810///
 811/// \headerfile <immintrin.h>
 812///
 813/// This intrinsic corresponds to the \c VPCMPGTQ instruction.
 814///
 815/// \param __a
 816///    A 256-bit vector of [4 x i64] containing one of the inputs.
 817/// \param __b
 818///    A 256-bit vector of [4 x i64] containing one of the inputs.
 819/// \returns A 256-bit vector of [4 x i64] containing the result.
 820static __inline__ __m256i __DEFAULT_FN_ATTRS256
 821_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 822{
 823  return (__m256i)((__v4di)__a > (__v4di)__b);
 824}
 825
 826/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
 827///    vectors of [16 x i16] and returns the lower 16 bits of each sum in an
 828///    element of the [16 x i16] result (overflow is ignored). Sums from
 829///    \a __a are returned in the lower 64 bits of each 128-bit half of the
 830///    result; sums from \a __b are returned in the upper 64 bits of each
 831///    128-bit half of the result.
 832///
 833/// \code{.operation}
 834/// FOR i := 0 TO 1
 835///   j := i*128
 836///   result[j+15:j] := __a[j+15:j] + __a[j+31:j+16]
 837///   result[j+31:j+16] := __a[j+47:j+32] + __a[j+63:j+48]
 838///   result[j+47:j+32] := __a[j+79:j+64] + __a[j+95:j+80]
 839///   result[j+63:j+48] := __a[j+111:j+96] + __a[j+127:j+112]
 840///   result[j+79:j+64] := __b[j+15:j] + __b[j+31:j+16]
 841///   result[j+95:j+80] := __b[j+47:j+32] + __b[j+63:j+48]
 842///   result[j+111:j+96] := __b[j+79:j+64] + __b[j+95:j+80]
 843///   result[j+127:j+112] := __b[j+111:j+96] + __b[j+127:j+112]
 844/// ENDFOR
 845/// \endcode
 846///
 847/// \headerfile <immintrin.h>
 848///
 849/// This intrinsic corresponds to the \c VPHADDW instruction.
 850///
 851/// \param __a
 852///    A 256-bit vector of [16 x i16] containing one of the source operands.
 853/// \param __b
 854///    A 256-bit vector of [16 x i16] containing one of the source operands.
 855/// \returns A 256-bit vector of [16 x i16] containing the sums.
 856static __inline__ __m256i __DEFAULT_FN_ATTRS256
 857_mm256_hadd_epi16(__m256i __a, __m256i __b)
 858{
 859    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 860}
 861
 862/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
 863///    vectors of [8 x i32] and returns the lower 32 bits of each sum in an
 864///    element of the [8 x i32] result (overflow is ignored). Sums from \a __a
 865///    are returned in the lower 64 bits of each 128-bit half of the result;
 866///    sums from \a __b are returned in the upper 64 bits of each 128-bit half
 867///    of the result.
 868///
 869/// \code{.operation}
 870/// FOR i := 0 TO 1
 871///   j := i*128
 872///   result[j+31:j] := __a[j+31:j] + __a[j+63:j+32]
 873///   result[j+63:j+32] := __a[j+95:j+64] + __a[j+127:j+96]
 874///   result[j+95:j+64] := __b[j+31:j] + __b[j+63:j+32]
 875///   result[j+127:j+96] := __b[j+95:j+64] + __b[j+127:j+96]
 876/// ENDFOR
 877/// \endcode
 878///
 879/// \headerfile <immintrin.h>
 880///
 881/// This intrinsic corresponds to the \c VPHADDD instruction.
 882///
 883/// \param __a
 884///    A 256-bit vector of [8 x i32] containing one of the source operands.
 885/// \param __b
 886///    A 256-bit vector of [8 x i32] containing one of the source operands.
 887/// \returns A 256-bit vector of [8 x i32] containing the sums.
 888static __inline__ __m256i __DEFAULT_FN_ATTRS256
 889_mm256_hadd_epi32(__m256i __a, __m256i __b)
 890{
 891    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 892}
 893
 894/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
 895///    vectors of [16 x i16] using signed saturation and returns each sum in
 896///    an element of the [16 x i16] result. Sums from \a __a are returned in
 897///    the lower 64 bits of each 128-bit half of the result; sums from \a __b
 898///    are returned in the upper 64 bits of each 128-bit half of the result.
 899///
 900/// \code{.operation}
 901/// FOR i := 0 TO 1
 902///   j := i*128
 903///   result[j+15:j] := SATURATE16(__a[j+15:j] + __a[j+31:j+16])
 904///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] + __a[j+63:j+48])
 905///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] + __a[j+95:j+80])
 906///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] + __a[j+127:j+112])
 907///   result[j+79:j+64] := SATURATE16(__b[j+15:j] + __b[j+31:j+16])
 908///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] + __b[j+63:j+48])
 909///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] + __b[j+95:j+80])
 910///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] + __b[j+127:j+112])
 911/// ENDFOR
 912/// \endcode
 913///
 914/// \headerfile <immintrin.h>
 915///
 916/// This intrinsic corresponds to the \c VPHADDSW instruction.
 917///
 918/// \param __a
 919///    A 256-bit vector of [16 x i16] containing one of the source operands.
 920/// \param __b
 921///    A 256-bit vector of [16 x i16] containing one of the source operands.
 922/// \returns A 256-bit vector of [16 x i16] containing the sums.
 923static __inline__ __m256i __DEFAULT_FN_ATTRS256
 924_mm256_hadds_epi16(__m256i __a, __m256i __b)
 925{
 926    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 927}
 928
 929/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
 930///    vectors of [16 x i16] and returns the lower 16 bits of each difference
 931///    in an element of the [16 x i16] result (overflow is ignored).
 932///    Differences from \a __a are returned in the lower 64 bits of each
 933///    128-bit half of the result; differences from \a __b are returned in the
 934///    upper 64 bits of each 128-bit half of the result.
 935///
 936/// \code{.operation}
 937/// FOR i := 0 TO 1
 938///   j := i*128
 939///   result[j+15:j] := __a[j+15:j] - __a[j+31:j+16]
 940///   result[j+31:j+16] := __a[j+47:j+32] - __a[j+63:j+48]
 941///   result[j+47:j+32] := __a[j+79:j+64] - __a[j+95:j+80]
 942///   result[j+63:j+48] := __a[j+111:j+96] - __a[j+127:j+112]
 943///   result[j+79:j+64] := __b[j+15:j] - __b[j+31:j+16]
 944///   result[j+95:j+80] := __b[j+47:j+32] - __b[j+63:j+48]
 945///   result[j+111:j+96] := __b[j+79:j+64] - __b[j+95:j+80]
 946///   result[j+127:j+112] := __b[j+111:j+96] - __b[j+127:j+112]
 947/// ENDFOR
 948/// \endcode
 949///
 950/// \headerfile <immintrin.h>
 951///
 952/// This intrinsic corresponds to the \c VPHSUBW instruction.
 953///
 954/// \param __a
 955///    A 256-bit vector of [16 x i16] containing one of the source operands.
 956/// \param __b
 957///    A 256-bit vector of [16 x i16] containing one of the source operands.
 958/// \returns A 256-bit vector of [16 x i16] containing the differences.
 959static __inline__ __m256i __DEFAULT_FN_ATTRS256
 960_mm256_hsub_epi16(__m256i __a, __m256i __b)
 961{
 962    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 963}
 964
 965/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
 966///    vectors of [8 x i32] and returns the lower 32 bits of each difference in
 967///    an element of the [8 x i32] result (overflow is ignored). Differences
 968///    from \a __a are returned in the lower 64 bits of each 128-bit half of
 969///    the result; differences from \a __b are returned in the upper 64 bits
 970///    of each 128-bit half of the result.
 971///
 972/// \code{.operation}
 973/// FOR i := 0 TO 1
 974///   j := i*128
 975///   result[j+31:j] := __a[j+31:j] - __a[j+63:j+32]
 976///   result[j+63:j+32] := __a[j+95:j+64] - __a[j+127:j+96]
 977///   result[j+95:j+64] := __b[j+31:j] - __b[j+63:j+32]
 978///   result[j+127:j+96] := __b[j+95:j+64] - __b[j+127:j+96]
 979/// ENDFOR
 980/// \endcode
 981///
 982/// \headerfile <immintrin.h>
 983///
 984/// This intrinsic corresponds to the \c VPHSUBD instruction.
 985///
 986/// \param __a
 987///    A 256-bit vector of [8 x i32] containing one of the source operands.
 988/// \param __b
 989///    A 256-bit vector of [8 x i32] containing one of the source operands.
 990/// \returns A 256-bit vector of [8 x i32] containing the differences.
 991static __inline__ __m256i __DEFAULT_FN_ATTRS256
 992_mm256_hsub_epi32(__m256i __a, __m256i __b)
 993{
 994    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 995}
 996
 997/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
 998///    vectors of [16 x i16] using signed saturation and returns each sum in
 999///    an element of the [16 x i16] result. Differences from \a __a are
1000///    returned in the lower 64 bits of each 128-bit half of the result;
1001///    differences from \a __b are returned in the upper 64 bits of each
1002///    128-bit half of the result.
1003///
1004/// \code{.operation}
1005/// FOR i := 0 TO 1
1006///   j := i*128
1007///   result[j+15:j] := SATURATE16(__a[j+15:j] - __a[j+31:j+16])
1008///   result[j+31:j+16] := SATURATE16(__a[j+47:j+32] - __a[j+63:j+48])
1009///   result[j+47:j+32] := SATURATE16(__a[j+79:j+64] - __a[j+95:j+80])
1010///   result[j+63:j+48] := SATURATE16(__a[j+111:j+96] - __a[j+127:j+112])
1011///   result[j+79:j+64] := SATURATE16(__b[j+15:j] - __b[j+31:j+16])
1012///   result[j+95:j+80] := SATURATE16(__b[j+47:j+32] - __b[j+63:j+48])
1013///   result[j+111:j+96] := SATURATE16(__b[j+79:j+64] - __b[j+95:j+80])
1014///   result[j+127:j+112] := SATURATE16(__b[j+111:j+96] - __b[j+127:j+112])
1015/// ENDFOR
1016/// \endcode
1017///
1018/// \headerfile <immintrin.h>
1019///
1020/// This intrinsic corresponds to the \c VPHSUBSW instruction.
1021///
1022/// \param __a
1023///    A 256-bit vector of [16 x i16] containing one of the source operands.
1024/// \param __b
1025///    A 256-bit vector of [16 x i16] containing one of the source operands.
1026/// \returns A 256-bit vector of [16 x i16] containing the differences.
1027static __inline__ __m256i __DEFAULT_FN_ATTRS256
1028_mm256_hsubs_epi16(__m256i __a, __m256i __b)
1029{
1030    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
1031}
1032
1033/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
1034///    with the corresponding signed byte from the 256-bit integer vector in
1035///    \a __b, forming signed 16-bit intermediate products. Adds adjacent
1036///    pairs of those products using signed saturation to form 16-bit sums
1037///    returned as elements of the [16 x i16] result.
1038///
1039/// \code{.operation}
1040/// FOR i := 0 TO 15
1041///   j := i*16
1042///   temp1 := __a[j+7:j] * __b[j+7:j]
1043///   temp2 := __a[j+15:j+8] * __b[j+15:j+8]
1044///   result[j+15:j] := SATURATE16(temp1 + temp2)
1045/// ENDFOR
1046/// \endcode
1047///
1048/// \headerfile <immintrin.h>
1049///
1050/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
1051///
1052/// \param __a
1053///    A 256-bit vector containing one of the source operands.
1054/// \param __b
1055///    A 256-bit vector containing one of the source operands.
1056/// \returns A 256-bit vector of [16 x i16] containing the result.
1057static __inline__ __m256i __DEFAULT_FN_ATTRS256
1058_mm256_maddubs_epi16(__m256i __a, __m256i __b)
1059{
1060    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
1061}
1062
1063/// Multiplies corresponding 16-bit elements of two 256-bit vectors of
1064///    [16 x i16], forming 32-bit intermediate products, and adds pairs of
1065///    those products to form 32-bit sums returned as elements of the
1066///    [8 x i32] result.
1067///
1068///    There is only one wraparound case: when all four of the 16-bit sources
1069///    are \c 0x8000, the result will be \c 0x80000000.
1070///
1071/// \code{.operation}
1072/// FOR i := 0 TO 7
1073///   j := i*32
1074///   temp1 := __a[j+15:j] * __b[j+15:j]
1075///   temp2 := __a[j+31:j+16] * __b[j+31:j+16]
1076///   result[j+31:j] := temp1 + temp2
1077/// ENDFOR
1078/// \endcode
1079///
1080/// \headerfile <immintrin.h>
1081///
1082/// This intrinsic corresponds to the \c VPMADDWD instruction.
1083///
1084/// \param __a
1085///    A 256-bit vector of [16 x i16] containing one of the source operands.
1086/// \param __b
1087///    A 256-bit vector of [16 x i16] containing one of the source operands.
1088/// \returns A 256-bit vector of [8 x i32] containing the result.
1089static __inline__ __m256i __DEFAULT_FN_ATTRS256
1090_mm256_madd_epi16(__m256i __a, __m256i __b)
1091{
1092  return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
1093}
1094
1095/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1096///     in \a __a and \a __b and returns the larger of each pair in the
1097///     corresponding byte of the 256-bit result.
1098///
1099/// \headerfile <immintrin.h>
1100///
1101/// This intrinsic corresponds to the \c VPMAXSB instruction.
1102///
1103/// \param __a
1104///    A 256-bit integer vector.
1105/// \param __b
1106///    A 256-bit integer vector.
1107/// \returns A 256-bit integer vector containing the result.
1108static __inline__ __m256i __DEFAULT_FN_ATTRS256
1109_mm256_max_epi8(__m256i __a, __m256i __b)
1110{
1111  return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
1112}
1113
1114/// Compares the corresponding signed 16-bit integers in the two 256-bit
1115///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1116///    each pair in the corresponding element of the 256-bit result.
1117///
1118/// \headerfile <immintrin.h>
1119///
1120/// This intrinsic corresponds to the \c VPMAXSW instruction.
1121///
1122/// \param __a
1123///    A 256-bit vector of [16 x i16].
1124/// \param __b
1125///    A 256-bit vector of [16 x i16].
1126/// \returns A 256-bit vector of [16 x i16] containing the result.
1127static __inline__ __m256i __DEFAULT_FN_ATTRS256
1128_mm256_max_epi16(__m256i __a, __m256i __b)
1129{
1130  return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
1131}
1132
1133/// Compares the corresponding signed 32-bit integers in the two 256-bit
1134///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1135///    each pair in the corresponding element of the 256-bit result.
1136///
1137/// \headerfile <immintrin.h>
1138///
1139/// This intrinsic corresponds to the \c VPMAXSD instruction.
1140///
1141/// \param __a
1142///    A 256-bit vector of [8 x i32].
1143/// \param __b
1144///    A 256-bit vector of [8 x i32].
1145/// \returns A 256-bit vector of [8 x i32] containing the result.
1146static __inline__ __m256i __DEFAULT_FN_ATTRS256
1147_mm256_max_epi32(__m256i __a, __m256i __b)
1148{
1149  return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
1150}
1151
1152/// Compares the corresponding unsigned bytes in the two 256-bit integer
1153///     vectors in \a __a and \a __b and returns the larger of each pair in
1154///     the corresponding byte of the 256-bit result.
1155///
1156/// \headerfile <immintrin.h>
1157///
1158/// This intrinsic corresponds to the \c VPMAXUB instruction.
1159///
1160/// \param __a
1161///    A 256-bit integer vector.
1162/// \param __b
1163///    A 256-bit integer vector.
1164/// \returns A 256-bit integer vector containing the result.
1165static __inline__ __m256i __DEFAULT_FN_ATTRS256
1166_mm256_max_epu8(__m256i __a, __m256i __b)
1167{
1168  return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
1169}
1170
1171/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1172///    vectors of [16 x i16] in \a __a and \a __b and returns the larger of
1173///    each pair in the corresponding element of the 256-bit result.
1174///
1175/// \headerfile <immintrin.h>
1176///
1177/// This intrinsic corresponds to the \c VPMAXUW instruction.
1178///
1179/// \param __a
1180///    A 256-bit vector of [16 x i16].
1181/// \param __b
1182///    A 256-bit vector of [16 x i16].
1183/// \returns A 256-bit vector of [16 x i16] containing the result.
1184static __inline__ __m256i __DEFAULT_FN_ATTRS256
1185_mm256_max_epu16(__m256i __a, __m256i __b)
1186{
1187  return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
1188}
1189
1190/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1191///    vectors of [8 x i32] in \a __a and \a __b and returns the larger of
1192///    each pair in the corresponding element of the 256-bit result.
1193///
1194/// \headerfile <immintrin.h>
1195///
1196/// This intrinsic corresponds to the \c VPMAXUD instruction.
1197///
1198/// \param __a
1199///    A 256-bit vector of [8 x i32].
1200/// \param __b
1201///    A 256-bit vector of [8 x i32].
1202/// \returns A 256-bit vector of [8 x i32] containing the result.
1203static __inline__ __m256i __DEFAULT_FN_ATTRS256
1204_mm256_max_epu32(__m256i __a, __m256i __b)
1205{
1206  return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
1207}
1208
1209/// Compares the corresponding signed bytes in the two 256-bit integer vectors
1210///     in \a __a and \a __b and returns the smaller of each pair in the
1211///     corresponding byte of the 256-bit result.
1212///
1213/// \headerfile <immintrin.h>
1214///
1215/// This intrinsic corresponds to the \c VPMINSB instruction.
1216///
1217/// \param __a
1218///    A 256-bit integer vector.
1219/// \param __b
1220///    A 256-bit integer vector.
1221/// \returns A 256-bit integer vector containing the result.
1222static __inline__ __m256i __DEFAULT_FN_ATTRS256
1223_mm256_min_epi8(__m256i __a, __m256i __b)
1224{
1225  return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
1226}
1227
1228/// Compares the corresponding signed 16-bit integers in the two 256-bit
1229///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1230///    each pair in the corresponding element of the 256-bit result.
1231///
1232/// \headerfile <immintrin.h>
1233///
1234/// This intrinsic corresponds to the \c VPMINSW instruction.
1235///
1236/// \param __a
1237///    A 256-bit vector of [16 x i16].
1238/// \param __b
1239///    A 256-bit vector of [16 x i16].
1240/// \returns A 256-bit vector of [16 x i16] containing the result.
1241static __inline__ __m256i __DEFAULT_FN_ATTRS256
1242_mm256_min_epi16(__m256i __a, __m256i __b)
1243{
1244  return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
1245}
1246
1247/// Compares the corresponding signed 32-bit integers in the two 256-bit
1248///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1249///    each pair in the corresponding element of the 256-bit result.
1250///
1251/// \headerfile <immintrin.h>
1252///
1253/// This intrinsic corresponds to the \c VPMINSD instruction.
1254///
1255/// \param __a
1256///    A 256-bit vector of [8 x i32].
1257/// \param __b
1258///    A 256-bit vector of [8 x i32].
1259/// \returns A 256-bit vector of [8 x i32] containing the result.
1260static __inline__ __m256i __DEFAULT_FN_ATTRS256
1261_mm256_min_epi32(__m256i __a, __m256i __b)
1262{
1263  return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
1264}
1265
1266/// Compares the corresponding unsigned bytes in the two 256-bit integer
1267///     vectors in \a __a and \a __b and returns the smaller of each pair in
1268///     the corresponding byte of the 256-bit result.
1269///
1270/// \headerfile <immintrin.h>
1271///
1272/// This intrinsic corresponds to the \c VPMINUB instruction.
1273///
1274/// \param __a
1275///    A 256-bit integer vector.
1276/// \param __b
1277///    A 256-bit integer vector.
1278/// \returns A 256-bit integer vector containing the result.
1279static __inline__ __m256i __DEFAULT_FN_ATTRS256
1280_mm256_min_epu8(__m256i __a, __m256i __b)
1281{
1282  return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
1283}
1284
1285/// Compares the corresponding unsigned 16-bit integers in the two 256-bit
1286///    vectors of [16 x i16] in \a __a and \a __b and returns the smaller of
1287///    each pair in the corresponding element of the 256-bit result.
1288///
1289/// \headerfile <immintrin.h>
1290///
1291/// This intrinsic corresponds to the \c VPMINUW instruction.
1292///
1293/// \param __a
1294///    A 256-bit vector of [16 x i16].
1295/// \param __b
1296///    A 256-bit vector of [16 x i16].
1297/// \returns A 256-bit vector of [16 x i16] containing the result.
1298static __inline__ __m256i __DEFAULT_FN_ATTRS256
1299_mm256_min_epu16(__m256i __a, __m256i __b)
1300{
1301  return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
1302}
1303
1304/// Compares the corresponding unsigned 32-bit integers in the two 256-bit
1305///    vectors of [8 x i32] in \a __a and \a __b and returns the smaller of
1306///    each pair in the corresponding element of the 256-bit result.
1307///
1308/// \headerfile <immintrin.h>
1309///
1310/// This intrinsic corresponds to the \c VPMINUD instruction.
1311///
1312/// \param __a
1313///    A 256-bit vector of [8 x i32].
1314/// \param __b
1315///    A 256-bit vector of [8 x i32].
1316/// \returns A 256-bit vector of [8 x i32] containing the result.
1317static __inline__ __m256i __DEFAULT_FN_ATTRS256
1318_mm256_min_epu32(__m256i __a, __m256i __b)
1319{
1320  return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
1321}
1322
1323/// Creates a 32-bit integer mask from the most significant bit of each byte
1324///    in the 256-bit integer vector in \a __a and returns the result.
1325///
1326/// \code{.operation}
1327/// FOR i := 0 TO 31
1328///   j := i*8
1329///   result[i] := __a[j+7]
1330/// ENDFOR
1331/// \endcode
1332///
1333/// \headerfile <immintrin.h>
1334///
1335/// This intrinsic corresponds to the \c VPMOVMSKB instruction.
1336///
1337/// \param __a
1338///    A 256-bit integer vector containing the source bytes.
1339/// \returns The 32-bit integer mask.
1340static __inline__ int __DEFAULT_FN_ATTRS256
1341_mm256_movemask_epi8(__m256i __a)
1342{
1343  return __builtin_ia32_pmovmskb256((__v32qi)__a);
1344}
1345
1346/// Sign-extends bytes from the 128-bit integer vector in \a __V and returns
1347///    the 16-bit values in the corresponding elements of a 256-bit vector
1348///    of [16 x i16].
1349///
1350/// \code{.operation}
1351/// FOR i := 0 TO 15
1352///   j := i*8
1353///   k := i*16
1354///   result[k+15:k] := SignExtend(__V[j+7:j])
1355/// ENDFOR
1356/// \endcode
1357///
1358/// \headerfile <immintrin.h>
1359///
1360/// This intrinsic corresponds to the \c VPMOVSXBW instruction.
1361///
1362/// \param __V
1363///    A 128-bit integer vector containing the source bytes.
1364/// \returns A 256-bit vector of [16 x i16] containing the sign-extended
1365///    values.
1366static __inline__ __m256i __DEFAULT_FN_ATTRS256
1367_mm256_cvtepi8_epi16(__m128i __V)
1368{
1369  /* This function always performs a signed extension, but __v16qi is a char
1370     which may be signed or unsigned, so use __v16qs. */
1371  return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
1372}
1373
1374/// Sign-extends bytes from the lower half of the 128-bit integer vector in
1375///    \a __V and returns the 32-bit values in the corresponding elements of a
1376///    256-bit vector of [8 x i32].
1377///
1378/// \code{.operation}
1379/// FOR i := 0 TO 7
1380///   j := i*8
1381///   k := i*32
1382///   result[k+31:k] := SignExtend(__V[j+7:j])
1383/// ENDFOR
1384/// \endcode
1385///
1386/// \headerfile <immintrin.h>
1387///
1388/// This intrinsic corresponds to the \c VPMOVSXBD instruction.
1389///
1390/// \param __V
1391///    A 128-bit integer vector containing the source bytes.
1392/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1393///    values.
1394static __inline__ __m256i __DEFAULT_FN_ATTRS256
1395_mm256_cvtepi8_epi32(__m128i __V)
1396{
1397  /* This function always performs a signed extension, but __v16qi is a char
1398     which may be signed or unsigned, so use __v16qs. */
1399  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1400}
1401
1402/// Sign-extends the first four bytes from the 128-bit integer vector in
1403///    \a __V and returns the 64-bit values in the corresponding elements of a
1404///    256-bit vector of [4 x i64].
1405///
1406/// \code{.operation}
1407/// result[63:0] := SignExtend(__V[7:0])
1408/// result[127:64] := SignExtend(__V[15:8])
1409/// result[191:128] := SignExtend(__V[23:16])
1410/// result[255:192] := SignExtend(__V[31:24])
1411/// \endcode
1412///
1413/// \headerfile <immintrin.h>
1414///
1415/// This intrinsic corresponds to the \c VPMOVSXBQ instruction.
1416///
1417/// \param __V
1418///    A 128-bit integer vector containing the source bytes.
1419/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1420///    values.
1421static __inline__ __m256i __DEFAULT_FN_ATTRS256
1422_mm256_cvtepi8_epi64(__m128i __V)
1423{
1424  /* This function always performs a signed extension, but __v16qi is a char
1425     which may be signed or unsigned, so use __v16qs. */
1426  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
1427}
1428
1429/// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1430///    \a __V and returns the 32-bit values in the corresponding elements of a
1431///    256-bit vector of [8 x i32].
1432///
1433/// \code{.operation}
1434/// FOR i := 0 TO 7
1435///   j := i*16
1436///   k := i*32
1437///   result[k+31:k] := SignExtend(__V[j+15:j])
1438/// ENDFOR
1439/// \endcode
1440///
1441/// \headerfile <immintrin.h>
1442///
1443/// This intrinsic corresponds to the \c VPMOVSXWD instruction.
1444///
1445/// \param __V
1446///    A 128-bit vector of [8 x i16] containing the source values.
1447/// \returns A 256-bit vector of [8 x i32] containing the sign-extended
1448///    values.
1449static __inline__ __m256i __DEFAULT_FN_ATTRS256
1450_mm256_cvtepi16_epi32(__m128i __V)
1451{
1452  return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
1453}
1454
1455/// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
1456///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1457///    elements of a 256-bit vector of [4 x i64].
1458///
1459/// \code{.operation}
1460/// result[63:0] := SignExtend(__V[15:0])
1461/// result[127:64] := SignExtend(__V[31:16])
1462/// result[191:128] := SignExtend(__V[47:32])
1463/// result[255:192] := SignExtend(__V[64:48])
1464/// \endcode
1465///
1466/// \headerfile <immintrin.h>
1467///
1468/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1469///
1470/// \param __V
1471///    A 128-bit vector of [8 x i16] containing the source values.
1472/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1473///    values.
1474static __inline__ __m256i __DEFAULT_FN_ATTRS256
1475_mm256_cvtepi16_epi64(__m128i __V)
1476{
1477  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
1478}
1479
1480/// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1481///    \a __V and returns the 64-bit values in the corresponding elements of a
1482///    256-bit vector of [4 x i64].
1483///
1484/// \code{.operation}
1485/// result[63:0] := SignExtend(__V[31:0])
1486/// result[127:64] := SignExtend(__V[63:32])
1487/// result[191:128] := SignExtend(__V[95:64])
1488/// result[255:192] := SignExtend(__V[127:96])
1489/// \endcode
1490///
1491/// \headerfile <immintrin.h>
1492///
1493/// This intrinsic corresponds to the \c VPMOVSXDQ instruction.
1494///
1495/// \param __V
1496///    A 128-bit vector of [4 x i32] containing the source values.
1497/// \returns A 256-bit vector of [4 x i64] containing the sign-extended
1498///    values.
1499static __inline__ __m256i __DEFAULT_FN_ATTRS256
1500_mm256_cvtepi32_epi64(__m128i __V)
1501{
1502  return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
1503}
1504
1505/// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
1506///    the 16-bit values in the corresponding elements of a 256-bit vector
1507///    of [16 x i16].
1508///
1509/// \code{.operation}
1510/// FOR i := 0 TO 15
1511///   j := i*8
1512///   k := i*16
1513///   result[k+15:k] := ZeroExtend(__V[j+7:j])
1514/// ENDFOR
1515/// \endcode
1516///
1517/// \headerfile <immintrin.h>
1518///
1519/// This intrinsic corresponds to the \c VPMOVZXBW instruction.
1520///
1521/// \param __V
1522///    A 128-bit integer vector containing the source bytes.
1523/// \returns A 256-bit vector of [16 x i16] containing the zero-extended
1524///    values.
1525static __inline__ __m256i __DEFAULT_FN_ATTRS256
1526_mm256_cvtepu8_epi16(__m128i __V)
1527{
1528  return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
1529}
1530
1531/// Zero-extends bytes from the lower half of the 128-bit integer vector in
1532///    \a __V and returns the 32-bit values in the corresponding elements of a
1533///    256-bit vector of [8 x i32].
1534///
1535/// \code{.operation}
1536/// FOR i := 0 TO 7
1537///   j := i*8
1538///   k := i*32
1539///   result[k+31:k] := ZeroExtend(__V[j+7:j])
1540/// ENDFOR
1541/// \endcode
1542///
1543/// \headerfile <immintrin.h>
1544///
1545/// This intrinsic corresponds to the \c VPMOVZXBD instruction.
1546///
1547/// \param __V
1548///    A 128-bit integer vector containing the source bytes.
1549/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1550///    values.
1551static __inline__ __m256i __DEFAULT_FN_ATTRS256
1552_mm256_cvtepu8_epi32(__m128i __V)
1553{
1554  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
1555}
1556
1557/// Zero-extends the first four bytes from the 128-bit integer vector in
1558///    \a __V and returns the 64-bit values in the corresponding elements of a
1559///    256-bit vector of [4 x i64].
1560///
1561/// \code{.operation}
1562/// result[63:0] := ZeroExtend(__V[7:0])
1563/// result[127:64] := ZeroExtend(__V[15:8])
1564/// result[191:128] := ZeroExtend(__V[23:16])
1565/// result[255:192] := ZeroExtend(__V[31:24])
1566/// \endcode
1567///
1568/// \headerfile <immintrin.h>
1569///
1570/// This intrinsic corresponds to the \c VPMOVZXBQ instruction.
1571///
1572/// \param __V
1573///    A 128-bit integer vector containing the source bytes.
1574/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1575///    values.
1576static __inline__ __m256i __DEFAULT_FN_ATTRS256
1577_mm256_cvtepu8_epi64(__m128i __V)
1578{
1579  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
1580}
1581
1582/// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
1583///    \a __V and returns the 32-bit values in the corresponding elements of a
1584///    256-bit vector of [8 x i32].
1585///
1586/// \code{.operation}
1587/// FOR i := 0 TO 7
1588///   j := i*16
1589///   k := i*32
1590///   result[k+31:k] := ZeroExtend(__V[j+15:j])
1591/// ENDFOR
1592/// \endcode
1593///
1594/// \headerfile <immintrin.h>
1595///
1596/// This intrinsic corresponds to the \c VPMOVZXWD instruction.
1597///
1598/// \param __V
1599///    A 128-bit vector of [8 x i16] containing the source values.
1600/// \returns A 256-bit vector of [8 x i32] containing the zero-extended
1601///    values.
1602static __inline__ __m256i __DEFAULT_FN_ATTRS256
1603_mm256_cvtepu16_epi32(__m128i __V)
1604{
1605  return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
1606}
1607
1608/// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
1609///    [8 x i16] in \a __V and returns the 64-bit values in the corresponding
1610///    elements of a 256-bit vector of [4 x i64].
1611///
1612/// \code{.operation}
1613/// result[63:0] := ZeroExtend(__V[15:0])
1614/// result[127:64] := ZeroExtend(__V[31:16])
1615/// result[191:128] := ZeroExtend(__V[47:32])
1616/// result[255:192] := ZeroExtend(__V[64:48])
1617/// \endcode
1618///
1619/// \headerfile <immintrin.h>
1620///
1621/// This intrinsic corresponds to the \c VPMOVSXWQ instruction.
1622///
1623/// \param __V
1624///    A 128-bit vector of [8 x i16] containing the source values.
1625/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1626///    values.
1627static __inline__ __m256i __DEFAULT_FN_ATTRS256
1628_mm256_cvtepu16_epi64(__m128i __V)
1629{
1630  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
1631}
1632
1633/// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
1634///    \a __V and returns the 64-bit values in the corresponding elements of a
1635///    256-bit vector of [4 x i64].
1636///
1637/// \code{.operation}
1638/// result[63:0] := ZeroExtend(__V[31:0])
1639/// result[127:64] := ZeroExtend(__V[63:32])
1640/// result[191:128] := ZeroExtend(__V[95:64])
1641/// result[255:192] := ZeroExtend(__V[127:96])
1642/// \endcode
1643///
1644/// \headerfile <immintrin.h>
1645///
1646/// This intrinsic corresponds to the \c VPMOVZXDQ instruction.
1647///
1648/// \param __V
1649///    A 128-bit vector of [4 x i32] containing the source values.
1650/// \returns A 256-bit vector of [4 x i64] containing the zero-extended
1651///    values.
1652static __inline__ __m256i __DEFAULT_FN_ATTRS256
1653_mm256_cvtepu32_epi64(__m128i __V)
1654{
1655  return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
1656}
1657
1658/// Multiplies signed 32-bit integers from even-numbered elements of two
1659///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1660///    [4 x i64] result.
1661///
1662/// \code{.operation}
1663/// result[63:0] := __a[31:0] * __b[31:0]
1664/// result[127:64] := __a[95:64] * __b[95:64]
1665/// result[191:128] := __a[159:128] * __b[159:128]
1666/// result[255:192] := __a[223:192] * __b[223:192]
1667/// \endcode
1668///
1669/// \headerfile <immintrin.h>
1670///
1671/// This intrinsic corresponds to the \c VPMULDQ instruction.
1672///
1673/// \param __a
1674///    A 256-bit vector of [8 x i32] containing one of the source operands.
1675/// \param __b
1676///    A 256-bit vector of [8 x i32] containing one of the source operands.
1677/// \returns A 256-bit vector of [4 x i64] containing the products.
1678static __inline__  __m256i __DEFAULT_FN_ATTRS256
1679_mm256_mul_epi32(__m256i __a, __m256i __b)
1680{
1681  return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
1682}
1683
1684/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1685///    [16 x i16], truncates the 32-bit results to the most significant 18
1686///    bits, rounds by adding 1, and returns bits [16:1] of each rounded
1687///    product in the [16 x i16] result.
1688///
1689/// \code{.operation}
1690/// FOR i := 0 TO 15
1691///   j := i*16
1692///   temp := ((__a[j+15:j] * __b[j+15:j]) >> 14) + 1
1693///   result[j+15:j] := temp[16:1]
1694/// \endcode
1695///
1696/// \headerfile <immintrin.h>
1697///
1698/// This intrinsic corresponds to the \c VPMULHRSW instruction.
1699///
1700/// \param __a
1701///    A 256-bit vector of [16 x i16] containing one of the source operands.
1702/// \param __b
1703///    A 256-bit vector of [16 x i16] containing one of the source operands.
1704/// \returns A 256-bit vector of [16 x i16] containing the rounded products.
1705static __inline__ __m256i __DEFAULT_FN_ATTRS256
1706_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
1707{
1708  return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
1709}
1710
1711/// Multiplies unsigned 16-bit integer elements of two 256-bit vectors of
1712///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1713///    [16 x i16] result.
1714///
1715/// \headerfile <immintrin.h>
1716///
1717/// This intrinsic corresponds to the \c VPMULHUW instruction.
1718///
1719/// \param __a
1720///    A 256-bit vector of [16 x i16] containing one of the source operands.
1721/// \param __b
1722///    A 256-bit vector of [16 x i16] containing one of the source operands.
1723/// \returns A 256-bit vector of [16 x i16] containing the products.
1724static __inline__ __m256i __DEFAULT_FN_ATTRS256
1725_mm256_mulhi_epu16(__m256i __a, __m256i __b)
1726{
1727  return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
1728}
1729
1730/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1731///    [16 x i16], and returns the upper 16 bits of each 32-bit product in the
1732///    [16 x i16] result.
1733///
1734/// \headerfile <immintrin.h>
1735///
1736/// This intrinsic corresponds to the \c VPMULHW instruction.
1737///
1738/// \param __a
1739///    A 256-bit vector of [16 x i16] containing one of the source operands.
1740/// \param __b
1741///    A 256-bit vector of [16 x i16] containing one of the source operands.
1742/// \returns A 256-bit vector of [16 x i16] containing the products.
1743static __inline__ __m256i __DEFAULT_FN_ATTRS256
1744_mm256_mulhi_epi16(__m256i __a, __m256i __b)
1745{
1746  return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
1747}
1748
1749/// Multiplies signed 16-bit integer elements of two 256-bit vectors of
1750///    [16 x i16], and returns the lower 16 bits of each 32-bit product in the
1751///    [16 x i16] result.
1752///
1753/// \headerfile <immintrin.h>
1754///
1755/// This intrinsic corresponds to the \c VPMULLW instruction.
1756///
1757/// \param __a
1758///    A 256-bit vector of [16 x i16] containing one of the source operands.
1759/// \param __b
1760///    A 256-bit vector of [16 x i16] containing one of the source operands.
1761/// \returns A 256-bit vector of [16 x i16] containing the products.
1762static __inline__ __m256i __DEFAULT_FN_ATTRS256
1763_mm256_mullo_epi16(__m256i __a, __m256i __b)
1764{
1765  return (__m256i)((__v16hu)__a * (__v16hu)__b);
1766}
1767
1768/// Multiplies signed 32-bit integer elements of two 256-bit vectors of
1769///    [8 x i32], and returns the lower 32 bits of each 64-bit product in the
1770///    [8 x i32] result.
1771///
1772/// \headerfile <immintrin.h>
1773///
1774/// This intrinsic corresponds to the \c VPMULLD instruction.
1775///
1776/// \param __a
1777///    A 256-bit vector of [8 x i32] containing one of the source operands.
1778/// \param __b
1779///    A 256-bit vector of [8 x i32] containing one of the source operands.
1780/// \returns A 256-bit vector of [8 x i32] containing the products.
1781static __inline__  __m256i __DEFAULT_FN_ATTRS256
1782_mm256_mullo_epi32 (__m256i __a, __m256i __b)
1783{
1784  return (__m256i)((__v8su)__a * (__v8su)__b);
1785}
1786
1787/// Multiplies unsigned 32-bit integers from even-numered elements of two
1788///    256-bit vectors of [8 x i32] and returns the 64-bit products in the
1789///    [4 x i64] result.
1790///
1791/// \code{.operation}
1792/// result[63:0] := __a[31:0] * __b[31:0]
1793/// result[127:64] := __a[95:64] * __b[95:64]
1794/// result[191:128] := __a[159:128] * __b[159:128]
1795/// result[255:192] := __a[223:192] * __b[223:192]
1796/// \endcode
1797///
1798/// \headerfile <immintrin.h>
1799///
1800/// This intrinsic corresponds to the \c VPMULUDQ instruction.
1801///
1802/// \param __a
1803///    A 256-bit vector of [8 x i32] containing one of the source operands.
1804/// \param __b
1805///    A 256-bit vector of [8 x i32] containing one of the source operands.
1806/// \returns A 256-bit vector of [4 x i64] containing the products.
1807static __inline__ __m256i __DEFAULT_FN_ATTRS256
1808_mm256_mul_epu32(__m256i __a, __m256i __b)
1809{
1810  return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
1811}
1812
1813/// Computes the bitwise OR of the 256-bit integer vectors in \a __a and
1814///    \a __b.
1815///
1816/// \headerfile <immintrin.h>
1817///
1818/// This intrinsic corresponds to the \c VPOR instruction.
1819///
1820/// \param __a
1821///    A 256-bit integer vector.
1822/// \param __b
1823///    A 256-bit integer vector.
1824/// \returns A 256-bit integer vector containing the result.
1825static __inline__ __m256i __DEFAULT_FN_ATTRS256
1826_mm256_or_si256(__m256i __a, __m256i __b)
1827{
1828  return (__m256i)((__v4du)__a | (__v4du)__b);
1829}
1830
1831/// Computes four sum of absolute difference (SAD) operations on sets of eight
1832///    unsigned 8-bit integers from the 256-bit integer vectors \a __a and
1833///    \a __b.
1834///
1835///    One SAD result is computed for each set of eight bytes from \a __a and
1836///    eight bytes from \a __b. The zero-extended SAD value is returned in the
1837///    corresponding 64-bit element of the result.
1838///
1839///    A single SAD operation takes the differences between the corresponding
1840///    bytes of \a __a and \a __b, takes the absolute value of each difference,
1841///    and sums these eight values to form one 16-bit result. This operation
1842///    is repeated four times with successive sets of eight bytes.
1843///
1844/// \code{.operation}
1845/// FOR i := 0 TO 3
1846///   j := i*64
1847///   temp0 := ABS(__a[j+7:j] - __b[j+7:j])
1848///   temp1 := ABS(__a[j+15:j+8] - __b[j+15:j+8])
1849///   temp2 := ABS(__a[j+23:j+16] - __b[j+23:j+16])
1850///   temp3 := ABS(__a[j+31:j+24] - __b[j+31:j+24])
1851///   temp4 := ABS(__a[j+39:j+32] - __b[j+39:j+32])
1852///   temp5 := ABS(__a[j+47:j+40] - __b[j+47:j+40])
1853///   temp6 := ABS(__a[j+55:j+48] - __b[j+55:j+48])
1854///   temp7 := ABS(__a[j+63:j+56] - __b[j+63:j+56])
1855///   result[j+15:j] := temp0 + temp1 + temp2 + temp3 +
1856///                     temp4 + temp5 + temp6 + temp7
1857///   result[j+63:j+16] := 0
1858/// ENDFOR
1859/// \endcode
1860///
1861/// \headerfile <immintrin.h>
1862///
1863/// This intrinsic corresponds to the \c VPSADBW instruction.
1864///
1865/// \param __a
1866///    A 256-bit integer vector.
1867/// \param __b
1868///    A 256-bit integer vector.
1869/// \returns A 256-bit integer vector containing the result.
1870static __inline__ __m256i __DEFAULT_FN_ATTRS256
1871_mm256_sad_epu8(__m256i __a, __m256i __b)
1872{
1873  return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
1874}
1875
1876/// Shuffles 8-bit integers in the 256-bit integer vector \a __a according
1877///    to control information in the 256-bit integer vector \a __b, and
1878///    returns the 256-bit result. In effect there are two separate 128-bit
1879///    shuffles in the lower and upper halves.
1880///
1881/// \code{.operation}
1882/// FOR i := 0 TO 31
1883///   j := i*8
1884///   IF __b[j+7] == 1
1885///     result[j+7:j] := 0
1886///   ELSE
1887///     k := __b[j+3:j] * 8
1888///     IF i > 15
1889///       k := k + 128
1890///     FI
1891///     result[j+7:j] := __a[k+7:k]
1892///   FI
1893/// ENDFOR
1894/// \endcode
1895///
1896/// \headerfile <immintrin.h>
1897///
1898/// This intrinsic corresponds to the \c VPSHUFB instruction.
1899///
1900/// \param __a
1901///    A 256-bit integer vector containing source values.
1902/// \param __b
1903///    A 256-bit integer vector containing control information to determine
1904///    what goes into the corresponding byte of the result. If bit 7 of the
1905///    control byte is 1, the result byte is 0; otherwise, bits 3:0 of the
1906///    control byte specify the index (within the same 128-bit half) of \a __a
1907///    to copy to the result byte.
1908/// \returns A 256-bit integer vector containing the result.
1909static __inline__ __m256i __DEFAULT_FN_ATTRS256
1910_mm256_shuffle_epi8(__m256i __a, __m256i __b)
1911{
1912  return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
1913}
1914
1915/// Shuffles 32-bit integers from the 256-bit vector of [8 x i32] in \a a
1916///    according to control information in the integer literal \a imm, and
1917///    returns the 256-bit result. In effect there are two parallel 128-bit
1918///    shuffles in the lower and upper halves.
1919///
1920/// \code{.operation}
1921/// FOR i := 0 to 3
1922///   j := i*32
1923///   k := (imm >> i*2)[1:0] * 32
1924///   result[j+31:j] := a[k+31:k]
1925///   result[128+j+31:128+j] := a[128+k+31:128+k]
1926/// ENDFOR
1927/// \endcode
1928///
1929/// \headerfile <immintrin.h>
1930///
1931/// \code
1932/// __m256i _mm256_shuffle_epi32(__m256i a, const int imm);
1933/// \endcode
1934///
1935/// This intrinsic corresponds to the \c VPSHUFB instruction.
1936///
1937/// \param a
1938///    A 256-bit vector of [8 x i32] containing source values.
1939/// \param imm
1940///    An immediate 8-bit value specifying which elements to copy from \a a.
1941///    \a imm[1:0] specifies the index in \a a for elements 0 and 4 of the
1942///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
1943///    forth.
1944/// \returns A 256-bit vector of [8 x i32] containing the result.
1945#define _mm256_shuffle_epi32(a, imm) \
1946  ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
1947
1948/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
1949///    according to control information in the integer literal \a imm, and
1950///    returns the 256-bit result. The upper 64 bits of each 128-bit half
1951///    are shuffled in parallel; the lower 64 bits of each 128-bit half are
1952///    copied from \a a unchanged.
1953///
1954/// \code{.operation}
1955/// result[63:0] := a[63:0]
1956/// result[191:128] := a[191:128]
1957/// FOR i := 0 TO 3
1958///   j := i * 16 + 64
1959///   k := (imm >> i*2)[1:0] * 16 + 64
1960///   result[j+15:j] := a[k+15:k]
1961///   result[128+j+15:128+j] := a[128+k+15:128+k]
1962/// ENDFOR
1963/// \endcode
1964///
1965/// \headerfile <immintrin.h>
1966///
1967/// \code
1968/// __m256i _mm256_shufflehi_epi16(__m256i a, const int imm);
1969/// \endcode
1970///
1971/// This intrinsic corresponds to the \c VPSHUFHW instruction.
1972///
1973/// \param a
1974///    A 256-bit vector of [16 x i16] containing source values.
1975/// \param imm
1976///    An immediate 8-bit value specifying which elements to copy from \a a.
1977///    \a imm[1:0] specifies the index in \a a for elements 4 and 8 of the
1978///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
1979///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
1980/// \returns A 256-bit vector of [16 x i16] containing the result.
1981#define _mm256_shufflehi_epi16(a, imm) \
1982  ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
1983
1984/// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
1985///    according to control information in the integer literal \a imm, and
1986///    returns the 256-bit [16 x i16] result. The lower 64 bits of each
1987///    128-bit half are shuffled; the upper 64 bits of each 128-bit half are
1988///    copied from \a a unchanged.
1989///
1990/// \code{.operation}
1991/// result[127:64] := a[127:64]
1992/// result[255:192] := a[255:192]
1993/// FOR i := 0 TO 3
1994///   j := i * 16
1995///   k := (imm >> i*2)[1:0] * 16
1996///   result[j+15:j] := a[k+15:k]
1997///   result[128+j+15:128+j] := a[128+k+15:128+k]
1998/// ENDFOR
1999/// \endcode
2000///
2001/// \headerfile <immintrin.h>
2002///
2003/// \code
2004/// __m256i _mm256_shufflelo_epi16(__m256i a, const int imm);
2005/// \endcode
2006///
2007/// This intrinsic corresponds to the \c VPSHUFLW instruction.
2008///
2009/// \param a
2010///    A 256-bit vector of [16 x i16] to use as a source of data for the
2011///    result.
2012/// \param imm
2013///    An immediate 8-bit value specifying which elements to copy from \a a.
2014///    \a imm[1:0] specifies the index in \a a for elements 0 and 8 of the
2015///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
2016///    forth.
2017/// \returns A 256-bit vector of [16 x i16] containing the result.
2018#define _mm256_shufflelo_epi16(a, imm) \
2019  ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
2020
2021/// Sets each byte of the result to the corresponding byte of the 256-bit
2022///    integer vector in \a __a, the negative of that byte, or zero, depending
2023///    on whether the corresponding byte of the 256-bit integer vector in
2024///    \a __b is greater than zero, less than zero, or equal to zero,
2025///    respectively.
2026///
2027/// \headerfile <immintrin.h>
2028///
2029/// This intrinsic corresponds to the \c VPSIGNB instruction.
2030///
2031/// \param __a
2032///    A 256-bit integer vector.
2033/// \param __b
2034///    A 256-bit integer vector].
2035/// \returns A 256-bit integer vector containing the result.
2036static __inline__ __m256i __DEFAULT_FN_ATTRS256
2037_mm256_sign_epi8(__m256i __a, __m256i __b)
2038{
2039    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
2040}
2041
2042/// Sets each element of the result to the corresponding element of the
2043///    256-bit vector of [16 x i16] in \a __a, the negative of that element,
2044///    or zero, depending on whether the corresponding element of the 256-bit
2045///    vector of [16 x i16] in \a __b is greater than zero, less than zero, or
2046///    equal to zero, respectively.
2047///
2048/// \headerfile <immintrin.h>
2049///
2050/// This intrinsic corresponds to the \c VPSIGNW instruction.
2051///
2052/// \param __a
2053///    A 256-bit vector of [16 x i16].
2054/// \param __b
2055///    A 256-bit vector of [16 x i16].
2056/// \returns A 256-bit vector of [16 x i16] containing the result.
2057static __inline__ __m256i __DEFAULT_FN_ATTRS256
2058_mm256_sign_epi16(__m256i __a, __m256i __b)
2059{
2060    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
2061}
2062
2063/// Sets each element of the result to the corresponding element of the
2064///    256-bit vector of [8 x i32] in \a __a, the negative of that element, or
2065///    zero, depending on whether the corresponding element of the 256-bit
2066///    vector of [8 x i32] in \a __b is greater than zero, less than zero, or
2067///    equal to zero, respectively.
2068///
2069/// \headerfile <immintrin.h>
2070///
2071/// This intrinsic corresponds to the \c VPSIGND instruction.
2072///
2073/// \param __a
2074///    A 256-bit vector of [8 x i32].
2075/// \param __b
2076///    A 256-bit vector of [8 x i32].
2077/// \returns A 256-bit vector of [8 x i32] containing the result.
2078static __inline__ __m256i __DEFAULT_FN_ATTRS256
2079_mm256_sign_epi32(__m256i __a, __m256i __b)
2080{
2081    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
2082}
2083
2084/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2085///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2086///    is greater than 15, the returned result is all zeroes.
2087///
2088/// \headerfile <immintrin.h>
2089///
2090/// \code
2091/// __m256i _mm256_slli_si256(__m256i a, const int imm);
2092/// \endcode
2093///
2094/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2095///
2096/// \param a
2097///    A 256-bit integer vector to be shifted.
2098/// \param imm
2099///     An unsigned immediate value specifying the shift count (in bytes).
2100/// \returns A 256-bit integer vector containing the result.
2101#define _mm256_slli_si256(a, imm) \
2102  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2103
2104/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
2105///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
2106///    is greater than 15, the returned result is all zeroes.
2107///
2108/// \headerfile <immintrin.h>
2109///
2110/// \code
2111/// __m256i _mm256_bslli_epi128(__m256i a, const int imm);
2112/// \endcode
2113///
2114/// This intrinsic corresponds to the \c VPSLLDQ instruction.
2115///
2116/// \param a
2117///    A 256-bit integer vector to be shifted.
2118/// \param imm
2119///    An unsigned immediate value specifying the shift count (in bytes).
2120/// \returns A 256-bit integer vector containing the result.
2121#define _mm256_bslli_epi128(a, imm) \
2122  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2123
2124/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2125///    left by \a __count bits, shifting in zero bits, and returns the result.
2126///    If \a __count is greater than 15, the returned result is all zeroes.
2127///
2128/// \headerfile <immintrin.h>
2129///
2130/// This intrinsic corresponds to the \c VPSLLW instruction.
2131///
2132/// \param __a
2133///    A 256-bit vector of [16 x i16] to be shifted.
2134/// \param __count
2135///    An unsigned integer value specifying the shift count (in bits).
2136/// \returns A 256-bit vector of [16 x i16] containing the result.
2137static __inline__ __m256i __DEFAULT_FN_ATTRS256
2138_mm256_slli_epi16(__m256i __a, int __count)
2139{
2140  return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
2141}
2142
2143/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2144///    left by the number of bits specified by the lower 64 bits of \a __count,
2145///    shifting in zero bits, and returns the result. If \a __count is greater
2146///    than 15, the returned result is all zeroes.
2147///
2148/// \headerfile <immintrin.h>
2149///
2150/// This intrinsic corresponds to the \c VPSLLW instruction.
2151///
2152/// \param __a
2153///    A 256-bit vector of [16 x i16] to be shifted.
2154/// \param __count
2155///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2156///    shift count (in bits). The upper element is ignored.
2157/// \returns A 256-bit vector of [16 x i16] containing the result.
2158static __inline__ __m256i __DEFAULT_FN_ATTRS256
2159_mm256_sll_epi16(__m256i __a, __m128i __count)
2160{
2161  return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
2162}
2163
2164/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2165///    left by \a __count bits, shifting in zero bits, and returns the result.
2166///    If \a __count is greater than 31, the returned result is all zeroes.
2167///
2168/// \headerfile <immintrin.h>
2169///
2170/// This intrinsic corresponds to the \c VPSLLD instruction.
2171///
2172/// \param __a
2173///    A 256-bit vector of [8 x i32] to be shifted.
2174/// \param __count
2175///    An unsigned integer value specifying the shift count (in bits).
2176/// \returns A 256-bit vector of [8 x i32] containing the result.
2177static __inline__ __m256i __DEFAULT_FN_ATTRS256
2178_mm256_slli_epi32(__m256i __a, int __count)
2179{
2180  return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
2181}
2182
2183/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2184///    left by the number of bits given in the lower 64 bits of \a __count,
2185///    shifting in zero bits, and returns the result. If \a __count is greater
2186///    than 31, the returned result is all zeroes.
2187///
2188/// \headerfile <immintrin.h>
2189///
2190/// This intrinsic corresponds to the \c VPSLLD instruction.
2191///
2192/// \param __a
2193///    A 256-bit vector of [8 x i32] to be shifted.
2194/// \param __count
2195///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2196///    shift count (in bits). The upper element is ignored.
2197/// \returns A 256-bit vector of [8 x i32] containing the result.
2198static __inline__ __m256i __DEFAULT_FN_ATTRS256
2199_mm256_sll_epi32(__m256i __a, __m128i __count)
2200{
2201  return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
2202}
2203
2204/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2205///    left by \a __count bits, shifting in zero bits, and returns the result.
2206///    If \a __count is greater than 63, the returned result is all zeroes.
2207///
2208/// \headerfile <immintrin.h>
2209///
2210/// This intrinsic corresponds to the \c VPSLLQ instruction.
2211///
2212/// \param __a
2213///    A 256-bit vector of [4 x i64] to be shifted.
2214/// \param __count
2215///    An unsigned integer value specifying the shift count (in bits).
2216/// \returns A 256-bit vector of [4 x i64] containing the result.
2217static __inline__ __m256i __DEFAULT_FN_ATTRS256
2218_mm256_slli_epi64(__m256i __a, int __count)
2219{
2220  return __builtin_ia32_psllqi256((__v4di)__a, __count);
2221}
2222
2223/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2224///    left by the number of bits given in the lower 64 bits of \a __count,
2225///    shifting in zero bits, and returns the result. If \a __count is greater
2226///    than 63, the returned result is all zeroes.
2227///
2228/// \headerfile <immintrin.h>
2229///
2230/// This intrinsic corresponds to the \c VPSLLQ instruction.
2231///
2232/// \param __a
2233///    A 256-bit vector of [4 x i64] to be shifted.
2234/// \param __count
2235///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2236///    shift count (in bits). The upper element is ignored.
2237/// \returns A 256-bit vector of [4 x i64] containing the result.
2238static __inline__ __m256i __DEFAULT_FN_ATTRS256
2239_mm256_sll_epi64(__m256i __a, __m128i __count)
2240{
2241  return __builtin_ia32_psllq256((__v4di)__a, __count);
2242}
2243
2244/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2245///    right by \a __count bits, shifting in sign bits, and returns the result.
2246///    If \a __count is greater than 15, each element of the result is either
2247///    0 or -1 according to the corresponding input sign bit.
2248///
2249/// \headerfile <immintrin.h>
2250///
2251/// This intrinsic corresponds to the \c VPSRAW instruction.
2252///
2253/// \param __a
2254///    A 256-bit vector of [16 x i16] to be shifted.
2255/// \param __count
2256///    An unsigned integer value specifying the shift count (in bits).
2257/// \returns A 256-bit vector of [16 x i16] containing the result.
2258static __inline__ __m256i __DEFAULT_FN_ATTRS256
2259_mm256_srai_epi16(__m256i __a, int __count)
2260{
2261  return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
2262}
2263
2264/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2265///    right by the number of bits given in the lower 64 bits of \a __count,
2266///    shifting in sign bits, and returns the result. If \a __count is greater
2267///    than 15, each element of the result is either 0 or -1 according to the
2268///    corresponding input sign bit.
2269///
2270/// \headerfile <immintrin.h>
2271///
2272/// This intrinsic corresponds to the \c VPSRAW instruction.
2273///
2274/// \param __a
2275///    A 256-bit vector of [16 x i16] to be shifted.
2276/// \param __count
2277///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2278///    shift count (in bits). The upper element is ignored.
2279/// \returns A 256-bit vector of [16 x i16] containing the result.
2280static __inline__ __m256i __DEFAULT_FN_ATTRS256
2281_mm256_sra_epi16(__m256i __a, __m128i __count)
2282{
2283  return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
2284}
2285
2286/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2287///    right by \a __count bits, shifting in sign bits, and returns the result.
2288///    If \a __count is greater than 31, each element of the result is either
2289///    0 or -1 according to the corresponding input sign bit.
2290///
2291/// \headerfile <immintrin.h>
2292///
2293/// This intrinsic corresponds to the \c VPSRAD instruction.
2294///
2295/// \param __a
2296///    A 256-bit vector of [8 x i32] to be shifted.
2297/// \param __count
2298///    An unsigned integer value specifying the shift count (in bits).
2299/// \returns A 256-bit vector of [8 x i32] containing the result.
2300static __inline__ __m256i __DEFAULT_FN_ATTRS256
2301_mm256_srai_epi32(__m256i __a, int __count)
2302{
2303  return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
2304}
2305
2306/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2307///    right by the number of bits given in the lower 64 bits of \a __count,
2308///    shifting in sign bits, and returns the result. If \a __count is greater
2309///    than 31, each element of the result is either 0 or -1 according to the
2310///    corresponding input sign bit.
2311///
2312/// \headerfile <immintrin.h>
2313///
2314/// This intrinsic corresponds to the \c VPSRAD instruction.
2315///
2316/// \param __a
2317///    A 256-bit vector of [8 x i32] to be shifted.
2318/// \param __count
2319///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2320///    shift count (in bits). The upper element is ignored.
2321/// \returns A 256-bit vector of [8 x i32] containing the result.
2322static __inline__ __m256i __DEFAULT_FN_ATTRS256
2323_mm256_sra_epi32(__m256i __a, __m128i __count)
2324{
2325  return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
2326}
2327
2328/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2329///    \a imm bytes, shifting in zero bytes, and returns the result. If
2330///    \a imm is greater than 15, the returned result is all zeroes.
2331///
2332/// \headerfile <immintrin.h>
2333///
2334/// \code
2335/// __m256i _mm256_srli_si256(__m256i a, const int imm);
2336/// \endcode
2337///
2338/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2339///
2340/// \param a
2341///    A 256-bit integer vector to be shifted.
2342/// \param imm
2343///    An unsigned immediate value specifying the shift count (in bytes).
2344/// \returns A 256-bit integer vector containing the result.
2345#define _mm256_srli_si256(a, imm) \
2346  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2347
2348/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
2349///    \a imm bytes, shifting in zero bytes, and returns the result. If
2350///    \a imm is greater than 15, the returned result is all zeroes.
2351///
2352/// \headerfile <immintrin.h>
2353///
2354/// \code
2355/// __m256i _mm256_bsrli_epi128(__m256i a, const int imm);
2356/// \endcode
2357///
2358/// This intrinsic corresponds to the \c VPSRLDQ instruction.
2359///
2360/// \param a
2361///    A 256-bit integer vector to be shifted.
2362/// \param imm
2363///     An unsigned immediate value specifying the shift count (in bytes).
2364/// \returns A 256-bit integer vector containing the result.
2365#define _mm256_bsrli_epi128(a, imm) \
2366  ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2367
2368/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2369///    right by \a __count bits, shifting in zero bits, and returns the result.
2370///    If \a __count is greater than 15, the returned result is all zeroes.
2371///
2372/// \headerfile <immintrin.h>
2373///
2374/// This intrinsic corresponds to the \c VPSRLW instruction.
2375///
2376/// \param __a
2377///    A 256-bit vector of [16 x i16] to be shifted.
2378/// \param __count
2379///    An unsigned integer value specifying the shift count (in bits).
2380/// \returns A 256-bit vector of [16 x i16] containing the result.
2381static __inline__ __m256i __DEFAULT_FN_ATTRS256
2382_mm256_srli_epi16(__m256i __a, int __count)
2383{
2384  return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
2385}
2386
2387/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
2388///    right by the number of bits given in the lower 64 bits of \a __count,
2389///    shifting in zero bits, and returns the result. If \a __count is greater
2390///    than 15, the returned result is all zeroes.
2391///
2392/// \headerfile <immintrin.h>
2393///
2394/// This intrinsic corresponds to the \c VPSRLW instruction.
2395///
2396/// \param __a
2397///    A 256-bit vector of [16 x i16] to be shifted.
2398/// \param __count
2399///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2400///    shift count (in bits). The upper element is ignored.
2401/// \returns A 256-bit vector of [16 x i16] containing the result.
2402static __inline__ __m256i __DEFAULT_FN_ATTRS256
2403_mm256_srl_epi16(__m256i __a, __m128i __count)
2404{
2405  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
2406}
2407
2408/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2409///    right by \a __count bits, shifting in zero bits, and returns the result.
2410///    If \a __count is greater than 31, the returned result is all zeroes.
2411///
2412/// \headerfile <immintrin.h>
2413///
2414/// This intrinsic corresponds to the \c VPSRLD instruction.
2415///
2416/// \param __a
2417///    A 256-bit vector of [8 x i32] to be shifted.
2418/// \param __count
2419///    An unsigned integer value specifying the shift count (in bits).
2420/// \returns A 256-bit vector of [8 x i32] containing the result.
2421static __inline__ __m256i __DEFAULT_FN_ATTRS256
2422_mm256_srli_epi32(__m256i __a, int __count)
2423{
2424  return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
2425}
2426
2427/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __a
2428///    right by the number of bits given in the lower 64 bits of \a __count,
2429///    shifting in zero bits, and returns the result. If \a __count is greater
2430///    than 31, the returned result is all zeroes.
2431///
2432/// \headerfile <immintrin.h>
2433///
2434/// This intrinsic corresponds to the \c VPSRLD instruction.
2435///
2436/// \param __a
2437///    A 256-bit vector of [8 x i32] to be shifted.
2438/// \param __count
2439///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2440///    shift count (in bits). The upper element is ignored.
2441/// \returns A 256-bit vector of [8 x i32] containing the result.
2442static __inline__ __m256i __DEFAULT_FN_ATTRS256
2443_mm256_srl_epi32(__m256i __a, __m128i __count)
2444{
2445  return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
2446}
2447
2448/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2449///    right by \a __count bits, shifting in zero bits, and returns the result.
2450///    If \a __count is greater than 63, the returned result is all zeroes.
2451///
2452/// \headerfile <immintrin.h>
2453///
2454/// This intrinsic corresponds to the \c VPSRLQ instruction.
2455///
2456/// \param __a
2457///    A 256-bit vector of [4 x i64] to be shifted.
2458/// \param __count
2459///    An unsigned integer value specifying the shift count (in bits).
2460/// \returns A 256-bit vector of [4 x i64] containing the result.
2461static __inline__ __m256i __DEFAULT_FN_ATTRS256
2462_mm256_srli_epi64(__m256i __a, int __count)
2463{
2464  return __builtin_ia32_psrlqi256((__v4di)__a, __count);
2465}
2466
2467/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __a
2468///    right by the number of bits given in the lower 64 bits of \a __count,
2469///    shifting in zero bits, and returns the result. If \a __count is greater
2470///    than 63, the returned result is all zeroes.
2471///
2472/// \headerfile <immintrin.h>
2473///
2474/// This intrinsic corresponds to the \c VPSRLQ instruction.
2475///
2476/// \param __a
2477///    A 256-bit vector of [4 x i64] to be shifted.
2478/// \param __count
2479///    A 128-bit vector of [2 x i64] whose lower element gives the unsigned
2480///    shift count (in bits). The upper element is ignored.
2481/// \returns A 256-bit vector of [4 x i64] containing the result.
2482static __inline__ __m256i __DEFAULT_FN_ATTRS256
2483_mm256_srl_epi64(__m256i __a, __m128i __count)
2484{
2485  return __builtin_ia32_psrlq256((__v4di)__a, __count);
2486}
2487
2488/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2489///    vectors. Returns the lower 8 bits of each difference in the
2490///    corresponding byte of the 256-bit integer vector result (overflow is
2491///    ignored).
2492///
2493/// \code{.operation}
2494/// FOR i := 0 TO 31
2495///   j := i*8
2496///   result[j+7:j] := __a[j+7:j] - __b[j+7:j]
2497/// ENDFOR
2498/// \endcode
2499///
2500/// \headerfile <immintrin.h>
2501///
2502/// This intrinsic corresponds to the \c VPSUBB instruction.
2503///
2504/// \param __a
2505///    A 256-bit integer vector containing the minuends.
2506/// \param __b
2507///    A 256-bit integer vector containing the subtrahends.
2508/// \returns A 256-bit integer vector containing the differences.
2509static __inline__ __m256i __DEFAULT_FN_ATTRS256
2510_mm256_sub_epi8(__m256i __a, __m256i __b)
2511{
2512  return (__m256i)((__v32qu)__a - (__v32qu)__b);
2513}
2514
2515/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2516///    vectors of [16 x i16]. Returns the lower 16 bits of each difference in
2517///    the corresponding element of the [16 x i16] result (overflow is
2518///    ignored).
2519///
2520/// \code{.operation}
2521/// FOR i := 0 TO 15
2522///   j := i*16
2523///   result[j+15:j] := __a[j+15:j] - __b[j+15:j]
2524/// ENDFOR
2525/// \endcode
2526///
2527/// \headerfile <immintrin.h>
2528///
2529/// This intrinsic corresponds to the \c VPSUBW instruction.
2530///
2531/// \param __a
2532///    A 256-bit vector of [16 x i16] containing the minuends.
2533/// \param __b
2534///    A 256-bit vector of [16 x i16] containing the subtrahends.
2535/// \returns A 256-bit vector of [16 x i16] containing the differences.
2536static __inline__ __m256i __DEFAULT_FN_ATTRS256
2537_mm256_sub_epi16(__m256i __a, __m256i __b)
2538{
2539  return (__m256i)((__v16hu)__a - (__v16hu)__b);
2540}
2541
2542/// Subtracts 32-bit integers from corresponding elements of two 256-bit
2543///    vectors of [8 x i32]. Returns the lower 32 bits of each difference in
2544///    the corresponding element of the [8 x i32] result (overflow is ignored).
2545///
2546/// \code{.operation}
2547/// FOR i := 0 TO 7
2548///   j := i*32
2549///   result[j+31:j] := __a[j+31:j] - __b[j+31:j]
2550/// ENDFOR
2551/// \endcode
2552///
2553/// \headerfile <immintrin.h>
2554///
2555/// This intrinsic corresponds to the \c VPSUBD instruction.
2556///
2557/// \param __a
2558///    A 256-bit vector of [8 x i32] containing the minuends.
2559/// \param __b
2560///    A 256-bit vector of [8 x i32] containing the subtrahends.
2561/// \returns A 256-bit vector of [8 x i32] containing the differences.
2562static __inline__ __m256i __DEFAULT_FN_ATTRS256
2563_mm256_sub_epi32(__m256i __a, __m256i __b)
2564{
2565  return (__m256i)((__v8su)__a - (__v8su)__b);
2566}
2567
2568/// Subtracts 64-bit integers from corresponding elements of two 256-bit
2569///    vectors of [4 x i64]. Returns the lower 64 bits of each difference in
2570///    the corresponding element of the [4 x i64] result (overflow is ignored).
2571///
2572/// \code{.operation}
2573/// FOR i := 0 TO 3
2574///   j := i*64
2575///   result[j+63:j] := __a[j+63:j] - __b[j+63:j]
2576/// ENDFOR
2577/// \endcode
2578///
2579/// \headerfile <immintrin.h>
2580///
2581/// This intrinsic corresponds to the \c VPSUBQ instruction.
2582///
2583/// \param __a
2584///    A 256-bit vector of [4 x i64] containing the minuends.
2585/// \param __b
2586///    A 256-bit vector of [4 x i64] containing the subtrahends.
2587/// \returns A 256-bit vector of [4 x i64] containing the differences.
2588static __inline__ __m256i __DEFAULT_FN_ATTRS256
2589_mm256_sub_epi64(__m256i __a, __m256i __b)
2590{
2591  return (__m256i)((__v4du)__a - (__v4du)__b);
2592}
2593
2594/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2595///    vectors using signed saturation, and returns each differences in the
2596///    corresponding byte of the 256-bit integer vector result.
2597///
2598/// \code{.operation}
2599/// FOR i := 0 TO 31
2600///   j := i*8
2601///   result[j+7:j] := SATURATE8(__a[j+7:j] - __b[j+7:j])
2602/// ENDFOR
2603/// \endcode
2604///
2605/// \headerfile <immintrin.h>
2606///
2607/// This intrinsic corresponds to the \c VPSUBSB instruction.
2608///
2609/// \param __a
2610///    A 256-bit integer vector containing the minuends.
2611/// \param __b
2612///    A 256-bit integer vector containing the subtrahends.
2613/// \returns A 256-bit integer vector containing the differences.
2614static __inline__ __m256i __DEFAULT_FN_ATTRS256
2615_mm256_subs_epi8(__m256i __a, __m256i __b)
2616{
2617  return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
2618}
2619
2620/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2621///    vectors of [16 x i16] using signed saturation, and returns each
2622///    difference in the corresponding element of the [16 x i16] result.
2623///
2624/// \code{.operation}
2625/// FOR i := 0 TO 15
2626///   j := i*16
2627///   result[j+7:j] := SATURATE16(__a[j+7:j] - __b[j+7:j])
2628/// ENDFOR
2629/// \endcode
2630///
2631/// \headerfile <immintrin.h>
2632///
2633/// This intrinsic corresponds to the \c VPSUBSW instruction.
2634///
2635/// \param __a
2636///    A 256-bit vector of [16 x i16] containing the minuends.
2637/// \param __b
2638///    A 256-bit vector of [16 x i16] containing the subtrahends.
2639/// \returns A 256-bit vector of [16 x i16] containing the differences.
2640static __inline__ __m256i __DEFAULT_FN_ATTRS256
2641_mm256_subs_epi16(__m256i __a, __m256i __b)
2642{
2643  return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
2644}
2645
2646/// Subtracts 8-bit integers from corresponding bytes of two 256-bit integer
2647///    vectors using unsigned saturation, and returns each difference in the
2648///    corresponding byte of the 256-bit integer vector result. For each byte,
2649///    computes <c> result = __a - __b </c>.
2650///
2651/// \code{.operation}
2652/// FOR i := 0 TO 31
2653///   j := i*8
2654///   result[j+7:j] := SATURATE8U(__a[j+7:j] - __b[j+7:j])
2655/// ENDFOR
2656/// \endcode
2657///
2658/// \headerfile <immintrin.h>
2659///
2660/// This intrinsic corresponds to the \c VPSUBUSB instruction.
2661///
2662/// \param __a
2663///    A 256-bit integer vector containing the minuends.
2664/// \param __b
2665///    A 256-bit integer vector containing the subtrahends.
2666/// \returns A 256-bit integer vector containing the differences.
2667static __inline__ __m256i __DEFAULT_FN_ATTRS256
2668_mm256_subs_epu8(__m256i __a, __m256i __b)
2669{
2670  return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
2671}
2672
2673/// Subtracts 16-bit integers from corresponding elements of two 256-bit
2674///    vectors of [16 x i16] using unsigned saturation, and returns each
2675///    difference in the corresponding element of the [16 x i16] result.
2676///
2677/// \code{.operation}
2678/// FOR i := 0 TO 15
2679///   j := i*16
2680///   result[j+15:j] := SATURATE16U(__a[j+15:j] - __b[j+15:j])
2681/// ENDFOR
2682/// \endcode
2683///
2684/// \headerfile <immintrin.h>
2685///
2686/// This intrinsic corresponds to the \c VPSUBUSW instruction.
2687///
2688/// \param __a
2689///    A 256-bit vector of [16 x i16] containing the minuends.
2690/// \param __b
2691///    A 256-bit vector of [16 x i16] containing the subtrahends.
2692/// \returns A 256-bit vector of [16 x i16] containing the differences.
2693static __inline__ __m256i __DEFAULT_FN_ATTRS256
2694_mm256_subs_epu16(__m256i __a, __m256i __b)
2695{
2696  return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
2697}
2698
2699/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2700///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2701///    uses the upper 64 bits of each 128-bit half of \a __a and \a __b as
2702///    input; other bits in these parameters are ignored.
2703///
2704/// \code{.operation}
2705/// result[7:0] := __a[71:64]
2706/// result[15:8] := __b[71:64]
2707/// result[23:16] := __a[79:72]
2708/// result[31:24] := __b[79:72]
2709/// . . .
2710/// result[127:120] := __b[127:120]
2711/// result[135:128] := __a[199:192]
2712/// . . .
2713/// result[255:248] := __b[255:248]
2714/// \endcode
2715///
2716/// \headerfile <immintrin.h>
2717///
2718/// This intrinsic corresponds to the \c VPUNPCKHBW instruction.
2719///
2720/// \param __a
2721///    A 256-bit integer vector used as the source for the even-numbered bytes
2722///    of the result.
2723/// \param __b
2724///    A 256-bit integer vector used as the source for the odd-numbered bytes
2725///    of the result.
2726/// \returns A 256-bit integer vector containing the result.
2727static __inline__ __m256i __DEFAULT_FN_ATTRS256
2728_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
2729{
2730  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
2731}
2732
2733/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2734///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2735///    vector of [16 x i16]. Specifically, uses the upper 64 bits of each
2736///    128-bit half of \a __a and \a __b as input; other bits in these
2737///    parameters are ignored.
2738///
2739/// \code{.operation}
2740/// result[15:0] := __a[79:64]
2741/// result[31:16] := __b[79:64]
2742/// result[47:32] := __a[95:80]
2743/// result[63:48] := __b[95:80]
2744/// . . .
2745/// result[127:112] := __b[127:112]
2746/// result[143:128] := __a[211:196]
2747/// . . .
2748/// result[255:240] := __b[255:240]
2749/// \endcode
2750///
2751/// \headerfile <immintrin.h>
2752///
2753/// This intrinsic corresponds to the \c VPUNPCKHWD instruction.
2754///
2755/// \param __a
2756///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2757///    elements of the result.
2758/// \param __b
2759///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2760///    elements of the result.
2761/// \returns A 256-bit vector of [16 x i16] containing the result.
2762static __inline__ __m256i __DEFAULT_FN_ATTRS256
2763_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
2764{
2765  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
2766}
2767
2768/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2769///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2770///    of [8 x i32]. Specifically, uses the upper 64 bits of each 128-bit half
2771///    of \a __a and \a __b as input; other bits in these parameters are
2772///    ignored.
2773///
2774/// \code{.operation}
2775/// result[31:0] := __a[95:64]
2776/// result[63:32] := __b[95:64]
2777/// result[95:64] := __a[127:96]
2778/// result[127:96] := __b[127:96]
2779/// result[159:128] := __a[223:192]
2780/// result[191:160] := __b[223:192]
2781/// result[223:192] := __a[255:224]
2782/// result[255:224] := __b[255:224]
2783/// \endcode
2784///
2785/// \headerfile <immintrin.h>
2786///
2787/// This intrinsic corresponds to the \c VPUNPCKHDQ instruction.
2788///
2789/// \param __a
2790///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2791///    elements of the result.
2792/// \param __b
2793///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2794///    elements of the result.
2795/// \returns A 256-bit vector of [8 x i32] containing the result.
2796static __inline__ __m256i __DEFAULT_FN_ATTRS256
2797_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
2798{
2799  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
2800}
2801
2802/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2803///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2804///    of [4 x i64]. Specifically, uses the upper 64 bits of each 128-bit half
2805///    of \a __a and \a __b as input; other bits in these parameters are
2806///    ignored.
2807///
2808/// \code{.operation}
2809/// result[63:0] := __a[127:64]
2810/// result[127:64] := __b[127:64]
2811/// result[191:128] := __a[255:192]
2812/// result[255:192] := __b[255:192]
2813/// \endcode
2814///
2815/// \headerfile <immintrin.h>
2816///
2817/// This intrinsic corresponds to the \c VPUNPCKHQDQ instruction.
2818///
2819/// \param __a
2820///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2821///    elements of the result.
2822/// \param __b
2823///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2824///    elements of the result.
2825/// \returns A 256-bit vector of [4 x i64] containing the result.
2826static __inline__ __m256i __DEFAULT_FN_ATTRS256
2827_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
2828{
2829  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
2830}
2831
2832/// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
2833///    vectors in \a __a and \a __b to form the 256-bit result. Specifically,
2834///    uses the lower 64 bits of each 128-bit half of \a __a and \a __b as
2835///    input; other bits in these parameters are ignored.
2836///
2837/// \code{.operation}
2838/// result[7:0] := __a[7:0]
2839/// result[15:8] := __b[7:0]
2840/// result[23:16] := __a[15:8]
2841/// result[31:24] := __b[15:8]
2842/// . . .
2843/// result[127:120] := __b[63:56]
2844/// result[135:128] := __a[135:128]
2845/// . . .
2846/// result[255:248] := __b[191:184]
2847/// \endcode
2848///
2849/// \headerfile <immintrin.h>
2850///
2851/// This intrinsic corresponds to the \c VPUNPCKLBW instruction.
2852///
2853/// \param __a
2854///    A 256-bit integer vector used as the source for the even-numbered bytes
2855///    of the result.
2856/// \param __b
2857///    A 256-bit integer vector used as the source for the odd-numbered bytes
2858///    of the result.
2859/// \returns A 256-bit integer vector containing the result.
2860static __inline__ __m256i __DEFAULT_FN_ATTRS256
2861_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
2862{
2863  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
2864}
2865
2866/// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
2867///    of [16 x i16] in \a __a and \a __b to return the resulting 256-bit
2868///    vector of [16 x i16]. Specifically, uses the lower 64 bits of each
2869///    128-bit half of \a __a and \a __b as input; other bits in these
2870///    parameters are ignored.
2871///
2872/// \code{.operation}
2873/// result[15:0] := __a[15:0]
2874/// result[31:16] := __b[15:0]
2875/// result[47:32] := __a[31:16]
2876/// result[63:48] := __b[31:16]
2877/// . . .
2878/// result[127:112] := __b[63:48]
2879/// result[143:128] := __a[143:128]
2880/// . . .
2881/// result[255:239] := __b[191:176]
2882/// \endcode
2883///
2884/// \headerfile <immintrin.h>
2885///
2886/// This intrinsic corresponds to the \c VPUNPCKLWD instruction.
2887///
2888/// \param __a
2889///    A 256-bit vector of [16 x i16] used as the source for the even-numbered
2890///    elements of the result.
2891/// \param __b
2892///    A 256-bit vector of [16 x i16] used as the source for the odd-numbered
2893///    elements of the result.
2894/// \returns A 256-bit vector of [16 x i16] containing the result.
2895static __inline__ __m256i __DEFAULT_FN_ATTRS256
2896_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
2897{
2898  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
2899}
2900
2901/// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
2902///    of [8 x i32] in \a __a and \a __b to return the resulting 256-bit vector
2903///    of [8 x i32]. Specifically, uses the lower 64 bits of each 128-bit half
2904///    of \a __a and \a __b as input; other bits in these parameters are
2905///    ignored.
2906///
2907/// \code{.operation}
2908/// result[31:0] := __a[31:0]
2909/// result[63:32] := __b[31:0]
2910/// result[95:64] := __a[63:32]
2911/// result[127:96] := __b[63:32]
2912/// result[159:128] := __a[159:128]
2913/// result[191:160] := __b[159:128]
2914/// result[223:192] := __a[191:160]
2915/// result[255:224] := __b[191:190]
2916/// \endcode
2917///
2918/// \headerfile <immintrin.h>
2919///
2920/// This intrinsic corresponds to the \c VPUNPCKLDQ instruction.
2921///
2922/// \param __a
2923///    A 256-bit vector of [8 x i32] used as the source for the even-numbered
2924///    elements of the result.
2925/// \param __b
2926///    A 256-bit vector of [8 x i32] used as the source for the odd-numbered
2927///    elements of the result.
2928/// \returns A 256-bit vector of [8 x i32] containing the result.
2929static __inline__ __m256i __DEFAULT_FN_ATTRS256
2930_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
2931{
2932  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
2933}
2934
2935/// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
2936///    of [4 x i64] in \a __a and \a __b to return the resulting 256-bit vector
2937///    of [4 x i64]. Specifically, uses the lower 64 bits of each 128-bit half
2938///    of \a __a and \a __b as input; other bits in these parameters are
2939///    ignored.
2940///
2941/// \code{.operation}
2942/// result[63:0] := __a[63:0]
2943/// result[127:64] := __b[63:0]
2944/// result[191:128] := __a[191:128]
2945/// result[255:192] := __b[191:128]
2946/// \endcode
2947///
2948/// \headerfile <immintrin.h>
2949///
2950/// This intrinsic corresponds to the \c VPUNPCKLQDQ instruction.
2951///
2952/// \param __a
2953///    A 256-bit vector of [4 x i64] used as the source for the even-numbered
2954///    elements of the result.
2955/// \param __b
2956///    A 256-bit vector of [4 x i64] used as the source for the odd-numbered
2957///    elements of the result.
2958/// \returns A 256-bit vector of [4 x i64] containing the result.
2959static __inline__ __m256i __DEFAULT_FN_ATTRS256
2960_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
2961{
2962  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
2963}
2964
2965/// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
2966///    \a __b.
2967///
2968/// \headerfile <immintrin.h>
2969///
2970/// This intrinsic corresponds to the \c VPXOR instruction.
2971///
2972/// \param __a
2973///    A 256-bit integer vector.
2974/// \param __b
2975///    A 256-bit integer vector.
2976/// \returns A 256-bit integer vector containing the result.
2977static __inline__ __m256i __DEFAULT_FN_ATTRS256
2978_mm256_xor_si256(__m256i __a, __m256i __b)
2979{
2980  return (__m256i)((__v4du)__a ^ (__v4du)__b);
2981}
2982
2983/// Loads the 256-bit integer vector from memory \a __V using a non-temporal
2984///   memory hint and returns the vector. \a __V must be aligned on a 32-byte
2985///   boundary.
2986///
2987/// \headerfile <immintrin.h>
2988///
2989/// This intrinsic corresponds to the \c VMOVNTDQA instruction.
2990///
2991/// \param __V
2992///    A pointer to the 32-byte aligned memory containing the vector to load.
2993/// \returns A 256-bit integer vector loaded from memory.
2994static __inline__ __m256i __DEFAULT_FN_ATTRS256
2995_mm256_stream_load_si256(const void *__V)
2996{
2997  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
2998  return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
2999}
3000
3001/// Broadcasts the 32-bit floating-point value from the low element of the
3002///    128-bit vector of [4 x float] in \a __X to all elements of the result's
3003///    128-bit vector of [4 x float].
3004///
3005/// \headerfile <immintrin.h>
3006///
3007/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3008///
3009/// \param __X
3010///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3011/// \returns A 128-bit vector of [4 x float] containing the result.
3012static __inline__ __m128 __DEFAULT_FN_ATTRS128
3013_mm_broadcastss_ps(__m128 __X)
3014{
3015  return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
3016}
3017
3018/// Broadcasts the 64-bit floating-point value from the low element of the
3019///    128-bit vector of [2 x double] in \a __a to both elements of the
3020///    result's 128-bit vector of [2 x double].
3021///
3022/// \headerfile <immintrin.h>
3023///
3024/// This intrinsic corresponds to the \c MOVDDUP instruction.
3025///
3026/// \param __a
3027///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3028/// \returns A 128-bit vector of [2 x double] containing the result.
3029static __inline__ __m128d __DEFAULT_FN_ATTRS128
3030_mm_broadcastsd_pd(__m128d __a)
3031{
3032  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
3033}
3034
3035/// Broadcasts the 32-bit floating-point value from the low element of the
3036///    128-bit vector of [4 x float] in \a __X to all elements of the
3037///    result's 256-bit vector of [8 x float].
3038///
3039/// \headerfile <immintrin.h>
3040///
3041/// This intrinsic corresponds to the \c VBROADCASTSS instruction.
3042///
3043/// \param __X
3044///    A 128-bit vector of [4 x float] whose low element will be broadcast.
3045/// \returns A 256-bit vector of [8 x float] containing the result.
3046static __inline__ __m256 __DEFAULT_FN_ATTRS256
3047_mm256_broadcastss_ps(__m128 __X)
3048{
3049  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3050}
3051
3052/// Broadcasts the 64-bit floating-point value from the low element of the
3053///    128-bit vector of [2 x double] in \a __X to all elements of the
3054///    result's 256-bit vector of [4 x double].
3055///
3056/// \headerfile <immintrin.h>
3057///
3058/// This intrinsic corresponds to the \c VBROADCASTSD instruction.
3059///
3060/// \param __X
3061///    A 128-bit vector of [2 x double] whose low element will be broadcast.
3062/// \returns A 256-bit vector of [4 x double] containing the result.
3063static __inline__ __m256d __DEFAULT_FN_ATTRS256
3064_mm256_broadcastsd_pd(__m128d __X)
3065{
3066  return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
3067}
3068
3069/// Broadcasts the 128-bit integer data from \a __X to both the lower and
3070///    upper halves of the 256-bit result.
3071///
3072/// \headerfile <immintrin.h>
3073///
3074/// This intrinsic corresponds to the \c VBROADCASTI128 instruction.
3075///
3076/// \param __X
3077///    A 128-bit integer vector to be broadcast.
3078/// \returns A 256-bit integer vector containing the result.
3079static __inline__ __m256i __DEFAULT_FN_ATTRS256
3080_mm256_broadcastsi128_si256(__m128i __X)
3081{
3082  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
3083}
3084
3085#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
3086
3087/// Merges 32-bit integer elements from either of the two 128-bit vectors of
3088///    [4 x i32] in \a V1 or \a V2 to the result's 128-bit vector of [4 x i32],
3089///    as specified by the immediate integer operand \a M.
3090///
3091/// \code{.operation}
3092/// FOR i := 0 TO 3
3093///   j := i*32
3094///   IF M[i] == 0
3095///     result[31+j:j] := V1[31+j:j]
3096///   ELSE
3097///     result[31+j:j] := V2[32+j:j]
3098///   FI
3099/// ENDFOR
3100/// \endcode
3101///
3102/// \headerfile <immintrin.h>
3103///
3104/// \code
3105/// __m128i _mm_blend_epi32(__m128i V1, __m128i V2, const int M);
3106/// \endcode
3107///
3108/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3109///
3110/// \param V1
3111///    A 128-bit vector of [4 x i32] containing source values.
3112/// \param V2
3113///    A 128-bit vector of [4 x i32] containing source values.
3114/// \param M
3115///    An immediate 8-bit integer operand, with bits [3:0] specifying the
3116///    source for each element of the result. The position of the mask bit
3117///    corresponds to the index of a copied value. When a mask bit is 0, the
3118///    element is copied from \a V1; otherwise, it is copied from \a V2.
3119/// \returns A 128-bit vector of [4 x i32] containing the result.
3120#define _mm_blend_epi32(V1, V2, M) \
3121  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
3122                                      (__v4si)(__m128i)(V2), (int)(M)))
3123
3124/// Merges 32-bit integer elements from either of the two 256-bit vectors of
3125///    [8 x i32] in \a V1 or \a V2 to return a 256-bit vector of [8 x i32],
3126///    as specified by the immediate integer operand \a M.
3127///
3128/// \code{.operation}
3129/// FOR i := 0 TO 7
3130///   j := i*32
3131///   IF M[i] == 0
3132///     result[31+j:j] := V1[31+j:j]
3133///   ELSE
3134///     result[31+j:j] := V2[32+j:j]
3135///   FI
3136/// ENDFOR
3137/// \endcode
3138///
3139/// \headerfile <immintrin.h>
3140///
3141/// \code
3142/// __m256i _mm256_blend_epi32(__m256i V1, __m256i V2, const int M);
3143/// \endcode
3144///
3145/// This intrinsic corresponds to the \c VPBLENDDD instruction.
3146///
3147/// \param V1
3148///    A 256-bit vector of [8 x i32] containing source values.
3149/// \param V2
3150///    A 256-bit vector of [8 x i32] containing source values.
3151/// \param M
3152///    An immediate 8-bit integer operand, with bits [7:0] specifying the
3153///    source for each element of the result. The position of the mask bit
3154///    corresponds to the index of a copied value. When a mask bit is 0, the
3155///    element is copied from \a V1; otherwise, it is is copied from \a V2.
3156/// \returns A 256-bit vector of [8 x i32] containing the result.
3157#define _mm256_blend_epi32(V1, V2, M) \
3158  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
3159                                      (__v8si)(__m256i)(V2), (int)(M)))
3160
3161/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3162///    bytes of the 256-bit result.
3163///
3164/// \headerfile <immintrin.h>
3165///
3166/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3167///
3168/// \param __X
3169///    A 128-bit integer vector whose low byte will be broadcast.
3170/// \returns A 256-bit integer vector containing the result.
3171static __inline__ __m256i __DEFAULT_FN_ATTRS256
3172_mm256_broadcastb_epi8(__m128i __X)
3173{
3174  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3175}
3176
3177/// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
3178///    to all elements of the result's 256-bit vector of [16 x i16].
3179///
3180/// \headerfile <immintrin.h>
3181///
3182/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3183///
3184/// \param __X
3185///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3186/// \returns A 256-bit vector of [16 x i16] containing the result.
3187static __inline__ __m256i __DEFAULT_FN_ATTRS256
3188_mm256_broadcastw_epi16(__m128i __X)
3189{
3190  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3191}
3192
3193/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3194///    to all elements of the result's 256-bit vector of [8 x i32].
3195///
3196/// \headerfile <immintrin.h>
3197///
3198/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3199///
3200/// \param __X
3201///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3202/// \returns A 256-bit vector of [8 x i32] containing the result.
3203static __inline__ __m256i __DEFAULT_FN_ATTRS256
3204_mm256_broadcastd_epi32(__m128i __X)
3205{
3206  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3207}
3208
3209/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3210///    to all elements of the result's 256-bit vector of [4 x i64].
3211///
3212/// \headerfile <immintrin.h>
3213///
3214/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3215///
3216/// \param __X
3217///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3218/// \returns A 256-bit vector of [4 x i64] containing the result.
3219static __inline__ __m256i __DEFAULT_FN_ATTRS256
3220_mm256_broadcastq_epi64(__m128i __X)
3221{
3222  return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
3223}
3224
3225/// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
3226///    bytes of the 128-bit result.
3227///
3228/// \headerfile <immintrin.h>
3229///
3230/// This intrinsic corresponds to the \c VPBROADCASTB instruction.
3231///
3232/// \param __X
3233///    A 128-bit integer vector whose low byte will be broadcast.
3234/// \returns A 128-bit integer vector containing the result.
3235static __inline__ __m128i __DEFAULT_FN_ATTRS128
3236_mm_broadcastb_epi8(__m128i __X)
3237{
3238  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
3239}
3240
3241/// Broadcasts the low element from the 128-bit vector of [8 x i16] in
3242///    \a __X to all elements of the result's 128-bit vector of [8 x i16].
3243///
3244/// \headerfile <immintrin.h>
3245///
3246/// This intrinsic corresponds to the \c VPBROADCASTW instruction.
3247///
3248/// \param __X
3249///    A 128-bit vector of [8 x i16] whose low element will be broadcast.
3250/// \returns A 128-bit vector of [8 x i16] containing the result.
3251static __inline__ __m128i __DEFAULT_FN_ATTRS128
3252_mm_broadcastw_epi16(__m128i __X)
3253{
3254  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
3255}
3256
3257/// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
3258///    to all elements of the result's vector of [4 x i32].
3259///
3260/// \headerfile <immintrin.h>
3261///
3262/// This intrinsic corresponds to the \c VPBROADCASTD instruction.
3263///
3264/// \param __X
3265///    A 128-bit vector of [4 x i32] whose low element will be broadcast.
3266/// \returns A 128-bit vector of [4 x i32] containing the result.
3267static __inline__ __m128i __DEFAULT_FN_ATTRS128
3268_mm_broadcastd_epi32(__m128i __X)
3269{
3270  return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
3271}
3272
3273/// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
3274///    to both elements of the result's 128-bit vector of [2 x i64].
3275///
3276/// \headerfile <immintrin.h>
3277///
3278/// This intrinsic corresponds to the \c VPBROADCASTQ instruction.
3279///
3280/// \param __X
3281///    A 128-bit vector of [2 x i64] whose low element will be broadcast.
3282/// \returns A 128-bit vector of [2 x i64] containing the result.
3283static __inline__ __m128i __DEFAULT_FN_ATTRS128
3284_mm_broadcastq_epi64(__m128i __X)
3285{
3286  return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
3287}
3288
3289/// Sets the result's 256-bit vector of [8 x i32] to copies of elements of the
3290///    256-bit vector of [8 x i32] in \a __a as specified by indexes in the
3291///    elements of the 256-bit vector of [8 x i32] in \a __b.
3292///
3293/// \code{.operation}
3294/// FOR i := 0 TO 7
3295///   j := i*32
3296///   k := __b[j+2:j] * 32
3297///   result[j+31:j] := __a[k+31:k]
3298/// ENDFOR
3299/// \endcode
3300///
3301/// \headerfile <immintrin.h>
3302///
3303/// This intrinsic corresponds to the \c VPERMD instruction.
3304///
3305/// \param __a
3306///    A 256-bit vector of [8 x i32] containing the source values.
3307/// \param __b
3308///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3309///    \a __a.
3310/// \returns A 256-bit vector of [8 x i32] containing the result.
3311static __inline__ __m256i __DEFAULT_FN_ATTRS256
3312_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
3313{
3314  return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
3315}
3316
3317/// Sets the result's 256-bit vector of [4 x double] to copies of elements of
3318///    the 256-bit vector of [4 x double] in \a V as specified by the
3319///    immediate value \a M.
3320///
3321/// \code{.operation}
3322/// FOR i := 0 TO 3
3323///   j := i*64
3324///   k := (M >> i*2)[1:0] * 64
3325///   result[j+63:j] := V[k+63:k]
3326/// ENDFOR
3327/// \endcode
3328///
3329/// \headerfile <immintrin.h>
3330///
3331/// \code
3332/// __m256d _mm256_permute4x64_pd(__m256d V, const int M);
3333/// \endcode
3334///
3335/// This intrinsic corresponds to the \c VPERMPD instruction.
3336///
3337/// \param V
3338///    A 256-bit vector of [4 x double] containing the source values.
3339/// \param M
3340///    An immediate 8-bit value specifying which elements to copy from \a V.
3341///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3342///    \a M[3:2] specifies the index for element 1, and so forth.
3343/// \returns A 256-bit vector of [4 x double] containing the result.
3344#define _mm256_permute4x64_pd(V, M) \
3345  ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
3346
3347/// Sets the result's 256-bit vector of [8 x float] to copies of elements of
3348///    the 256-bit vector of [8 x float] in \a __a as specified by indexes in
3349///    the elements of the 256-bit vector of [8 x i32] in \a __b.
3350///
3351/// \code{.operation}
3352/// FOR i := 0 TO 7
3353///   j := i*32
3354///   k := __b[j+2:j] * 32
3355///   result[j+31:j] := __a[k+31:k]
3356/// ENDFOR
3357/// \endcode
3358///
3359/// \headerfile <immintrin.h>
3360///
3361/// This intrinsic corresponds to the \c VPERMPS instruction.
3362///
3363/// \param __a
3364///    A 256-bit vector of [8 x float] containing the source values.
3365/// \param __b
3366///    A 256-bit vector of [8 x i32] containing indexes of values to use from
3367///    \a __a.
3368/// \returns A 256-bit vector of [8 x float] containing the result.
3369static __inline__ __m256 __DEFAULT_FN_ATTRS256
3370_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
3371{
3372  return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
3373}
3374
3375/// Sets the result's 256-bit vector of [4 x i64] result to copies of elements
3376///    of the 256-bit vector of [4 x i64] in \a V as specified by the
3377///    immediate value \a M.
3378///
3379/// \code{.operation}
3380/// FOR i := 0 TO 3
3381///   j := i*64
3382///   k := (M >> i*2)[1:0] * 64
3383///   result[j+63:j] := V[k+63:k]
3384/// ENDFOR
3385/// \endcode
3386///
3387/// \headerfile <immintrin.h>
3388///
3389/// \code
3390/// __m256i _mm256_permute4x64_epi64(__m256i V, const int M);
3391/// \endcode
3392///
3393/// This intrinsic corresponds to the \c VPERMQ instruction.
3394///
3395/// \param V
3396///    A 256-bit vector of [4 x i64] containing the source values.
3397/// \param M
3398///    An immediate 8-bit value specifying which elements to copy from \a V.
3399///    \a M[1:0] specifies the index in \a a for element 0 of the result,
3400///    \a M[3:2] specifies the index for element 1, and so forth.
3401/// \returns A 256-bit vector of [4 x i64] containing the result.
3402#define _mm256_permute4x64_epi64(V, M) \
3403  ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
3404
3405/// Sets each half of the 256-bit result either to zero or to one of the
3406///    four possible 128-bit halves of the 256-bit vectors \a V1 and \a V2,
3407///    as specified by the immediate value \a M.
3408///
3409/// \code{.operation}
3410/// FOR i := 0 TO 1
3411///   j := i*128
3412///   k := M >> (i*4)
3413///   IF k[3] == 0
3414///     CASE (k[1:0]) OF
3415///     0: result[127+j:j] := V1[127:0]
3416///     1: result[127+j:j] := V1[255:128]
3417///     2: result[127+j:j] := V2[127:0]
3418///     3: result[127+j:j] := V2[255:128]
3419///     ESAC
3420///   ELSE
3421///     result[127+j:j] := 0
3422///   FI
3423/// ENDFOR
3424/// \endcode
3425///
3426/// \headerfile <immintrin.h>
3427///
3428/// \code
3429/// __m256i _mm256_permute2x128_si256(__m256i V1, __m256i V2, const int M);
3430/// \endcode
3431///
3432/// This intrinsic corresponds to the \c VPERM2I128 instruction.
3433///
3434/// \param V1
3435///    A 256-bit integer vector containing source values.
3436/// \param V2
3437///    A 256-bit integer vector containing source values.
3438/// \param M
3439///    An immediate value specifying how to form the result. Bits [3:0]
3440///    control the lower half of the result, bits [7:4] control the upper half.
3441///    Within each 4-bit control value, if bit 3 is 1, the result is zero,
3442///    otherwise bits [1:0] determine the source as follows. \n
3443///    0: the lower half of \a V1 \n
3444///    1: the upper half of \a V1 \n
3445///    2: the lower half of \a V2 \n
3446///    3: the upper half of \a V2
3447/// \returns A 256-bit integer vector containing the result.
3448#define _mm256_permute2x128_si256(V1, V2, M) \
3449  ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
3450
3451/// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
3452///     of the immediate \a M is zero, extracts the lower half of the result;
3453///     otherwise, extracts the upper half.
3454///
3455/// \headerfile <immintrin.h>
3456///
3457/// \code
3458/// __m128i _mm256_extracti128_si256(__m256i V, const int M);
3459/// \endcode
3460///
3461/// This intrinsic corresponds to the \c VEXTRACTI128 instruction.
3462///
3463/// \param V
3464///    A 256-bit integer vector containing the source values.
3465/// \param M
3466///    An immediate value specifying which half of \a V to extract.
3467/// \returns A 128-bit integer vector containing the result.
3468#define _mm256_extracti128_si256(V, M) \
3469  ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
3470
3471/// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
3472///     result with the 128-bit vector \a V2. If bit 0 of the immediate \a M
3473///     is zero, overwrites the lower half of the result; otherwise,
3474///     overwrites the upper half.
3475///
3476/// \headerfile <immintrin.h>
3477///
3478/// \code
3479/// __m256i _mm256_inserti128_si256(__m256i V1, __m128i V2, const int M);
3480/// \endcode
3481///
3482/// This intrinsic corresponds to the \c VINSERTI128 instruction.
3483///
3484/// \param V1
3485///    A 256-bit integer vector containing a source value.
3486/// \param V2
3487///    A 128-bit integer vector containing a source value.
3488/// \param M
3489///    An immediate value specifying where to put \a V2 in the result.
3490/// \returns A 256-bit integer vector containing the result.
3491#define _mm256_inserti128_si256(V1, V2, M) \
3492  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
3493                                         (__v2di)(__m128i)(V2), (int)(M)))
3494
3495/// Conditionally loads eight 32-bit integer elements from memory \a __X, if
3496///    the most significant bit of the corresponding element in the mask
3497///    \a __M is set; otherwise, sets that element of the result to zero.
3498///    Returns the 256-bit [8 x i32] result.
3499///
3500/// \code{.operation}
3501/// FOR i := 0 TO 7
3502///   j := i*32
3503///   IF __M[j+31] == 1
3504///     result[j+31:j] := Load32(__X+(i*4))
3505///   ELSE
3506///     result[j+31:j] := 0
3507///   FI
3508/// ENDFOR
3509/// \endcode
3510///
3511/// \headerfile <immintrin.h>
3512///
3513/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3514///
3515/// \param __X
3516///    A pointer to the memory used for loading values.
3517/// \param __M
3518///    A 256-bit vector of [8 x i32] containing the mask bits.
3519/// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
3520///    elements.
3521static __inline__ __m256i __DEFAULT_FN_ATTRS256
3522_mm256_maskload_epi32(int const *__X, __m256i __M)
3523{
3524  return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
3525}
3526
3527/// Conditionally loads four 64-bit integer elements from memory \a __X, if
3528///    the most significant bit of the corresponding element in the mask
3529///    \a __M is set; otherwise, sets that element of the result to zero.
3530///    Returns the 256-bit [4 x i64] result.
3531///
3532/// \code{.operation}
3533/// FOR i := 0 TO 3
3534///   j := i*64
3535///   IF __M[j+63] == 1
3536///     result[j+63:j] := Load64(__X+(i*8))
3537///   ELSE
3538///     result[j+63:j] := 0
3539///   FI
3540/// ENDFOR
3541/// \endcode
3542///
3543/// \headerfile <immintrin.h>
3544///
3545/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3546///
3547/// \param __X
3548///    A pointer to the memory used for loading values.
3549/// \param __M
3550///    A 256-bit vector of [4 x i64] containing the mask bits.
3551/// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
3552///    elements.
3553static __inline__ __m256i __DEFAULT_FN_ATTRS256
3554_mm256_maskload_epi64(long long const *__X, __m256i __M)
3555{
3556  return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
3557}
3558
3559/// Conditionally loads four 32-bit integer elements from memory \a __X, if
3560///    the most significant bit of the corresponding element in the mask
3561///    \a __M is set; otherwise, sets that element of the result to zero.
3562///    Returns the 128-bit [4 x i32] result.
3563///
3564/// \code{.operation}
3565/// FOR i := 0 TO 3
3566///   j := i*32
3567///   IF __M[j+31] == 1
3568///     result[j+31:j] := Load32(__X+(i*4))
3569///   ELSE
3570///     result[j+31:j] := 0
3571///   FI
3572/// ENDFOR
3573/// \endcode
3574///
3575/// \headerfile <immintrin.h>
3576///
3577/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3578///
3579/// \param __X
3580///    A pointer to the memory used for loading values.
3581/// \param __M
3582///    A 128-bit vector of [4 x i32] containing the mask bits.
3583/// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
3584///    elements.
3585static __inline__ __m128i __DEFAULT_FN_ATTRS128
3586_mm_maskload_epi32(int const *__X, __m128i __M)
3587{
3588  return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
3589}
3590
3591/// Conditionally loads two 64-bit integer elements from memory \a __X, if
3592///    the most significant bit of the corresponding element in the mask
3593///    \a __M is set; otherwise, sets that element of the result to zero.
3594///    Returns the 128-bit [2 x i64] result.
3595///
3596/// \code{.operation}
3597/// FOR i := 0 TO 1
3598///   j := i*64
3599///   IF __M[j+63] == 1
3600///     result[j+63:j] := Load64(__X+(i*8))
3601///   ELSE
3602///     result[j+63:j] := 0
3603///   FI
3604/// ENDFOR
3605/// \endcode
3606///
3607/// \headerfile <immintrin.h>
3608///
3609/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3610///
3611/// \param __X
3612///    A pointer to the memory used for loading values.
3613/// \param __M
3614///    A 128-bit vector of [2 x i64] containing the mask bits.
3615/// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
3616///    elements.
3617static __inline__ __m128i __DEFAULT_FN_ATTRS128
3618_mm_maskload_epi64(long long const *__X, __m128i __M)
3619{
3620  return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
3621}
3622
3623/// Conditionally stores eight 32-bit integer elements from the 256-bit vector
3624///    of [8 x i32] in \a __Y to memory \a __X, if the most significant bit of
3625///    the corresponding element in the mask \a __M is set; otherwise, the
3626///    memory element is unchanged.
3627///
3628/// \code{.operation}
3629/// FOR i := 0 TO 7
3630///   j := i*32
3631///   IF __M[j+31] == 1
3632///     Store32(__X+(i*4), __Y[j+31:j])
3633///   FI
3634/// ENDFOR
3635/// \endcode
3636///
3637/// \headerfile <immintrin.h>
3638///
3639/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3640///
3641/// \param __X
3642///    A pointer to the memory used for storing values.
3643/// \param __M
3644///    A 256-bit vector of [8 x i32] containing the mask bits.
3645/// \param __Y
3646///    A 256-bit vector of [8 x i32] containing the values to store.
3647static __inline__ void __DEFAULT_FN_ATTRS256
3648_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
3649{
3650  __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
3651}
3652
3653/// Conditionally stores four 64-bit integer elements from the 256-bit vector
3654///    of [4 x i64] in \a __Y to memory \a __X, if the most significant bit of
3655///    the corresponding element in the mask \a __M is set; otherwise, the
3656///    memory element is unchanged.
3657///
3658/// \code{.operation}
3659/// FOR i := 0 TO 3
3660///   j := i*64
3661///   IF __M[j+63] == 1
3662///     Store64(__X+(i*8), __Y[j+63:j])
3663///   FI
3664/// ENDFOR
3665/// \endcode
3666///
3667/// \headerfile <immintrin.h>
3668///
3669/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3670///
3671/// \param __X
3672///    A pointer to the memory used for storing values.
3673/// \param __M
3674///    A 256-bit vector of [4 x i64] containing the mask bits.
3675/// \param __Y
3676///    A 256-bit vector of [4 x i64] containing the values to store.
3677static __inline__ void __DEFAULT_FN_ATTRS256
3678_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
3679{
3680  __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
3681}
3682
3683/// Conditionally stores four 32-bit integer elements from the 128-bit vector
3684///    of [4 x i32] in \a __Y to memory \a __X, if the most significant bit of
3685///    the corresponding element in the mask \a __M is set; otherwise, the
3686///    memory element is unchanged.
3687///
3688/// \code{.operation}
3689/// FOR i := 0 TO 3
3690///   j := i*32
3691///   IF __M[j+31] == 1
3692///     Store32(__X+(i*4), __Y[j+31:j])
3693///   FI
3694/// ENDFOR
3695/// \endcode
3696///
3697/// \headerfile <immintrin.h>
3698///
3699/// This intrinsic corresponds to the \c VPMASKMOVD instruction.
3700///
3701/// \param __X
3702///    A pointer to the memory used for storing values.
3703/// \param __M
3704///    A 128-bit vector of [4 x i32] containing the mask bits.
3705/// \param __Y
3706///    A 128-bit vector of [4 x i32] containing the values to store.
3707static __inline__ void __DEFAULT_FN_ATTRS128
3708_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
3709{
3710  __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
3711}
3712
3713/// Conditionally stores two 64-bit integer elements from the 128-bit vector
3714///    of [2 x i64] in \a __Y to memory \a __X, if the most significant bit of
3715///    the corresponding element in the mask \a __M is set; otherwise, the
3716///    memory element is unchanged.
3717///
3718/// \code{.operation}
3719/// FOR i := 0 TO 1
3720///   j := i*64
3721///   IF __M[j+63] == 1
3722///     Store64(__X+(i*8), __Y[j+63:j])
3723///   FI
3724/// ENDFOR
3725/// \endcode
3726///
3727/// \headerfile <immintrin.h>
3728///
3729/// This intrinsic corresponds to the \c VPMASKMOVQ instruction.
3730///
3731/// \param __X
3732///    A pointer to the memory used for storing values.
3733/// \param __M
3734///    A 128-bit vector of [2 x i64] containing the mask bits.
3735/// \param __Y
3736///    A 128-bit vector of [2 x i64] containing the values to store.
3737static __inline__ void __DEFAULT_FN_ATTRS128
3738_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
3739{
3740  __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
3741}
3742
3743/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3744///    left by the number of bits given in the corresponding element of the
3745///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3746///    returns the result. If the shift count for any element is greater than
3747///    31, the result for that element is zero.
3748///
3749/// \headerfile <immintrin.h>
3750///
3751/// This intrinsic corresponds to the \c VPSLLVD instruction.
3752///
3753/// \param __X
3754///    A 256-bit vector of [8 x i32] to be shifted.
3755/// \param __Y
3756///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3757///    bits).
3758/// \returns A 256-bit vector of [8 x i32] containing the result.
3759static __inline__ __m256i __DEFAULT_FN_ATTRS256
3760_mm256_sllv_epi32(__m256i __X, __m256i __Y)
3761{
3762  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
3763}
3764
3765/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3766///    left by the number of bits given in the corresponding element of the
3767///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3768///    returns the result. If the shift count for any element is greater than
3769///    31, the result for that element is zero.
3770///
3771/// \headerfile <immintrin.h>
3772///
3773/// This intrinsic corresponds to the \c VPSLLVD instruction.
3774///
3775/// \param __X
3776///    A 128-bit vector of [4 x i32] to be shifted.
3777/// \param __Y
3778///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3779///    bits).
3780/// \returns A 128-bit vector of [4 x i32] containing the result.
3781static __inline__ __m128i __DEFAULT_FN_ATTRS128
3782_mm_sllv_epi32(__m128i __X, __m128i __Y)
3783{
3784  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
3785}
3786
3787/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3788///    left by the number of bits given in the corresponding element of the
3789///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3790///    returns the result. If the shift count for any element is greater than
3791///    63, the result for that element is zero.
3792///
3793/// \headerfile <immintrin.h>
3794///
3795/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3796///
3797/// \param __X
3798///    A 256-bit vector of [4 x i64] to be shifted.
3799/// \param __Y
3800///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3801///    bits).
3802/// \returns A 256-bit vector of [4 x i64] containing the result.
3803static __inline__ __m256i __DEFAULT_FN_ATTRS256
3804_mm256_sllv_epi64(__m256i __X, __m256i __Y)
3805{
3806  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
3807}
3808
3809/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3810///    left by the number of bits given in the corresponding element of the
3811///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3812///    returns the result. If the shift count for any element is greater than
3813///    63, the result for that element is zero.
3814///
3815/// \headerfile <immintrin.h>
3816///
3817/// This intrinsic corresponds to the \c VPSLLVQ instruction.
3818///
3819/// \param __X
3820///    A 128-bit vector of [2 x i64] to be shifted.
3821/// \param __Y
3822///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3823///    bits).
3824/// \returns A 128-bit vector of [2 x i64] containing the result.
3825static __inline__ __m128i __DEFAULT_FN_ATTRS128
3826_mm_sllv_epi64(__m128i __X, __m128i __Y)
3827{
3828  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
3829}
3830
3831/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3832///    right by the number of bits given in the corresponding element of the
3833///    256-bit vector of [8 x i32] in \a __Y, shifting in sign bits, and
3834///    returns the result. If the shift count for any element is greater than
3835///    31, the result for that element is 0 or -1 according to the sign bit
3836///    for that element.
3837///
3838/// \headerfile <immintrin.h>
3839///
3840/// This intrinsic corresponds to the \c VPSRAVD instruction.
3841///
3842/// \param __X
3843///    A 256-bit vector of [8 x i32] to be shifted.
3844/// \param __Y
3845///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3846///    bits).
3847/// \returns A 256-bit vector of [8 x i32] containing the result.
3848static __inline__ __m256i __DEFAULT_FN_ATTRS256
3849_mm256_srav_epi32(__m256i __X, __m256i __Y)
3850{
3851  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
3852}
3853
3854/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3855///    right by the number of bits given in the corresponding element of the
3856///    128-bit vector of [4 x i32] in \a __Y, shifting in sign bits, and
3857///    returns the result. If the shift count for any element is greater than
3858///    31, the result for that element is 0 or -1 according to the sign bit
3859///    for that element.
3860///
3861/// \headerfile <immintrin.h>
3862///
3863/// This intrinsic corresponds to the \c VPSRAVD instruction.
3864///
3865/// \param __X
3866///    A 128-bit vector of [4 x i32] to be shifted.
3867/// \param __Y
3868///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3869///    bits).
3870/// \returns A 128-bit vector of [4 x i32] containing the result.
3871static __inline__ __m128i __DEFAULT_FN_ATTRS128
3872_mm_srav_epi32(__m128i __X, __m128i __Y)
3873{
3874  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
3875}
3876
3877/// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
3878///    right by the number of bits given in the corresponding element of the
3879///    256-bit vector of [8 x i32] in \a __Y, shifting in zero bits, and
3880///    returns the result. If the shift count for any element is greater than
3881///    31, the result for that element is zero.
3882///
3883/// \headerfile <immintrin.h>
3884///
3885/// This intrinsic corresponds to the \c VPSRLVD instruction.
3886///
3887/// \param __X
3888///    A 256-bit vector of [8 x i32] to be shifted.
3889/// \param __Y
3890///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
3891///    bits).
3892/// \returns A 256-bit vector of [8 x i32] containing the result.
3893static __inline__ __m256i __DEFAULT_FN_ATTRS256
3894_mm256_srlv_epi32(__m256i __X, __m256i __Y)
3895{
3896  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
3897}
3898
3899/// Shifts each 32-bit element of the 128-bit vector of [4 x i32] in \a __X
3900///    right by the number of bits given in the corresponding element of the
3901///    128-bit vector of [4 x i32] in \a __Y, shifting in zero bits, and
3902///    returns the result. If the shift count for any element is greater than
3903///    31, the result for that element is zero.
3904///
3905/// \headerfile <immintrin.h>
3906///
3907/// This intrinsic corresponds to the \c VPSRLVD instruction.
3908///
3909/// \param __X
3910///    A 128-bit vector of [4 x i32] to be shifted.
3911/// \param __Y
3912///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
3913///    bits).
3914/// \returns A 128-bit vector of [4 x i32] containing the result.
3915static __inline__ __m128i __DEFAULT_FN_ATTRS128
3916_mm_srlv_epi32(__m128i __X, __m128i __Y)
3917{
3918  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
3919}
3920
3921/// Shifts each 64-bit element of the 256-bit vector of [4 x i64] in \a __X
3922///    right by the number of bits given in the corresponding element of the
3923///    128-bit vector of [4 x i64] in \a __Y, shifting in zero bits, and
3924///    returns the result. If the shift count for any element is greater than
3925///    63, the result for that element is zero.
3926///
3927/// \headerfile <immintrin.h>
3928///
3929/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3930///
3931/// \param __X
3932///    A 256-bit vector of [4 x i64] to be shifted.
3933/// \param __Y
3934///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
3935///    bits).
3936/// \returns A 256-bit vector of [4 x i64] containing the result.
3937static __inline__ __m256i __DEFAULT_FN_ATTRS256
3938_mm256_srlv_epi64(__m256i __X, __m256i __Y)
3939{
3940  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
3941}
3942
3943/// Shifts each 64-bit element of the 128-bit vector of [2 x i64] in \a __X
3944///    right by the number of bits given in the corresponding element of the
3945///    128-bit vector of [2 x i64] in \a __Y, shifting in zero bits, and
3946///    returns the result. If the shift count for any element is greater than
3947///    63, the result for that element is zero.
3948///
3949/// \headerfile <immintrin.h>
3950///
3951/// This intrinsic corresponds to the \c VPSRLVQ instruction.
3952///
3953/// \param __X
3954///    A 128-bit vector of [2 x i64] to be shifted.
3955/// \param __Y
3956///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
3957///    bits).
3958/// \returns A 128-bit vector of [2 x i64] containing the result.
3959static __inline__ __m128i __DEFAULT_FN_ATTRS128
3960_mm_srlv_epi64(__m128i __X, __m128i __Y)
3961{
3962  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
3963}
3964
3965/// Conditionally gathers two 64-bit floating-point values, either from the
3966///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
3967///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
3968///    of [2 x double] in \a mask determines the source for each element.
3969///
3970/// \code{.operation}
3971/// FOR element := 0 to 1
3972///   j := element*64
3973///   k := element*32
3974///   IF mask[j+63] == 0
3975///     result[j+63:j] := a[j+63:j]
3976///   ELSE
3977///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
3978///   FI
3979/// ENDFOR
3980/// \endcode
3981///
3982/// \headerfile <immintrin.h>
3983///
3984/// \code
3985/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
3986///                               __m128d mask, const int s);
3987/// \endcode
3988///
3989/// This intrinsic corresponds to the \c VGATHERDPD instruction.
3990///
3991/// \param a
3992///    A 128-bit vector of [2 x double] used as the source when a mask bit is
3993///    zero.
3994/// \param m
3995///    A pointer to the memory used for loading values.
3996/// \param i
3997///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
3998///    the first two elements are used.
3999/// \param mask
4000///    A 128-bit vector of [2 x double] containing the mask. The most
4001///    significant bit of each element in the mask vector represents the mask
4002///    bits. If a mask bit is zero, the corresponding value from vector \a a
4003///    is gathered; otherwise the value is loaded from memory.
4004/// \param s
4005///    A literal constant scale factor for the indexes in \a i. Must be
4006///    1, 2, 4, or 8.
4007/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4008#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
4009  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
4010                                      (double const *)(m), \
4011                                      (__v4si)(__m128i)(i), \
4012                                      (__v2df)(__m128d)(mask), (s)))
4013
4014/// Conditionally gathers four 64-bit floating-point values, either from the
4015///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4016///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4017///    of [4 x double] in \a mask determines the source for each element.
4018///
4019/// \code{.operation}
4020/// FOR element := 0 to 3
4021///   j := element*64
4022///   k := element*32
4023///   IF mask[j+63] == 0
4024///     result[j+63:j] := a[j+63:j]
4025///   ELSE
4026///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4027///   FI
4028/// ENDFOR
4029/// \endcode
4030///
4031/// \headerfile <immintrin.h>
4032///
4033/// \code
4034/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
4035///                                  __m256d mask, const int s);
4036/// \endcode
4037///
4038/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4039///
4040/// \param a
4041///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4042///    zero.
4043/// \param m
4044///    A pointer to the memory used for loading values.
4045/// \param i
4046///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4047/// \param mask
4048///    A 256-bit vector of [4 x double] containing the mask. The most
4049///    significant bit of each element in the mask vector represents the mask
4050///    bits. If a mask bit is zero, the corresponding value from vector \a a
4051///    is gathered; otherwise the value is loaded from memory.
4052/// \param s
4053///    A literal constant scale factor for the indexes in \a i. Must be
4054///    1, 2, 4, or 8.
4055/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4056#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
4057  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
4058                                         (double const *)(m), \
4059                                         (__v4si)(__m128i)(i), \
4060                                         (__v4df)(__m256d)(mask), (s)))
4061
4062/// Conditionally gathers two 64-bit floating-point values, either from the
4063///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
4064///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4065///    of [2 x double] in \a mask determines the source for each element.
4066///
4067/// \code{.operation}
4068/// FOR element := 0 to 1
4069///   j := element*64
4070///   k := element*64
4071///   IF mask[j+63] == 0
4072///     result[j+63:j] := a[j+63:j]
4073///   ELSE
4074///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4075///   FI
4076/// ENDFOR
4077/// \endcode
4078///
4079/// \headerfile <immintrin.h>
4080///
4081/// \code
4082/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
4083///                               __m128d mask, const int s);
4084/// \endcode
4085///
4086/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4087///
4088/// \param a
4089///    A 128-bit vector of [2 x double] used as the source when a mask bit is
4090///    zero.
4091/// \param m
4092///    A pointer to the memory used for loading values.
4093/// \param i
4094///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4095/// \param mask
4096///    A 128-bit vector of [2 x double] containing the mask. The most
4097///    significant bit of each element in the mask vector represents the mask
4098///    bits. If a mask bit is zero, the corresponding value from vector \a a
4099///    is gathered; otherwise the value is loaded from memory.
4100/// \param s
4101///    A literal constant scale factor for the indexes in \a i. Must be
4102///    1, 2, 4, or 8.
4103/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4104#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
4105  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
4106                                      (double const *)(m), \
4107                                      (__v2di)(__m128i)(i), \
4108                                      (__v2df)(__m128d)(mask), (s)))
4109
4110/// Conditionally gathers four 64-bit floating-point values, either from the
4111///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
4112///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4113///    of [4 x double] in \a mask determines the source for each element.
4114///
4115/// \code{.operation}
4116/// FOR element := 0 to 3
4117///   j := element*64
4118///   k := element*64
4119///   IF mask[j+63] == 0
4120///     result[j+63:j] := a[j+63:j]
4121///   ELSE
4122///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4123///   FI
4124/// ENDFOR
4125/// \endcode
4126///
4127/// \headerfile <immintrin.h>
4128///
4129/// \code
4130/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
4131///                                  __m256d mask, const int s);
4132/// \endcode
4133///
4134/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4135///
4136/// \param a
4137///    A 256-bit vector of [4 x double] used as the source when a mask bit is
4138///    zero.
4139/// \param m
4140///    A pointer to the memory used for loading values.
4141/// \param i
4142///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4143/// \param mask
4144///    A 256-bit vector of [4 x double] containing the mask. The most
4145///    significant bit of each element in the mask vector represents the mask
4146///    bits. If a mask bit is zero, the corresponding value from vector \a a
4147///    is gathered; otherwise the value is loaded from memory.
4148/// \param s
4149///    A literal constant scale factor for the indexes in \a i. Must be
4150///    1, 2, 4, or 8.
4151/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4152#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
4153  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
4154                                         (double const *)(m), \
4155                                         (__v4di)(__m256i)(i), \
4156                                         (__v4df)(__m256d)(mask), (s)))
4157
4158/// Conditionally gathers four 32-bit floating-point values, either from the
4159///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4160///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4161///    of [4 x float] in \a mask determines the source for each element.
4162///
4163/// \code{.operation}
4164/// FOR element := 0 to 3
4165///   j := element*32
4166///   k := element*32
4167///   IF mask[j+31] == 0
4168///     result[j+31:j] := a[j+31:j]
4169///   ELSE
4170///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4171///   FI
4172/// ENDFOR
4173/// \endcode
4174///
4175/// \headerfile <immintrin.h>
4176///
4177/// \code
4178/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
4179///                              __m128 mask, const int s);
4180/// \endcode
4181///
4182/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4183///
4184/// \param a
4185///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4186///    zero.
4187/// \param m
4188///    A pointer to the memory used for loading values.
4189/// \param i
4190///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4191/// \param mask
4192///    A 128-bit vector of [4 x float] containing the mask. The most
4193///    significant bit of each element in the mask vector represents the mask
4194///    bits. If a mask bit is zero, the corresponding value from vector \a a
4195///    is gathered; otherwise the value is loaded from memory.
4196/// \param s
4197///    A literal constant scale factor for the indexes in \a i. Must be
4198///    1, 2, 4, or 8.
4199/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4200#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
4201  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
4202                                     (float const *)(m), \
4203                                     (__v4si)(__m128i)(i), \
4204                                     (__v4sf)(__m128)(mask), (s)))
4205
4206/// Conditionally gathers eight 32-bit floating-point values, either from the
4207///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
4208///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4209///    of [8 x float] in \a mask determines the source for each element.
4210///
4211/// \code{.operation}
4212/// FOR element := 0 to 7
4213///   j := element*32
4214///   k := element*32
4215///   IF mask[j+31] == 0
4216///     result[j+31:j] := a[j+31:j]
4217///   ELSE
4218///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4219///   FI
4220/// ENDFOR
4221/// \endcode
4222///
4223/// \headerfile <immintrin.h>
4224///
4225/// \code
4226/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
4227///                                 __m256 mask, const int s);
4228/// \endcode
4229///
4230/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4231///
4232/// \param a
4233///    A 256-bit vector of [8 x float] used as the source when a mask bit is
4234///    zero.
4235/// \param m
4236///    A pointer to the memory used for loading values.
4237/// \param i
4238///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4239/// \param mask
4240///    A 256-bit vector of [8 x float] containing the mask. The most
4241///    significant bit of each element in the mask vector represents the mask
4242///    bits. If a mask bit is zero, the corresponding value from vector \a a
4243///    is gathered; otherwise the value is loaded from memory.
4244/// \param s
4245///    A literal constant scale factor for the indexes in \a i. Must be
4246///    1, 2, 4, or 8.
4247/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4248#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
4249  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
4250                                        (float const *)(m), \
4251                                        (__v8si)(__m256i)(i), \
4252                                        (__v8sf)(__m256)(mask), (s)))
4253
4254/// Conditionally gathers two 32-bit floating-point values, either from the
4255///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4256///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4257///    of [4 x float] in \a mask determines the source for the lower two
4258///    elements. The upper two elements of the result are zeroed.
4259///
4260/// \code{.operation}
4261/// FOR element := 0 to 1
4262///   j := element*32
4263///   k := element*64
4264///   IF mask[j+31] == 0
4265///     result[j+31:j] := a[j+31:j]
4266///   ELSE
4267///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4268///   FI
4269/// ENDFOR
4270/// result[127:64] := 0
4271/// \endcode
4272///
4273/// \headerfile <immintrin.h>
4274///
4275/// \code
4276/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
4277///                              __m128 mask, const int s);
4278/// \endcode
4279///
4280/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4281///
4282/// \param a
4283///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4284///    zero. Only the first two elements are used.
4285/// \param m
4286///    A pointer to the memory used for loading values.
4287/// \param i
4288///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4289/// \param mask
4290///    A 128-bit vector of [4 x float] containing the mask. The most
4291///    significant bit of each element in the mask vector represents the mask
4292///    bits. If a mask bit is zero, the corresponding value from vector \a a
4293///    is gathered; otherwise the value is loaded from memory. Only the first
4294///    two elements are used.
4295/// \param s
4296///    A literal constant scale factor for the indexes in \a i. Must be
4297///    1, 2, 4, or 8.
4298/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4299#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
4300  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
4301                                     (float const *)(m), \
4302                                     (__v2di)(__m128i)(i), \
4303                                     (__v4sf)(__m128)(mask), (s)))
4304
4305/// Conditionally gathers four 32-bit floating-point values, either from the
4306///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
4307///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4308///    of [4 x float] in \a mask determines the source for each element.
4309///
4310/// \code{.operation}
4311/// FOR element := 0 to 3
4312///   j := element*32
4313///   k := element*64
4314///   IF mask[j+31] == 0
4315///     result[j+31:j] := a[j+31:j]
4316///   ELSE
4317///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4318///   FI
4319/// ENDFOR
4320/// \endcode
4321///
4322/// \headerfile <immintrin.h>
4323///
4324/// \code
4325/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
4326///                                 __m128 mask, const int s);
4327/// \endcode
4328///
4329/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4330///
4331/// \param a
4332///    A 128-bit vector of [4 x float] used as the source when a mask bit is
4333///   zero.
4334/// \param m
4335///    A pointer to the memory used for loading values.
4336/// \param i
4337///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4338/// \param mask
4339///    A 128-bit vector of [4 x float] containing the mask. The most
4340///    significant bit of each element in the mask vector represents the mask
4341///    bits. If a mask bit is zero, the corresponding value from vector \a a
4342///    is gathered; otherwise the value is loaded from memory.
4343/// \param s
4344///    A literal constant scale factor for the indexes in \a i. Must be
4345///    1, 2, 4, or 8.
4346/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4347#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
4348  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
4349                                        (float const *)(m), \
4350                                        (__v4di)(__m256i)(i), \
4351                                        (__v4sf)(__m128)(mask), (s)))
4352
4353/// Conditionally gathers four 32-bit integer values, either from the
4354///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4355///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4356///    of [4 x i32] in \a mask determines the source for each element.
4357///
4358/// \code{.operation}
4359/// FOR element := 0 to 3
4360///   j := element*32
4361///   k := element*32
4362///   IF mask[j+31] == 0
4363///     result[j+31:j] := a[j+31:j]
4364///   ELSE
4365///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4366///   FI
4367/// ENDFOR
4368/// \endcode
4369///
4370/// \headerfile <immintrin.h>
4371///
4372/// \code
4373/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
4374///                                  __m128i mask, const int s);
4375/// \endcode
4376///
4377/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4378///
4379/// \param a
4380///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4381///    zero.
4382/// \param m
4383///    A pointer to the memory used for loading values.
4384/// \param i
4385///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4386/// \param mask
4387///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4388///    bit of each element in the mask vector represents the mask bits. If a
4389///    mask bit is zero, the corresponding value from vector \a a is gathered;
4390///    otherwise the value is loaded from memory.
4391/// \param s
4392///    A literal constant scale factor for the indexes in \a i. Must be
4393///    1, 2, 4, or 8.
4394/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4395#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
4396  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
4397                                     (int const *)(m), \
4398                                     (__v4si)(__m128i)(i), \
4399                                     (__v4si)(__m128i)(mask), (s)))
4400
4401/// Conditionally gathers eight 32-bit integer values, either from the
4402///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
4403///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
4404///    of [8 x i32] in \a mask determines the source for each element.
4405///
4406/// \code{.operation}
4407/// FOR element := 0 to 7
4408///   j := element*32
4409///   k := element*32
4410///   IF mask[j+31] == 0
4411///     result[j+31:j] := a[j+31:j]
4412///   ELSE
4413///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4414///   FI
4415/// ENDFOR
4416/// \endcode
4417///
4418/// \headerfile <immintrin.h>
4419///
4420/// \code
4421/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
4422///                                     __m256i mask, const int s);
4423/// \endcode
4424///
4425/// This intrinsic corresponds to the \c VPGATHERDD instruction.
4426///
4427/// \param a
4428///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
4429///    zero.
4430/// \param m
4431///    A pointer to the memory used for loading values.
4432/// \param i
4433///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4434/// \param mask
4435///    A 256-bit vector of [8 x i32] containing the mask. The most significant
4436///    bit of each element in the mask vector represents the mask bits. If a
4437///    mask bit is zero, the corresponding value from vector \a a is gathered;
4438///    otherwise the value is loaded from memory.
4439/// \param s
4440///    A literal constant scale factor for the indexes in \a i. Must be
4441///    1, 2, 4, or 8.
4442/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
4443#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
4444  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
4445                                        (int const *)(m), \
4446                                        (__v8si)(__m256i)(i), \
4447                                        (__v8si)(__m256i)(mask), (s)))
4448
4449/// Conditionally gathers two 32-bit integer values, either from the
4450///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4451///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4452///    of [4 x i32] in \a mask determines the source for the lower two
4453///    elements. The upper two elements of the result are zeroed.
4454///
4455/// \code{.operation}
4456/// FOR element := 0 to 1
4457///   j := element*32
4458///   k := element*64
4459///   IF mask[j+31] == 0
4460///     result[j+31:j] := a[j+31:j]
4461///   ELSE
4462///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4463///   FI
4464/// ENDFOR
4465/// result[127:64] := 0
4466/// \endcode
4467///
4468/// \headerfile <immintrin.h>
4469///
4470/// \code
4471/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
4472///                                  __m128i mask, const int s);
4473/// \endcode
4474///
4475/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4476///
4477/// \param a
4478///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4479///   zero. Only the first two elements are used.
4480/// \param m
4481///    A pointer to the memory used for loading values.
4482/// \param i
4483///    A 128-bit vector of [2 x i64] containing indexes into \a m.
4484/// \param mask
4485///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4486///    bit of each element in the mask vector represents the mask bits. If a
4487///    mask bit is zero, the corresponding value from vector \a a is gathered;
4488///    otherwise the value is loaded from memory. Only the first two elements
4489///    are used.
4490/// \param s
4491///    A literal constant scale factor for the indexes in \a i. Must be
4492///    1, 2, 4, or 8.
4493/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4494#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
4495  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
4496                                     (int const *)(m), \
4497                                     (__v2di)(__m128i)(i), \
4498                                     (__v4si)(__m128i)(mask), (s)))
4499
4500/// Conditionally gathers four 32-bit integer values, either from the
4501///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
4502///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
4503///    of [4 x i32] in \a mask determines the source for each element.
4504///
4505/// \code{.operation}
4506/// FOR element := 0 to 3
4507///   j := element*32
4508///   k := element*64
4509///   IF mask[j+31] == 0
4510///     result[j+31:j] := a[j+31:j]
4511///   ELSE
4512///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4513///   FI
4514/// ENDFOR
4515/// \endcode
4516///
4517/// \headerfile <immintrin.h>
4518///
4519/// \code
4520/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
4521///                                     __m128i mask, const int s);
4522/// \endcode
4523///
4524/// This intrinsic corresponds to the \c VPGATHERQD instruction.
4525///
4526/// \param a
4527///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
4528///    zero.
4529/// \param m
4530///    A pointer to the memory used for loading values.
4531/// \param i
4532///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4533/// \param mask
4534///    A 128-bit vector of [4 x i32] containing the mask. The most significant
4535///    bit of each element in the mask vector represents the mask bits. If a
4536///    mask bit is zero, the corresponding value from vector \a a is gathered;
4537///    otherwise the value is loaded from memory.
4538/// \param s
4539///    A literal constant scale factor for the indexes in \a i. Must be
4540///    1, 2, 4, or 8.
4541/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
4542#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
4543  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
4544                                        (int const *)(m), \
4545                                        (__v4di)(__m256i)(i), \
4546                                        (__v4si)(__m128i)(mask), (s)))
4547
4548/// Conditionally gathers two 64-bit integer values, either from the
4549///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4550///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
4551///    of [2 x i64] in \a mask determines the source for each element.
4552///
4553/// \code{.operation}
4554/// FOR element := 0 to 1
4555///   j := element*64
4556///   k := element*32
4557///   IF mask[j+63] == 0
4558///     result[j+63:j] := a[j+63:j]
4559///   ELSE
4560///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4561///   FI
4562/// ENDFOR
4563/// \endcode
4564///
4565/// \headerfile <immintrin.h>
4566///
4567/// \code
4568/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
4569///                                  __m128i mask, const int s);
4570/// \endcode
4571///
4572/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4573///
4574/// \param a
4575///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4576///    zero.
4577/// \param m
4578///    A pointer to the memory used for loading values.
4579/// \param i
4580///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4581///    the first two elements are used.
4582/// \param mask
4583///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4584///    bit of each element in the mask vector represents the mask bits. If a
4585///    mask bit is zero, the corresponding value from vector \a a is gathered;
4586///    otherwise the value is loaded from memory.
4587/// \param s
4588///    A literal constant scale factor for the indexes in \a i. Must be
4589///    1, 2, 4, or 8.
4590/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4591#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
4592  ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
4593                                     (long long const *)(m), \
4594                                     (__v4si)(__m128i)(i), \
4595                                     (__v2di)(__m128i)(mask), (s)))
4596
4597/// Conditionally gathers four 64-bit integer values, either from the
4598///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4599///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
4600///    of [4 x i64] in \a mask determines the source for each element.
4601///
4602/// \code{.operation}
4603/// FOR element := 0 to 3
4604///   j := element*64
4605///   k := element*32
4606///   IF mask[j+63] == 0
4607///     result[j+63:j] := a[j+63:j]
4608///   ELSE
4609///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4610///   FI
4611/// ENDFOR
4612/// \endcode
4613///
4614/// \headerfile <immintrin.h>
4615///
4616/// \code
4617/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
4618///                                     __m128i i, __m256i mask, const int s);
4619/// \endcode
4620///
4621/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
4622///
4623/// \param a
4624///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4625///    zero.
4626/// \param m
4627///    A pointer to the memory used for loading values.
4628/// \param i
4629///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4630/// \param mask
4631///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4632///    bit of each element in the mask vector represents the mask bits. If a
4633///    mask bit is zero, the corresponding value from vector \a a is gathered;
4634///    otherwise the value is loaded from memory.
4635/// \param s
4636///    A literal constant scale factor for the indexes in \a i. Must be
4637///    1, 2, 4, or 8.
4638/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4639#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
4640  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
4641                                        (long long const *)(m), \
4642                                        (__v4si)(__m128i)(i), \
4643                                        (__v4di)(__m256i)(mask), (s)))
4644
4645/// Conditionally gathers two 64-bit integer values, either from the
4646///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
4647///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
4648///    of [2 x i64] in \a mask determines the source for each element.
4649///
4650/// \code{.operation}
4651/// FOR element := 0 to 1
4652///   j := element*64
4653///   k := element*64
4654///   IF mask[j+63] == 0
4655///     result[j+63:j] := a[j+63:j]
4656///   ELSE
4657///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4658///   FI
4659/// ENDFOR
4660/// \endcode
4661///
4662/// \headerfile <immintrin.h>
4663///
4664/// \code
4665/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
4666///                                  __m128i mask, const int s);
4667/// \endcode
4668///
4669/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4670///
4671/// \param a
4672///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
4673///    zero.
4674/// \param m
4675///    A pointer to the memory used for loading values.
4676/// \param i
4677///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4678/// \param mask
4679///    A 128-bit vector of [2 x i64] containing the mask. The most significant
4680///    bit of each element in the mask vector represents the mask bits. If a
4681///    mask bit is zero, the corresponding value from vector \a a is gathered;
4682///    otherwise the value is loaded from memory.
4683/// \param s
4684///    A literal constant scale factor for the indexes in \a i. Must be
4685///    1, 2, 4, or 8.
4686/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
4687#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
4688  ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
4689                                     (long long const *)(m), \
4690                                     (__v2di)(__m128i)(i), \
4691                                     (__v2di)(__m128i)(mask), (s)))
4692
4693/// Conditionally gathers four 64-bit integer values, either from the
4694///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
4695///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
4696///    of [4 x i64] in \a mask determines the source for each element.
4697///
4698/// \code{.operation}
4699/// FOR element := 0 to 3
4700///   j := element*64
4701///   k := element*64
4702///   IF mask[j+63] == 0
4703///     result[j+63:j] := a[j+63:j]
4704///   ELSE
4705///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4706///   FI
4707/// ENDFOR
4708/// \endcode
4709///
4710/// \headerfile <immintrin.h>
4711///
4712/// \code
4713/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
4714///                                     __m256i i, __m256i mask, const int s);
4715/// \endcode
4716///
4717/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
4718///
4719/// \param a
4720///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
4721///    zero.
4722/// \param m
4723///    A pointer to the memory used for loading values.
4724/// \param i
4725///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4726/// \param mask
4727///    A 256-bit vector of [4 x i64] containing the mask. The most significant
4728///    bit of each element in the mask vector represents the mask bits. If a
4729///    mask bit is zero, the corresponding value from vector \a a is gathered;
4730///    otherwise the value is loaded from memory.
4731/// \param s
4732///    A literal constant scale factor for the indexes in \a i. Must be
4733///    1, 2, 4, or 8.
4734/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
4735#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
4736  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
4737                                        (long long const *)(m), \
4738                                        (__v4di)(__m256i)(i), \
4739                                        (__v4di)(__m256i)(mask), (s)))
4740
4741/// Gathers two 64-bit floating-point values from memory \a m using scaled
4742///    indexes from the 128-bit vector of [4 x i32] in \a i.
4743///
4744/// \code{.operation}
4745/// FOR element := 0 to 1
4746///   j := element*64
4747///   k := element*32
4748///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4749/// ENDFOR
4750/// \endcode
4751///
4752/// \headerfile <immintrin.h>
4753///
4754/// \code
4755/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
4756/// \endcode
4757///
4758/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4759///
4760/// \param m
4761///    A pointer to the memory used for loading values.
4762/// \param i
4763///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
4764///    the first two elements are used.
4765/// \param s
4766///    A literal constant scale factor for the indexes in \a i. Must be
4767///    1, 2, 4, or 8.
4768/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4769#define _mm_i32gather_pd(m, i, s) \
4770  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
4771                                      (double const *)(m), \
4772                                      (__v4si)(__m128i)(i), \
4773                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4774                                                           _mm_setzero_pd()), \
4775                                      (s)))
4776
4777/// Gathers four 64-bit floating-point values from memory \a m using scaled
4778///    indexes from the 128-bit vector of [4 x i32] in \a i.
4779///
4780/// \code{.operation}
4781/// FOR element := 0 to 3
4782///   j := element*64
4783///   k := element*32
4784///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
4785/// ENDFOR
4786/// \endcode
4787///
4788/// \headerfile <immintrin.h>
4789///
4790/// \code
4791/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
4792/// \endcode
4793///
4794/// This intrinsic corresponds to the \c VGATHERDPD instruction.
4795///
4796/// \param m
4797///    A pointer to the memory used for loading values.
4798/// \param i
4799///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4800/// \param s
4801///    A literal constant scale factor for the indexes in \a i. Must be
4802///    1, 2, 4, or 8.
4803/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4804#define _mm256_i32gather_pd(m, i, s) \
4805  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
4806                                         (double const *)(m), \
4807                                         (__v4si)(__m128i)(i), \
4808                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4809                                                               _mm256_setzero_pd(), \
4810                                                               _CMP_EQ_OQ), \
4811                                         (s)))
4812
4813/// Gathers two 64-bit floating-point values from memory \a m using scaled
4814///    indexes from the 128-bit vector of [2 x i64] in \a i.
4815///
4816/// \code{.operation}
4817/// FOR element := 0 to 1
4818///   j := element*64
4819///   k := element*64
4820///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4821/// ENDFOR
4822/// \endcode
4823///
4824/// \headerfile <immintrin.h>
4825///
4826/// \code
4827/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
4828/// \endcode
4829///
4830/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4831///
4832/// \param m
4833///    A pointer to the memory used for loading values.
4834/// \param i
4835///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4836/// \param s
4837///    A literal constant scale factor for the indexes in \a i. Must be
4838///    1, 2, 4, or 8.
4839/// \returns A 128-bit vector of [2 x double] containing the gathered values.
4840#define _mm_i64gather_pd(m, i, s) \
4841  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
4842                                      (double const *)(m), \
4843                                      (__v2di)(__m128i)(i), \
4844                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
4845                                                           _mm_setzero_pd()), \
4846                                      (s)))
4847
4848/// Gathers four 64-bit floating-point values from memory \a m using scaled
4849///    indexes from the 256-bit vector of [4 x i64] in \a i.
4850///
4851/// \code{.operation}
4852/// FOR element := 0 to 3
4853///   j := element*64
4854///   k := element*64
4855///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
4856/// ENDFOR
4857/// \endcode
4858///
4859/// \headerfile <immintrin.h>
4860///
4861/// \code
4862/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
4863/// \endcode
4864///
4865/// This intrinsic corresponds to the \c VGATHERQPD instruction.
4866///
4867/// \param m
4868///    A pointer to the memory used for loading values.
4869/// \param i
4870///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
4871/// \param s
4872///    A literal constant scale factor for the indexes in \a i. Must be
4873///    1, 2, 4, or 8.
4874/// \returns A 256-bit vector of [4 x double] containing the gathered values.
4875#define _mm256_i64gather_pd(m, i, s) \
4876  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
4877                                         (double const *)(m), \
4878                                         (__v4di)(__m256i)(i), \
4879                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
4880                                                               _mm256_setzero_pd(), \
4881                                                               _CMP_EQ_OQ), \
4882                                         (s)))
4883
4884/// Gathers four 32-bit floating-point values from memory \a m using scaled
4885///    indexes from the 128-bit vector of [4 x i32] in \a i.
4886///
4887/// \code{.operation}
4888/// FOR element := 0 to 3
4889///   j := element*32
4890///   k := element*32
4891///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4892/// ENDFOR
4893/// \endcode
4894///
4895/// \headerfile <immintrin.h>
4896///
4897/// \code
4898/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
4899/// \endcode
4900///
4901/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4902///
4903/// \param m
4904///    A pointer to the memory used for loading values.
4905/// \param i
4906///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
4907/// \param s
4908///    A literal constant scale factor for the indexes in \a i. Must be
4909///    1, 2, 4, or 8.
4910/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4911#define _mm_i32gather_ps(m, i, s) \
4912  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
4913                                     (float const *)(m), \
4914                                     (__v4si)(__m128i)(i), \
4915                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4916                                                          _mm_setzero_ps()), \
4917                                     (s)))
4918
4919/// Gathers eight 32-bit floating-point values from memory \a m using scaled
4920///    indexes from the 256-bit vector of [8 x i32] in \a i.
4921///
4922/// \code{.operation}
4923/// FOR element := 0 to 7
4924///   j := element*32
4925///   k := element*32
4926///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
4927/// ENDFOR
4928/// \endcode
4929///
4930/// \headerfile <immintrin.h>
4931///
4932/// \code
4933/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
4934/// \endcode
4935///
4936/// This intrinsic corresponds to the \c VGATHERDPS instruction.
4937///
4938/// \param m
4939///    A pointer to the memory used for loading values.
4940/// \param i
4941///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
4942/// \param s
4943///    A literal constant scale factor for the indexes in \a i. Must be
4944///    1, 2, 4, or 8.
4945/// \returns A 256-bit vector of [8 x float] containing the gathered values.
4946#define _mm256_i32gather_ps(m, i, s) \
4947  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
4948                                        (float const *)(m), \
4949                                        (__v8si)(__m256i)(i), \
4950                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
4951                                                              _mm256_setzero_ps(), \
4952                                                              _CMP_EQ_OQ), \
4953                                        (s)))
4954
4955/// Gathers two 32-bit floating-point values from memory \a m using scaled
4956///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
4957///    elements of the result are zeroed.
4958///
4959/// \code{.operation}
4960/// FOR element := 0 to 1
4961///   j := element*32
4962///   k := element*64
4963///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
4964/// ENDFOR
4965/// result[127:64] := 0
4966/// \endcode
4967///
4968/// \headerfile <immintrin.h>
4969///
4970/// \code
4971/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
4972/// \endcode
4973///
4974/// This intrinsic corresponds to the \c VGATHERQPS instruction.
4975///
4976/// \param m
4977///    A pointer to the memory used for loading values.
4978/// \param i
4979///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
4980/// \param s
4981///    A literal constant scale factor for the indexes in \a i. Must be
4982///    1, 2, 4, or 8.
4983/// \returns A 128-bit vector of [4 x float] containing the gathered values.
4984#define _mm_i64gather_ps(m, i, s) \
4985  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
4986                                     (float const *)(m), \
4987                                     (__v2di)(__m128i)(i), \
4988                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
4989                                                          _mm_setzero_ps()), \
4990                                     (s)))
4991
4992/// Gathers four 32-bit floating-point values from memory \a m using scaled
4993///    indexes from the 256-bit vector of [4 x i64] in \a i.
4994///
4995/// \code{.operation}
4996/// FOR element := 0 to 3
4997///   j := element*32
4998///   k := element*64
4999///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
5000/// ENDFOR
5001/// \endcode
5002///
5003/// \headerfile <immintrin.h>
5004///
5005/// \code
5006/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
5007/// \endcode
5008///
5009/// This intrinsic corresponds to the \c VGATHERQPS instruction.
5010///
5011/// \param m
5012///    A pointer to the memory used for loading values.
5013/// \param i
5014///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5015/// \param s
5016///    A literal constant scale factor for the indexes in \a i. Must be
5017///    1, 2, 4, or 8.
5018/// \returns A 128-bit vector of [4 x float] containing the gathered values.
5019#define _mm256_i64gather_ps(m, i, s) \
5020  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
5021                                        (float const *)(m), \
5022                                        (__v4di)(__m256i)(i), \
5023                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
5024                                                             _mm_setzero_ps()), \
5025                                        (s)))
5026
5027/// Gathers four 32-bit floating-point values from memory \a m using scaled
5028///    indexes from the 128-bit vector of [4 x i32] in \a i.
5029///
5030/// \code{.operation}
5031/// FOR element := 0 to 3
5032///   j := element*32
5033///   k := element*32
5034///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5035/// ENDFOR
5036/// \endcode
5037///
5038/// \headerfile <immintrin.h>
5039///
5040/// \code
5041/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
5042/// \endcode
5043///
5044/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5045///
5046/// \param m
5047///    A pointer to the memory used for loading values.
5048/// \param i
5049///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5050/// \param s
5051///    A literal constant scale factor for the indexes in \a i. Must be
5052///    1, 2, 4, or 8.
5053/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5054#define _mm_i32gather_epi32(m, i, s) \
5055  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
5056                                     (int const *)(m), (__v4si)(__m128i)(i), \
5057                                     (__v4si)_mm_set1_epi32(-1), (s)))
5058
5059/// Gathers eight 32-bit floating-point values from memory \a m using scaled
5060///    indexes from the 256-bit vector of [8 x i32] in \a i.
5061///
5062/// \code{.operation}
5063/// FOR element := 0 to 7
5064///   j := element*32
5065///   k := element*32
5066///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
5067/// ENDFOR
5068/// \endcode
5069///
5070/// \headerfile <immintrin.h>
5071///
5072/// \code
5073/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
5074/// \endcode
5075///
5076/// This intrinsic corresponds to the \c VPGATHERDD instruction.
5077///
5078/// \param m
5079///    A pointer to the memory used for loading values.
5080/// \param i
5081///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
5082/// \param s
5083///    A literal constant scale factor for the indexes in \a i. Must be
5084///    1, 2, 4, or 8.
5085/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
5086#define _mm256_i32gather_epi32(m, i, s) \
5087  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
5088                                        (int const *)(m), (__v8si)(__m256i)(i), \
5089                                        (__v8si)_mm256_set1_epi32(-1), (s)))
5090
5091/// Gathers two 32-bit integer values from memory \a m using scaled indexes
5092///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
5093///    of the result are zeroed.
5094///
5095/// \code{.operation}
5096/// FOR element := 0 to 1
5097///   j := element*32
5098///   k := element*64
5099///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5100/// ENDFOR
5101/// result[127:64] := 0
5102/// \endcode
5103///
5104/// \headerfile <immintrin.h>
5105///
5106/// \code
5107/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
5108/// \endcode
5109///
5110/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5111///
5112/// \param m
5113///    A pointer to the memory used for loading values.
5114/// \param i
5115///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5116/// \param s
5117///    A literal constant scale factor for the indexes in \a i. Must be
5118///    1, 2, 4, or 8.
5119/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5120#define _mm_i64gather_epi32(m, i, s) \
5121  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
5122                                     (int const *)(m), (__v2di)(__m128i)(i), \
5123                                     (__v4si)_mm_set1_epi32(-1), (s)))
5124
5125/// Gathers four 32-bit integer values from memory \a m using scaled indexes
5126///    from the 256-bit vector of [4 x i64] in \a i.
5127///
5128/// \code{.operation}
5129/// FOR element := 0 to 3
5130///   j := element*32
5131///   k := element*64
5132///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
5133/// ENDFOR
5134/// \endcode
5135///
5136/// \headerfile <immintrin.h>
5137///
5138/// \code
5139/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
5140/// \endcode
5141///
5142/// This intrinsic corresponds to the \c VPGATHERQD instruction.
5143///
5144/// \param m
5145///    A pointer to the memory used for loading values.
5146/// \param i
5147///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5148/// \param s
5149///    A literal constant scale factor for the indexes in \a i. Must be
5150///    1, 2, 4, or 8.
5151/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
5152#define _mm256_i64gather_epi32(m, i, s) \
5153  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
5154                                        (int const *)(m), (__v4di)(__m256i)(i), \
5155                                        (__v4si)_mm_set1_epi32(-1), (s)))
5156
5157/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5158///    from the 128-bit vector of [4 x i32] in \a i.
5159///
5160/// \code{.operation}
5161/// FOR element := 0 to 1
5162///   j := element*64
5163///   k := element*32
5164///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5165/// ENDFOR
5166/// \endcode
5167///
5168/// \headerfile <immintrin.h>
5169///
5170/// \code
5171/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
5172/// \endcode
5173///
5174/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5175///
5176/// \param m
5177///    A pointer to the memory used for loading values.
5178/// \param i
5179///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
5180///    the first two elements are used.
5181/// \param s
5182///    A literal constant scale factor for the indexes in \a i. Must be
5183///    1, 2, 4, or 8.
5184/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5185#define _mm_i32gather_epi64(m, i, s) \
5186  ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
5187                                     (long long const *)(m), \
5188                                     (__v4si)(__m128i)(i), \
5189                                     (__v2di)_mm_set1_epi64x(-1), (s)))
5190
5191/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5192///    from the 128-bit vector of [4 x i32] in \a i.
5193///
5194/// \code{.operation}
5195/// FOR element := 0 to 3
5196///   j := element*64
5197///   k := element*32
5198///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
5199/// ENDFOR
5200/// \endcode
5201///
5202/// \headerfile <immintrin.h>
5203///
5204/// \code
5205/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
5206/// \endcode
5207///
5208/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
5209///
5210/// \param m
5211///    A pointer to the memory used for loading values.
5212/// \param i
5213///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
5214/// \param s
5215///    A literal constant scale factor for the indexes in \a i. Must be
5216///    1, 2, 4, or 8.
5217/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5218#define _mm256_i32gather_epi64(m, i, s) \
5219  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
5220                                        (long long const *)(m), \
5221                                        (__v4si)(__m128i)(i), \
5222                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
5223
5224/// Gathers two 64-bit integer values from memory \a m using scaled indexes
5225///    from the 128-bit vector of [2 x i64] in \a i.
5226///
5227/// \code{.operation}
5228/// FOR element := 0 to 1
5229///   j := element*64
5230///   k := element*64
5231///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5232/// ENDFOR
5233/// \endcode
5234///
5235/// \headerfile <immintrin.h>
5236///
5237/// \code
5238/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
5239/// \endcode
5240///
5241/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5242///
5243/// \param m
5244///    A pointer to the memory used for loading values.
5245/// \param i
5246///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
5247/// \param s
5248///    A literal constant scale factor for the indexes in \a i. Must be
5249///    1, 2, 4, or 8.
5250/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
5251#define _mm_i64gather_epi64(m, i, s) \
5252  ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
5253                                     (long long const *)(m), \
5254                                     (__v2di)(__m128i)(i), \
5255                                     (__v2di)_mm_set1_epi64x(-1), (s)))
5256
5257/// Gathers four 64-bit integer values from memory \a m using scaled indexes
5258///    from the 256-bit vector of [4 x i64] in \a i.
5259///
5260/// \code{.operation}
5261/// FOR element := 0 to 3
5262///   j := element*64
5263///   k := element*64
5264///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
5265/// ENDFOR
5266/// \endcode
5267///
5268/// \headerfile <immintrin.h>
5269///
5270/// \code
5271/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
5272/// \endcode
5273///
5274/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
5275///
5276/// \param m
5277///    A pointer to the memory used for loading values.
5278/// \param i
5279///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
5280/// \param s
5281///    A literal constant scale factor for the indexes in \a i. Must be
5282///    1, 2, 4, or 8.
5283/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
5284#define _mm256_i64gather_epi64(m, i, s) \
5285  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
5286                                        (long long const *)(m), \
5287                                        (__v4di)(__m256i)(i), \
5288                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
5289
5290#undef __DEFAULT_FN_ATTRS256
5291#undef __DEFAULT_FN_ATTRS128
5292
5293#endif /* __AVX2INTRIN_H */