master
   1/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9
  10#ifndef __IMMINTRIN_H
  11#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
  12#endif
  13
  14#ifndef __AVXINTRIN_H
  15#define __AVXINTRIN_H
  16
  17typedef double __v4df __attribute__ ((__vector_size__ (32)));
  18typedef float __v8sf __attribute__ ((__vector_size__ (32)));
  19typedef long long __v4di __attribute__ ((__vector_size__ (32)));
  20typedef int __v8si __attribute__ ((__vector_size__ (32)));
  21typedef short __v16hi __attribute__ ((__vector_size__ (32)));
  22typedef char __v32qi __attribute__ ((__vector_size__ (32)));
  23
  24/* Unsigned types */
  25typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
  26typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
  27typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
  28typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
  29
  30/* We need an explicitly signed variant for char. Note that this shouldn't
  31 * appear in the interface though. */
  32typedef signed char __v32qs __attribute__((__vector_size__(32)));
  33
  34typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
  35typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
  36typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
  37
  38typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
  39typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
  40typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
  41
  42#ifdef __SSE2__
  43/* Both _Float16 and __bf16 require SSE2 being enabled. */
  44typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
  45typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
  46typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
  47
  48typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
  49typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
  50#endif
  51
  52/* Define the default attributes for the functions in this file. */
  53#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
  54#define __DEFAULT_FN_ATTRS                                                     \
  55  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
  56                 __min_vector_width__(256)))
  57#define __DEFAULT_FN_ATTRS128                                                  \
  58  __attribute__((__always_inline__, __nodebug__, __target__("avx,no-evex512"), \
  59                 __min_vector_width__(128)))
  60#else
  61#define __DEFAULT_FN_ATTRS                                                     \
  62  __attribute__((__always_inline__, __nodebug__, __target__("avx"),            \
  63                 __min_vector_width__(256)))
  64#define __DEFAULT_FN_ATTRS128                                                  \
  65  __attribute__((__always_inline__, __nodebug__, __target__("avx"),            \
  66                 __min_vector_width__(128)))
  67#endif
  68
  69#if defined(__cplusplus) && (__cplusplus >= 201103L)
  70#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
  71#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
  72#else
  73#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS128
  74#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS
  75#endif
  76
  77/* Arithmetic */
  78/// Adds two 256-bit vectors of [4 x double].
  79///
  80/// \headerfile <x86intrin.h>
  81///
  82/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
  83///
  84/// \param __a
  85///    A 256-bit vector of [4 x double] containing one of the source operands.
  86/// \param __b
  87///    A 256-bit vector of [4 x double] containing one of the source operands.
  88/// \returns A 256-bit vector of [4 x double] containing the sums of both
  89///    operands.
  90static __inline __m256d __DEFAULT_FN_ATTRS
  91_mm256_add_pd(__m256d __a, __m256d __b)
  92{
  93  return (__m256d)((__v4df)__a+(__v4df)__b);
  94}
  95
  96/// Adds two 256-bit vectors of [8 x float].
  97///
  98/// \headerfile <x86intrin.h>
  99///
 100/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
 101///
 102/// \param __a
 103///    A 256-bit vector of [8 x float] containing one of the source operands.
 104/// \param __b
 105///    A 256-bit vector of [8 x float] containing one of the source operands.
 106/// \returns A 256-bit vector of [8 x float] containing the sums of both
 107///    operands.
 108static __inline __m256 __DEFAULT_FN_ATTRS
 109_mm256_add_ps(__m256 __a, __m256 __b)
 110{
 111  return (__m256)((__v8sf)__a+(__v8sf)__b);
 112}
 113
 114/// Subtracts two 256-bit vectors of [4 x double].
 115///
 116/// \headerfile <x86intrin.h>
 117///
 118/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
 119///
 120/// \param __a
 121///    A 256-bit vector of [4 x double] containing the minuend.
 122/// \param __b
 123///    A 256-bit vector of [4 x double] containing the subtrahend.
 124/// \returns A 256-bit vector of [4 x double] containing the differences between
 125///    both operands.
 126static __inline __m256d __DEFAULT_FN_ATTRS
 127_mm256_sub_pd(__m256d __a, __m256d __b)
 128{
 129  return (__m256d)((__v4df)__a-(__v4df)__b);
 130}
 131
 132/// Subtracts two 256-bit vectors of [8 x float].
 133///
 134/// \headerfile <x86intrin.h>
 135///
 136/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
 137///
 138/// \param __a
 139///    A 256-bit vector of [8 x float] containing the minuend.
 140/// \param __b
 141///    A 256-bit vector of [8 x float] containing the subtrahend.
 142/// \returns A 256-bit vector of [8 x float] containing the differences between
 143///    both operands.
 144static __inline __m256 __DEFAULT_FN_ATTRS
 145_mm256_sub_ps(__m256 __a, __m256 __b)
 146{
 147  return (__m256)((__v8sf)__a-(__v8sf)__b);
 148}
 149
 150/// Adds the even-indexed values and subtracts the odd-indexed values of
 151///    two 256-bit vectors of [4 x double].
 152///
 153/// \headerfile <x86intrin.h>
 154///
 155/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
 156///
 157/// \param __a
 158///    A 256-bit vector of [4 x double] containing the left source operand.
 159/// \param __b
 160///    A 256-bit vector of [4 x double] containing the right source operand.
 161/// \returns A 256-bit vector of [4 x double] containing the alternating sums
 162///    and differences between both operands.
 163static __inline __m256d __DEFAULT_FN_ATTRS
 164_mm256_addsub_pd(__m256d __a, __m256d __b)
 165{
 166  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
 167}
 168
 169/// Adds the even-indexed values and subtracts the odd-indexed values of
 170///    two 256-bit vectors of [8 x float].
 171///
 172/// \headerfile <x86intrin.h>
 173///
 174/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
 175///
 176/// \param __a
 177///    A 256-bit vector of [8 x float] containing the left source operand.
 178/// \param __b
 179///    A 256-bit vector of [8 x float] containing the right source operand.
 180/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
 181///    differences between both operands.
 182static __inline __m256 __DEFAULT_FN_ATTRS
 183_mm256_addsub_ps(__m256 __a, __m256 __b)
 184{
 185  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
 186}
 187
 188/// Divides two 256-bit vectors of [4 x double].
 189///
 190/// \headerfile <x86intrin.h>
 191///
 192/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
 193///
 194/// \param __a
 195///    A 256-bit vector of [4 x double] containing the dividend.
 196/// \param __b
 197///    A 256-bit vector of [4 x double] containing the divisor.
 198/// \returns A 256-bit vector of [4 x double] containing the quotients of both
 199///    operands.
 200static __inline __m256d __DEFAULT_FN_ATTRS
 201_mm256_div_pd(__m256d __a, __m256d __b)
 202{
 203  return (__m256d)((__v4df)__a/(__v4df)__b);
 204}
 205
 206/// Divides two 256-bit vectors of [8 x float].
 207///
 208/// \headerfile <x86intrin.h>
 209///
 210/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
 211///
 212/// \param __a
 213///    A 256-bit vector of [8 x float] containing the dividend.
 214/// \param __b
 215///    A 256-bit vector of [8 x float] containing the divisor.
 216/// \returns A 256-bit vector of [8 x float] containing the quotients of both
 217///    operands.
 218static __inline __m256 __DEFAULT_FN_ATTRS
 219_mm256_div_ps(__m256 __a, __m256 __b)
 220{
 221  return (__m256)((__v8sf)__a/(__v8sf)__b);
 222}
 223
 224/// Compares two 256-bit vectors of [4 x double] and returns the greater
 225///    of each pair of values.
 226///
 227///    If either value in a comparison is NaN, returns the value from \a __b.
 228///
 229/// \headerfile <x86intrin.h>
 230///
 231/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
 232///
 233/// \param __a
 234///    A 256-bit vector of [4 x double] containing one of the operands.
 235/// \param __b
 236///    A 256-bit vector of [4 x double] containing one of the operands.
 237/// \returns A 256-bit vector of [4 x double] containing the maximum values
 238///    between both operands.
 239static __inline __m256d __DEFAULT_FN_ATTRS
 240_mm256_max_pd(__m256d __a, __m256d __b)
 241{
 242  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
 243}
 244
 245/// Compares two 256-bit vectors of [8 x float] and returns the greater
 246///    of each pair of values.
 247///
 248///    If either value in a comparison is NaN, returns the value from \a __b.
 249///
 250/// \headerfile <x86intrin.h>
 251///
 252/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
 253///
 254/// \param __a
 255///    A 256-bit vector of [8 x float] containing one of the operands.
 256/// \param __b
 257///    A 256-bit vector of [8 x float] containing one of the operands.
 258/// \returns A 256-bit vector of [8 x float] containing the maximum values
 259///    between both operands.
 260static __inline __m256 __DEFAULT_FN_ATTRS
 261_mm256_max_ps(__m256 __a, __m256 __b)
 262{
 263  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
 264}
 265
 266/// Compares two 256-bit vectors of [4 x double] and returns the lesser
 267///    of each pair of values.
 268///
 269///    If either value in a comparison is NaN, returns the value from \a __b.
 270///
 271/// \headerfile <x86intrin.h>
 272///
 273/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
 274///
 275/// \param __a
 276///    A 256-bit vector of [4 x double] containing one of the operands.
 277/// \param __b
 278///    A 256-bit vector of [4 x double] containing one of the operands.
 279/// \returns A 256-bit vector of [4 x double] containing the minimum values
 280///    between both operands.
 281static __inline __m256d __DEFAULT_FN_ATTRS
 282_mm256_min_pd(__m256d __a, __m256d __b)
 283{
 284  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
 285}
 286
 287/// Compares two 256-bit vectors of [8 x float] and returns the lesser
 288///    of each pair of values.
 289///
 290///    If either value in a comparison is NaN, returns the value from \a __b.
 291///
 292/// \headerfile <x86intrin.h>
 293///
 294/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
 295///
 296/// \param __a
 297///    A 256-bit vector of [8 x float] containing one of the operands.
 298/// \param __b
 299///    A 256-bit vector of [8 x float] containing one of the operands.
 300/// \returns A 256-bit vector of [8 x float] containing the minimum values
 301///    between both operands.
 302static __inline __m256 __DEFAULT_FN_ATTRS
 303_mm256_min_ps(__m256 __a, __m256 __b)
 304{
 305  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
 306}
 307
 308/// Multiplies two 256-bit vectors of [4 x double].
 309///
 310/// \headerfile <x86intrin.h>
 311///
 312/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
 313///
 314/// \param __a
 315///    A 256-bit vector of [4 x double] containing one of the operands.
 316/// \param __b
 317///    A 256-bit vector of [4 x double] containing one of the operands.
 318/// \returns A 256-bit vector of [4 x double] containing the products of both
 319///    operands.
 320static __inline __m256d __DEFAULT_FN_ATTRS
 321_mm256_mul_pd(__m256d __a, __m256d __b)
 322{
 323  return (__m256d)((__v4df)__a * (__v4df)__b);
 324}
 325
 326/// Multiplies two 256-bit vectors of [8 x float].
 327///
 328/// \headerfile <x86intrin.h>
 329///
 330/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
 331///
 332/// \param __a
 333///    A 256-bit vector of [8 x float] containing one of the operands.
 334/// \param __b
 335///    A 256-bit vector of [8 x float] containing one of the operands.
 336/// \returns A 256-bit vector of [8 x float] containing the products of both
 337///    operands.
 338static __inline __m256 __DEFAULT_FN_ATTRS
 339_mm256_mul_ps(__m256 __a, __m256 __b)
 340{
 341  return (__m256)((__v8sf)__a * (__v8sf)__b);
 342}
 343
 344/// Calculates the square roots of the values in a 256-bit vector of
 345///    [4 x double].
 346///
 347/// \headerfile <x86intrin.h>
 348///
 349/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
 350///
 351/// \param __a
 352///    A 256-bit vector of [4 x double].
 353/// \returns A 256-bit vector of [4 x double] containing the square roots of the
 354///    values in the operand.
 355static __inline __m256d __DEFAULT_FN_ATTRS
 356_mm256_sqrt_pd(__m256d __a)
 357{
 358  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
 359}
 360
 361/// Calculates the square roots of the values in a 256-bit vector of
 362///    [8 x float].
 363///
 364/// \headerfile <x86intrin.h>
 365///
 366/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
 367///
 368/// \param __a
 369///    A 256-bit vector of [8 x float].
 370/// \returns A 256-bit vector of [8 x float] containing the square roots of the
 371///    values in the operand.
 372static __inline __m256 __DEFAULT_FN_ATTRS
 373_mm256_sqrt_ps(__m256 __a)
 374{
 375  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
 376}
 377
 378/// Calculates the reciprocal square roots of the values in a 256-bit
 379///    vector of [8 x float].
 380///
 381/// \headerfile <x86intrin.h>
 382///
 383/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
 384///
 385/// \param __a
 386///    A 256-bit vector of [8 x float].
 387/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
 388///    roots of the values in the operand.
 389static __inline __m256 __DEFAULT_FN_ATTRS
 390_mm256_rsqrt_ps(__m256 __a)
 391{
 392  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
 393}
 394
 395/// Calculates the reciprocals of the values in a 256-bit vector of
 396///    [8 x float].
 397///
 398/// \headerfile <x86intrin.h>
 399///
 400/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
 401///
 402/// \param __a
 403///    A 256-bit vector of [8 x float].
 404/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
 405///    values in the operand.
 406static __inline __m256 __DEFAULT_FN_ATTRS
 407_mm256_rcp_ps(__m256 __a)
 408{
 409  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
 410}
 411
 412/// Rounds the values in a 256-bit vector of [4 x double] as specified
 413///    by the byte operand. The source values are rounded to integer values and
 414///    returned as 64-bit double-precision floating-point values.
 415///
 416/// \headerfile <x86intrin.h>
 417///
 418/// \code
 419/// __m256d _mm256_round_pd(__m256d V, const int M);
 420/// \endcode
 421///
 422/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 423///
 424/// \param V
 425///    A 256-bit vector of [4 x double].
 426/// \param M
 427///    An integer value that specifies the rounding operation. \n
 428///    Bits [7:4] are reserved. \n
 429///    Bit [3] is a precision exception value: \n
 430///      0: A normal PE exception is used. \n
 431///      1: The PE field is not updated. \n
 432///    Bit [2] is the rounding control source: \n
 433///      0: Use bits [1:0] of \a M. \n
 434///      1: Use the current MXCSR setting. \n
 435///    Bits [1:0] contain the rounding control definition: \n
 436///      00: Nearest. \n
 437///      01: Downward (toward negative infinity). \n
 438///      10: Upward (toward positive infinity). \n
 439///      11: Truncated.
 440/// \returns A 256-bit vector of [4 x double] containing the rounded values.
 441#define _mm256_round_pd(V, M) \
 442  ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
 443
 444/// Rounds the values stored in a 256-bit vector of [8 x float] as
 445///    specified by the byte operand. The source values are rounded to integer
 446///    values and returned as floating-point values.
 447///
 448/// \headerfile <x86intrin.h>
 449///
 450/// \code
 451/// __m256 _mm256_round_ps(__m256 V, const int M);
 452/// \endcode
 453///
 454/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 455///
 456/// \param V
 457///    A 256-bit vector of [8 x float].
 458/// \param M
 459///    An integer value that specifies the rounding operation. \n
 460///    Bits [7:4] are reserved. \n
 461///    Bit [3] is a precision exception value: \n
 462///      0: A normal PE exception is used. \n
 463///      1: The PE field is not updated. \n
 464///    Bit [2] is the rounding control source: \n
 465///      0: Use bits [1:0] of \a M. \n
 466///      1: Use the current MXCSR setting. \n
 467///    Bits [1:0] contain the rounding control definition: \n
 468///      00: Nearest. \n
 469///      01: Downward (toward negative infinity). \n
 470///      10: Upward (toward positive infinity). \n
 471///      11: Truncated.
 472/// \returns A 256-bit vector of [8 x float] containing the rounded values.
 473#define _mm256_round_ps(V, M) \
 474  ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
 475
 476/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
 477///    source values are rounded up to integer values and returned as 64-bit
 478///    double-precision floating-point values.
 479///
 480/// \headerfile <x86intrin.h>
 481///
 482/// \code
 483/// __m256d _mm256_ceil_pd(__m256d V);
 484/// \endcode
 485///
 486/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 487///
 488/// \param V
 489///    A 256-bit vector of [4 x double].
 490/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
 491#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
 492
 493/// Rounds down the values stored in a 256-bit vector of [4 x double].
 494///    The source values are rounded down to integer values and returned as
 495///    64-bit double-precision floating-point values.
 496///
 497/// \headerfile <x86intrin.h>
 498///
 499/// \code
 500/// __m256d _mm256_floor_pd(__m256d V);
 501/// \endcode
 502///
 503/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
 504///
 505/// \param V
 506///    A 256-bit vector of [4 x double].
 507/// \returns A 256-bit vector of [4 x double] containing the rounded down
 508///    values.
 509#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
 510
 511/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
 512///    source values are rounded up to integer values and returned as
 513///    floating-point values.
 514///
 515/// \headerfile <x86intrin.h>
 516///
 517/// \code
 518/// __m256 _mm256_ceil_ps(__m256 V);
 519/// \endcode
 520///
 521/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 522///
 523/// \param V
 524///    A 256-bit vector of [8 x float].
 525/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
 526#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
 527
 528/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
 529///    source values are rounded down to integer values and returned as
 530///    floating-point values.
 531///
 532/// \headerfile <x86intrin.h>
 533///
 534/// \code
 535/// __m256 _mm256_floor_ps(__m256 V);
 536/// \endcode
 537///
 538/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
 539///
 540/// \param V
 541///    A 256-bit vector of [8 x float].
 542/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
 543#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
 544
 545/* Logical */
 546/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
 547///
 548/// \headerfile <x86intrin.h>
 549///
 550/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
 551///
 552/// \param __a
 553///    A 256-bit vector of [4 x double] containing one of the source operands.
 554/// \param __b
 555///    A 256-bit vector of [4 x double] containing one of the source operands.
 556/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
 557///    values between both operands.
 558static __inline __m256d __DEFAULT_FN_ATTRS
 559_mm256_and_pd(__m256d __a, __m256d __b)
 560{
 561  return (__m256d)((__v4du)__a & (__v4du)__b);
 562}
 563
 564/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
 565///
 566/// \headerfile <x86intrin.h>
 567///
 568/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
 569///
 570/// \param __a
 571///    A 256-bit vector of [8 x float] containing one of the source operands.
 572/// \param __b
 573///    A 256-bit vector of [8 x float] containing one of the source operands.
 574/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
 575///    values between both operands.
 576static __inline __m256 __DEFAULT_FN_ATTRS
 577_mm256_and_ps(__m256 __a, __m256 __b)
 578{
 579  return (__m256)((__v8su)__a & (__v8su)__b);
 580}
 581
 582/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
 583///    the one's complement of the values contained in the first source operand.
 584///
 585/// \headerfile <x86intrin.h>
 586///
 587/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
 588///
 589/// \param __a
 590///    A 256-bit vector of [4 x double] containing the left source operand. The
 591///    one's complement of this value is used in the bitwise AND.
 592/// \param __b
 593///    A 256-bit vector of [4 x double] containing the right source operand.
 594/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
 595///    values of the second operand and the one's complement of the first
 596///    operand.
 597static __inline __m256d __DEFAULT_FN_ATTRS
 598_mm256_andnot_pd(__m256d __a, __m256d __b)
 599{
 600  return (__m256d)(~(__v4du)__a & (__v4du)__b);
 601}
 602
 603/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
 604///    the one's complement of the values contained in the first source operand.
 605///
 606/// \headerfile <x86intrin.h>
 607///
 608/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
 609///
 610/// \param __a
 611///    A 256-bit vector of [8 x float] containing the left source operand. The
 612///    one's complement of this value is used in the bitwise AND.
 613/// \param __b
 614///    A 256-bit vector of [8 x float] containing the right source operand.
 615/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
 616///    values of the second operand and the one's complement of the first
 617///    operand.
 618static __inline __m256 __DEFAULT_FN_ATTRS
 619_mm256_andnot_ps(__m256 __a, __m256 __b)
 620{
 621  return (__m256)(~(__v8su)__a & (__v8su)__b);
 622}
 623
 624/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
 625///
 626/// \headerfile <x86intrin.h>
 627///
 628/// This intrinsic corresponds to the <c> VORPD </c> instruction.
 629///
 630/// \param __a
 631///    A 256-bit vector of [4 x double] containing one of the source operands.
 632/// \param __b
 633///    A 256-bit vector of [4 x double] containing one of the source operands.
 634/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
 635///    values between both operands.
 636static __inline __m256d __DEFAULT_FN_ATTRS
 637_mm256_or_pd(__m256d __a, __m256d __b)
 638{
 639  return (__m256d)((__v4du)__a | (__v4du)__b);
 640}
 641
 642/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
 643///
 644/// \headerfile <x86intrin.h>
 645///
 646/// This intrinsic corresponds to the <c> VORPS </c> instruction.
 647///
 648/// \param __a
 649///    A 256-bit vector of [8 x float] containing one of the source operands.
 650/// \param __b
 651///    A 256-bit vector of [8 x float] containing one of the source operands.
 652/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
 653///    values between both operands.
 654static __inline __m256 __DEFAULT_FN_ATTRS
 655_mm256_or_ps(__m256 __a, __m256 __b)
 656{
 657  return (__m256)((__v8su)__a | (__v8su)__b);
 658}
 659
 660/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
 661///
 662/// \headerfile <x86intrin.h>
 663///
 664/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
 665///
 666/// \param __a
 667///    A 256-bit vector of [4 x double] containing one of the source operands.
 668/// \param __b
 669///    A 256-bit vector of [4 x double] containing one of the source operands.
 670/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
 671///    values between both operands.
 672static __inline __m256d __DEFAULT_FN_ATTRS
 673_mm256_xor_pd(__m256d __a, __m256d __b)
 674{
 675  return (__m256d)((__v4du)__a ^ (__v4du)__b);
 676}
 677
 678/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
 679///
 680/// \headerfile <x86intrin.h>
 681///
 682/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
 683///
 684/// \param __a
 685///    A 256-bit vector of [8 x float] containing one of the source operands.
 686/// \param __b
 687///    A 256-bit vector of [8 x float] containing one of the source operands.
 688/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
 689///    values between both operands.
 690static __inline __m256 __DEFAULT_FN_ATTRS
 691_mm256_xor_ps(__m256 __a, __m256 __b)
 692{
 693  return (__m256)((__v8su)__a ^ (__v8su)__b);
 694}
 695
 696/* Horizontal arithmetic */
 697/// Horizontally adds the adjacent pairs of values contained in two
 698///    256-bit vectors of [4 x double].
 699///
 700/// \headerfile <x86intrin.h>
 701///
 702/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
 703///
 704/// \param __a
 705///    A 256-bit vector of [4 x double] containing one of the source operands.
 706///    The horizontal sums of the values are returned in the even-indexed
 707///    elements of a vector of [4 x double].
 708/// \param __b
 709///    A 256-bit vector of [4 x double] containing one of the source operands.
 710///    The horizontal sums of the values are returned in the odd-indexed
 711///    elements of a vector of [4 x double].
 712/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
 713///    both operands.
 714static __inline __m256d __DEFAULT_FN_ATTRS
 715_mm256_hadd_pd(__m256d __a, __m256d __b)
 716{
 717  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
 718}
 719
 720/// Horizontally adds the adjacent pairs of values contained in two
 721///    256-bit vectors of [8 x float].
 722///
 723/// \headerfile <x86intrin.h>
 724///
 725/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
 726///
 727/// \param __a
 728///    A 256-bit vector of [8 x float] containing one of the source operands.
 729///    The horizontal sums of the values are returned in the elements with
 730///    index 0, 1, 4, 5 of a vector of [8 x float].
 731/// \param __b
 732///    A 256-bit vector of [8 x float] containing one of the source operands.
 733///    The horizontal sums of the values are returned in the elements with
 734///    index 2, 3, 6, 7 of a vector of [8 x float].
 735/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
 736///    both operands.
 737static __inline __m256 __DEFAULT_FN_ATTRS
 738_mm256_hadd_ps(__m256 __a, __m256 __b)
 739{
 740  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
 741}
 742
 743/// Horizontally subtracts the adjacent pairs of values contained in two
 744///    256-bit vectors of [4 x double].
 745///
 746/// \headerfile <x86intrin.h>
 747///
 748/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
 749///
 750/// \param __a
 751///    A 256-bit vector of [4 x double] containing one of the source operands.
 752///    The horizontal differences between the values are returned in the
 753///    even-indexed elements of a vector of [4 x double].
 754/// \param __b
 755///    A 256-bit vector of [4 x double] containing one of the source operands.
 756///    The horizontal differences between the values are returned in the
 757///    odd-indexed elements of a vector of [4 x double].
 758/// \returns A 256-bit vector of [4 x double] containing the horizontal
 759///    differences of both operands.
 760static __inline __m256d __DEFAULT_FN_ATTRS
 761_mm256_hsub_pd(__m256d __a, __m256d __b)
 762{
 763  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
 764}
 765
 766/// Horizontally subtracts the adjacent pairs of values contained in two
 767///    256-bit vectors of [8 x float].
 768///
 769/// \headerfile <x86intrin.h>
 770///
 771/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
 772///
 773/// \param __a
 774///    A 256-bit vector of [8 x float] containing one of the source operands.
 775///    The horizontal differences between the values are returned in the
 776///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
 777/// \param __b
 778///    A 256-bit vector of [8 x float] containing one of the source operands.
 779///    The horizontal differences between the values are returned in the
 780///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
 781/// \returns A 256-bit vector of [8 x float] containing the horizontal
 782///    differences of both operands.
 783static __inline __m256 __DEFAULT_FN_ATTRS
 784_mm256_hsub_ps(__m256 __a, __m256 __b)
 785{
 786  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
 787}
 788
 789/* Vector permutations */
 790/// Copies the values in a 128-bit vector of [2 x double] as specified
 791///    by the 128-bit integer vector operand.
 792///
 793/// \headerfile <x86intrin.h>
 794///
 795/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 796///
 797/// \param __a
 798///    A 128-bit vector of [2 x double].
 799/// \param __c
 800///    A 128-bit integer vector operand specifying how the values are to be
 801///    copied. \n
 802///    Bit [1]: \n
 803///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
 804///         vector. \n
 805///      1: Bits [127:64] of the source are copied to bits [63:0] of the
 806///         returned vector. \n
 807///    Bit [65]: \n
 808///      0: Bits [63:0] of the source are copied to bits [127:64] of the
 809///         returned vector. \n
 810///      1: Bits [127:64] of the source are copied to bits [127:64] of the
 811///         returned vector.
 812/// \returns A 128-bit vector of [2 x double] containing the copied values.
 813static __inline __m128d __DEFAULT_FN_ATTRS128
 814_mm_permutevar_pd(__m128d __a, __m128i __c)
 815{
 816  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
 817}
 818
 819/// Copies the values in a 256-bit vector of [4 x double] as specified
 820///    by the 256-bit integer vector operand.
 821///
 822/// \headerfile <x86intrin.h>
 823///
 824/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
 825///
 826/// \param __a
 827///    A 256-bit vector of [4 x double].
 828/// \param __c
 829///    A 256-bit integer vector operand specifying how the values are to be
 830///    copied. \n
 831///    Bit [1]: \n
 832///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
 833///         vector. \n
 834///      1: Bits [127:64] of the source are copied to bits [63:0] of the
 835///         returned vector. \n
 836///    Bit [65]: \n
 837///      0: Bits [63:0] of the source are copied to bits [127:64] of the
 838///         returned vector. \n
 839///      1: Bits [127:64] of the source are copied to bits [127:64] of the
 840///         returned vector. \n
 841///    Bit [129]: \n
 842///      0: Bits [191:128] of the source are copied to bits [191:128] of the
 843///         returned vector. \n
 844///      1: Bits [255:192] of the source are copied to bits [191:128] of the
 845///         returned vector. \n
 846///    Bit [193]: \n
 847///      0: Bits [191:128] of the source are copied to bits [255:192] of the
 848///         returned vector. \n
 849///      1: Bits [255:192] of the source are copied to bits [255:192] of the
 850///    returned vector.
 851/// \returns A 256-bit vector of [4 x double] containing the copied values.
 852static __inline __m256d __DEFAULT_FN_ATTRS
 853_mm256_permutevar_pd(__m256d __a, __m256i __c)
 854{
 855  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
 856}
 857
 858/// Copies the values stored in a 128-bit vector of [4 x float] as
 859///    specified by the 128-bit integer vector operand.
 860///
 861/// \headerfile <x86intrin.h>
 862///
 863/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 864///
 865/// \param __a
 866///    A 128-bit vector of [4 x float].
 867/// \param __c
 868///    A 128-bit integer vector operand specifying how the values are to be
 869///    copied. \n
 870///    Bits [1:0]: \n
 871///      00: Bits [31:0] of the source are copied to bits [31:0] of the
 872///          returned vector. \n
 873///      01: Bits [63:32] of the source are copied to bits [31:0] of the
 874///          returned vector. \n
 875///      10: Bits [95:64] of the source are copied to bits [31:0] of the
 876///          returned vector. \n
 877///      11: Bits [127:96] of the source are copied to bits [31:0] of the
 878///          returned vector. \n
 879///    Bits [33:32]: \n
 880///      00: Bits [31:0] of the source are copied to bits [63:32] of the
 881///          returned vector. \n
 882///      01: Bits [63:32] of the source are copied to bits [63:32] of the
 883///          returned vector. \n
 884///      10: Bits [95:64] of the source are copied to bits [63:32] of the
 885///          returned vector. \n
 886///      11: Bits [127:96] of the source are copied to bits [63:32] of the
 887///          returned vector. \n
 888///    Bits [65:64]: \n
 889///      00: Bits [31:0] of the source are copied to bits [95:64] of the
 890///          returned vector. \n
 891///      01: Bits [63:32] of the source are copied to bits [95:64] of the
 892///          returned vector. \n
 893///      10: Bits [95:64] of the source are copied to bits [95:64] of the
 894///          returned vector. \n
 895///      11: Bits [127:96] of the source are copied to bits [95:64] of the
 896///          returned vector. \n
 897///    Bits [97:96]: \n
 898///      00: Bits [31:0] of the source are copied to bits [127:96] of the
 899///          returned vector. \n
 900///      01: Bits [63:32] of the source are copied to bits [127:96] of the
 901///          returned vector. \n
 902///      10: Bits [95:64] of the source are copied to bits [127:96] of the
 903///          returned vector. \n
 904///      11: Bits [127:96] of the source are copied to bits [127:96] of the
 905///          returned vector.
 906/// \returns A 128-bit vector of [4 x float] containing the copied values.
 907static __inline __m128 __DEFAULT_FN_ATTRS128
 908_mm_permutevar_ps(__m128 __a, __m128i __c)
 909{
 910  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
 911}
 912
 913/// Copies the values stored in a 256-bit vector of [8 x float] as
 914///    specified by the 256-bit integer vector operand.
 915///
 916/// \headerfile <x86intrin.h>
 917///
 918/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
 919///
 920/// \param __a
 921///    A 256-bit vector of [8 x float].
 922/// \param __c
 923///    A 256-bit integer vector operand specifying how the values are to be
 924///    copied. \n
 925///    Bits [1:0]: \n
 926///      00: Bits [31:0] of the source are copied to bits [31:0] of the
 927///          returned vector. \n
 928///      01: Bits [63:32] of the source are copied to bits [31:0] of the
 929///          returned vector. \n
 930///      10: Bits [95:64] of the source are copied to bits [31:0] of the
 931///          returned vector. \n
 932///      11: Bits [127:96] of the source are copied to bits [31:0] of the
 933///          returned vector. \n
 934///    Bits [33:32]: \n
 935///      00: Bits [31:0] of the source are copied to bits [63:32] of the
 936///          returned vector. \n
 937///      01: Bits [63:32] of the source are copied to bits [63:32] of the
 938///          returned vector. \n
 939///      10: Bits [95:64] of the source are copied to bits [63:32] of the
 940///          returned vector. \n
 941///      11: Bits [127:96] of the source are copied to bits [63:32] of the
 942///          returned vector. \n
 943///    Bits [65:64]: \n
 944///      00: Bits [31:0] of the source are copied to bits [95:64] of the
 945///          returned vector. \n
 946///      01: Bits [63:32] of the source are copied to bits [95:64] of the
 947///          returned vector. \n
 948///      10: Bits [95:64] of the source are copied to bits [95:64] of the
 949///          returned vector. \n
 950///      11: Bits [127:96] of the source are copied to bits [95:64] of the
 951///          returned vector. \n
 952///    Bits [97:96]: \n
 953///      00: Bits [31:0] of the source are copied to bits [127:96] of the
 954///          returned vector. \n
 955///      01: Bits [63:32] of the source are copied to bits [127:96] of the
 956///          returned vector. \n
 957///      10: Bits [95:64] of the source are copied to bits [127:96] of the
 958///          returned vector. \n
 959///      11: Bits [127:96] of the source are copied to bits [127:96] of the
 960///          returned vector. \n
 961///    Bits [129:128]: \n
 962///      00: Bits [159:128] of the source are copied to bits [159:128] of the
 963///          returned vector. \n
 964///      01: Bits [191:160] of the source are copied to bits [159:128] of the
 965///          returned vector. \n
 966///      10: Bits [223:192] of the source are copied to bits [159:128] of the
 967///          returned vector. \n
 968///      11: Bits [255:224] of the source are copied to bits [159:128] of the
 969///          returned vector. \n
 970///    Bits [161:160]: \n
 971///      00: Bits [159:128] of the source are copied to bits [191:160] of the
 972///          returned vector. \n
 973///      01: Bits [191:160] of the source are copied to bits [191:160] of the
 974///          returned vector. \n
 975///      10: Bits [223:192] of the source are copied to bits [191:160] of the
 976///          returned vector. \n
 977///      11: Bits [255:224] of the source are copied to bits [191:160] of the
 978///          returned vector. \n
 979///    Bits [193:192]: \n
 980///      00: Bits [159:128] of the source are copied to bits [223:192] of the
 981///          returned vector. \n
 982///      01: Bits [191:160] of the source are copied to bits [223:192] of the
 983///          returned vector. \n
 984///      10: Bits [223:192] of the source are copied to bits [223:192] of the
 985///          returned vector. \n
 986///      11: Bits [255:224] of the source are copied to bits [223:192] of the
 987///          returned vector. \n
 988///    Bits [225:224]: \n
 989///      00: Bits [159:128] of the source are copied to bits [255:224] of the
 990///          returned vector. \n
 991///      01: Bits [191:160] of the source are copied to bits [255:224] of the
 992///          returned vector. \n
 993///      10: Bits [223:192] of the source are copied to bits [255:224] of the
 994///          returned vector. \n
 995///      11: Bits [255:224] of the source are copied to bits [255:224] of the
 996///          returned vector.
 997/// \returns A 256-bit vector of [8 x float] containing the copied values.
 998static __inline __m256 __DEFAULT_FN_ATTRS
 999_mm256_permutevar_ps(__m256 __a, __m256i __c)
1000{
1001  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
1002}
1003
1004/// Copies the values in a 128-bit vector of [2 x double] as specified
1005///    by the immediate integer operand.
1006///
1007/// \headerfile <x86intrin.h>
1008///
1009/// \code
1010/// __m128d _mm_permute_pd(__m128d A, const int C);
1011/// \endcode
1012///
1013/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1014///
1015/// \param A
1016///    A 128-bit vector of [2 x double].
1017/// \param C
1018///    An immediate integer operand specifying how the values are to be
1019///    copied. \n
1020///    Bit [0]: \n
1021///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1022///         vector. \n
1023///      1: Bits [127:64] of the source are copied to bits [63:0] of the
1024///         returned vector. \n
1025///    Bit [1]: \n
1026///      0: Bits [63:0] of the source are copied to bits [127:64] of the
1027///         returned vector. \n
1028///      1: Bits [127:64] of the source are copied to bits [127:64] of the
1029///         returned vector.
1030/// \returns A 128-bit vector of [2 x double] containing the copied values.
1031#define _mm_permute_pd(A, C) \
1032  ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
1033
1034/// Copies the values in a 256-bit vector of [4 x double] as specified by
1035///    the immediate integer operand.
1036///
1037/// \headerfile <x86intrin.h>
1038///
1039/// \code
1040/// __m256d _mm256_permute_pd(__m256d A, const int C);
1041/// \endcode
1042///
1043/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
1044///
1045/// \param A
1046///    A 256-bit vector of [4 x double].
1047/// \param C
1048///    An immediate integer operand specifying how the values are to be
1049///    copied. \n
1050///    Bit [0]: \n
1051///      0: Bits [63:0] of the source are copied to bits [63:0] of the returned
1052///         vector. \n
1053///      1: Bits [127:64] of the source are copied to bits [63:0] of the
1054///         returned vector. \n
1055///    Bit [1]: \n
1056///      0: Bits [63:0] of the source are copied to bits [127:64] of the
1057///         returned vector. \n
1058///      1: Bits [127:64] of the source are copied to bits [127:64] of the
1059///         returned vector. \n
1060///    Bit [2]: \n
1061///      0: Bits [191:128] of the source are copied to bits [191:128] of the
1062///         returned vector. \n
1063///      1: Bits [255:192] of the source are copied to bits [191:128] of the
1064///         returned vector. \n
1065///    Bit [3]: \n
1066///      0: Bits [191:128] of the source are copied to bits [255:192] of the
1067///         returned vector. \n
1068///      1: Bits [255:192] of the source are copied to bits [255:192] of the
1069///         returned vector.
1070/// \returns A 256-bit vector of [4 x double] containing the copied values.
1071#define _mm256_permute_pd(A, C) \
1072  ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
1073
1074/// Copies the values in a 128-bit vector of [4 x float] as specified by
1075///    the immediate integer operand.
1076///
1077/// \headerfile <x86intrin.h>
1078///
1079/// \code
1080/// __m128 _mm_permute_ps(__m128 A, const int C);
1081/// \endcode
1082///
1083/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1084///
1085/// \param A
1086///    A 128-bit vector of [4 x float].
1087/// \param C
1088///    An immediate integer operand specifying how the values are to be
1089///    copied. \n
1090///    Bits [1:0]: \n
1091///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1092///          returned vector. \n
1093///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1094///          returned vector. \n
1095///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1096///          returned vector. \n
1097///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1098///          returned vector. \n
1099///    Bits [3:2]: \n
1100///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1101///          returned vector. \n
1102///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1103///          returned vector. \n
1104///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1105///          returned vector. \n
1106///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1107///          returned vector. \n
1108///    Bits [5:4]: \n
1109///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1110///          returned vector. \n
1111///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1112///          returned vector. \n
1113///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1114///          returned vector. \n
1115///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1116///          returned vector. \n
1117///    Bits [7:6]: \n
1118///      00: Bits [31:0] of the source are copied to bits [127:96] of the
1119///          returned vector. \n
1120///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1121///          returned vector. \n
1122///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1123///          returned vector. \n
1124///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1125///          returned vector.
1126/// \returns A 128-bit vector of [4 x float] containing the copied values.
1127#define _mm_permute_ps(A, C) \
1128  ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
1129
1130/// Copies the values in a 256-bit vector of [8 x float] as specified by
1131///    the immediate integer operand.
1132///
1133/// \headerfile <x86intrin.h>
1134///
1135/// \code
1136/// __m256 _mm256_permute_ps(__m256 A, const int C);
1137/// \endcode
1138///
1139/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
1140///
1141/// \param A
1142///    A 256-bit vector of [8 x float].
1143/// \param C
1144///    An immediate integer operand specifying how the values are to be
1145///    copied. \n
1146///    Bits [1:0]: \n
1147///      00: Bits [31:0] of the source are copied to bits [31:0] of the
1148///          returned vector. \n
1149///      01: Bits [63:32] of the source are copied to bits [31:0] of the
1150///          returned vector. \n
1151///      10: Bits [95:64] of the source are copied to bits [31:0] of the
1152///          returned vector. \n
1153///      11: Bits [127:96] of the source are copied to bits [31:0] of the
1154///          returned vector. \n
1155///    Bits [3:2]: \n
1156///      00: Bits [31:0] of the source are copied to bits [63:32] of the
1157///          returned vector. \n
1158///      01: Bits [63:32] of the source are copied to bits [63:32] of the
1159///          returned vector. \n
1160///      10: Bits [95:64] of the source are copied to bits [63:32] of the
1161///          returned vector. \n
1162///      11: Bits [127:96] of the source are copied to bits [63:32] of the
1163///          returned vector. \n
1164///    Bits [5:4]: \n
1165///      00: Bits [31:0] of the source are copied to bits [95:64] of the
1166///          returned vector. \n
1167///      01: Bits [63:32] of the source are copied to bits [95:64] of the
1168///          returned vector. \n
1169///      10: Bits [95:64] of the source are copied to bits [95:64] of the
1170///          returned vector. \n
1171///      11: Bits [127:96] of the source are copied to bits [95:64] of the
1172///          returned vector. \n
1173///    Bits [7:6]: \n
1174///      00: Bits [31:0] of the source are copied to bits [127:96] of the
1175///          returned vector. \n
1176///      01: Bits [63:32] of the source are copied to bits [127:96] of the
1177///          returned vector. \n
1178///      10: Bits [95:64] of the source are copied to bits [127:96] of the
1179///          returned vector. \n
1180///      11: Bits [127:96] of the source are copied to bits [127:96] of the
1181///          returned vector. \n
1182///    Bits [1:0]: \n
1183///      00: Bits [159:128] of the source are copied to bits [159:128] of the
1184///          returned vector. \n
1185///      01: Bits [191:160] of the source are copied to bits [159:128] of the
1186///          returned vector. \n
1187///      10: Bits [223:192] of the source are copied to bits [159:128] of the
1188///          returned vector. \n
1189///      11: Bits [255:224] of the source are copied to bits [159:128] of the
1190///          returned vector. \n
1191///    Bits [3:2]: \n
1192///      00: Bits [159:128] of the source are copied to bits [191:160] of the
1193///          returned vector. \n
1194///      01: Bits [191:160] of the source are copied to bits [191:160] of the
1195///          returned vector. \n
1196///      10: Bits [223:192] of the source are copied to bits [191:160] of the
1197///          returned vector. \n
1198///      11: Bits [255:224] of the source are copied to bits [191:160] of the
1199///          returned vector. \n
1200///    Bits [5:4]: \n
1201///      00: Bits [159:128] of the source are copied to bits [223:192] of the
1202///          returned vector. \n
1203///      01: Bits [191:160] of the source are copied to bits [223:192] of the
1204///          returned vector. \n
1205///      10: Bits [223:192] of the source are copied to bits [223:192] of the
1206///          returned vector. \n
1207///      11: Bits [255:224] of the source are copied to bits [223:192] of the
1208///          returned vector. \n
1209///    Bits [7:6]: \n
1210///      00: Bits [159:128] of the source are copied to bits [255:224] of the
1211///          returned vector. \n
1212///      01: Bits [191:160] of the source are copied to bits [255:224] of the
1213///          returned vector. \n
1214///      10: Bits [223:192] of the source are copied to bits [255:224] of the
1215///          returned vector. \n
1216///      11: Bits [255:224] of the source are copied to bits [255:224] of the
1217///          returned vector.
1218/// \returns A 256-bit vector of [8 x float] containing the copied values.
1219#define _mm256_permute_ps(A, C) \
1220  ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
1221
1222/// Permutes 128-bit data values stored in two 256-bit vectors of
1223///    [4 x double], as specified by the immediate integer operand.
1224///
1225/// \headerfile <x86intrin.h>
1226///
1227/// \code
1228/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
1229/// \endcode
1230///
1231/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1232///
1233/// \param V1
1234///    A 256-bit vector of [4 x double].
1235/// \param V2
1236///    A 256-bit vector of [4 x double.
1237/// \param M
1238///    An immediate integer operand specifying how the values are to be
1239///    permuted. \n
1240///    Bits [1:0]: \n
1241///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1242///          destination. \n
1243///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1244///          destination. \n
1245///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1246///          destination. \n
1247///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1248///          destination. \n
1249///    Bits [5:4]: \n
1250///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1251///          destination. \n
1252///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1253///          destination. \n
1254///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1255///          destination. \n
1256///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1257///          destination.
1258/// \returns A 256-bit vector of [4 x double] containing the copied values.
1259#define _mm256_permute2f128_pd(V1, V2, M) \
1260  ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
1261                                            (__v4df)(__m256d)(V2), (int)(M)))
1262
1263/// Permutes 128-bit data values stored in two 256-bit vectors of
1264///    [8 x float], as specified by the immediate integer operand.
1265///
1266/// \headerfile <x86intrin.h>
1267///
1268/// \code
1269/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
1270/// \endcode
1271///
1272/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1273///
1274/// \param V1
1275///    A 256-bit vector of [8 x float].
1276/// \param V2
1277///    A 256-bit vector of [8 x float].
1278/// \param M
1279///    An immediate integer operand specifying how the values are to be
1280///    permuted. \n
1281///    Bits [1:0]: \n
1282///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1283///    destination. \n
1284///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1285///    destination. \n
1286///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1287///    destination. \n
1288///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1289///    destination. \n
1290///    Bits [5:4]: \n
1291///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1292///    destination. \n
1293///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1294///    destination. \n
1295///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1296///    destination. \n
1297///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1298///    destination.
1299/// \returns A 256-bit vector of [8 x float] containing the copied values.
1300#define _mm256_permute2f128_ps(V1, V2, M) \
1301  ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
1302                                           (__v8sf)(__m256)(V2), (int)(M)))
1303
1304/// Permutes 128-bit data values stored in two 256-bit integer vectors,
1305///    as specified by the immediate integer operand.
1306///
1307/// \headerfile <x86intrin.h>
1308///
1309/// \code
1310/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
1311/// \endcode
1312///
1313/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
1314///
1315/// \param V1
1316///    A 256-bit integer vector.
1317/// \param V2
1318///    A 256-bit integer vector.
1319/// \param M
1320///    An immediate integer operand specifying how the values are to be copied.
1321///    Bits [1:0]: \n
1322///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
1323///    destination. \n
1324///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
1325///    destination. \n
1326///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
1327///    destination. \n
1328///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
1329///    destination. \n
1330///    Bits [5:4]: \n
1331///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
1332///    destination. \n
1333///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
1334///    destination. \n
1335///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
1336///    destination. \n
1337///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
1338///    destination.
1339/// \returns A 256-bit integer vector containing the copied values.
1340#define _mm256_permute2f128_si256(V1, V2, M) \
1341  ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
1342                                            (__v8si)(__m256i)(V2), (int)(M)))
1343
1344/* Vector Blend */
1345/// Merges 64-bit double-precision data values stored in either of the
1346///    two 256-bit vectors of [4 x double], as specified by the immediate
1347///    integer operand.
1348///
1349/// \headerfile <x86intrin.h>
1350///
1351/// \code
1352/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
1353/// \endcode
1354///
1355/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
1356///
1357/// \param V1
1358///    A 256-bit vector of [4 x double].
1359/// \param V2
1360///    A 256-bit vector of [4 x double].
1361/// \param M
1362///    An immediate integer operand, with mask bits [3:0] specifying how the
1363///    values are to be copied. The position of the mask bit corresponds to the
1364///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
1365///    element in operand \a V1 is copied to the same position in the
1366///    destination. When a mask bit is 1, the corresponding 64-bit element in
1367///    operand \a V2 is copied to the same position in the destination.
1368/// \returns A 256-bit vector of [4 x double] containing the copied values.
1369#define _mm256_blend_pd(V1, V2, M) \
1370  ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
1371                                      (__v4df)(__m256d)(V2), (int)(M)))
1372
1373/// Merges 32-bit single-precision data values stored in either of the
1374///    two 256-bit vectors of [8 x float], as specified by the immediate
1375///    integer operand.
1376///
1377/// \headerfile <x86intrin.h>
1378///
1379/// \code
1380/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
1381/// \endcode
1382///
1383/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
1384///
1385/// \param V1
1386///    A 256-bit vector of [8 x float].
1387/// \param V2
1388///    A 256-bit vector of [8 x float].
1389/// \param M
1390///    An immediate integer operand, with mask bits [7:0] specifying how the
1391///    values are to be copied. The position of the mask bit corresponds to the
1392///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
1393///    element in operand \a V1 is copied to the same position in the
1394///    destination. When a mask bit is 1, the corresponding 32-bit element in
1395///    operand \a V2 is copied to the same position in the destination.
1396/// \returns A 256-bit vector of [8 x float] containing the copied values.
1397#define _mm256_blend_ps(V1, V2, M) \
1398  ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
1399                                     (__v8sf)(__m256)(V2), (int)(M)))
1400
1401/// Merges 64-bit double-precision data values stored in either of the
1402///    two 256-bit vectors of [4 x double], as specified by the 256-bit vector
1403///    operand.
1404///
1405/// \headerfile <x86intrin.h>
1406///
1407/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
1408///
1409/// \param __a
1410///    A 256-bit vector of [4 x double].
1411/// \param __b
1412///    A 256-bit vector of [4 x double].
1413/// \param __c
1414///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
1415///    how the values are to be copied. The position of the mask bit corresponds
1416///    to the most significant bit of a copied value. When a mask bit is 0, the
1417///    corresponding 64-bit element in operand \a __a is copied to the same
1418///    position in the destination. When a mask bit is 1, the corresponding
1419///    64-bit element in operand \a __b is copied to the same position in the
1420///    destination.
1421/// \returns A 256-bit vector of [4 x double] containing the copied values.
1422static __inline __m256d __DEFAULT_FN_ATTRS
1423_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
1424{
1425  return (__m256d)__builtin_ia32_blendvpd256(
1426    (__v4df)__a, (__v4df)__b, (__v4df)__c);
1427}
1428
1429/// Merges 32-bit single-precision data values stored in either of the
1430///    two 256-bit vectors of [8 x float], as specified by the 256-bit vector
1431///    operand.
1432///
1433/// \headerfile <x86intrin.h>
1434///
1435/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
1436///
1437/// \param __a
1438///    A 256-bit vector of [8 x float].
1439/// \param __b
1440///    A 256-bit vector of [8 x float].
1441/// \param __c
1442///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
1443///    and 31 specifying how the values are to be copied. The position of the
1444///    mask bit corresponds to the most significant bit of a copied value. When
1445///    a mask bit is 0, the corresponding 32-bit element in operand \a __a is
1446///    copied to the same position in the destination. When a mask bit is 1, the
1447///    corresponding 32-bit element in operand \a __b is copied to the same
1448///    position in the destination.
1449/// \returns A 256-bit vector of [8 x float] containing the copied values.
1450static __inline __m256 __DEFAULT_FN_ATTRS
1451_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
1452{
1453  return (__m256)__builtin_ia32_blendvps256(
1454    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
1455}
1456
1457/* Vector Dot Product */
1458/// Computes two dot products in parallel, using the lower and upper
1459///    halves of two [8 x float] vectors as input to the two computations, and
1460///    returning the two dot products in the lower and upper halves of the
1461///    [8 x float] result.
1462///
1463///    The immediate integer operand controls which input elements will
1464///    contribute to the dot product, and where the final results are returned.
1465///    In general, for each dot product, the four corresponding elements of the
1466///    input vectors are multiplied; the first two and second two products are
1467///    summed, then the two sums are added to form the final result.
1468///
1469/// \headerfile <x86intrin.h>
1470///
1471/// \code
1472/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
1473/// \endcode
1474///
1475/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
1476///
1477/// \param V1
1478///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1479/// \param V2
1480///    A vector of [8 x float] values, treated as two [4 x float] vectors.
1481/// \param M
1482///    An immediate integer argument. Bits [7:4] determine which elements of
1483///    the input vectors are used, with bit [4] corresponding to the lowest
1484///    element and bit [7] corresponding to the highest element of each [4 x
1485///    float] subvector. If a bit is set, the corresponding elements from the
1486///    two input vectors are used as an input for dot product; otherwise that
1487///    input is treated as zero. Bits [3:0] determine which elements of the
1488///    result will receive a copy of the final dot product, with bit [0]
1489///    corresponding to the lowest element and bit [3] corresponding to the
1490///    highest element of each [4 x float] subvector. If a bit is set, the dot
1491///    product is returned in the corresponding element; otherwise that element
1492///    is set to zero. The bitmask is applied in the same way to each of the
1493///    two parallel dot product computations.
1494/// \returns A 256-bit vector of [8 x float] containing the two dot products.
1495#define _mm256_dp_ps(V1, V2, M) \
1496  ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
1497                                  (__v8sf)(__m256)(V2), (M)))
1498
1499/* Vector shuffle */
1500/// Selects 8 float values from the 256-bit operands of [8 x float], as
1501///    specified by the immediate value operand.
1502///
1503///    The four selected elements in each operand are copied to the destination
1504///    according to the bits specified in the immediate operand. The selected
1505///    elements from the first 256-bit operand are copied to bits [63:0] and
1506///    bits [191:128] of the destination, and the selected elements from the
1507///    second 256-bit operand are copied to bits [127:64] and bits [255:192] of
1508///    the destination. For example, if bits [7:0] of the immediate operand
1509///    contain a value of 0xFF, the 256-bit destination vector would contain the
1510///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
1511///
1512/// \headerfile <x86intrin.h>
1513///
1514/// \code
1515/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
1516/// \endcode
1517///
1518/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
1519///
1520/// \param a
1521///    A 256-bit vector of [8 x float]. The four selected elements in this
1522///    operand are copied to bits [63:0] and bits [191:128] in the destination,
1523///    according to the bits specified in the immediate operand.
1524/// \param b
1525///    A 256-bit vector of [8 x float]. The four selected elements in this
1526///    operand are copied to bits [127:64] and bits [255:192] in the
1527///    destination, according to the bits specified in the immediate operand.
1528/// \param mask
1529///    An immediate value containing an 8-bit value specifying which elements to
1530///    copy from \a a and \a b \n.
1531///    Bits [3:0] specify the values copied from operand \a a. \n
1532///    Bits [7:4] specify the values copied from operand \a b. \n
1533///    The destinations within the 256-bit destination are assigned values as
1534///    follows, according to the bit value assignments described below: \n
1535///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
1536///    destination. \n
1537///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
1538///    destination. \n
1539///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
1540///    destination. \n
1541///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
1542///    the destination. \n
1543///    Bit value assignments: \n
1544///    00: Bits [31:0] and [159:128] are copied from the selected operand. \n
1545///    01: Bits [63:32] and [191:160] are copied from the selected operand. \n
1546///    10: Bits [95:64] and [223:192] are copied from the selected operand. \n
1547///    11: Bits [127:96] and [255:224] are copied from the selected operand. \n
1548///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
1549///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
1550///    <c>[b6, b4, b2, b0]</c>.
1551/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
1552#define _mm256_shuffle_ps(a, b, mask) \
1553  ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
1554                                    (__v8sf)(__m256)(b), (int)(mask)))
1555
1556/// Selects four double-precision values from the 256-bit operands of
1557///    [4 x double], as specified by the immediate value operand.
1558///
1559///    The selected elements from the first 256-bit operand are copied to bits
1560///    [63:0] and bits [191:128] in the destination, and the selected elements
1561///    from the second 256-bit operand are copied to bits [127:64] and bits
1562///    [255:192] in the destination. For example, if bits [3:0] of the immediate
1563///    operand contain a value of 0xF, the 256-bit destination vector would
1564///    contain the following values: b[3], a[3], b[1], a[1].
1565///
1566/// \headerfile <x86intrin.h>
1567///
1568/// \code
1569/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
1570/// \endcode
1571///
1572/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
1573///
1574/// \param a
1575///    A 256-bit vector of [4 x double].
1576/// \param b
1577///    A 256-bit vector of [4 x double].
1578/// \param mask
1579///    An immediate value containing 8-bit values specifying which elements to
1580///    copy from \a a and \a b: \n
1581///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
1582///    destination. \n
1583///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
1584///    destination. \n
1585///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
1586///    destination. \n
1587///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
1588///    destination. \n
1589///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
1590///    destination. \n
1591///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
1592///    destination. \n
1593///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
1594///    destination. \n
1595///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
1596///    destination.
1597/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
1598#define _mm256_shuffle_pd(a, b, mask) \
1599  ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
1600                                     (__v4df)(__m256d)(b), (int)(mask)))
1601
1602/* Compare */
1603#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
1604#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, signaling)  */
1605#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
1606#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
1607#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
1608#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  */
1609#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
1610#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
1611#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
1612#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
1613#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  */
1614#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
1615#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
1616#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
1617#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, non-signaling)  */
1618#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
1619#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
1620#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, non-signaling)  */
1621#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  */
1622#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
1623#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
1624#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, non-signaling)  */
1625#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
1626#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
1627
1628/* Below intrinsic defined in emmintrin.h can be used for AVX */
1629/// Compares each of the corresponding double-precision values of two
1630///    128-bit vectors of [2 x double], using the operation specified by the
1631///    immediate integer operand.
1632///
1633///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1634///    If either value in a comparison is NaN, comparisons that are ordered
1635///    return false, and comparisons that are unordered return true.
1636///
1637/// \headerfile <x86intrin.h>
1638///
1639/// \code
1640/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
1641/// \endcode
1642///
1643/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1644///
1645/// \param a
1646///    A 128-bit vector of [2 x double].
1647/// \param b
1648///    A 128-bit vector of [2 x double].
1649/// \param c
1650///    An immediate integer operand, with bits [4:0] specifying which comparison
1651///    operation to use: \n
1652///    0x00: Equal (ordered, non-signaling) \n
1653///    0x01: Less-than (ordered, signaling) \n
1654///    0x02: Less-than-or-equal (ordered, signaling) \n
1655///    0x03: Unordered (non-signaling) \n
1656///    0x04: Not-equal (unordered, non-signaling) \n
1657///    0x05: Not-less-than (unordered, signaling) \n
1658///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1659///    0x07: Ordered (non-signaling) \n
1660///    0x08: Equal (unordered, non-signaling) \n
1661///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1662///    0x0A: Not-greater-than (unordered, signaling) \n
1663///    0x0B: False (ordered, non-signaling) \n
1664///    0x0C: Not-equal (ordered, non-signaling) \n
1665///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1666///    0x0E: Greater-than (ordered, signaling) \n
1667///    0x0F: True (unordered, non-signaling) \n
1668///    0x10: Equal (ordered, signaling) \n
1669///    0x11: Less-than (ordered, non-signaling) \n
1670///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1671///    0x13: Unordered (signaling) \n
1672///    0x14: Not-equal (unordered, signaling) \n
1673///    0x15: Not-less-than (unordered, non-signaling) \n
1674///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1675///    0x17: Ordered (signaling) \n
1676///    0x18: Equal (unordered, signaling) \n
1677///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1678///    0x1A: Not-greater-than (unordered, non-signaling) \n
1679///    0x1B: False (ordered, signaling) \n
1680///    0x1C: Not-equal (ordered, signaling) \n
1681///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1682///    0x1E: Greater-than (ordered, non-signaling) \n
1683///    0x1F: True (unordered, signaling)
1684/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1685/// \fn __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c)
1686
1687/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1688/// Compares each of the corresponding values of two 128-bit vectors of
1689///    [4 x float], using the operation specified by the immediate integer
1690///    operand.
1691///
1692///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1693///    If either value in a comparison is NaN, comparisons that are ordered
1694///    return false, and comparisons that are unordered return true.
1695///
1696/// \headerfile <x86intrin.h>
1697///
1698/// \code
1699/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
1700/// \endcode
1701///
1702/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1703///
1704/// \param a
1705///    A 128-bit vector of [4 x float].
1706/// \param b
1707///    A 128-bit vector of [4 x float].
1708/// \param c
1709///    An immediate integer operand, with bits [4:0] specifying which comparison
1710///    operation to use: \n
1711///    0x00: Equal (ordered, non-signaling) \n
1712///    0x01: Less-than (ordered, signaling) \n
1713///    0x02: Less-than-or-equal (ordered, signaling) \n
1714///    0x03: Unordered (non-signaling) \n
1715///    0x04: Not-equal (unordered, non-signaling) \n
1716///    0x05: Not-less-than (unordered, signaling) \n
1717///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1718///    0x07: Ordered (non-signaling) \n
1719///    0x08: Equal (unordered, non-signaling) \n
1720///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1721///    0x0A: Not-greater-than (unordered, signaling) \n
1722///    0x0B: False (ordered, non-signaling) \n
1723///    0x0C: Not-equal (ordered, non-signaling) \n
1724///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1725///    0x0E: Greater-than (ordered, signaling) \n
1726///    0x0F: True (unordered, non-signaling) \n
1727///    0x10: Equal (ordered, signaling) \n
1728///    0x11: Less-than (ordered, non-signaling) \n
1729///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1730///    0x13: Unordered (signaling) \n
1731///    0x14: Not-equal (unordered, signaling) \n
1732///    0x15: Not-less-than (unordered, non-signaling) \n
1733///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1734///    0x17: Ordered (signaling) \n
1735///    0x18: Equal (unordered, signaling) \n
1736///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1737///    0x1A: Not-greater-than (unordered, non-signaling) \n
1738///    0x1B: False (ordered, signaling) \n
1739///    0x1C: Not-equal (ordered, signaling) \n
1740///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1741///    0x1E: Greater-than (ordered, non-signaling) \n
1742///    0x1F: True (unordered, signaling)
1743/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1744/// \fn __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c)
1745
1746/// Compares each of the corresponding double-precision values of two
1747///    256-bit vectors of [4 x double], using the operation specified by the
1748///    immediate integer operand.
1749///
1750///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1751///    If either value in a comparison is NaN, comparisons that are ordered
1752///    return false, and comparisons that are unordered return true.
1753///
1754/// \headerfile <x86intrin.h>
1755///
1756/// \code
1757/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
1758/// \endcode
1759///
1760/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
1761///
1762/// \param a
1763///    A 256-bit vector of [4 x double].
1764/// \param b
1765///    A 256-bit vector of [4 x double].
1766/// \param c
1767///    An immediate integer operand, with bits [4:0] specifying which comparison
1768///    operation to use: \n
1769///    0x00: Equal (ordered, non-signaling) \n
1770///    0x01: Less-than (ordered, signaling) \n
1771///    0x02: Less-than-or-equal (ordered, signaling) \n
1772///    0x03: Unordered (non-signaling) \n
1773///    0x04: Not-equal (unordered, non-signaling) \n
1774///    0x05: Not-less-than (unordered, signaling) \n
1775///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1776///    0x07: Ordered (non-signaling) \n
1777///    0x08: Equal (unordered, non-signaling) \n
1778///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1779///    0x0A: Not-greater-than (unordered, signaling) \n
1780///    0x0B: False (ordered, non-signaling) \n
1781///    0x0C: Not-equal (ordered, non-signaling) \n
1782///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1783///    0x0E: Greater-than (ordered, signaling) \n
1784///    0x0F: True (unordered, non-signaling) \n
1785///    0x10: Equal (ordered, signaling) \n
1786///    0x11: Less-than (ordered, non-signaling) \n
1787///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1788///    0x13: Unordered (signaling) \n
1789///    0x14: Not-equal (unordered, signaling) \n
1790///    0x15: Not-less-than (unordered, non-signaling) \n
1791///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1792///    0x17: Ordered (signaling) \n
1793///    0x18: Equal (unordered, signaling) \n
1794///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1795///    0x1A: Not-greater-than (unordered, non-signaling) \n
1796///    0x1B: False (ordered, signaling) \n
1797///    0x1C: Not-equal (ordered, signaling) \n
1798///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1799///    0x1E: Greater-than (ordered, non-signaling) \n
1800///    0x1F: True (unordered, signaling)
1801/// \returns A 256-bit vector of [4 x double] containing the comparison results.
1802#define _mm256_cmp_pd(a, b, c) \
1803  ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
1804                                    (__v4df)(__m256d)(b), (c)))
1805
1806/// Compares each of the corresponding values of two 256-bit vectors of
1807///    [8 x float], using the operation specified by the immediate integer
1808///    operand.
1809///
1810///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1811///    If either value in a comparison is NaN, comparisons that are ordered
1812///    return false, and comparisons that are unordered return true.
1813///
1814/// \headerfile <x86intrin.h>
1815///
1816/// \code
1817/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
1818/// \endcode
1819///
1820/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
1821///
1822/// \param a
1823///    A 256-bit vector of [8 x float].
1824/// \param b
1825///    A 256-bit vector of [8 x float].
1826/// \param c
1827///    An immediate integer operand, with bits [4:0] specifying which comparison
1828///    operation to use: \n
1829///    0x00: Equal (ordered, non-signaling) \n
1830///    0x01: Less-than (ordered, signaling) \n
1831///    0x02: Less-than-or-equal (ordered, signaling) \n
1832///    0x03: Unordered (non-signaling) \n
1833///    0x04: Not-equal (unordered, non-signaling) \n
1834///    0x05: Not-less-than (unordered, signaling) \n
1835///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1836///    0x07: Ordered (non-signaling) \n
1837///    0x08: Equal (unordered, non-signaling) \n
1838///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1839///    0x0A: Not-greater-than (unordered, signaling) \n
1840///    0x0B: False (ordered, non-signaling) \n
1841///    0x0C: Not-equal (ordered, non-signaling) \n
1842///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1843///    0x0E: Greater-than (ordered, signaling) \n
1844///    0x0F: True (unordered, non-signaling) \n
1845///    0x10: Equal (ordered, signaling) \n
1846///    0x11: Less-than (ordered, non-signaling) \n
1847///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1848///    0x13: Unordered (signaling) \n
1849///    0x14: Not-equal (unordered, signaling) \n
1850///    0x15: Not-less-than (unordered, non-signaling) \n
1851///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1852///    0x17: Ordered (signaling) \n
1853///    0x18: Equal (unordered, signaling) \n
1854///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1855///    0x1A: Not-greater-than (unordered, non-signaling) \n
1856///    0x1B: False (ordered, signaling) \n
1857///    0x1C: Not-equal (ordered, signaling) \n
1858///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1859///    0x1E: Greater-than (ordered, non-signaling) \n
1860///    0x1F: True (unordered, signaling)
1861/// \returns A 256-bit vector of [8 x float] containing the comparison results.
1862#define _mm256_cmp_ps(a, b, c) \
1863  ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
1864                                   (__v8sf)(__m256)(b), (c)))
1865
1866/* Below intrinsic defined in emmintrin.h can be used for AVX */
1867/// Compares each of the corresponding scalar double-precision values of
1868///    two 128-bit vectors of [2 x double], using the operation specified by the
1869///    immediate integer operand.
1870///
1871///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1872///    If either value in a comparison is NaN, comparisons that are ordered
1873///    return false, and comparisons that are unordered return true.
1874///
1875/// \headerfile <x86intrin.h>
1876///
1877/// \code
1878/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
1879/// \endcode
1880///
1881/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
1882///
1883/// \param a
1884///    A 128-bit vector of [2 x double].
1885/// \param b
1886///    A 128-bit vector of [2 x double].
1887/// \param c
1888///    An immediate integer operand, with bits [4:0] specifying which comparison
1889///    operation to use: \n
1890///    0x00: Equal (ordered, non-signaling) \n
1891///    0x01: Less-than (ordered, signaling) \n
1892///    0x02: Less-than-or-equal (ordered, signaling) \n
1893///    0x03: Unordered (non-signaling) \n
1894///    0x04: Not-equal (unordered, non-signaling) \n
1895///    0x05: Not-less-than (unordered, signaling) \n
1896///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1897///    0x07: Ordered (non-signaling) \n
1898///    0x08: Equal (unordered, non-signaling) \n
1899///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1900///    0x0A: Not-greater-than (unordered, signaling) \n
1901///    0x0B: False (ordered, non-signaling) \n
1902///    0x0C: Not-equal (ordered, non-signaling) \n
1903///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1904///    0x0E: Greater-than (ordered, signaling) \n
1905///    0x0F: True (unordered, non-signaling) \n
1906///    0x10: Equal (ordered, signaling) \n
1907///    0x11: Less-than (ordered, non-signaling) \n
1908///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1909///    0x13: Unordered (signaling) \n
1910///    0x14: Not-equal (unordered, signaling) \n
1911///    0x15: Not-less-than (unordered, non-signaling) \n
1912///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1913///    0x17: Ordered (signaling) \n
1914///    0x18: Equal (unordered, signaling) \n
1915///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1916///    0x1A: Not-greater-than (unordered, non-signaling) \n
1917///    0x1B: False (ordered, signaling) \n
1918///    0x1C: Not-equal (ordered, signaling) \n
1919///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1920///    0x1E: Greater-than (ordered, non-signaling) \n
1921///    0x1F: True (unordered, signaling)
1922/// \returns A 128-bit vector of [2 x double] containing the comparison results.
1923/// \fn __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c)
1924
1925/* Below intrinsic defined in xmmintrin.h can be used for AVX */
1926/// Compares each of the corresponding scalar values of two 128-bit
1927///    vectors of [4 x float], using the operation specified by the immediate
1928///    integer operand.
1929///
1930///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
1931///    If either value in a comparison is NaN, comparisons that are ordered
1932///    return false, and comparisons that are unordered return true.
1933///
1934/// \headerfile <x86intrin.h>
1935///
1936/// \code
1937/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
1938/// \endcode
1939///
1940/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
1941///
1942/// \param a
1943///    A 128-bit vector of [4 x float].
1944/// \param b
1945///    A 128-bit vector of [4 x float].
1946/// \param c
1947///    An immediate integer operand, with bits [4:0] specifying which comparison
1948///    operation to use: \n
1949///    0x00: Equal (ordered, non-signaling) \n
1950///    0x01: Less-than (ordered, signaling) \n
1951///    0x02: Less-than-or-equal (ordered, signaling) \n
1952///    0x03: Unordered (non-signaling) \n
1953///    0x04: Not-equal (unordered, non-signaling) \n
1954///    0x05: Not-less-than (unordered, signaling) \n
1955///    0x06: Not-less-than-or-equal (unordered, signaling) \n
1956///    0x07: Ordered (non-signaling) \n
1957///    0x08: Equal (unordered, non-signaling) \n
1958///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
1959///    0x0A: Not-greater-than (unordered, signaling) \n
1960///    0x0B: False (ordered, non-signaling) \n
1961///    0x0C: Not-equal (ordered, non-signaling) \n
1962///    0x0D: Greater-than-or-equal (ordered, signaling) \n
1963///    0x0E: Greater-than (ordered, signaling) \n
1964///    0x0F: True (unordered, non-signaling) \n
1965///    0x10: Equal (ordered, signaling) \n
1966///    0x11: Less-than (ordered, non-signaling) \n
1967///    0x12: Less-than-or-equal (ordered, non-signaling) \n
1968///    0x13: Unordered (signaling) \n
1969///    0x14: Not-equal (unordered, signaling) \n
1970///    0x15: Not-less-than (unordered, non-signaling) \n
1971///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
1972///    0x17: Ordered (signaling) \n
1973///    0x18: Equal (unordered, signaling) \n
1974///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
1975///    0x1A: Not-greater-than (unordered, non-signaling) \n
1976///    0x1B: False (ordered, signaling) \n
1977///    0x1C: Not-equal (ordered, signaling) \n
1978///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
1979///    0x1E: Greater-than (ordered, non-signaling) \n
1980///    0x1F: True (unordered, signaling)
1981/// \returns A 128-bit vector of [4 x float] containing the comparison results.
1982/// \fn __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c)
1983
1984/// Takes a [8 x i32] vector and returns the vector element value
1985///    indexed by the immediate constant operand.
1986///
1987/// \headerfile <x86intrin.h>
1988///
1989/// \code
1990/// int _mm256_extract_epi32(__m256i X, const int N);
1991/// \endcode
1992///
1993/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
1994///   instruction.
1995///
1996/// \param X
1997///    A 256-bit vector of [8 x i32].
1998/// \param N
1999///    An immediate integer operand with bits [2:0] determining which vector
2000///    element is extracted and returned.
2001/// \returns A 32-bit integer containing the extracted 32 bits of extended
2002///    packed data.
2003#define _mm256_extract_epi32(X, N) \
2004  ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
2005
2006/// Takes a [16 x i16] vector and returns the vector element value
2007///    indexed by the immediate constant operand.
2008///
2009/// \headerfile <x86intrin.h>
2010///
2011/// \code
2012/// int _mm256_extract_epi16(__m256i X, const int N);
2013/// \endcode
2014///
2015/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2016///   instruction.
2017///
2018/// \param X
2019///    A 256-bit integer vector of [16 x i16].
2020/// \param N
2021///    An immediate integer operand with bits [3:0] determining which vector
2022///    element is extracted and returned.
2023/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
2024///    packed data.
2025#define _mm256_extract_epi16(X, N) \
2026  ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
2027                                                     (int)(N)))
2028
2029/// Takes a [32 x i8] vector and returns the vector element value
2030///    indexed by the immediate constant operand.
2031///
2032/// \headerfile <x86intrin.h>
2033///
2034/// \code
2035/// int _mm256_extract_epi8(__m256i X, const int N);
2036/// \endcode
2037///
2038/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2039///   instruction.
2040///
2041/// \param X
2042///    A 256-bit integer vector of [32 x i8].
2043/// \param N
2044///    An immediate integer operand with bits [4:0] determining which vector
2045///    element is extracted and returned.
2046/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
2047///    packed data.
2048#define _mm256_extract_epi8(X, N) \
2049  ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
2050                                                    (int)(N)))
2051
2052#ifdef __x86_64__
2053/// Takes a [4 x i64] vector and returns the vector element value
2054///    indexed by the immediate constant operand.
2055///
2056/// \headerfile <x86intrin.h>
2057///
2058/// \code
2059/// long long _mm256_extract_epi64(__m256i X, const int N);
2060/// \endcode
2061///
2062/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
2063///   instruction.
2064///
2065/// \param X
2066///    A 256-bit integer vector of [4 x i64].
2067/// \param N
2068///    An immediate integer operand with bits [1:0] determining which vector
2069///    element is extracted and returned.
2070/// \returns A 64-bit integer containing the extracted 64 bits of extended
2071///    packed data.
2072#define _mm256_extract_epi64(X, N) \
2073  ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
2074#endif
2075
2076/// Takes a [8 x i32] vector and replaces the vector element value
2077///    indexed by the immediate constant operand by a new value. Returns the
2078///    modified vector.
2079///
2080/// \headerfile <x86intrin.h>
2081///
2082/// \code
2083/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
2084/// \endcode
2085///
2086/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2087///   instruction.
2088///
2089/// \param X
2090///    A vector of [8 x i32] to be used by the insert operation.
2091/// \param I
2092///    An integer value. The replacement value for the insert operation.
2093/// \param N
2094///    An immediate integer specifying the index of the vector element to be
2095///    replaced.
2096/// \returns A copy of vector \a X, after replacing its element indexed by
2097///    \a N with \a I.
2098#define _mm256_insert_epi32(X, I, N) \
2099  ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
2100                                        (int)(I), (int)(N)))
2101
2102
2103/// Takes a [16 x i16] vector and replaces the vector element value
2104///    indexed by the immediate constant operand with a new value. Returns the
2105///    modified vector.
2106///
2107/// \headerfile <x86intrin.h>
2108///
2109/// \code
2110/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
2111/// \endcode
2112///
2113/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2114///   instruction.
2115///
2116/// \param X
2117///    A vector of [16 x i16] to be used by the insert operation.
2118/// \param I
2119///    An i16 integer value. The replacement value for the insert operation.
2120/// \param N
2121///    An immediate integer specifying the index of the vector element to be
2122///    replaced.
2123/// \returns A copy of vector \a X, after replacing its element indexed by
2124///    \a N with \a I.
2125#define _mm256_insert_epi16(X, I, N) \
2126  ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
2127                                         (int)(I), (int)(N)))
2128
2129/// Takes a [32 x i8] vector and replaces the vector element value
2130///    indexed by the immediate constant operand with a new value. Returns the
2131///    modified vector.
2132///
2133/// \headerfile <x86intrin.h>
2134///
2135/// \code
2136/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
2137/// \endcode
2138///
2139/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2140///   instruction.
2141///
2142/// \param X
2143///    A vector of [32 x i8] to be used by the insert operation.
2144/// \param I
2145///    An i8 integer value. The replacement value for the insert operation.
2146/// \param N
2147///    An immediate integer specifying the index of the vector element to be
2148///    replaced.
2149/// \returns A copy of vector \a X, after replacing its element indexed by
2150///    \a N with \a I.
2151#define _mm256_insert_epi8(X, I, N) \
2152  ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
2153                                         (int)(I), (int)(N)))
2154
2155#ifdef __x86_64__
2156/// Takes a [4 x i64] vector and replaces the vector element value
2157///    indexed by the immediate constant operand with a new value. Returns the
2158///    modified vector.
2159///
2160/// \headerfile <x86intrin.h>
2161///
2162/// \code
2163/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
2164/// \endcode
2165///
2166/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
2167///   instruction.
2168///
2169/// \param X
2170///    A vector of [4 x i64] to be used by the insert operation.
2171/// \param I
2172///    A 64-bit integer value. The replacement value for the insert operation.
2173/// \param N
2174///    An immediate integer specifying the index of the vector element to be
2175///    replaced.
2176/// \returns A copy of vector \a X, after replacing its element indexed by
2177///     \a N with \a I.
2178#define _mm256_insert_epi64(X, I, N) \
2179  ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
2180                                        (long long)(I), (int)(N)))
2181#endif
2182
2183/* Conversion */
2184/// Converts a vector of [4 x i32] into a vector of [4 x double].
2185///
2186/// \headerfile <x86intrin.h>
2187///
2188/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
2189///
2190/// \param __a
2191///    A 128-bit integer vector of [4 x i32].
2192/// \returns A 256-bit vector of [4 x double] containing the converted values.
2193static __inline __m256d __DEFAULT_FN_ATTRS
2194_mm256_cvtepi32_pd(__m128i __a)
2195{
2196  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
2197}
2198
2199/// Converts a vector of [8 x i32] into a vector of [8 x float].
2200///
2201/// \headerfile <x86intrin.h>
2202///
2203/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
2204///
2205/// \param __a
2206///    A 256-bit integer vector.
2207/// \returns A 256-bit vector of [8 x float] containing the converted values.
2208static __inline __m256 __DEFAULT_FN_ATTRS
2209_mm256_cvtepi32_ps(__m256i __a)
2210{
2211  return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
2212}
2213
2214/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2215///    [4 x float].
2216///
2217/// \headerfile <x86intrin.h>
2218///
2219/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
2220///
2221/// \param __a
2222///    A 256-bit vector of [4 x double].
2223/// \returns A 128-bit vector of [4 x float] containing the converted values.
2224static __inline __m128 __DEFAULT_FN_ATTRS
2225_mm256_cvtpd_ps(__m256d __a)
2226{
2227  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
2228}
2229
2230/// Converts a vector of [8 x float] into a vector of [8 x i32].
2231///
2232///    If a converted value does not fit in a 32-bit integer, raises a
2233///    floating-point invalid exception. If the exception is masked, returns
2234///    the most negative integer.
2235///
2236/// \headerfile <x86intrin.h>
2237///
2238/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
2239///
2240/// \param __a
2241///    A 256-bit vector of [8 x float].
2242/// \returns A 256-bit integer vector containing the converted values.
2243static __inline __m256i __DEFAULT_FN_ATTRS
2244_mm256_cvtps_epi32(__m256 __a)
2245{
2246  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
2247}
2248
2249/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
2250///    x double].
2251///
2252/// \headerfile <x86intrin.h>
2253///
2254/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
2255///
2256/// \param __a
2257///    A 128-bit vector of [4 x float].
2258/// \returns A 256-bit vector of [4 x double] containing the converted values.
2259static __inline __m256d __DEFAULT_FN_ATTRS
2260_mm256_cvtps_pd(__m128 __a)
2261{
2262  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
2263}
2264
2265/// Converts a 256-bit vector of [4 x double] into four signed truncated
2266///    (rounded toward zero) 32-bit integers returned in a 128-bit vector of
2267///    [4 x i32].
2268///
2269///    If a converted value does not fit in a 32-bit integer, raises a
2270///    floating-point invalid exception. If the exception is masked, returns
2271///    the most negative integer.
2272///
2273/// \headerfile <x86intrin.h>
2274///
2275/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
2276///
2277/// \param __a
2278///    A 256-bit vector of [4 x double].
2279/// \returns A 128-bit integer vector containing the converted values.
2280static __inline __m128i __DEFAULT_FN_ATTRS
2281_mm256_cvttpd_epi32(__m256d __a)
2282{
2283  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
2284}
2285
2286/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
2287///    [4 x i32].
2288///
2289///    If a converted value does not fit in a 32-bit integer, raises a
2290///    floating-point invalid exception. If the exception is masked, returns
2291///    the most negative integer.
2292///
2293/// \headerfile <x86intrin.h>
2294///
2295/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
2296///
2297/// \param __a
2298///    A 256-bit vector of [4 x double].
2299/// \returns A 128-bit integer vector containing the converted values.
2300static __inline __m128i __DEFAULT_FN_ATTRS
2301_mm256_cvtpd_epi32(__m256d __a)
2302{
2303  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
2304}
2305
2306/// Converts a vector of [8 x float] into eight signed truncated (rounded
2307///    toward zero) 32-bit integers returned in a vector of [8 x i32].
2308///
2309///    If a converted value does not fit in a 32-bit integer, raises a
2310///    floating-point invalid exception. If the exception is masked, returns
2311///    the most negative integer.
2312///
2313/// \headerfile <x86intrin.h>
2314///
2315/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
2316///
2317/// \param __a
2318///    A 256-bit vector of [8 x float].
2319/// \returns A 256-bit integer vector containing the converted values.
2320static __inline __m256i __DEFAULT_FN_ATTRS
2321_mm256_cvttps_epi32(__m256 __a)
2322{
2323  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
2324}
2325
2326/// Returns the first element of the input vector of [4 x double].
2327///
2328/// \headerfile <x86intrin.h>
2329///
2330/// This intrinsic is a utility function and does not correspond to a specific
2331///    instruction.
2332///
2333/// \param __a
2334///    A 256-bit vector of [4 x double].
2335/// \returns A 64 bit double containing the first element of the input vector.
2336static __inline double __DEFAULT_FN_ATTRS
2337_mm256_cvtsd_f64(__m256d __a)
2338{
2339 return __a[0];
2340}
2341
2342/// Returns the first element of the input vector of [8 x i32].
2343///
2344/// \headerfile <x86intrin.h>
2345///
2346/// This intrinsic is a utility function and does not correspond to a specific
2347///    instruction.
2348///
2349/// \param __a
2350///    A 256-bit vector of [8 x i32].
2351/// \returns A 32 bit integer containing the first element of the input vector.
2352static __inline int __DEFAULT_FN_ATTRS
2353_mm256_cvtsi256_si32(__m256i __a)
2354{
2355 __v8si __b = (__v8si)__a;
2356 return __b[0];
2357}
2358
2359/// Returns the first element of the input vector of [8 x float].
2360///
2361/// \headerfile <x86intrin.h>
2362///
2363/// This intrinsic is a utility function and does not correspond to a specific
2364///    instruction.
2365///
2366/// \param __a
2367///    A 256-bit vector of [8 x float].
2368/// \returns A 32 bit float containing the first element of the input vector.
2369static __inline float __DEFAULT_FN_ATTRS
2370_mm256_cvtss_f32(__m256 __a)
2371{
2372 return __a[0];
2373}
2374
2375/* Vector replicate */
2376/// Moves and duplicates odd-indexed values from a 256-bit vector of
2377///    [8 x float] to float values in a 256-bit vector of [8 x float].
2378///
2379/// \headerfile <x86intrin.h>
2380///
2381/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
2382///
2383/// \param __a
2384///    A 256-bit vector of [8 x float]. \n
2385///    Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
2386///    the return value. \n
2387///    Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
2388///    the return value. \n
2389///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
2390///    return value. \n
2391///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
2392///    return value.
2393/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2394///    values.
2395static __inline __m256 __DEFAULT_FN_ATTRS
2396_mm256_movehdup_ps(__m256 __a)
2397{
2398  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
2399}
2400
2401/// Moves and duplicates even-indexed values from a 256-bit vector of
2402///    [8 x float] to float values in a 256-bit vector of [8 x float].
2403///
2404/// \headerfile <x86intrin.h>
2405///
2406/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
2407///
2408/// \param __a
2409///    A 256-bit vector of [8 x float]. \n
2410///    Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
2411///    the return value. \n
2412///    Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
2413///    the return value. \n
2414///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
2415///    return value. \n
2416///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
2417///    return value.
2418/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
2419///    values.
2420static __inline __m256 __DEFAULT_FN_ATTRS
2421_mm256_moveldup_ps(__m256 __a)
2422{
2423  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
2424}
2425
2426/// Moves and duplicates double-precision floating point values from a
2427///    256-bit vector of [4 x double] to double-precision values in a 256-bit
2428///    vector of [4 x double].
2429///
2430/// \headerfile <x86intrin.h>
2431///
2432/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
2433///
2434/// \param __a
2435///    A 256-bit vector of [4 x double]. \n
2436///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
2437///    return value. \n
2438///    Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
2439///    the return value.
2440/// \returns A 256-bit vector of [4 x double] containing the moved and
2441///    duplicated values.
2442static __inline __m256d __DEFAULT_FN_ATTRS
2443_mm256_movedup_pd(__m256d __a)
2444{
2445  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
2446}
2447
2448/* Unpack and Interleave */
2449/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
2450///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2451///
2452/// \headerfile <x86intrin.h>
2453///
2454/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
2455///
2456/// \param __a
2457///    A 256-bit floating-point vector of [4 x double]. \n
2458///    Bits [127:64] are written to bits [63:0] of the return value. \n
2459///    Bits [255:192] are written to bits [191:128] of the return value. \n
2460/// \param __b
2461///    A 256-bit floating-point vector of [4 x double]. \n
2462///    Bits [127:64] are written to bits [127:64] of the return value. \n
2463///    Bits [255:192] are written to bits [255:192] of the return value. \n
2464/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2465static __inline __m256d __DEFAULT_FN_ATTRS
2466_mm256_unpackhi_pd(__m256d __a, __m256d __b)
2467{
2468  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
2469}
2470
2471/// Unpacks the even-indexed vector elements from two 256-bit vectors of
2472///    [4 x double] and interleaves them into a 256-bit vector of [4 x double].
2473///
2474/// \headerfile <x86intrin.h>
2475///
2476/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
2477///
2478/// \param __a
2479///    A 256-bit floating-point vector of [4 x double]. \n
2480///    Bits [63:0] are written to bits [63:0] of the return value. \n
2481///    Bits [191:128] are written to bits [191:128] of the return value.
2482/// \param __b
2483///    A 256-bit floating-point vector of [4 x double]. \n
2484///    Bits [63:0] are written to bits [127:64] of the return value. \n
2485///    Bits [191:128] are written to bits [255:192] of the return value. \n
2486/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
2487static __inline __m256d __DEFAULT_FN_ATTRS
2488_mm256_unpacklo_pd(__m256d __a, __m256d __b)
2489{
2490  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
2491}
2492
2493/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
2494///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2495///    vector of [8 x float].
2496///
2497/// \headerfile <x86intrin.h>
2498///
2499/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
2500///
2501/// \param __a
2502///    A 256-bit vector of [8 x float]. \n
2503///    Bits [95:64] are written to bits [31:0] of the return value. \n
2504///    Bits [127:96] are written to bits [95:64] of the return value. \n
2505///    Bits [223:192] are written to bits [159:128] of the return value. \n
2506///    Bits [255:224] are written to bits [223:192] of the return value.
2507/// \param __b
2508///    A 256-bit vector of [8 x float]. \n
2509///    Bits [95:64] are written to bits [63:32] of the return value. \n
2510///    Bits [127:96] are written to bits [127:96] of the return value. \n
2511///    Bits [223:192] are written to bits [191:160] of the return value. \n
2512///    Bits [255:224] are written to bits [255:224] of the return value.
2513/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2514static __inline __m256 __DEFAULT_FN_ATTRS
2515_mm256_unpackhi_ps(__m256 __a, __m256 __b)
2516{
2517  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
2518}
2519
2520/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
2521///    two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
2522///    vector of [8 x float].
2523///
2524/// \headerfile <x86intrin.h>
2525///
2526/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
2527///
2528/// \param __a
2529///    A 256-bit vector of [8 x float]. \n
2530///    Bits [31:0] are written to bits [31:0] of the return value. \n
2531///    Bits [63:32] are written to bits [95:64] of the return value. \n
2532///    Bits [159:128] are written to bits [159:128] of the return value. \n
2533///    Bits [191:160] are written to bits [223:192] of the return value.
2534/// \param __b
2535///    A 256-bit vector of [8 x float]. \n
2536///    Bits [31:0] are written to bits [63:32] of the return value. \n
2537///    Bits [63:32] are written to bits [127:96] of the return value. \n
2538///    Bits [159:128] are written to bits [191:160] of the return value. \n
2539///    Bits [191:160] are written to bits [255:224] of the return value.
2540/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
2541static __inline __m256 __DEFAULT_FN_ATTRS
2542_mm256_unpacklo_ps(__m256 __a, __m256 __b)
2543{
2544  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
2545}
2546
2547/* Bit Test */
2548/// Given two 128-bit floating-point vectors of [2 x double], perform an
2549///    element-by-element comparison of the double-precision element in the
2550///    first source vector and the corresponding element in the second source
2551///    vector.
2552///
2553///    The EFLAGS register is updated as follows: \n
2554///    If there is at least one pair of double-precision elements where the
2555///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2556///    ZF flag is set to 1. \n
2557///    If there is at least one pair of double-precision elements where the
2558///    sign-bit of the first element is 0 and the sign-bit of the second element
2559///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2560///    This intrinsic returns the value of the ZF flag.
2561///
2562/// \headerfile <x86intrin.h>
2563///
2564/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2565///
2566/// \param __a
2567///    A 128-bit vector of [2 x double].
2568/// \param __b
2569///    A 128-bit vector of [2 x double].
2570/// \returns the ZF flag in the EFLAGS register.
2571static __inline int __DEFAULT_FN_ATTRS128
2572_mm_testz_pd(__m128d __a, __m128d __b)
2573{
2574  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
2575}
2576
2577/// Given two 128-bit floating-point vectors of [2 x double], perform an
2578///    element-by-element comparison of the double-precision element in the
2579///    first source vector and the corresponding element in the second source
2580///    vector.
2581///
2582///    The EFLAGS register is updated as follows: \n
2583///    If there is at least one pair of double-precision elements where the
2584///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2585///    ZF flag is set to 1. \n
2586///    If there is at least one pair of double-precision elements where the
2587///    sign-bit of the first element is 0 and the sign-bit of the second element
2588///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2589///    This intrinsic returns the value of the CF flag.
2590///
2591/// \headerfile <x86intrin.h>
2592///
2593/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2594///
2595/// \param __a
2596///    A 128-bit vector of [2 x double].
2597/// \param __b
2598///    A 128-bit vector of [2 x double].
2599/// \returns the CF flag in the EFLAGS register.
2600static __inline int __DEFAULT_FN_ATTRS128
2601_mm_testc_pd(__m128d __a, __m128d __b)
2602{
2603  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
2604}
2605
2606/// Given two 128-bit floating-point vectors of [2 x double], perform an
2607///    element-by-element comparison of the double-precision element in the
2608///    first source vector and the corresponding element in the second source
2609///    vector.
2610///
2611///    The EFLAGS register is updated as follows: \n
2612///    If there is at least one pair of double-precision elements where the
2613///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2614///    ZF flag is set to 1. \n
2615///    If there is at least one pair of double-precision elements where the
2616///    sign-bit of the first element is 0 and the sign-bit of the second element
2617///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2618///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2619///    otherwise it returns 0.
2620///
2621/// \headerfile <x86intrin.h>
2622///
2623/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2624///
2625/// \param __a
2626///    A 128-bit vector of [2 x double].
2627/// \param __b
2628///    A 128-bit vector of [2 x double].
2629/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2630static __inline int __DEFAULT_FN_ATTRS128
2631_mm_testnzc_pd(__m128d __a, __m128d __b)
2632{
2633  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
2634}
2635
2636/// Given two 128-bit floating-point vectors of [4 x float], perform an
2637///    element-by-element comparison of the single-precision element in the
2638///    first source vector and the corresponding element in the second source
2639///    vector.
2640///
2641///    The EFLAGS register is updated as follows: \n
2642///    If there is at least one pair of single-precision elements where the
2643///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2644///    ZF flag is set to 1. \n
2645///    If there is at least one pair of single-precision elements where the
2646///    sign-bit of the first element is 0 and the sign-bit of the second element
2647///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2648///    This intrinsic returns the value of the ZF flag.
2649///
2650/// \headerfile <x86intrin.h>
2651///
2652/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2653///
2654/// \param __a
2655///    A 128-bit vector of [4 x float].
2656/// \param __b
2657///    A 128-bit vector of [4 x float].
2658/// \returns the ZF flag.
2659static __inline int __DEFAULT_FN_ATTRS128
2660_mm_testz_ps(__m128 __a, __m128 __b)
2661{
2662  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
2663}
2664
2665/// Given two 128-bit floating-point vectors of [4 x float], perform an
2666///    element-by-element comparison of the single-precision element in the
2667///    first source vector and the corresponding element in the second source
2668///    vector.
2669///
2670///    The EFLAGS register is updated as follows: \n
2671///    If there is at least one pair of single-precision elements where the
2672///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2673///    ZF flag is set to 1. \n
2674///    If there is at least one pair of single-precision elements where the
2675///    sign-bit of the first element is 0 and the sign-bit of the second element
2676///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2677///    This intrinsic returns the value of the CF flag.
2678///
2679/// \headerfile <x86intrin.h>
2680///
2681/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2682///
2683/// \param __a
2684///    A 128-bit vector of [4 x float].
2685/// \param __b
2686///    A 128-bit vector of [4 x float].
2687/// \returns the CF flag.
2688static __inline int __DEFAULT_FN_ATTRS128
2689_mm_testc_ps(__m128 __a, __m128 __b)
2690{
2691  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
2692}
2693
2694/// Given two 128-bit floating-point vectors of [4 x float], perform an
2695///    element-by-element comparison of the single-precision element in the
2696///    first source vector and the corresponding element in the second source
2697///    vector.
2698///
2699///    The EFLAGS register is updated as follows: \n
2700///    If there is at least one pair of single-precision elements where the
2701///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2702///    ZF flag is set to 1. \n
2703///    If there is at least one pair of single-precision elements where the
2704///    sign-bit of the first element is 0 and the sign-bit of the second element
2705///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2706///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2707///    otherwise it returns 0.
2708///
2709/// \headerfile <x86intrin.h>
2710///
2711/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2712///
2713/// \param __a
2714///    A 128-bit vector of [4 x float].
2715/// \param __b
2716///    A 128-bit vector of [4 x float].
2717/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2718static __inline int __DEFAULT_FN_ATTRS128
2719_mm_testnzc_ps(__m128 __a, __m128 __b)
2720{
2721  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
2722}
2723
2724/// Given two 256-bit floating-point vectors of [4 x double], perform an
2725///    element-by-element comparison of the double-precision elements in the
2726///    first source vector and the corresponding elements in the second source
2727///    vector.
2728///
2729///    The EFLAGS register is updated as follows: \n
2730///    If there is at least one pair of double-precision elements where the
2731///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2732///    ZF flag is set to 1. \n
2733///    If there is at least one pair of double-precision elements where the
2734///    sign-bit of the first element is 0 and the sign-bit of the second element
2735///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2736///    This intrinsic returns the value of the ZF flag.
2737///
2738/// \headerfile <x86intrin.h>
2739///
2740/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2741///
2742/// \param __a
2743///    A 256-bit vector of [4 x double].
2744/// \param __b
2745///    A 256-bit vector of [4 x double].
2746/// \returns the ZF flag.
2747static __inline int __DEFAULT_FN_ATTRS
2748_mm256_testz_pd(__m256d __a, __m256d __b)
2749{
2750  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
2751}
2752
2753/// Given two 256-bit floating-point vectors of [4 x double], perform an
2754///    element-by-element comparison of the double-precision elements in the
2755///    first source vector and the corresponding elements in the second source
2756///    vector.
2757///
2758///    The EFLAGS register is updated as follows: \n
2759///    If there is at least one pair of double-precision elements where the
2760///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2761///    ZF flag is set to 1. \n
2762///    If there is at least one pair of double-precision elements where the
2763///    sign-bit of the first element is 0 and the sign-bit of the second element
2764///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2765///    This intrinsic returns the value of the CF flag.
2766///
2767/// \headerfile <x86intrin.h>
2768///
2769/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2770///
2771/// \param __a
2772///    A 256-bit vector of [4 x double].
2773/// \param __b
2774///    A 256-bit vector of [4 x double].
2775/// \returns the CF flag.
2776static __inline int __DEFAULT_FN_ATTRS
2777_mm256_testc_pd(__m256d __a, __m256d __b)
2778{
2779  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
2780}
2781
2782/// Given two 256-bit floating-point vectors of [4 x double], perform an
2783///    element-by-element comparison of the double-precision elements in the
2784///    first source vector and the corresponding elements in the second source
2785///    vector.
2786///
2787///    The EFLAGS register is updated as follows: \n
2788///    If there is at least one pair of double-precision elements where the
2789///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2790///    ZF flag is set to 1. \n
2791///    If there is at least one pair of double-precision elements where the
2792///    sign-bit of the first element is 0 and the sign-bit of the second element
2793///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2794///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2795///    otherwise it returns 0.
2796///
2797/// \headerfile <x86intrin.h>
2798///
2799/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
2800///
2801/// \param __a
2802///    A 256-bit vector of [4 x double].
2803/// \param __b
2804///    A 256-bit vector of [4 x double].
2805/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2806static __inline int __DEFAULT_FN_ATTRS
2807_mm256_testnzc_pd(__m256d __a, __m256d __b)
2808{
2809  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
2810}
2811
2812/// Given two 256-bit floating-point vectors of [8 x float], perform an
2813///    element-by-element comparison of the single-precision element in the
2814///    first source vector and the corresponding element in the second source
2815///    vector.
2816///
2817///    The EFLAGS register is updated as follows: \n
2818///    If there is at least one pair of single-precision elements where the
2819///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2820///    ZF flag is set to 1. \n
2821///    If there is at least one pair of single-precision elements where the
2822///    sign-bit of the first element is 0 and the sign-bit of the second element
2823///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2824///    This intrinsic returns the value of the ZF flag.
2825///
2826/// \headerfile <x86intrin.h>
2827///
2828/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2829///
2830/// \param __a
2831///    A 256-bit vector of [8 x float].
2832/// \param __b
2833///    A 256-bit vector of [8 x float].
2834/// \returns the ZF flag.
2835static __inline int __DEFAULT_FN_ATTRS
2836_mm256_testz_ps(__m256 __a, __m256 __b)
2837{
2838  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
2839}
2840
2841/// Given two 256-bit floating-point vectors of [8 x float], perform an
2842///    element-by-element comparison of the single-precision element in the
2843///    first source vector and the corresponding element in the second source
2844///    vector.
2845///
2846///    The EFLAGS register is updated as follows: \n
2847///    If there is at least one pair of single-precision elements where the
2848///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2849///    ZF flag is set to 1. \n
2850///    If there is at least one pair of single-precision elements where the
2851///    sign-bit of the first element is 0 and the sign-bit of the second element
2852///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2853///    This intrinsic returns the value of the CF flag.
2854///
2855/// \headerfile <x86intrin.h>
2856///
2857/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2858///
2859/// \param __a
2860///    A 256-bit vector of [8 x float].
2861/// \param __b
2862///    A 256-bit vector of [8 x float].
2863/// \returns the CF flag.
2864static __inline int __DEFAULT_FN_ATTRS
2865_mm256_testc_ps(__m256 __a, __m256 __b)
2866{
2867  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
2868}
2869
2870/// Given two 256-bit floating-point vectors of [8 x float], perform an
2871///    element-by-element comparison of the single-precision elements in the
2872///    first source vector and the corresponding elements in the second source
2873///    vector.
2874///
2875///    The EFLAGS register is updated as follows: \n
2876///    If there is at least one pair of single-precision elements where the
2877///    sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
2878///    ZF flag is set to 1. \n
2879///    If there is at least one pair of single-precision elements where the
2880///    sign-bit of the first element is 0 and the sign-bit of the second element
2881///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
2882///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2883///    otherwise it returns 0.
2884///
2885/// \headerfile <x86intrin.h>
2886///
2887/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
2888///
2889/// \param __a
2890///    A 256-bit vector of [8 x float].
2891/// \param __b
2892///    A 256-bit vector of [8 x float].
2893/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2894static __inline int __DEFAULT_FN_ATTRS
2895_mm256_testnzc_ps(__m256 __a, __m256 __b)
2896{
2897  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
2898}
2899
2900/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2901///    of the two source vectors.
2902///
2903///    The EFLAGS register is updated as follows: \n
2904///    If there is at least one pair of bits where both bits are 1, the ZF flag
2905///    is set to 0. Otherwise the ZF flag is set to 1. \n
2906///    If there is at least one pair of bits where the bit from the first source
2907///    vector is 0 and the bit from the second source vector is 1, the CF flag
2908///    is set to 0. Otherwise the CF flag is set to 1. \n
2909///    This intrinsic returns the value of the ZF flag.
2910///
2911/// \headerfile <x86intrin.h>
2912///
2913/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2914///
2915/// \param __a
2916///    A 256-bit integer vector.
2917/// \param __b
2918///    A 256-bit integer vector.
2919/// \returns the ZF flag.
2920static __inline int __DEFAULT_FN_ATTRS
2921_mm256_testz_si256(__m256i __a, __m256i __b)
2922{
2923  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
2924}
2925
2926/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2927///    of the two source vectors.
2928///
2929///    The EFLAGS register is updated as follows: \n
2930///    If there is at least one pair of bits where both bits are 1, the ZF flag
2931///    is set to 0. Otherwise the ZF flag is set to 1. \n
2932///    If there is at least one pair of bits where the bit from the first source
2933///    vector is 0 and the bit from the second source vector is 1, the CF flag
2934///    is set to 0. Otherwise the CF flag is set to 1. \n
2935///    This intrinsic returns the value of the CF flag.
2936///
2937/// \headerfile <x86intrin.h>
2938///
2939/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2940///
2941/// \param __a
2942///    A 256-bit integer vector.
2943/// \param __b
2944///    A 256-bit integer vector.
2945/// \returns the CF flag.
2946static __inline int __DEFAULT_FN_ATTRS
2947_mm256_testc_si256(__m256i __a, __m256i __b)
2948{
2949  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
2950}
2951
2952/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
2953///    of the two source vectors.
2954///
2955///    The EFLAGS register is updated as follows: \n
2956///    If there is at least one pair of bits where both bits are 1, the ZF flag
2957///    is set to 0. Otherwise the ZF flag is set to 1. \n
2958///    If there is at least one pair of bits where the bit from the first source
2959///    vector is 0 and the bit from the second source vector is 1, the CF flag
2960///    is set to 0. Otherwise the CF flag is set to 1. \n
2961///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
2962///    otherwise it returns 0.
2963///
2964/// \headerfile <x86intrin.h>
2965///
2966/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
2967///
2968/// \param __a
2969///    A 256-bit integer vector.
2970/// \param __b
2971///    A 256-bit integer vector.
2972/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
2973static __inline int __DEFAULT_FN_ATTRS
2974_mm256_testnzc_si256(__m256i __a, __m256i __b)
2975{
2976  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
2977}
2978
2979/* Vector extract sign mask */
2980/// Extracts the sign bits of double-precision floating point elements
2981///    in a 256-bit vector of [4 x double] and writes them to the lower order
2982///    bits of the return value.
2983///
2984/// \headerfile <x86intrin.h>
2985///
2986/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
2987///
2988/// \param __a
2989///    A 256-bit vector of [4 x double] containing the double-precision
2990///    floating point values with sign bits to be extracted.
2991/// \returns The sign bits from the operand, written to bits [3:0].
2992static __inline int __DEFAULT_FN_ATTRS
2993_mm256_movemask_pd(__m256d __a)
2994{
2995  return __builtin_ia32_movmskpd256((__v4df)__a);
2996}
2997
2998/// Extracts the sign bits of single-precision floating point elements
2999///    in a 256-bit vector of [8 x float] and writes them to the lower order
3000///    bits of the return value.
3001///
3002/// \headerfile <x86intrin.h>
3003///
3004/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
3005///
3006/// \param __a
3007///    A 256-bit vector of [8 x float] containing the single-precision floating
3008///    point values with sign bits to be extracted.
3009/// \returns The sign bits from the operand, written to bits [7:0].
3010static __inline int __DEFAULT_FN_ATTRS
3011_mm256_movemask_ps(__m256 __a)
3012{
3013  return __builtin_ia32_movmskps256((__v8sf)__a);
3014}
3015
3016/* Vector __zero */
3017/// Zeroes the contents of all XMM or YMM registers.
3018///
3019/// \headerfile <x86intrin.h>
3020///
3021/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
3022static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3023_mm256_zeroall(void)
3024{
3025  __builtin_ia32_vzeroall();
3026}
3027
3028/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
3029///
3030/// \headerfile <x86intrin.h>
3031///
3032/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
3033static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
3034_mm256_zeroupper(void)
3035{
3036  __builtin_ia32_vzeroupper();
3037}
3038
3039/* Vector load with broadcast */
3040/// Loads a scalar single-precision floating point value from the
3041///    specified address pointed to by \a __a and broadcasts it to the elements
3042///    of a [4 x float] vector.
3043///
3044/// \headerfile <x86intrin.h>
3045///
3046/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3047///
3048/// \param __a
3049///    The single-precision floating point value to be broadcast.
3050/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
3051///    equal to the broadcast value.
3052static __inline __m128 __DEFAULT_FN_ATTRS128
3053_mm_broadcast_ss(float const *__a)
3054{
3055  struct __mm_broadcast_ss_struct {
3056    float __f;
3057  } __attribute__((__packed__, __may_alias__));
3058  float __f = ((const struct __mm_broadcast_ss_struct*)__a)->__f;
3059  return __extension__ (__m128){ __f, __f, __f, __f };
3060}
3061
3062/// Loads a scalar double-precision floating point value from the
3063///    specified address pointed to by \a __a and broadcasts it to the elements
3064///    of a [4 x double] vector.
3065///
3066/// \headerfile <x86intrin.h>
3067///
3068/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
3069///
3070/// \param __a
3071///    The double-precision floating point value to be broadcast.
3072/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
3073///    equal to the broadcast value.
3074static __inline __m256d __DEFAULT_FN_ATTRS
3075_mm256_broadcast_sd(double const *__a)
3076{
3077  struct __mm256_broadcast_sd_struct {
3078    double __d;
3079  } __attribute__((__packed__, __may_alias__));
3080  double __d = ((const struct __mm256_broadcast_sd_struct*)__a)->__d;
3081  return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
3082}
3083
3084/// Loads a scalar single-precision floating point value from the
3085///    specified address pointed to by \a __a and broadcasts it to the elements
3086///    of a [8 x float] vector.
3087///
3088/// \headerfile <x86intrin.h>
3089///
3090/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
3091///
3092/// \param __a
3093///    The single-precision floating point value to be broadcast.
3094/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
3095///    equal to the broadcast value.
3096static __inline __m256 __DEFAULT_FN_ATTRS
3097_mm256_broadcast_ss(float const *__a)
3098{
3099  struct __mm256_broadcast_ss_struct {
3100    float __f;
3101  } __attribute__((__packed__, __may_alias__));
3102  float __f = ((const struct __mm256_broadcast_ss_struct*)__a)->__f;
3103  return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
3104}
3105
3106/// Loads the data from a 128-bit vector of [2 x double] from the
3107///    specified address pointed to by \a __a and broadcasts it to 128-bit
3108///    elements in a 256-bit vector of [4 x double].
3109///
3110/// \headerfile <x86intrin.h>
3111///
3112/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3113///
3114/// \param __a
3115///    The 128-bit vector of [2 x double] to be broadcast.
3116/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
3117///    equal to the broadcast value.
3118static __inline __m256d __DEFAULT_FN_ATTRS
3119_mm256_broadcast_pd(__m128d const *__a)
3120{
3121  __m128d __b = _mm_loadu_pd((const double *)__a);
3122  return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
3123                                          0, 1, 0, 1);
3124}
3125
3126/// Loads the data from a 128-bit vector of [4 x float] from the
3127///    specified address pointed to by \a __a and broadcasts it to 128-bit
3128///    elements in a 256-bit vector of [8 x float].
3129///
3130/// \headerfile <x86intrin.h>
3131///
3132/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
3133///
3134/// \param __a
3135///    The 128-bit vector of [4 x float] to be broadcast.
3136/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
3137///    equal to the broadcast value.
3138static __inline __m256 __DEFAULT_FN_ATTRS
3139_mm256_broadcast_ps(__m128 const *__a)
3140{
3141  __m128 __b = _mm_loadu_ps((const float *)__a);
3142  return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
3143                                         0, 1, 2, 3, 0, 1, 2, 3);
3144}
3145
3146/* SIMD load ops */
3147/// Loads 4 double-precision floating point values from a 32-byte aligned
3148///    memory location pointed to by \a __p into a vector of [4 x double].
3149///
3150/// \headerfile <x86intrin.h>
3151///
3152/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3153///
3154/// \param __p
3155///    A 32-byte aligned pointer to a memory location containing
3156///    double-precision floating point values.
3157/// \returns A 256-bit vector of [4 x double] containing the moved values.
3158static __inline __m256d __DEFAULT_FN_ATTRS
3159_mm256_load_pd(double const *__p)
3160{
3161  return *(const __m256d *)__p;
3162}
3163
3164/// Loads 8 single-precision floating point values from a 32-byte aligned
3165///    memory location pointed to by \a __p into a vector of [8 x float].
3166///
3167/// \headerfile <x86intrin.h>
3168///
3169/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3170///
3171/// \param __p
3172///    A 32-byte aligned pointer to a memory location containing float values.
3173/// \returns A 256-bit vector of [8 x float] containing the moved values.
3174static __inline __m256 __DEFAULT_FN_ATTRS
3175_mm256_load_ps(float const *__p)
3176{
3177  return *(const __m256 *)__p;
3178}
3179
3180/// Loads 4 double-precision floating point values from an unaligned
3181///    memory location pointed to by \a __p into a vector of [4 x double].
3182///
3183/// \headerfile <x86intrin.h>
3184///
3185/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3186///
3187/// \param __p
3188///    A pointer to a memory location containing double-precision floating
3189///    point values.
3190/// \returns A 256-bit vector of [4 x double] containing the moved values.
3191static __inline __m256d __DEFAULT_FN_ATTRS
3192_mm256_loadu_pd(double const *__p)
3193{
3194  struct __loadu_pd {
3195    __m256d_u __v;
3196  } __attribute__((__packed__, __may_alias__));
3197  return ((const struct __loadu_pd*)__p)->__v;
3198}
3199
3200/// Loads 8 single-precision floating point values from an unaligned
3201///    memory location pointed to by \a __p into a vector of [8 x float].
3202///
3203/// \headerfile <x86intrin.h>
3204///
3205/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3206///
3207/// \param __p
3208///    A pointer to a memory location containing single-precision floating
3209///    point values.
3210/// \returns A 256-bit vector of [8 x float] containing the moved values.
3211static __inline __m256 __DEFAULT_FN_ATTRS
3212_mm256_loadu_ps(float const *__p)
3213{
3214  struct __loadu_ps {
3215    __m256_u __v;
3216  } __attribute__((__packed__, __may_alias__));
3217  return ((const struct __loadu_ps*)__p)->__v;
3218}
3219
3220/// Loads 256 bits of integer data from a 32-byte aligned memory
3221///    location pointed to by \a __p into elements of a 256-bit integer vector.
3222///
3223/// \headerfile <x86intrin.h>
3224///
3225/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3226///
3227/// \param __p
3228///    A 32-byte aligned pointer to a 256-bit integer vector containing integer
3229///    values.
3230/// \returns A 256-bit integer vector containing the moved values.
3231static __inline __m256i __DEFAULT_FN_ATTRS
3232_mm256_load_si256(__m256i const *__p)
3233{
3234  return *__p;
3235}
3236
3237/// Loads 256 bits of integer data from an unaligned memory location
3238///    pointed to by \a __p into a 256-bit integer vector.
3239///
3240/// \headerfile <x86intrin.h>
3241///
3242/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3243///
3244/// \param __p
3245///    A pointer to a 256-bit integer vector containing integer values.
3246/// \returns A 256-bit integer vector containing the moved values.
3247static __inline __m256i __DEFAULT_FN_ATTRS
3248_mm256_loadu_si256(__m256i_u const *__p)
3249{
3250  struct __loadu_si256 {
3251    __m256i_u __v;
3252  } __attribute__((__packed__, __may_alias__));
3253  return ((const struct __loadu_si256*)__p)->__v;
3254}
3255
3256/// Loads 256 bits of integer data from an unaligned memory location
3257///    pointed to by \a __p into a 256-bit integer vector. This intrinsic may
3258///    perform better than \c _mm256_loadu_si256 when the data crosses a cache
3259///    line boundary.
3260///
3261/// \headerfile <x86intrin.h>
3262///
3263/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
3264///
3265/// \param __p
3266///    A pointer to a 256-bit integer vector containing integer values.
3267/// \returns A 256-bit integer vector containing the moved values.
3268static __inline __m256i __DEFAULT_FN_ATTRS
3269_mm256_lddqu_si256(__m256i_u const *__p)
3270{
3271  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
3272}
3273
3274/* SIMD store ops */
3275/// Stores double-precision floating point values from a 256-bit vector
3276///    of [4 x double] to a 32-byte aligned memory location pointed to by
3277///    \a __p.
3278///
3279/// \headerfile <x86intrin.h>
3280///
3281/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
3282///
3283/// \param __p
3284///    A 32-byte aligned pointer to a memory location that will receive the
3285///    double-precision floaing point values.
3286/// \param __a
3287///    A 256-bit vector of [4 x double] containing the values to be moved.
3288static __inline void __DEFAULT_FN_ATTRS
3289_mm256_store_pd(double *__p, __m256d __a)
3290{
3291  *(__m256d *)__p = __a;
3292}
3293
3294/// Stores single-precision floating point values from a 256-bit vector
3295///    of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
3296///
3297/// \headerfile <x86intrin.h>
3298///
3299/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
3300///
3301/// \param __p
3302///    A 32-byte aligned pointer to a memory location that will receive the
3303///    float values.
3304/// \param __a
3305///    A 256-bit vector of [8 x float] containing the values to be moved.
3306static __inline void __DEFAULT_FN_ATTRS
3307_mm256_store_ps(float *__p, __m256 __a)
3308{
3309  *(__m256 *)__p = __a;
3310}
3311
3312/// Stores double-precision floating point values from a 256-bit vector
3313///    of [4 x double] to an unaligned memory location pointed to by \a __p.
3314///
3315/// \headerfile <x86intrin.h>
3316///
3317/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
3318///
3319/// \param __p
3320///    A pointer to a memory location that will receive the double-precision
3321///    floating point values.
3322/// \param __a
3323///    A 256-bit vector of [4 x double] containing the values to be moved.
3324static __inline void __DEFAULT_FN_ATTRS
3325_mm256_storeu_pd(double *__p, __m256d __a)
3326{
3327  struct __storeu_pd {
3328    __m256d_u __v;
3329  } __attribute__((__packed__, __may_alias__));
3330  ((struct __storeu_pd*)__p)->__v = __a;
3331}
3332
3333/// Stores single-precision floating point values from a 256-bit vector
3334///    of [8 x float] to an unaligned memory location pointed to by \a __p.
3335///
3336/// \headerfile <x86intrin.h>
3337///
3338/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
3339///
3340/// \param __p
3341///    A pointer to a memory location that will receive the float values.
3342/// \param __a
3343///    A 256-bit vector of [8 x float] containing the values to be moved.
3344static __inline void __DEFAULT_FN_ATTRS
3345_mm256_storeu_ps(float *__p, __m256 __a)
3346{
3347  struct __storeu_ps {
3348    __m256_u __v;
3349  } __attribute__((__packed__, __may_alias__));
3350  ((struct __storeu_ps*)__p)->__v = __a;
3351}
3352
3353/// Stores integer values from a 256-bit integer vector to a 32-byte
3354///    aligned memory location pointed to by \a __p.
3355///
3356/// \headerfile <x86intrin.h>
3357///
3358/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
3359///
3360/// \param __p
3361///    A 32-byte aligned pointer to a memory location that will receive the
3362///    integer values.
3363/// \param __a
3364///    A 256-bit integer vector containing the values to be moved.
3365static __inline void __DEFAULT_FN_ATTRS
3366_mm256_store_si256(__m256i *__p, __m256i __a)
3367{
3368  *__p = __a;
3369}
3370
3371/// Stores integer values from a 256-bit integer vector to an unaligned
3372///    memory location pointed to by \a __p.
3373///
3374/// \headerfile <x86intrin.h>
3375///
3376/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
3377///
3378/// \param __p
3379///    A pointer to a memory location that will receive the integer values.
3380/// \param __a
3381///    A 256-bit integer vector containing the values to be moved.
3382static __inline void __DEFAULT_FN_ATTRS
3383_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
3384{
3385  struct __storeu_si256 {
3386    __m256i_u __v;
3387  } __attribute__((__packed__, __may_alias__));
3388  ((struct __storeu_si256*)__p)->__v = __a;
3389}
3390
3391/* Conditional load ops */
3392/// Conditionally loads double-precision floating point elements from a
3393///    memory location pointed to by \a __p into a 128-bit vector of
3394///    [2 x double], depending on the mask bits associated with each data
3395///    element.
3396///
3397/// \headerfile <x86intrin.h>
3398///
3399/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3400///
3401/// \param __p
3402///    A pointer to a memory location that contains the double-precision
3403///    floating point values.
3404/// \param __m
3405///    A 128-bit integer vector containing the mask. The most significant bit of
3406///    each data element represents the mask bits. If a mask bit is zero, the
3407///    corresponding value in the memory location is not loaded and the
3408///    corresponding field in the return value is set to zero.
3409/// \returns A 128-bit vector of [2 x double] containing the loaded values.
3410static __inline __m128d __DEFAULT_FN_ATTRS128
3411_mm_maskload_pd(double const *__p, __m128i __m)
3412{
3413  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
3414}
3415
3416/// Conditionally loads double-precision floating point elements from a
3417///    memory location pointed to by \a __p into a 256-bit vector of
3418///    [4 x double], depending on the mask bits associated with each data
3419///    element.
3420///
3421/// \headerfile <x86intrin.h>
3422///
3423/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3424///
3425/// \param __p
3426///    A pointer to a memory location that contains the double-precision
3427///    floating point values.
3428/// \param __m
3429///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3430///    significant bit of each quadword element represents the mask bits. If a
3431///    mask bit is zero, the corresponding value in the memory location is not
3432///    loaded and the corresponding field in the return value is set to zero.
3433/// \returns A 256-bit vector of [4 x double] containing the loaded values.
3434static __inline __m256d __DEFAULT_FN_ATTRS
3435_mm256_maskload_pd(double const *__p, __m256i __m)
3436{
3437  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
3438                                               (__v4di)__m);
3439}
3440
3441/// Conditionally loads single-precision floating point elements from a
3442///    memory location pointed to by \a __p into a 128-bit vector of
3443///    [4 x float], depending on the mask bits associated with each data
3444///    element.
3445///
3446/// \headerfile <x86intrin.h>
3447///
3448/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3449///
3450/// \param __p
3451///    A pointer to a memory location that contains the single-precision
3452///    floating point values.
3453/// \param __m
3454///    A 128-bit integer vector containing the mask. The most significant bit of
3455///    each data element represents the mask bits. If a mask bit is zero, the
3456///    corresponding value in the memory location is not loaded and the
3457///    corresponding field in the return value is set to zero.
3458/// \returns A 128-bit vector of [4 x float] containing the loaded values.
3459static __inline __m128 __DEFAULT_FN_ATTRS128
3460_mm_maskload_ps(float const *__p, __m128i __m)
3461{
3462  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
3463}
3464
3465/// Conditionally loads single-precision floating point elements from a
3466///    memory location pointed to by \a __p into a 256-bit vector of
3467///    [8 x float], depending on the mask bits associated with each data
3468///    element.
3469///
3470/// \headerfile <x86intrin.h>
3471///
3472/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3473///
3474/// \param __p
3475///    A pointer to a memory location that contains the single-precision
3476///    floating point values.
3477/// \param __m
3478///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3479///    significant bit of each dword element represents the mask bits. If a mask
3480///    bit is zero, the corresponding value in the memory location is not loaded
3481///    and the corresponding field in the return value is set to zero.
3482/// \returns A 256-bit vector of [8 x float] containing the loaded values.
3483static __inline __m256 __DEFAULT_FN_ATTRS
3484_mm256_maskload_ps(float const *__p, __m256i __m)
3485{
3486  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
3487}
3488
3489/* Conditional store ops */
3490/// Moves single-precision floating point values from a 256-bit vector
3491///    of [8 x float] to a memory location pointed to by \a __p, according to
3492///    the specified mask.
3493///
3494/// \headerfile <x86intrin.h>
3495///
3496/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3497///
3498/// \param __p
3499///    A pointer to a memory location that will receive the float values.
3500/// \param __m
3501///    A 256-bit integer vector of [8 x dword] containing the mask. The most
3502///    significant bit of each dword element in the mask vector represents the
3503///    mask bits. If a mask bit is zero, the corresponding value from vector
3504///    \a __a is not stored and the corresponding field in the memory location
3505///    pointed to by \a __p is not changed.
3506/// \param __a
3507///    A 256-bit vector of [8 x float] containing the values to be stored.
3508static __inline void __DEFAULT_FN_ATTRS
3509_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
3510{
3511  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
3512}
3513
3514/// Moves double-precision values from a 128-bit vector of [2 x double]
3515///    to a memory location pointed to by \a __p, according to the specified
3516///    mask.
3517///
3518/// \headerfile <x86intrin.h>
3519///
3520/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3521///
3522/// \param __p
3523///    A pointer to a memory location that will receive the float values.
3524/// \param __m
3525///    A 128-bit integer vector containing the mask. The most significant bit of
3526///    each field in the mask vector represents the mask bits. If a mask bit is
3527///    zero, the corresponding value from vector \a __a is not stored and the
3528///    corresponding field in the memory location pointed to by \a __p is not
3529///    changed.
3530/// \param __a
3531///    A 128-bit vector of [2 x double] containing the values to be stored.
3532static __inline void __DEFAULT_FN_ATTRS128
3533_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
3534{
3535  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
3536}
3537
3538/// Moves double-precision values from a 256-bit vector of [4 x double]
3539///    to a memory location pointed to by \a __p, according to the specified
3540///    mask.
3541///
3542/// \headerfile <x86intrin.h>
3543///
3544/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
3545///
3546/// \param __p
3547///    A pointer to a memory location that will receive the float values.
3548/// \param __m
3549///    A 256-bit integer vector of [4 x quadword] containing the mask. The most
3550///    significant bit of each quadword element in the mask vector represents
3551///    the mask bits. If a mask bit is zero, the corresponding value from vector
3552///    __a is not stored and the corresponding field in the memory location
3553///    pointed to by \a __p is not changed.
3554/// \param __a
3555///    A 256-bit vector of [4 x double] containing the values to be stored.
3556static __inline void __DEFAULT_FN_ATTRS
3557_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
3558{
3559  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
3560}
3561
3562/// Moves single-precision floating point values from a 128-bit vector
3563///    of [4 x float] to a memory location pointed to by \a __p, according to
3564///    the specified mask.
3565///
3566/// \headerfile <x86intrin.h>
3567///
3568/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
3569///
3570/// \param __p
3571///    A pointer to a memory location that will receive the float values.
3572/// \param __m
3573///    A 128-bit integer vector containing the mask. The most significant bit of
3574///    each field in the mask vector represents the mask bits. If a mask bit is
3575///    zero, the corresponding value from vector __a is not stored and the
3576///    corresponding field in the memory location pointed to by \a __p is not
3577///    changed.
3578/// \param __a
3579///    A 128-bit vector of [4 x float] containing the values to be stored.
3580static __inline void __DEFAULT_FN_ATTRS128
3581_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
3582{
3583  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
3584}
3585
3586/* Cacheability support ops */
3587/// Moves integer data from a 256-bit integer vector to a 32-byte
3588///    aligned memory location. To minimize caching, the data is flagged as
3589///    non-temporal (unlikely to be used again soon).
3590///
3591/// \headerfile <x86intrin.h>
3592///
3593/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
3594///
3595/// \param __a
3596///    A pointer to a 32-byte aligned memory location that will receive the
3597///    integer values.
3598/// \param __b
3599///    A 256-bit integer vector containing the values to be moved.
3600static __inline void __DEFAULT_FN_ATTRS
3601_mm256_stream_si256(void *__a, __m256i __b)
3602{
3603  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
3604  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
3605}
3606
3607/// Moves double-precision values from a 256-bit vector of [4 x double]
3608///    to a 32-byte aligned memory location. To minimize caching, the data is
3609///    flagged as non-temporal (unlikely to be used again soon).
3610///
3611/// \headerfile <x86intrin.h>
3612///
3613/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
3614///
3615/// \param __a
3616///    A pointer to a 32-byte aligned memory location that will receive the
3617///    double-precision floating-point values.
3618/// \param __b
3619///    A 256-bit vector of [4 x double] containing the values to be moved.
3620static __inline void __DEFAULT_FN_ATTRS
3621_mm256_stream_pd(void *__a, __m256d __b)
3622{
3623  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
3624  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
3625}
3626
3627/// Moves single-precision floating point values from a 256-bit vector
3628///    of [8 x float] to a 32-byte aligned memory location. To minimize
3629///    caching, the data is flagged as non-temporal (unlikely to be used again
3630///    soon).
3631///
3632/// \headerfile <x86intrin.h>
3633///
3634/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
3635///
3636/// \param __p
3637///    A pointer to a 32-byte aligned memory location that will receive the
3638///    single-precision floating point values.
3639/// \param __a
3640///    A 256-bit vector of [8 x float] containing the values to be moved.
3641static __inline void __DEFAULT_FN_ATTRS
3642_mm256_stream_ps(void *__p, __m256 __a)
3643{
3644  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
3645  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
3646}
3647
3648/* Create vectors */
3649/// Create a 256-bit vector of [4 x double] with undefined values.
3650///
3651/// \headerfile <x86intrin.h>
3652///
3653/// This intrinsic has no corresponding instruction.
3654///
3655/// \returns A 256-bit vector of [4 x double] containing undefined values.
3656static __inline__ __m256d __DEFAULT_FN_ATTRS
3657_mm256_undefined_pd(void)
3658{
3659  return (__m256d)__builtin_ia32_undef256();
3660}
3661
3662/// Create a 256-bit vector of [8 x float] with undefined values.
3663///
3664/// \headerfile <x86intrin.h>
3665///
3666/// This intrinsic has no corresponding instruction.
3667///
3668/// \returns A 256-bit vector of [8 x float] containing undefined values.
3669static __inline__ __m256 __DEFAULT_FN_ATTRS
3670_mm256_undefined_ps(void)
3671{
3672  return (__m256)__builtin_ia32_undef256();
3673}
3674
3675/// Create a 256-bit integer vector with undefined values.
3676///
3677/// \headerfile <x86intrin.h>
3678///
3679/// This intrinsic has no corresponding instruction.
3680///
3681/// \returns A 256-bit integer vector containing undefined values.
3682static __inline__ __m256i __DEFAULT_FN_ATTRS
3683_mm256_undefined_si256(void)
3684{
3685  return (__m256i)__builtin_ia32_undef256();
3686}
3687
3688/// Constructs a 256-bit floating-point vector of [4 x double]
3689///    initialized with the specified double-precision floating-point values.
3690///
3691/// \headerfile <x86intrin.h>
3692///
3693/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3694///   instruction.
3695///
3696/// \param __a
3697///    A double-precision floating-point value used to initialize bits [255:192]
3698///    of the result.
3699/// \param __b
3700///    A double-precision floating-point value used to initialize bits [191:128]
3701///    of the result.
3702/// \param __c
3703///    A double-precision floating-point value used to initialize bits [127:64]
3704///    of the result.
3705/// \param __d
3706///    A double-precision floating-point value used to initialize bits [63:0]
3707///    of the result.
3708/// \returns An initialized 256-bit floating-point vector of [4 x double].
3709static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3710_mm256_set_pd(double __a, double __b, double __c, double __d)
3711{
3712  return __extension__ (__m256d){ __d, __c, __b, __a };
3713}
3714
3715/// Constructs a 256-bit floating-point vector of [8 x float] initialized
3716///    with the specified single-precision floating-point values.
3717///
3718/// \headerfile <x86intrin.h>
3719///
3720/// This intrinsic is a utility function and does not correspond to a specific
3721///   instruction.
3722///
3723/// \param __a
3724///    A single-precision floating-point value used to initialize bits [255:224]
3725///    of the result.
3726/// \param __b
3727///    A single-precision floating-point value used to initialize bits [223:192]
3728///    of the result.
3729/// \param __c
3730///    A single-precision floating-point value used to initialize bits [191:160]
3731///    of the result.
3732/// \param __d
3733///    A single-precision floating-point value used to initialize bits [159:128]
3734///    of the result.
3735/// \param __e
3736///    A single-precision floating-point value used to initialize bits [127:96]
3737///    of the result.
3738/// \param __f
3739///    A single-precision floating-point value used to initialize bits [95:64]
3740///    of the result.
3741/// \param __g
3742///    A single-precision floating-point value used to initialize bits [63:32]
3743///    of the result.
3744/// \param __h
3745///    A single-precision floating-point value used to initialize bits [31:0]
3746///    of the result.
3747/// \returns An initialized 256-bit floating-point vector of [8 x float].
3748static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
3749_mm256_set_ps(float __a, float __b, float __c, float __d,
3750              float __e, float __f, float __g, float __h)
3751{
3752  return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
3753}
3754
3755/// Constructs a 256-bit integer vector initialized with the specified
3756///    32-bit integral values.
3757///
3758/// \headerfile <x86intrin.h>
3759///
3760/// This intrinsic is a utility function and does not correspond to a specific
3761///   instruction.
3762///
3763/// \param __i0
3764///    A 32-bit integral value used to initialize bits [255:224] of the result.
3765/// \param __i1
3766///    A 32-bit integral value used to initialize bits [223:192] of the result.
3767/// \param __i2
3768///    A 32-bit integral value used to initialize bits [191:160] of the result.
3769/// \param __i3
3770///    A 32-bit integral value used to initialize bits [159:128] of the result.
3771/// \param __i4
3772///    A 32-bit integral value used to initialize bits [127:96] of the result.
3773/// \param __i5
3774///    A 32-bit integral value used to initialize bits [95:64] of the result.
3775/// \param __i6
3776///    A 32-bit integral value used to initialize bits [63:32] of the result.
3777/// \param __i7
3778///    A 32-bit integral value used to initialize bits [31:0] of the result.
3779/// \returns An initialized 256-bit integer vector.
3780static __inline __m256i __DEFAULT_FN_ATTRS
3781_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
3782                 int __i4, int __i5, int __i6, int __i7)
3783{
3784  return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
3785}
3786
3787/// Constructs a 256-bit integer vector initialized with the specified
3788///    16-bit integral values.
3789///
3790/// \headerfile <x86intrin.h>
3791///
3792/// This intrinsic is a utility function and does not correspond to a specific
3793///   instruction.
3794///
3795/// \param __w15
3796///    A 16-bit integral value used to initialize bits [255:240] of the result.
3797/// \param __w14
3798///    A 16-bit integral value used to initialize bits [239:224] of the result.
3799/// \param __w13
3800///    A 16-bit integral value used to initialize bits [223:208] of the result.
3801/// \param __w12
3802///    A 16-bit integral value used to initialize bits [207:192] of the result.
3803/// \param __w11
3804///    A 16-bit integral value used to initialize bits [191:176] of the result.
3805/// \param __w10
3806///    A 16-bit integral value used to initialize bits [175:160] of the result.
3807/// \param __w09
3808///    A 16-bit integral value used to initialize bits [159:144] of the result.
3809/// \param __w08
3810///    A 16-bit integral value used to initialize bits [143:128] of the result.
3811/// \param __w07
3812///    A 16-bit integral value used to initialize bits [127:112] of the result.
3813/// \param __w06
3814///    A 16-bit integral value used to initialize bits [111:96] of the result.
3815/// \param __w05
3816///    A 16-bit integral value used to initialize bits [95:80] of the result.
3817/// \param __w04
3818///    A 16-bit integral value used to initialize bits [79:64] of the result.
3819/// \param __w03
3820///    A 16-bit integral value used to initialize bits [63:48] of the result.
3821/// \param __w02
3822///    A 16-bit integral value used to initialize bits [47:32] of the result.
3823/// \param __w01
3824///    A 16-bit integral value used to initialize bits [31:16] of the result.
3825/// \param __w00
3826///    A 16-bit integral value used to initialize bits [15:0] of the result.
3827/// \returns An initialized 256-bit integer vector.
3828static __inline __m256i __DEFAULT_FN_ATTRS
3829_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
3830                 short __w11, short __w10, short __w09, short __w08,
3831                 short __w07, short __w06, short __w05, short __w04,
3832                 short __w03, short __w02, short __w01, short __w00)
3833{
3834  return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
3835    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
3836}
3837
3838/// Constructs a 256-bit integer vector initialized with the specified
3839///    8-bit integral values.
3840///
3841/// \headerfile <x86intrin.h>
3842///
3843/// This intrinsic is a utility function and does not correspond to a specific
3844///   instruction.
3845///
3846/// \param __b31
3847///    An 8-bit integral value used to initialize bits [255:248] of the result.
3848/// \param __b30
3849///    An 8-bit integral value used to initialize bits [247:240] of the result.
3850/// \param __b29
3851///    An 8-bit integral value used to initialize bits [239:232] of the result.
3852/// \param __b28
3853///    An 8-bit integral value used to initialize bits [231:224] of the result.
3854/// \param __b27
3855///    An 8-bit integral value used to initialize bits [223:216] of the result.
3856/// \param __b26
3857///    An 8-bit integral value used to initialize bits [215:208] of the result.
3858/// \param __b25
3859///    An 8-bit integral value used to initialize bits [207:200] of the result.
3860/// \param __b24
3861///    An 8-bit integral value used to initialize bits [199:192] of the result.
3862/// \param __b23
3863///    An 8-bit integral value used to initialize bits [191:184] of the result.
3864/// \param __b22
3865///    An 8-bit integral value used to initialize bits [183:176] of the result.
3866/// \param __b21
3867///    An 8-bit integral value used to initialize bits [175:168] of the result.
3868/// \param __b20
3869///    An 8-bit integral value used to initialize bits [167:160] of the result.
3870/// \param __b19
3871///    An 8-bit integral value used to initialize bits [159:152] of the result.
3872/// \param __b18
3873///    An 8-bit integral value used to initialize bits [151:144] of the result.
3874/// \param __b17
3875///    An 8-bit integral value used to initialize bits [143:136] of the result.
3876/// \param __b16
3877///    An 8-bit integral value used to initialize bits [135:128] of the result.
3878/// \param __b15
3879///    An 8-bit integral value used to initialize bits [127:120] of the result.
3880/// \param __b14
3881///    An 8-bit integral value used to initialize bits [119:112] of the result.
3882/// \param __b13
3883///    An 8-bit integral value used to initialize bits [111:104] of the result.
3884/// \param __b12
3885///    An 8-bit integral value used to initialize bits [103:96] of the result.
3886/// \param __b11
3887///    An 8-bit integral value used to initialize bits [95:88] of the result.
3888/// \param __b10
3889///    An 8-bit integral value used to initialize bits [87:80] of the result.
3890/// \param __b09
3891///    An 8-bit integral value used to initialize bits [79:72] of the result.
3892/// \param __b08
3893///    An 8-bit integral value used to initialize bits [71:64] of the result.
3894/// \param __b07
3895///    An 8-bit integral value used to initialize bits [63:56] of the result.
3896/// \param __b06
3897///    An 8-bit integral value used to initialize bits [55:48] of the result.
3898/// \param __b05
3899///    An 8-bit integral value used to initialize bits [47:40] of the result.
3900/// \param __b04
3901///    An 8-bit integral value used to initialize bits [39:32] of the result.
3902/// \param __b03
3903///    An 8-bit integral value used to initialize bits [31:24] of the result.
3904/// \param __b02
3905///    An 8-bit integral value used to initialize bits [23:16] of the result.
3906/// \param __b01
3907///    An 8-bit integral value used to initialize bits [15:8] of the result.
3908/// \param __b00
3909///    An 8-bit integral value used to initialize bits [7:0] of the result.
3910/// \returns An initialized 256-bit integer vector.
3911static __inline __m256i __DEFAULT_FN_ATTRS
3912_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
3913                char __b27, char __b26, char __b25, char __b24,
3914                char __b23, char __b22, char __b21, char __b20,
3915                char __b19, char __b18, char __b17, char __b16,
3916                char __b15, char __b14, char __b13, char __b12,
3917                char __b11, char __b10, char __b09, char __b08,
3918                char __b07, char __b06, char __b05, char __b04,
3919                char __b03, char __b02, char __b01, char __b00)
3920{
3921  return __extension__ (__m256i)(__v32qi){
3922    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
3923    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
3924    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
3925    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
3926  };
3927}
3928
3929/// Constructs a 256-bit integer vector initialized with the specified
3930///    64-bit integral values.
3931///
3932/// \headerfile <x86intrin.h>
3933///
3934/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
3935///   instruction.
3936///
3937/// \param __a
3938///    A 64-bit integral value used to initialize bits [255:192] of the result.
3939/// \param __b
3940///    A 64-bit integral value used to initialize bits [191:128] of the result.
3941/// \param __c
3942///    A 64-bit integral value used to initialize bits [127:64] of the result.
3943/// \param __d
3944///    A 64-bit integral value used to initialize bits [63:0] of the result.
3945/// \returns An initialized 256-bit integer vector.
3946static __inline __m256i __DEFAULT_FN_ATTRS
3947_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
3948{
3949  return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
3950}
3951
3952/* Create vectors with elements in reverse order */
3953/// Constructs a 256-bit floating-point vector of [4 x double],
3954///    initialized in reverse order with the specified double-precision
3955///    floating-point values.
3956///
3957/// \headerfile <x86intrin.h>
3958///
3959/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
3960///   instruction.
3961///
3962/// \param __a
3963///    A double-precision floating-point value used to initialize bits [63:0]
3964///    of the result.
3965/// \param __b
3966///    A double-precision floating-point value used to initialize bits [127:64]
3967///    of the result.
3968/// \param __c
3969///    A double-precision floating-point value used to initialize bits [191:128]
3970///    of the result.
3971/// \param __d
3972///    A double-precision floating-point value used to initialize bits [255:192]
3973///    of the result.
3974/// \returns An initialized 256-bit floating-point vector of [4 x double].
3975static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
3976_mm256_setr_pd(double __a, double __b, double __c, double __d)
3977{
3978  return _mm256_set_pd(__d, __c, __b, __a);
3979}
3980
3981/// Constructs a 256-bit floating-point vector of [8 x float],
3982///    initialized in reverse order with the specified single-precision
3983///    float-point values.
3984///
3985/// \headerfile <x86intrin.h>
3986///
3987/// This intrinsic is a utility function and does not correspond to a specific
3988///   instruction.
3989///
3990/// \param __a
3991///    A single-precision floating-point value used to initialize bits [31:0]
3992///    of the result.
3993/// \param __b
3994///    A single-precision floating-point value used to initialize bits [63:32]
3995///    of the result.
3996/// \param __c
3997///    A single-precision floating-point value used to initialize bits [95:64]
3998///    of the result.
3999/// \param __d
4000///    A single-precision floating-point value used to initialize bits [127:96]
4001///    of the result.
4002/// \param __e
4003///    A single-precision floating-point value used to initialize bits [159:128]
4004///    of the result.
4005/// \param __f
4006///    A single-precision floating-point value used to initialize bits [191:160]
4007///    of the result.
4008/// \param __g
4009///    A single-precision floating-point value used to initialize bits [223:192]
4010///    of the result.
4011/// \param __h
4012///    A single-precision floating-point value used to initialize bits [255:224]
4013///    of the result.
4014/// \returns An initialized 256-bit floating-point vector of [8 x float].
4015static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4016_mm256_setr_ps(float __a, float __b, float __c, float __d,
4017               float __e, float __f, float __g, float __h)
4018{
4019  return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
4020}
4021
4022/// Constructs a 256-bit integer vector, initialized in reverse order
4023///    with the specified 32-bit integral values.
4024///
4025/// \headerfile <x86intrin.h>
4026///
4027/// This intrinsic is a utility function and does not correspond to a specific
4028///   instruction.
4029///
4030/// \param __i0
4031///    A 32-bit integral value used to initialize bits [31:0] of the result.
4032/// \param __i1
4033///    A 32-bit integral value used to initialize bits [63:32] of the result.
4034/// \param __i2
4035///    A 32-bit integral value used to initialize bits [95:64] of the result.
4036/// \param __i3
4037///    A 32-bit integral value used to initialize bits [127:96] of the result.
4038/// \param __i4
4039///    A 32-bit integral value used to initialize bits [159:128] of the result.
4040/// \param __i5
4041///    A 32-bit integral value used to initialize bits [191:160] of the result.
4042/// \param __i6
4043///    A 32-bit integral value used to initialize bits [223:192] of the result.
4044/// \param __i7
4045///    A 32-bit integral value used to initialize bits [255:224] of the result.
4046/// \returns An initialized 256-bit integer vector.
4047static __inline __m256i __DEFAULT_FN_ATTRS
4048_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
4049                  int __i4, int __i5, int __i6, int __i7)
4050{
4051  return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
4052}
4053
4054/// Constructs a 256-bit integer vector, initialized in reverse order
4055///    with the specified 16-bit integral values.
4056///
4057/// \headerfile <x86intrin.h>
4058///
4059/// This intrinsic is a utility function and does not correspond to a specific
4060///   instruction.
4061///
4062/// \param __w15
4063///    A 16-bit integral value used to initialize bits [15:0] of the result.
4064/// \param __w14
4065///    A 16-bit integral value used to initialize bits [31:16] of the result.
4066/// \param __w13
4067///    A 16-bit integral value used to initialize bits [47:32] of the result.
4068/// \param __w12
4069///    A 16-bit integral value used to initialize bits [63:48] of the result.
4070/// \param __w11
4071///    A 16-bit integral value used to initialize bits [79:64] of the result.
4072/// \param __w10
4073///    A 16-bit integral value used to initialize bits [95:80] of the result.
4074/// \param __w09
4075///    A 16-bit integral value used to initialize bits [111:96] of the result.
4076/// \param __w08
4077///    A 16-bit integral value used to initialize bits [127:112] of the result.
4078/// \param __w07
4079///    A 16-bit integral value used to initialize bits [143:128] of the result.
4080/// \param __w06
4081///    A 16-bit integral value used to initialize bits [159:144] of the result.
4082/// \param __w05
4083///    A 16-bit integral value used to initialize bits [175:160] of the result.
4084/// \param __w04
4085///    A 16-bit integral value used to initialize bits [191:176] of the result.
4086/// \param __w03
4087///    A 16-bit integral value used to initialize bits [207:192] of the result.
4088/// \param __w02
4089///    A 16-bit integral value used to initialize bits [223:208] of the result.
4090/// \param __w01
4091///    A 16-bit integral value used to initialize bits [239:224] of the result.
4092/// \param __w00
4093///    A 16-bit integral value used to initialize bits [255:240] of the result.
4094/// \returns An initialized 256-bit integer vector.
4095static __inline __m256i __DEFAULT_FN_ATTRS
4096_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
4097       short __w11, short __w10, short __w09, short __w08,
4098       short __w07, short __w06, short __w05, short __w04,
4099       short __w03, short __w02, short __w01, short __w00)
4100{
4101  return _mm256_set_epi16(__w00, __w01, __w02, __w03,
4102                          __w04, __w05, __w06, __w07,
4103                          __w08, __w09, __w10, __w11,
4104                          __w12, __w13, __w14, __w15);
4105}
4106
4107/// Constructs a 256-bit integer vector, initialized in reverse order
4108///    with the specified 8-bit integral values.
4109///
4110/// \headerfile <x86intrin.h>
4111///
4112/// This intrinsic is a utility function and does not correspond to a specific
4113///   instruction.
4114///
4115/// \param __b31
4116///    An 8-bit integral value used to initialize bits [7:0] of the result.
4117/// \param __b30
4118///    An 8-bit integral value used to initialize bits [15:8] of the result.
4119/// \param __b29
4120///    An 8-bit integral value used to initialize bits [23:16] of the result.
4121/// \param __b28
4122///    An 8-bit integral value used to initialize bits [31:24] of the result.
4123/// \param __b27
4124///    An 8-bit integral value used to initialize bits [39:32] of the result.
4125/// \param __b26
4126///    An 8-bit integral value used to initialize bits [47:40] of the result.
4127/// \param __b25
4128///    An 8-bit integral value used to initialize bits [55:48] of the result.
4129/// \param __b24
4130///    An 8-bit integral value used to initialize bits [63:56] of the result.
4131/// \param __b23
4132///    An 8-bit integral value used to initialize bits [71:64] of the result.
4133/// \param __b22
4134///    An 8-bit integral value used to initialize bits [79:72] of the result.
4135/// \param __b21
4136///    An 8-bit integral value used to initialize bits [87:80] of the result.
4137/// \param __b20
4138///    An 8-bit integral value used to initialize bits [95:88] of the result.
4139/// \param __b19
4140///    An 8-bit integral value used to initialize bits [103:96] of the result.
4141/// \param __b18
4142///    An 8-bit integral value used to initialize bits [111:104] of the result.
4143/// \param __b17
4144///    An 8-bit integral value used to initialize bits [119:112] of the result.
4145/// \param __b16
4146///    An 8-bit integral value used to initialize bits [127:120] of the result.
4147/// \param __b15
4148///    An 8-bit integral value used to initialize bits [135:128] of the result.
4149/// \param __b14
4150///    An 8-bit integral value used to initialize bits [143:136] of the result.
4151/// \param __b13
4152///    An 8-bit integral value used to initialize bits [151:144] of the result.
4153/// \param __b12
4154///    An 8-bit integral value used to initialize bits [159:152] of the result.
4155/// \param __b11
4156///    An 8-bit integral value used to initialize bits [167:160] of the result.
4157/// \param __b10
4158///    An 8-bit integral value used to initialize bits [175:168] of the result.
4159/// \param __b09
4160///    An 8-bit integral value used to initialize bits [183:176] of the result.
4161/// \param __b08
4162///    An 8-bit integral value used to initialize bits [191:184] of the result.
4163/// \param __b07
4164///    An 8-bit integral value used to initialize bits [199:192] of the result.
4165/// \param __b06
4166///    An 8-bit integral value used to initialize bits [207:200] of the result.
4167/// \param __b05
4168///    An 8-bit integral value used to initialize bits [215:208] of the result.
4169/// \param __b04
4170///    An 8-bit integral value used to initialize bits [223:216] of the result.
4171/// \param __b03
4172///    An 8-bit integral value used to initialize bits [231:224] of the result.
4173/// \param __b02
4174///    An 8-bit integral value used to initialize bits [239:232] of the result.
4175/// \param __b01
4176///    An 8-bit integral value used to initialize bits [247:240] of the result.
4177/// \param __b00
4178///    An 8-bit integral value used to initialize bits [255:248] of the result.
4179/// \returns An initialized 256-bit integer vector.
4180static __inline __m256i __DEFAULT_FN_ATTRS
4181_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
4182                 char __b27, char __b26, char __b25, char __b24,
4183                 char __b23, char __b22, char __b21, char __b20,
4184                 char __b19, char __b18, char __b17, char __b16,
4185                 char __b15, char __b14, char __b13, char __b12,
4186                 char __b11, char __b10, char __b09, char __b08,
4187                 char __b07, char __b06, char __b05, char __b04,
4188                 char __b03, char __b02, char __b01, char __b00)
4189{
4190  return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
4191                         __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
4192                         __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
4193                         __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
4194}
4195
4196/// Constructs a 256-bit integer vector, initialized in reverse order
4197///    with the specified 64-bit integral values.
4198///
4199/// \headerfile <x86intrin.h>
4200///
4201/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
4202///   instruction.
4203///
4204/// \param __a
4205///    A 64-bit integral value used to initialize bits [63:0] of the result.
4206/// \param __b
4207///    A 64-bit integral value used to initialize bits [127:64] of the result.
4208/// \param __c
4209///    A 64-bit integral value used to initialize bits [191:128] of the result.
4210/// \param __d
4211///    A 64-bit integral value used to initialize bits [255:192] of the result.
4212/// \returns An initialized 256-bit integer vector.
4213static __inline __m256i __DEFAULT_FN_ATTRS
4214_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
4215{
4216  return _mm256_set_epi64x(__d, __c, __b, __a);
4217}
4218
4219/* Create vectors with repeated elements */
4220/// Constructs a 256-bit floating-point vector of [4 x double], with each
4221///    of the four double-precision floating-point vector elements set to the
4222///    specified double-precision floating-point value.
4223///
4224/// \headerfile <x86intrin.h>
4225///
4226/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4227///
4228/// \param __w
4229///    A double-precision floating-point value used to initialize each vector
4230///    element of the result.
4231/// \returns An initialized 256-bit floating-point vector of [4 x double].
4232static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR
4233_mm256_set1_pd(double __w)
4234{
4235  return _mm256_set_pd(__w, __w, __w, __w);
4236}
4237
4238/// Constructs a 256-bit floating-point vector of [8 x float], with each
4239///    of the eight single-precision floating-point vector elements set to the
4240///    specified single-precision floating-point value.
4241///
4242/// \headerfile <x86intrin.h>
4243///
4244/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4245///   instruction.
4246///
4247/// \param __w
4248///    A single-precision floating-point value used to initialize each vector
4249///    element of the result.
4250/// \returns An initialized 256-bit floating-point vector of [8 x float].
4251static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR
4252_mm256_set1_ps(float __w)
4253{
4254  return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
4255}
4256
4257/// Constructs a 256-bit integer vector of [8 x i32], with each of the
4258///    32-bit integral vector elements set to the specified 32-bit integral
4259///    value.
4260///
4261/// \headerfile <x86intrin.h>
4262///
4263/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
4264///   instruction.
4265///
4266/// \param __i
4267///    A 32-bit integral value used to initialize each vector element of the
4268///    result.
4269/// \returns An initialized 256-bit integer vector of [8 x i32].
4270static __inline __m256i __DEFAULT_FN_ATTRS
4271_mm256_set1_epi32(int __i)
4272{
4273  return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
4274}
4275
4276/// Constructs a 256-bit integer vector of [16 x i16], with each of the
4277///    16-bit integral vector elements set to the specified 16-bit integral
4278///    value.
4279///
4280/// \headerfile <x86intrin.h>
4281///
4282/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4283///
4284/// \param __w
4285///    A 16-bit integral value used to initialize each vector element of the
4286///    result.
4287/// \returns An initialized 256-bit integer vector of [16 x i16].
4288static __inline __m256i __DEFAULT_FN_ATTRS
4289_mm256_set1_epi16(short __w)
4290{
4291  return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
4292                          __w, __w, __w, __w, __w, __w, __w, __w);
4293}
4294
4295/// Constructs a 256-bit integer vector of [32 x i8], with each of the
4296///    8-bit integral vector elements set to the specified 8-bit integral value.
4297///
4298/// \headerfile <x86intrin.h>
4299///
4300/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
4301///
4302/// \param __b
4303///    An 8-bit integral value used to initialize each vector element of the
4304///    result.
4305/// \returns An initialized 256-bit integer vector of [32 x i8].
4306static __inline __m256i __DEFAULT_FN_ATTRS
4307_mm256_set1_epi8(char __b)
4308{
4309  return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
4310                         __b, __b, __b, __b, __b, __b, __b, __b,
4311                         __b, __b, __b, __b, __b, __b, __b, __b,
4312                         __b, __b, __b, __b, __b, __b, __b, __b);
4313}
4314
4315/// Constructs a 256-bit integer vector of [4 x i64], with each of the
4316///    64-bit integral vector elements set to the specified 64-bit integral
4317///    value.
4318///
4319/// \headerfile <x86intrin.h>
4320///
4321/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
4322///
4323/// \param __q
4324///    A 64-bit integral value used to initialize each vector element of the
4325///    result.
4326/// \returns An initialized 256-bit integer vector of [4 x i64].
4327static __inline __m256i __DEFAULT_FN_ATTRS
4328_mm256_set1_epi64x(long long __q)
4329{
4330  return _mm256_set_epi64x(__q, __q, __q, __q);
4331}
4332
4333/* Create __zeroed vectors */
4334/// Constructs a 256-bit floating-point vector of [4 x double] with all
4335///    vector elements initialized to zero.
4336///
4337/// \headerfile <x86intrin.h>
4338///
4339/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4340///
4341/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
4342static __inline __m256d __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_pd(void) {
4343  return __extension__(__m256d){0.0, 0.0, 0.0, 0.0};
4344}
4345
4346/// Constructs a 256-bit floating-point vector of [8 x float] with all
4347///    vector elements initialized to zero.
4348///
4349/// \headerfile <x86intrin.h>
4350///
4351/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4352///
4353/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
4354static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_setzero_ps(void) {
4355  return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
4356}
4357
4358/// Constructs a 256-bit integer vector initialized to zero.
4359///
4360/// \headerfile <x86intrin.h>
4361///
4362/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
4363///
4364/// \returns A 256-bit integer vector initialized to zero.
4365static __inline __m256i __DEFAULT_FN_ATTRS_CONSTEXPR
4366_mm256_setzero_si256(void) {
4367  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
4368}
4369
4370/* Cast between vector types */
4371/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4372///    floating-point vector of [8 x float].
4373///
4374/// \headerfile <x86intrin.h>
4375///
4376/// This intrinsic has no corresponding instruction.
4377///
4378/// \param __a
4379///    A 256-bit floating-point vector of [4 x double].
4380/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4381///    bitwise pattern as the parameter.
4382static __inline __m256 __DEFAULT_FN_ATTRS
4383_mm256_castpd_ps(__m256d __a)
4384{
4385  return (__m256)__a;
4386}
4387
4388/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
4389///    integer vector.
4390///
4391/// \headerfile <x86intrin.h>
4392///
4393/// This intrinsic has no corresponding instruction.
4394///
4395/// \param __a
4396///    A 256-bit floating-point vector of [4 x double].
4397/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4398///    parameter.
4399static __inline __m256i __DEFAULT_FN_ATTRS
4400_mm256_castpd_si256(__m256d __a)
4401{
4402  return (__m256i)__a;
4403}
4404
4405/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4406///    floating-point vector of [4 x double].
4407///
4408/// \headerfile <x86intrin.h>
4409///
4410/// This intrinsic has no corresponding instruction.
4411///
4412/// \param __a
4413///    A 256-bit floating-point vector of [8 x float].
4414/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4415///    bitwise pattern as the parameter.
4416static __inline __m256d __DEFAULT_FN_ATTRS
4417_mm256_castps_pd(__m256 __a)
4418{
4419  return (__m256d)__a;
4420}
4421
4422/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
4423///    integer vector.
4424///
4425/// \headerfile <x86intrin.h>
4426///
4427/// This intrinsic has no corresponding instruction.
4428///
4429/// \param __a
4430///    A 256-bit floating-point vector of [8 x float].
4431/// \returns A 256-bit integer vector containing the same bitwise pattern as the
4432///    parameter.
4433static __inline __m256i __DEFAULT_FN_ATTRS
4434_mm256_castps_si256(__m256 __a)
4435{
4436  return (__m256i)__a;
4437}
4438
4439/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4440///    of [8 x float].
4441///
4442/// \headerfile <x86intrin.h>
4443///
4444/// This intrinsic has no corresponding instruction.
4445///
4446/// \param __a
4447///    A 256-bit integer vector.
4448/// \returns A 256-bit floating-point vector of [8 x float] containing the same
4449///    bitwise pattern as the parameter.
4450static __inline __m256 __DEFAULT_FN_ATTRS
4451_mm256_castsi256_ps(__m256i __a)
4452{
4453  return (__m256)__a;
4454}
4455
4456/// Casts a 256-bit integer vector into a 256-bit floating-point vector
4457///    of [4 x double].
4458///
4459/// \headerfile <x86intrin.h>
4460///
4461/// This intrinsic has no corresponding instruction.
4462///
4463/// \param __a
4464///    A 256-bit integer vector.
4465/// \returns A 256-bit floating-point vector of [4 x double] containing the same
4466///    bitwise pattern as the parameter.
4467static __inline __m256d __DEFAULT_FN_ATTRS
4468_mm256_castsi256_pd(__m256i __a)
4469{
4470  return (__m256d)__a;
4471}
4472
4473/// Returns the lower 128 bits of a 256-bit floating-point vector of
4474///    [4 x double] as a 128-bit floating-point vector of [2 x double].
4475///
4476/// \headerfile <x86intrin.h>
4477///
4478/// This intrinsic has no corresponding instruction.
4479///
4480/// \param __a
4481///    A 256-bit floating-point vector of [4 x double].
4482/// \returns A 128-bit floating-point vector of [2 x double] containing the
4483///    lower 128 bits of the parameter.
4484static __inline __m128d __DEFAULT_FN_ATTRS
4485_mm256_castpd256_pd128(__m256d __a)
4486{
4487  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
4488}
4489
4490/// Returns the lower 128 bits of a 256-bit floating-point vector of
4491///    [8 x float] as a 128-bit floating-point vector of [4 x float].
4492///
4493/// \headerfile <x86intrin.h>
4494///
4495/// This intrinsic has no corresponding instruction.
4496///
4497/// \param __a
4498///    A 256-bit floating-point vector of [8 x float].
4499/// \returns A 128-bit floating-point vector of [4 x float] containing the
4500///    lower 128 bits of the parameter.
4501static __inline __m128 __DEFAULT_FN_ATTRS
4502_mm256_castps256_ps128(__m256 __a)
4503{
4504  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
4505}
4506
4507/// Truncates a 256-bit integer vector into a 128-bit integer vector.
4508///
4509/// \headerfile <x86intrin.h>
4510///
4511/// This intrinsic has no corresponding instruction.
4512///
4513/// \param __a
4514///    A 256-bit integer vector.
4515/// \returns A 128-bit integer vector containing the lower 128 bits of the
4516///    parameter.
4517static __inline __m128i __DEFAULT_FN_ATTRS
4518_mm256_castsi256_si128(__m256i __a)
4519{
4520  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
4521}
4522
4523/// Constructs a 256-bit floating-point vector of [4 x double] from a
4524///    128-bit floating-point vector of [2 x double].
4525///
4526///    The lower 128 bits contain the value of the source vector. The contents
4527///    of the upper 128 bits are undefined.
4528///
4529/// \headerfile <x86intrin.h>
4530///
4531/// This intrinsic has no corresponding instruction.
4532///
4533/// \param __a
4534///    A 128-bit vector of [2 x double].
4535/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4536///    contain the value of the parameter. The contents of the upper 128 bits
4537///    are undefined.
4538static __inline __m256d __DEFAULT_FN_ATTRS
4539_mm256_castpd128_pd256(__m128d __a)
4540{
4541  return __builtin_shufflevector(
4542      (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4543}
4544
4545/// Constructs a 256-bit floating-point vector of [8 x float] from a
4546///    128-bit floating-point vector of [4 x float].
4547///
4548///    The lower 128 bits contain the value of the source vector. The contents
4549///    of the upper 128 bits are undefined.
4550///
4551/// \headerfile <x86intrin.h>
4552///
4553/// This intrinsic has no corresponding instruction.
4554///
4555/// \param __a
4556///    A 128-bit vector of [4 x float].
4557/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4558///    contain the value of the parameter. The contents of the upper 128 bits
4559///    are undefined.
4560static __inline __m256 __DEFAULT_FN_ATTRS
4561_mm256_castps128_ps256(__m128 __a)
4562{
4563  return __builtin_shufflevector((__v4sf)__a,
4564                                 (__v4sf)__builtin_nondeterministic_value(__a),
4565                                 0, 1, 2, 3, 4, 5, 6, 7);
4566}
4567
4568/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4569///
4570///    The lower 128 bits contain the value of the source vector. The contents
4571///    of the upper 128 bits are undefined.
4572///
4573/// \headerfile <x86intrin.h>
4574///
4575/// This intrinsic has no corresponding instruction.
4576///
4577/// \param __a
4578///    A 128-bit integer vector.
4579/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4580///    the parameter. The contents of the upper 128 bits are undefined.
4581static __inline __m256i __DEFAULT_FN_ATTRS
4582_mm256_castsi128_si256(__m128i __a)
4583{
4584  return __builtin_shufflevector(
4585      (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
4586}
4587
4588/// Constructs a 256-bit floating-point vector of [4 x double] from a
4589///    128-bit floating-point vector of [2 x double]. The lower 128 bits
4590///    contain the value of the source vector. The upper 128 bits are set
4591///    to zero.
4592///
4593/// \headerfile <x86intrin.h>
4594///
4595/// This intrinsic has no corresponding instruction.
4596///
4597/// \param __a
4598///    A 128-bit vector of [2 x double].
4599/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
4600///    contain the value of the parameter. The upper 128 bits are set to zero.
4601static __inline __m256d __DEFAULT_FN_ATTRS
4602_mm256_zextpd128_pd256(__m128d __a)
4603{
4604  return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
4605}
4606
4607/// Constructs a 256-bit floating-point vector of [8 x float] from a
4608///    128-bit floating-point vector of [4 x float]. The lower 128 bits contain
4609///    the value of the source vector. The upper 128 bits are set to zero.
4610///
4611/// \headerfile <x86intrin.h>
4612///
4613/// This intrinsic has no corresponding instruction.
4614///
4615/// \param __a
4616///    A 128-bit vector of [4 x float].
4617/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
4618///    contain the value of the parameter. The upper 128 bits are set to zero.
4619static __inline __m256 __DEFAULT_FN_ATTRS
4620_mm256_zextps128_ps256(__m128 __a)
4621{
4622  return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
4623}
4624
4625/// Constructs a 256-bit integer vector from a 128-bit integer vector.
4626///    The lower 128 bits contain the value of the source vector. The upper
4627///    128 bits are set to zero.
4628///
4629/// \headerfile <x86intrin.h>
4630///
4631/// This intrinsic has no corresponding instruction.
4632///
4633/// \param __a
4634///    A 128-bit integer vector.
4635/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
4636///    the parameter. The upper 128 bits are set to zero.
4637static __inline __m256i __DEFAULT_FN_ATTRS
4638_mm256_zextsi128_si256(__m128i __a)
4639{
4640  return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
4641}
4642
4643/*
4644   Vector insert.
4645   We use macros rather than inlines because we only want to accept
4646   invocations where the immediate M is a constant expression.
4647*/
4648/// Constructs a new 256-bit vector of [8 x float] by first duplicating
4649///    a 256-bit vector of [8 x float] given in the first parameter, and then
4650///    replacing either the upper or the lower 128 bits with the contents of a
4651///    128-bit vector of [4 x float] in the second parameter.
4652///
4653///    The immediate integer parameter determines between the upper or the lower
4654///    128 bits.
4655///
4656/// \headerfile <x86intrin.h>
4657///
4658/// \code
4659/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
4660/// \endcode
4661///
4662/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4663///
4664/// \param V1
4665///    A 256-bit vector of [8 x float]. This vector is copied to the result
4666///    first, and then either the upper or the lower 128 bits of the result will
4667///    be replaced by the contents of \a V2.
4668/// \param V2
4669///    A 128-bit vector of [4 x float]. The contents of this parameter are
4670///    written to either the upper or the lower 128 bits of the result depending
4671///    on the value of parameter \a M.
4672/// \param M
4673///    An immediate integer. The least significant bit determines how the values
4674///    from the two parameters are interleaved: \n
4675///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4676///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4677///    result. \n
4678///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4679///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4680///    result.
4681/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
4682#define _mm256_insertf128_ps(V1, V2, M) \
4683  ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
4684                                            (__v4sf)(__m128)(V2), (int)(M)))
4685
4686/// Constructs a new 256-bit vector of [4 x double] by first duplicating
4687///    a 256-bit vector of [4 x double] given in the first parameter, and then
4688///    replacing either the upper or the lower 128 bits with the contents of a
4689///    128-bit vector of [2 x double] in the second parameter.
4690///
4691///    The immediate integer parameter determines between the upper or the lower
4692///    128 bits.
4693///
4694/// \headerfile <x86intrin.h>
4695///
4696/// \code
4697/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
4698/// \endcode
4699///
4700/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4701///
4702/// \param V1
4703///    A 256-bit vector of [4 x double]. This vector is copied to the result
4704///    first, and then either the upper or the lower 128 bits of the result will
4705///    be replaced by the contents of \a V2.
4706/// \param V2
4707///    A 128-bit vector of [2 x double]. The contents of this parameter are
4708///    written to either the upper or the lower 128 bits of the result depending
4709///    on the value of parameter \a M.
4710/// \param M
4711///    An immediate integer. The least significant bit determines how the values
4712///    from the two parameters are interleaved: \n
4713///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4714///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4715///    result. \n
4716///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4717///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4718///    result.
4719/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
4720#define _mm256_insertf128_pd(V1, V2, M) \
4721  ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
4722                                             (__v2df)(__m128d)(V2), (int)(M)))
4723
4724/// Constructs a new 256-bit integer vector by first duplicating a
4725///    256-bit integer vector given in the first parameter, and then replacing
4726///    either the upper or the lower 128 bits with the contents of a 128-bit
4727///    integer vector in the second parameter.
4728///
4729///    The immediate integer parameter determines between the upper or the lower
4730///    128 bits.
4731///
4732/// \headerfile <x86intrin.h>
4733///
4734/// \code
4735/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
4736/// \endcode
4737///
4738/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4739///
4740/// \param V1
4741///    A 256-bit integer vector. This vector is copied to the result first, and
4742///    then either the upper or the lower 128 bits of the result will be
4743///    replaced by the contents of \a V2.
4744/// \param V2
4745///    A 128-bit integer vector. The contents of this parameter are written to
4746///    either the upper or the lower 128 bits of the result depending on the
4747///     value of parameter \a M.
4748/// \param M
4749///    An immediate integer. The least significant bit determines how the values
4750///    from the two parameters are interleaved: \n
4751///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
4752///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
4753///    result. \n
4754///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
4755///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
4756///    result.
4757/// \returns A 256-bit integer vector containing the interleaved values.
4758#define _mm256_insertf128_si256(V1, V2, M) \
4759  ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
4760                                             (__v4si)(__m128i)(V2), (int)(M)))
4761
4762/*
4763   Vector extract.
4764   We use macros rather than inlines because we only want to accept
4765   invocations where the immediate M is a constant expression.
4766*/
4767/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4768///    of [8 x float], as determined by the immediate integer parameter, and
4769///    returns the extracted bits as a 128-bit vector of [4 x float].
4770///
4771/// \headerfile <x86intrin.h>
4772///
4773/// \code
4774/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
4775/// \endcode
4776///
4777/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4778///
4779/// \param V
4780///    A 256-bit vector of [8 x float].
4781/// \param M
4782///    An immediate integer. The least significant bit determines which bits are
4783///    extracted from the first parameter: \n
4784///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4785///    result. \n
4786///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4787/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
4788#define _mm256_extractf128_ps(V, M) \
4789  ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
4790
4791/// Extracts either the upper or the lower 128 bits from a 256-bit vector
4792///    of [4 x double], as determined by the immediate integer parameter, and
4793///    returns the extracted bits as a 128-bit vector of [2 x double].
4794///
4795/// \headerfile <x86intrin.h>
4796///
4797/// \code
4798/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
4799/// \endcode
4800///
4801/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4802///
4803/// \param V
4804///    A 256-bit vector of [4 x double].
4805/// \param M
4806///    An immediate integer. The least significant bit determines which bits are
4807///    extracted from the first parameter: \n
4808///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4809///    result. \n
4810///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4811/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
4812#define _mm256_extractf128_pd(V, M) \
4813  ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
4814
4815/// Extracts either the upper or the lower 128 bits from a 256-bit
4816///    integer vector, as determined by the immediate integer parameter, and
4817///    returns the extracted bits as a 128-bit integer vector.
4818///
4819/// \headerfile <x86intrin.h>
4820///
4821/// \code
4822/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
4823/// \endcode
4824///
4825/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
4826///
4827/// \param V
4828///    A 256-bit integer vector.
4829/// \param M
4830///    An immediate integer. The least significant bit determines which bits are
4831///    extracted from the first parameter:  \n
4832///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
4833///    result. \n
4834///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
4835/// \returns A 128-bit integer vector containing the extracted bits.
4836#define _mm256_extractf128_si256(V, M) \
4837  ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
4838
4839/// Constructs a 256-bit floating-point vector of [8 x float] by
4840///    concatenating two 128-bit floating-point vectors of [4 x float].
4841///
4842/// \headerfile <x86intrin.h>
4843///
4844/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4845///
4846/// \param __hi
4847///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
4848///    128 bits of the result.
4849/// \param __lo
4850///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
4851///    128 bits of the result.
4852/// \returns A 256-bit floating-point vector of [8 x float] containing the
4853///    concatenated result.
4854static __inline __m256 __DEFAULT_FN_ATTRS
4855_mm256_set_m128 (__m128 __hi, __m128 __lo)
4856{
4857  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
4858}
4859
4860/// Constructs a 256-bit floating-point vector of [4 x double] by
4861///    concatenating two 128-bit floating-point vectors of [2 x double].
4862///
4863/// \headerfile <x86intrin.h>
4864///
4865/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4866///
4867/// \param __hi
4868///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
4869///    128 bits of the result.
4870/// \param __lo
4871///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
4872///    128 bits of the result.
4873/// \returns A 256-bit floating-point vector of [4 x double] containing the
4874///    concatenated result.
4875static __inline __m256d __DEFAULT_FN_ATTRS
4876_mm256_set_m128d (__m128d __hi, __m128d __lo)
4877{
4878  return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
4879}
4880
4881/// Constructs a 256-bit integer vector by concatenating two 128-bit
4882///    integer vectors.
4883///
4884/// \headerfile <x86intrin.h>
4885///
4886/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4887///
4888/// \param __hi
4889///    A 128-bit integer vector to be copied to the upper 128 bits of the
4890///    result.
4891/// \param __lo
4892///    A 128-bit integer vector to be copied to the lower 128 bits of the
4893///    result.
4894/// \returns A 256-bit integer vector containing the concatenated result.
4895static __inline __m256i __DEFAULT_FN_ATTRS
4896_mm256_set_m128i (__m128i __hi, __m128i __lo)
4897{
4898  return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
4899}
4900
4901/// Constructs a 256-bit floating-point vector of [8 x float] by
4902///    concatenating two 128-bit floating-point vectors of [4 x float]. This is
4903///    similar to _mm256_set_m128, but the order of the input parameters is
4904///    swapped.
4905///
4906/// \headerfile <x86intrin.h>
4907///
4908/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4909///
4910/// \param __lo
4911///    A 128-bit floating-point vector of [4 x float] to be copied to the lower
4912///    128 bits of the result.
4913/// \param __hi
4914///    A 128-bit floating-point vector of [4 x float] to be copied to the upper
4915///    128 bits of the result.
4916/// \returns A 256-bit floating-point vector of [8 x float] containing the
4917///    concatenated result.
4918static __inline __m256 __DEFAULT_FN_ATTRS
4919_mm256_setr_m128 (__m128 __lo, __m128 __hi)
4920{
4921  return _mm256_set_m128(__hi, __lo);
4922}
4923
4924/// Constructs a 256-bit floating-point vector of [4 x double] by
4925///    concatenating two 128-bit floating-point vectors of [2 x double]. This is
4926///    similar to _mm256_set_m128d, but the order of the input parameters is
4927///    swapped.
4928///
4929/// \headerfile <x86intrin.h>
4930///
4931/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4932///
4933/// \param __lo
4934///    A 128-bit floating-point vector of [2 x double] to be copied to the lower
4935///    128 bits of the result.
4936/// \param __hi
4937///    A 128-bit floating-point vector of [2 x double] to be copied to the upper
4938///    128 bits of the result.
4939/// \returns A 256-bit floating-point vector of [4 x double] containing the
4940///    concatenated result.
4941static __inline __m256d __DEFAULT_FN_ATTRS
4942_mm256_setr_m128d (__m128d __lo, __m128d __hi)
4943{
4944  return (__m256d)_mm256_set_m128d(__hi, __lo);
4945}
4946
4947/// Constructs a 256-bit integer vector by concatenating two 128-bit
4948///    integer vectors. This is similar to _mm256_set_m128i, but the order of
4949///    the input parameters is swapped.
4950///
4951/// \headerfile <x86intrin.h>
4952///
4953/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
4954///
4955/// \param __lo
4956///    A 128-bit integer vector to be copied to the lower 128 bits of the
4957///    result.
4958/// \param __hi
4959///    A 128-bit integer vector to be copied to the upper 128 bits of the
4960///    result.
4961/// \returns A 256-bit integer vector containing the concatenated result.
4962static __inline __m256i __DEFAULT_FN_ATTRS
4963_mm256_setr_m128i (__m128i __lo, __m128i __hi)
4964{
4965  return (__m256i)_mm256_set_m128i(__hi, __lo);
4966}
4967
4968/* SIMD load ops (unaligned) */
4969/// Loads two 128-bit floating-point vectors of [4 x float] from
4970///    unaligned memory locations and constructs a 256-bit floating-point vector
4971///    of [8 x float] by concatenating the two 128-bit vectors.
4972///
4973/// \headerfile <x86intrin.h>
4974///
4975/// This intrinsic corresponds to load instructions followed by the
4976///   <c> VINSERTF128 </c> instruction.
4977///
4978/// \param __addr_hi
4979///    A pointer to a 128-bit memory location containing 4 consecutive
4980///    single-precision floating-point values. These values are to be copied to
4981///    bits[255:128] of the result. The address of the memory location does not
4982///    have to be aligned.
4983/// \param __addr_lo
4984///    A pointer to a 128-bit memory location containing 4 consecutive
4985///    single-precision floating-point values. These values are to be copied to
4986///    bits[127:0] of the result. The address of the memory location does not
4987///    have to be aligned.
4988/// \returns A 256-bit floating-point vector of [8 x float] containing the
4989///    concatenated result.
4990static __inline __m256 __DEFAULT_FN_ATTRS
4991_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
4992{
4993  return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
4994}
4995
4996/// Loads two 128-bit floating-point vectors of [2 x double] from
4997///    unaligned memory locations and constructs a 256-bit floating-point vector
4998///    of [4 x double] by concatenating the two 128-bit vectors.
4999///
5000/// \headerfile <x86intrin.h>
5001///
5002/// This intrinsic corresponds to load instructions followed by the
5003///   <c> VINSERTF128 </c> instruction.
5004///
5005/// \param __addr_hi
5006///    A pointer to a 128-bit memory location containing two consecutive
5007///    double-precision floating-point values. These values are to be copied to
5008///    bits[255:128] of the result. The address of the memory location does not
5009///    have to be aligned.
5010/// \param __addr_lo
5011///    A pointer to a 128-bit memory location containing two consecutive
5012///    double-precision floating-point values. These values are to be copied to
5013///    bits[127:0] of the result. The address of the memory location does not
5014///    have to be aligned.
5015/// \returns A 256-bit floating-point vector of [4 x double] containing the
5016///    concatenated result.
5017static __inline __m256d __DEFAULT_FN_ATTRS
5018_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
5019{
5020  return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
5021}
5022
5023/// Loads two 128-bit integer vectors from unaligned memory locations and
5024///    constructs a 256-bit integer vector by concatenating the two 128-bit
5025///    vectors.
5026///
5027/// \headerfile <x86intrin.h>
5028///
5029/// This intrinsic corresponds to load instructions followed by the
5030///   <c> VINSERTF128 </c> instruction.
5031///
5032/// \param __addr_hi
5033///    A pointer to a 128-bit memory location containing a 128-bit integer
5034///    vector. This vector is to be copied to bits[255:128] of the result. The
5035///    address of the memory location does not have to be aligned.
5036/// \param __addr_lo
5037///    A pointer to a 128-bit memory location containing a 128-bit integer
5038///    vector. This vector is to be copied to bits[127:0] of the result. The
5039///    address of the memory location does not have to be aligned.
5040/// \returns A 256-bit integer vector containing the concatenated result.
5041static __inline __m256i __DEFAULT_FN_ATTRS
5042_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
5043{
5044   return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
5045}
5046
5047/* SIMD store ops (unaligned) */
5048/// Stores the upper and lower 128 bits of a 256-bit floating-point
5049///    vector of [8 x float] into two different unaligned memory locations.
5050///
5051/// \headerfile <x86intrin.h>
5052///
5053/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5054///   store instructions.
5055///
5056/// \param __addr_hi
5057///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5058///    copied to this memory location. The address of this memory location does
5059///    not have to be aligned.
5060/// \param __addr_lo
5061///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5062///    copied to this memory location. The address of this memory location does
5063///    not have to be aligned.
5064/// \param __a
5065///    A 256-bit floating-point vector of [8 x float].
5066static __inline void __DEFAULT_FN_ATTRS
5067_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
5068{
5069  __m128 __v128;
5070
5071  __v128 = _mm256_castps256_ps128(__a);
5072  _mm_storeu_ps(__addr_lo, __v128);
5073  __v128 = _mm256_extractf128_ps(__a, 1);
5074  _mm_storeu_ps(__addr_hi, __v128);
5075}
5076
5077/// Stores the upper and lower 128 bits of a 256-bit floating-point
5078///    vector of [4 x double] into two different unaligned memory locations.
5079///
5080/// \headerfile <x86intrin.h>
5081///
5082/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5083///   store instructions.
5084///
5085/// \param __addr_hi
5086///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5087///    copied to this memory location. The address of this memory location does
5088///    not have to be aligned.
5089/// \param __addr_lo
5090///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5091///    copied to this memory location. The address of this memory location does
5092///    not have to be aligned.
5093/// \param __a
5094///    A 256-bit floating-point vector of [4 x double].
5095static __inline void __DEFAULT_FN_ATTRS
5096_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
5097{
5098  __m128d __v128;
5099
5100  __v128 = _mm256_castpd256_pd128(__a);
5101  _mm_storeu_pd(__addr_lo, __v128);
5102  __v128 = _mm256_extractf128_pd(__a, 1);
5103  _mm_storeu_pd(__addr_hi, __v128);
5104}
5105
5106/// Stores the upper and lower 128 bits of a 256-bit integer vector into
5107///    two different unaligned memory locations.
5108///
5109/// \headerfile <x86intrin.h>
5110///
5111/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
5112///   store instructions.
5113///
5114/// \param __addr_hi
5115///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
5116///    copied to this memory location. The address of this memory location does
5117///    not have to be aligned.
5118/// \param __addr_lo
5119///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
5120///    copied to this memory location. The address of this memory location does
5121///    not have to be aligned.
5122/// \param __a
5123///    A 256-bit integer vector.
5124static __inline void __DEFAULT_FN_ATTRS
5125_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
5126{
5127  __m128i __v128;
5128
5129  __v128 = _mm256_castsi256_si128(__a);
5130  _mm_storeu_si128(__addr_lo, __v128);
5131  __v128 = _mm256_extractf128_si256(__a, 1);
5132  _mm_storeu_si128(__addr_hi, __v128);
5133}
5134
5135#undef __DEFAULT_FN_ATTRS
5136#undef __DEFAULT_FN_ATTRS_CONSTEXPR
5137#undef __DEFAULT_FN_ATTRS128
5138#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
5139
5140#endif /* __AVXINTRIN_H */