master
   1/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9
  10#ifndef __EMMINTRIN_H
  11#define __EMMINTRIN_H
  12
  13#if !defined(__i386__) && !defined(__x86_64__)
  14#error "This header is only meant to be used on x86 and x64 architecture"
  15#endif
  16
  17#include <xmmintrin.h>
  18
  19typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
  20typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
  21
  22typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
  23typedef long long __m128i_u
  24    __attribute__((__vector_size__(16), __aligned__(1)));
  25
  26/* Type defines.  */
  27typedef double __v2df __attribute__((__vector_size__(16)));
  28typedef long long __v2di __attribute__((__vector_size__(16)));
  29typedef short __v8hi __attribute__((__vector_size__(16)));
  30typedef char __v16qi __attribute__((__vector_size__(16)));
  31
  32/* Unsigned types */
  33typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
  34typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
  35typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
  36
  37/* We need an explicitly signed variant for char. Note that this shouldn't
  38 * appear in the interface though. */
  39typedef signed char __v16qs __attribute__((__vector_size__(16)));
  40
  41#ifdef __SSE2__
  42/* Both _Float16 and __bf16 require SSE2 being enabled. */
  43typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
  44typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
  45typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
  46
  47typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
  48typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
  49#endif
  50
  51/* Define the default attributes for the functions in this file. */
  52#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
  53#define __DEFAULT_FN_ATTRS                                                     \
  54  __attribute__((__always_inline__, __nodebug__,                               \
  55                 __target__("sse2,no-evex512"), __min_vector_width__(128)))
  56#else
  57#define __DEFAULT_FN_ATTRS                                                     \
  58  __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
  59                 __min_vector_width__(128)))
  60#endif
  61
  62#if defined(__cplusplus) && (__cplusplus >= 201103L)
  63#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
  64#else
  65#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
  66#endif
  67
  68#define __trunc64(x)                                                           \
  69  (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
  70#define __anyext128(x)                                                         \
  71  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
  72                                    1, -1, -1)
  73
  74/// Adds lower double-precision values in both operands and returns the
  75///    sum in the lower 64 bits of the result. The upper 64 bits of the result
  76///    are copied from the upper double-precision value of the first operand.
  77///
  78/// \headerfile <x86intrin.h>
  79///
  80/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
  81///
  82/// \param __a
  83///    A 128-bit vector of [2 x double] containing one of the source operands.
  84/// \param __b
  85///    A 128-bit vector of [2 x double] containing one of the source operands.
  86/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
  87///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
  88///    from the upper 64 bits of the first source operand.
  89static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_sd(__m128d __a,
  90                                                                  __m128d __b) {
  91  __a[0] += __b[0];
  92  return __a;
  93}
  94
  95/// Adds two 128-bit vectors of [2 x double].
  96///
  97/// \headerfile <x86intrin.h>
  98///
  99/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
 100///
 101/// \param __a
 102///    A 128-bit vector of [2 x double] containing one of the source operands.
 103/// \param __b
 104///    A 128-bit vector of [2 x double] containing one of the source operands.
 105/// \returns A 128-bit vector of [2 x double] containing the sums of both
 106///    operands.
 107static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_add_pd(__m128d __a,
 108                                                                  __m128d __b) {
 109  return (__m128d)((__v2df)__a + (__v2df)__b);
 110}
 111
 112/// Subtracts the lower double-precision value of the second operand
 113///    from the lower double-precision value of the first operand and returns
 114///    the difference in the lower 64 bits of the result. The upper 64 bits of
 115///    the result are copied from the upper double-precision value of the first
 116///    operand.
 117///
 118/// \headerfile <x86intrin.h>
 119///
 120/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
 121///
 122/// \param __a
 123///    A 128-bit vector of [2 x double] containing the minuend.
 124/// \param __b
 125///    A 128-bit vector of [2 x double] containing the subtrahend.
 126/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 127///    difference of the lower 64 bits of both operands. The upper 64 bits are
 128///    copied from the upper 64 bits of the first source operand.
 129static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_sd(__m128d __a,
 130                                                                  __m128d __b) {
 131  __a[0] -= __b[0];
 132  return __a;
 133}
 134
 135/// Subtracts two 128-bit vectors of [2 x double].
 136///
 137/// \headerfile <x86intrin.h>
 138///
 139/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
 140///
 141/// \param __a
 142///    A 128-bit vector of [2 x double] containing the minuend.
 143/// \param __b
 144///    A 128-bit vector of [2 x double] containing the subtrahend.
 145/// \returns A 128-bit vector of [2 x double] containing the differences between
 146///    both operands.
 147static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sub_pd(__m128d __a,
 148                                                                  __m128d __b) {
 149  return (__m128d)((__v2df)__a - (__v2df)__b);
 150}
 151
 152/// Multiplies lower double-precision values in both operands and returns
 153///    the product in the lower 64 bits of the result. The upper 64 bits of the
 154///    result are copied from the upper double-precision value of the first
 155///    operand.
 156///
 157/// \headerfile <x86intrin.h>
 158///
 159/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
 160///
 161/// \param __a
 162///    A 128-bit vector of [2 x double] containing one of the source operands.
 163/// \param __b
 164///    A 128-bit vector of [2 x double] containing one of the source operands.
 165/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 166///    product of the lower 64 bits of both operands. The upper 64 bits are
 167///    copied from the upper 64 bits of the first source operand.
 168static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_sd(__m128d __a,
 169                                                                  __m128d __b) {
 170  __a[0] *= __b[0];
 171  return __a;
 172}
 173
 174/// Multiplies two 128-bit vectors of [2 x double].
 175///
 176/// \headerfile <x86intrin.h>
 177///
 178/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
 179///
 180/// \param __a
 181///    A 128-bit vector of [2 x double] containing one of the operands.
 182/// \param __b
 183///    A 128-bit vector of [2 x double] containing one of the operands.
 184/// \returns A 128-bit vector of [2 x double] containing the products of both
 185///    operands.
 186static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_pd(__m128d __a,
 187                                                                  __m128d __b) {
 188  return (__m128d)((__v2df)__a * (__v2df)__b);
 189}
 190
 191/// Divides the lower double-precision value of the first operand by the
 192///    lower double-precision value of the second operand and returns the
 193///    quotient in the lower 64 bits of the result. The upper 64 bits of the
 194///    result are copied from the upper double-precision value of the first
 195///    operand.
 196///
 197/// \headerfile <x86intrin.h>
 198///
 199/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
 200///
 201/// \param __a
 202///    A 128-bit vector of [2 x double] containing the dividend.
 203/// \param __b
 204///    A 128-bit vector of [2 x double] containing divisor.
 205/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 206///    quotient of the lower 64 bits of both operands. The upper 64 bits are
 207///    copied from the upper 64 bits of the first source operand.
 208static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_sd(__m128d __a,
 209                                                                  __m128d __b) {
 210  __a[0] /= __b[0];
 211  return __a;
 212}
 213
 214/// Performs an element-by-element division of two 128-bit vectors of
 215///    [2 x double].
 216///
 217/// \headerfile <x86intrin.h>
 218///
 219/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
 220///
 221/// \param __a
 222///    A 128-bit vector of [2 x double] containing the dividend.
 223/// \param __b
 224///    A 128-bit vector of [2 x double] containing the divisor.
 225/// \returns A 128-bit vector of [2 x double] containing the quotients of both
 226///    operands.
 227static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
 228                                                                  __m128d __b) {
 229  return (__m128d)((__v2df)__a / (__v2df)__b);
 230}
 231
 232/// Calculates the square root of the lower double-precision value of
 233///    the second operand and returns it in the lower 64 bits of the result.
 234///    The upper 64 bits of the result are copied from the upper
 235///    double-precision value of the first operand.
 236///
 237/// \headerfile <x86intrin.h>
 238///
 239/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
 240///
 241/// \param __a
 242///    A 128-bit vector of [2 x double] containing one of the operands. The
 243///    upper 64 bits of this operand are copied to the upper 64 bits of the
 244///    result.
 245/// \param __b
 246///    A 128-bit vector of [2 x double] containing one of the operands. The
 247///    square root is calculated using the lower 64 bits of this operand.
 248/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 249///    square root of the lower 64 bits of operand \a __b, and whose upper 64
 250///    bits are copied from the upper 64 bits of operand \a __a.
 251static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
 252                                                         __m128d __b) {
 253  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
 254  return __extension__(__m128d){__c[0], __a[1]};
 255}
 256
 257/// Calculates the square root of the each of two values stored in a
 258///    128-bit vector of [2 x double].
 259///
 260/// \headerfile <x86intrin.h>
 261///
 262/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
 263///
 264/// \param __a
 265///    A 128-bit vector of [2 x double].
 266/// \returns A 128-bit vector of [2 x double] containing the square roots of the
 267///    values in the operand.
 268static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
 269  return __builtin_ia32_sqrtpd((__v2df)__a);
 270}
 271
 272/// Compares lower 64-bit double-precision values of both operands, and
 273///    returns the lesser of the pair of values in the lower 64-bits of the
 274///    result. The upper 64 bits of the result are copied from the upper
 275///    double-precision value of the first operand.
 276///
 277///    If either value in a comparison is NaN, returns the value from \a __b.
 278///
 279/// \headerfile <x86intrin.h>
 280///
 281/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
 282///
 283/// \param __a
 284///    A 128-bit vector of [2 x double] containing one of the operands. The
 285///    lower 64 bits of this operand are used in the comparison.
 286/// \param __b
 287///    A 128-bit vector of [2 x double] containing one of the operands. The
 288///    lower 64 bits of this operand are used in the comparison.
 289/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 290///    minimum value between both operands. The upper 64 bits are copied from
 291///    the upper 64 bits of the first source operand.
 292static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
 293                                                        __m128d __b) {
 294  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
 295}
 296
 297/// Performs element-by-element comparison of the two 128-bit vectors of
 298///    [2 x double] and returns a vector containing the lesser of each pair of
 299///    values.
 300///
 301///    If either value in a comparison is NaN, returns the value from \a __b.
 302///
 303/// \headerfile <x86intrin.h>
 304///
 305/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
 306///
 307/// \param __a
 308///    A 128-bit vector of [2 x double] containing one of the operands.
 309/// \param __b
 310///    A 128-bit vector of [2 x double] containing one of the operands.
 311/// \returns A 128-bit vector of [2 x double] containing the minimum values
 312///    between both operands.
 313static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
 314                                                        __m128d __b) {
 315  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
 316}
 317
 318/// Compares lower 64-bit double-precision values of both operands, and
 319///    returns the greater of the pair of values in the lower 64-bits of the
 320///    result. The upper 64 bits of the result are copied from the upper
 321///    double-precision value of the first operand.
 322///
 323///    If either value in a comparison is NaN, returns the value from \a __b.
 324///
 325/// \headerfile <x86intrin.h>
 326///
 327/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
 328///
 329/// \param __a
 330///    A 128-bit vector of [2 x double] containing one of the operands. The
 331///    lower 64 bits of this operand are used in the comparison.
 332/// \param __b
 333///    A 128-bit vector of [2 x double] containing one of the operands. The
 334///    lower 64 bits of this operand are used in the comparison.
 335/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
 336///    maximum value between both operands. The upper 64 bits are copied from
 337///    the upper 64 bits of the first source operand.
 338static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
 339                                                        __m128d __b) {
 340  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
 341}
 342
 343/// Performs element-by-element comparison of the two 128-bit vectors of
 344///    [2 x double] and returns a vector containing the greater of each pair
 345///    of values.
 346///
 347///    If either value in a comparison is NaN, returns the value from \a __b.
 348///
 349/// \headerfile <x86intrin.h>
 350///
 351/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
 352///
 353/// \param __a
 354///    A 128-bit vector of [2 x double] containing one of the operands.
 355/// \param __b
 356///    A 128-bit vector of [2 x double] containing one of the operands.
 357/// \returns A 128-bit vector of [2 x double] containing the maximum values
 358///    between both operands.
 359static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
 360                                                        __m128d __b) {
 361  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
 362}
 363
 364/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
 365///
 366/// \headerfile <x86intrin.h>
 367///
 368/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
 369///
 370/// \param __a
 371///    A 128-bit vector of [2 x double] containing one of the source operands.
 372/// \param __b
 373///    A 128-bit vector of [2 x double] containing one of the source operands.
 374/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
 375///    values between both operands.
 376static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_and_pd(__m128d __a,
 377                                                                  __m128d __b) {
 378  return (__m128d)((__v2du)__a & (__v2du)__b);
 379}
 380
 381/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
 382///    the one's complement of the values contained in the first source operand.
 383///
 384/// \headerfile <x86intrin.h>
 385///
 386/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
 387///
 388/// \param __a
 389///    A 128-bit vector of [2 x double] containing the left source operand. The
 390///    one's complement of this value is used in the bitwise AND.
 391/// \param __b
 392///    A 128-bit vector of [2 x double] containing the right source operand.
 393/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
 394///    values in the second operand and the one's complement of the first
 395///    operand.
 396static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
 397_mm_andnot_pd(__m128d __a, __m128d __b) {
 398  return (__m128d)(~(__v2du)__a & (__v2du)__b);
 399}
 400
 401/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
 402///
 403/// \headerfile <x86intrin.h>
 404///
 405/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
 406///
 407/// \param __a
 408///    A 128-bit vector of [2 x double] containing one of the source operands.
 409/// \param __b
 410///    A 128-bit vector of [2 x double] containing one of the source operands.
 411/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
 412///    values between both operands.
 413static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_or_pd(__m128d __a,
 414                                                                 __m128d __b) {
 415  return (__m128d)((__v2du)__a | (__v2du)__b);
 416}
 417
 418/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
 419///
 420/// \headerfile <x86intrin.h>
 421///
 422/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
 423///
 424/// \param __a
 425///    A 128-bit vector of [2 x double] containing one of the source operands.
 426/// \param __b
 427///    A 128-bit vector of [2 x double] containing one of the source operands.
 428/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
 429///    values between both operands.
 430static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_xor_pd(__m128d __a,
 431                                                                  __m128d __b) {
 432  return (__m128d)((__v2du)__a ^ (__v2du)__b);
 433}
 434
 435/// Compares each of the corresponding double-precision values of the
 436///    128-bit vectors of [2 x double] for equality.
 437///
 438///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 439///    If either value in a comparison is NaN, returns false.
 440///
 441/// \headerfile <x86intrin.h>
 442///
 443/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
 444///
 445/// \param __a
 446///    A 128-bit vector of [2 x double].
 447/// \param __b
 448///    A 128-bit vector of [2 x double].
 449/// \returns A 128-bit vector containing the comparison results.
 450static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
 451                                                          __m128d __b) {
 452  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
 453}
 454
 455/// Compares each of the corresponding double-precision values of the
 456///    128-bit vectors of [2 x double] to determine if the values in the first
 457///    operand are less than those in the second operand.
 458///
 459///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 460///    If either value in a comparison is NaN, returns false.
 461///
 462/// \headerfile <x86intrin.h>
 463///
 464/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
 465///
 466/// \param __a
 467///    A 128-bit vector of [2 x double].
 468/// \param __b
 469///    A 128-bit vector of [2 x double].
 470/// \returns A 128-bit vector containing the comparison results.
 471static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
 472                                                          __m128d __b) {
 473  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
 474}
 475
 476/// Compares each of the corresponding double-precision values of the
 477///    128-bit vectors of [2 x double] to determine if the values in the first
 478///    operand are less than or equal to those in the second operand.
 479///
 480///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 481///    If either value in a comparison is NaN, returns false.
 482///
 483/// \headerfile <x86intrin.h>
 484///
 485/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
 486///
 487/// \param __a
 488///    A 128-bit vector of [2 x double].
 489/// \param __b
 490///    A 128-bit vector of [2 x double].
 491/// \returns A 128-bit vector containing the comparison results.
 492static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
 493                                                          __m128d __b) {
 494  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
 495}
 496
 497/// Compares each of the corresponding double-precision values of the
 498///    128-bit vectors of [2 x double] to determine if the values in the first
 499///    operand are greater than those in the second operand.
 500///
 501///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 502///    If either value in a comparison is NaN, returns false.
 503///
 504/// \headerfile <x86intrin.h>
 505///
 506/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
 507///
 508/// \param __a
 509///    A 128-bit vector of [2 x double].
 510/// \param __b
 511///    A 128-bit vector of [2 x double].
 512/// \returns A 128-bit vector containing the comparison results.
 513static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
 514                                                          __m128d __b) {
 515  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
 516}
 517
 518/// Compares each of the corresponding double-precision values of the
 519///    128-bit vectors of [2 x double] to determine if the values in the first
 520///    operand are greater than or equal to those in the second operand.
 521///
 522///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 523///    If either value in a comparison is NaN, returns false.
 524///
 525/// \headerfile <x86intrin.h>
 526///
 527/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
 528///
 529/// \param __a
 530///    A 128-bit vector of [2 x double].
 531/// \param __b
 532///    A 128-bit vector of [2 x double].
 533/// \returns A 128-bit vector containing the comparison results.
 534static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
 535                                                          __m128d __b) {
 536  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
 537}
 538
 539/// Compares each of the corresponding double-precision values of the
 540///    128-bit vectors of [2 x double] to determine if the values in the first
 541///    operand are ordered with respect to those in the second operand.
 542///
 543///    A pair of double-precision values are ordered with respect to each
 544///    other if neither value is a NaN. Each comparison returns 0x0 for false,
 545///    0xFFFFFFFFFFFFFFFF for true.
 546///
 547/// \headerfile <x86intrin.h>
 548///
 549/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
 550///
 551/// \param __a
 552///    A 128-bit vector of [2 x double].
 553/// \param __b
 554///    A 128-bit vector of [2 x double].
 555/// \returns A 128-bit vector containing the comparison results.
 556static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
 557                                                           __m128d __b) {
 558  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
 559}
 560
 561/// Compares each of the corresponding double-precision values of the
 562///    128-bit vectors of [2 x double] to determine if the values in the first
 563///    operand are unordered with respect to those in the second operand.
 564///
 565///    A pair of double-precision values are unordered with respect to each
 566///    other if one or both values are NaN. Each comparison returns 0x0 for
 567///    false, 0xFFFFFFFFFFFFFFFF for true.
 568///
 569/// \headerfile <x86intrin.h>
 570///
 571/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
 572///   instruction.
 573///
 574/// \param __a
 575///    A 128-bit vector of [2 x double].
 576/// \param __b
 577///    A 128-bit vector of [2 x double].
 578/// \returns A 128-bit vector containing the comparison results.
 579static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
 580                                                             __m128d __b) {
 581  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
 582}
 583
 584/// Compares each of the corresponding double-precision values of the
 585///    128-bit vectors of [2 x double] to determine if the values in the first
 586///    operand are unequal to those in the second operand.
 587///
 588///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 589///    If either value in a comparison is NaN, returns true.
 590///
 591/// \headerfile <x86intrin.h>
 592///
 593/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
 594///
 595/// \param __a
 596///    A 128-bit vector of [2 x double].
 597/// \param __b
 598///    A 128-bit vector of [2 x double].
 599/// \returns A 128-bit vector containing the comparison results.
 600static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
 601                                                           __m128d __b) {
 602  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
 603}
 604
 605/// Compares each of the corresponding double-precision values of the
 606///    128-bit vectors of [2 x double] to determine if the values in the first
 607///    operand are not less than those in the second operand.
 608///
 609///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 610///    If either value in a comparison is NaN, returns true.
 611///
 612/// \headerfile <x86intrin.h>
 613///
 614/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
 615///
 616/// \param __a
 617///    A 128-bit vector of [2 x double].
 618/// \param __b
 619///    A 128-bit vector of [2 x double].
 620/// \returns A 128-bit vector containing the comparison results.
 621static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
 622                                                           __m128d __b) {
 623  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
 624}
 625
 626/// Compares each of the corresponding double-precision values of the
 627///    128-bit vectors of [2 x double] to determine if the values in the first
 628///    operand are not less than or equal to those in the second operand.
 629///
 630///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 631///    If either value in a comparison is NaN, returns true.
 632///
 633/// \headerfile <x86intrin.h>
 634///
 635/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
 636///
 637/// \param __a
 638///    A 128-bit vector of [2 x double].
 639/// \param __b
 640///    A 128-bit vector of [2 x double].
 641/// \returns A 128-bit vector containing the comparison results.
 642static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
 643                                                           __m128d __b) {
 644  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
 645}
 646
 647/// Compares each of the corresponding double-precision values of the
 648///    128-bit vectors of [2 x double] to determine if the values in the first
 649///    operand are not greater than those in the second operand.
 650///
 651///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 652///    If either value in a comparison is NaN, returns true.
 653///
 654/// \headerfile <x86intrin.h>
 655///
 656/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
 657///
 658/// \param __a
 659///    A 128-bit vector of [2 x double].
 660/// \param __b
 661///    A 128-bit vector of [2 x double].
 662/// \returns A 128-bit vector containing the comparison results.
 663static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
 664                                                           __m128d __b) {
 665  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
 666}
 667
 668/// Compares each of the corresponding double-precision values of the
 669///    128-bit vectors of [2 x double] to determine if the values in the first
 670///    operand are not greater than or equal to those in the second operand.
 671///
 672///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 673///    If either value in a comparison is NaN, returns true.
 674///
 675/// \headerfile <x86intrin.h>
 676///
 677/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
 678///
 679/// \param __a
 680///    A 128-bit vector of [2 x double].
 681/// \param __b
 682///    A 128-bit vector of [2 x double].
 683/// \returns A 128-bit vector containing the comparison results.
 684static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
 685                                                           __m128d __b) {
 686  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
 687}
 688
 689/// Compares the lower double-precision floating-point values in each of
 690///    the two 128-bit floating-point vectors of [2 x double] for equality.
 691///
 692///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 693///    If either value in a comparison is NaN, returns false.
 694///
 695/// \headerfile <x86intrin.h>
 696///
 697/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
 698///
 699/// \param __a
 700///    A 128-bit vector of [2 x double]. The lower double-precision value is
 701///    compared to the lower double-precision value of \a __b.
 702/// \param __b
 703///    A 128-bit vector of [2 x double]. The lower double-precision value is
 704///    compared to the lower double-precision value of \a __a.
 705/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 706///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 707static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
 708                                                          __m128d __b) {
 709  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
 710}
 711
 712/// Compares the lower double-precision floating-point values in each of
 713///    the two 128-bit floating-point vectors of [2 x double] to determine if
 714///    the value in the first parameter is less than the corresponding value in
 715///    the second parameter.
 716///
 717///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 718///    If either value in a comparison is NaN, returns false.
 719///
 720/// \headerfile <x86intrin.h>
 721///
 722/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
 723///
 724/// \param __a
 725///    A 128-bit vector of [2 x double]. The lower double-precision value is
 726///    compared to the lower double-precision value of \a __b.
 727/// \param __b
 728///    A 128-bit vector of [2 x double]. The lower double-precision value is
 729///    compared to the lower double-precision value of \a __a.
 730/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 731///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 732static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
 733                                                          __m128d __b) {
 734  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
 735}
 736
 737/// Compares the lower double-precision floating-point values in each of
 738///    the two 128-bit floating-point vectors of [2 x double] to determine if
 739///    the value in the first parameter is less than or equal to the
 740///    corresponding value in the second parameter.
 741///
 742///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 743///    If either value in a comparison is NaN, returns false.
 744///
 745/// \headerfile <x86intrin.h>
 746///
 747/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
 748///
 749/// \param __a
 750///    A 128-bit vector of [2 x double]. The lower double-precision value is
 751///    compared to the lower double-precision value of \a __b.
 752/// \param __b
 753///    A 128-bit vector of [2 x double]. The lower double-precision value is
 754///    compared to the lower double-precision value of \a __a.
 755/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 756///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 757static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
 758                                                          __m128d __b) {
 759  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
 760}
 761
 762/// Compares the lower double-precision floating-point values in each of
 763///    the two 128-bit floating-point vectors of [2 x double] to determine if
 764///    the value in the first parameter is greater than the corresponding value
 765///    in the second parameter.
 766///
 767///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 768///    If either value in a comparison is NaN, returns false.
 769///
 770/// \headerfile <x86intrin.h>
 771///
 772/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
 773///
 774/// \param __a
 775///     A 128-bit vector of [2 x double]. The lower double-precision value is
 776///     compared to the lower double-precision value of \a __b.
 777/// \param __b
 778///     A 128-bit vector of [2 x double]. The lower double-precision value is
 779///     compared to the lower double-precision value of \a __a.
 780/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 781///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 782static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
 783                                                          __m128d __b) {
 784  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
 785  return __extension__(__m128d){__c[0], __a[1]};
 786}
 787
 788/// Compares the lower double-precision floating-point values in each of
 789///    the two 128-bit floating-point vectors of [2 x double] to determine if
 790///    the value in the first parameter is greater than or equal to the
 791///    corresponding value in the second parameter.
 792///
 793///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 794///    If either value in a comparison is NaN, returns false.
 795///
 796/// \headerfile <x86intrin.h>
 797///
 798/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
 799///
 800/// \param __a
 801///    A 128-bit vector of [2 x double]. The lower double-precision value is
 802///    compared to the lower double-precision value of \a __b.
 803/// \param __b
 804///    A 128-bit vector of [2 x double]. The lower double-precision value is
 805///    compared to the lower double-precision value of \a __a.
 806/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 807///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 808static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
 809                                                          __m128d __b) {
 810  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
 811  return __extension__(__m128d){__c[0], __a[1]};
 812}
 813
 814/// Compares the lower double-precision floating-point values in each of
 815///    the two 128-bit floating-point vectors of [2 x double] to determine if
 816///    the value in the first parameter is ordered with respect to the
 817///    corresponding value in the second parameter.
 818///
 819///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
 820///    of double-precision values are ordered with respect to each other if
 821///    neither value is a NaN.
 822///
 823/// \headerfile <x86intrin.h>
 824///
 825/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
 826///
 827/// \param __a
 828///    A 128-bit vector of [2 x double]. The lower double-precision value is
 829///    compared to the lower double-precision value of \a __b.
 830/// \param __b
 831///    A 128-bit vector of [2 x double]. The lower double-precision value is
 832///    compared to the lower double-precision value of \a __a.
 833/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 834///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 835static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
 836                                                           __m128d __b) {
 837  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
 838}
 839
 840/// Compares the lower double-precision floating-point values in each of
 841///    the two 128-bit floating-point vectors of [2 x double] to determine if
 842///    the value in the first parameter is unordered with respect to the
 843///    corresponding value in the second parameter.
 844///
 845///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
 846///    of double-precision values are unordered with respect to each other if
 847///    one or both values are NaN.
 848///
 849/// \headerfile <x86intrin.h>
 850///
 851/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
 852///   instruction.
 853///
 854/// \param __a
 855///    A 128-bit vector of [2 x double]. The lower double-precision value is
 856///    compared to the lower double-precision value of \a __b.
 857/// \param __b
 858///    A 128-bit vector of [2 x double]. The lower double-precision value is
 859///    compared to the lower double-precision value of \a __a.
 860/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 861///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 862static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
 863                                                             __m128d __b) {
 864  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
 865}
 866
 867/// Compares the lower double-precision floating-point values in each of
 868///    the two 128-bit floating-point vectors of [2 x double] to determine if
 869///    the value in the first parameter is unequal to the corresponding value in
 870///    the second parameter.
 871///
 872///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 873///    If either value in a comparison is NaN, returns true.
 874///
 875/// \headerfile <x86intrin.h>
 876///
 877/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
 878///
 879/// \param __a
 880///    A 128-bit vector of [2 x double]. The lower double-precision value is
 881///    compared to the lower double-precision value of \a __b.
 882/// \param __b
 883///    A 128-bit vector of [2 x double]. The lower double-precision value is
 884///    compared to the lower double-precision value of \a __a.
 885/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 886///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 887static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
 888                                                           __m128d __b) {
 889  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
 890}
 891
 892/// Compares the lower double-precision floating-point values in each of
 893///    the two 128-bit floating-point vectors of [2 x double] to determine if
 894///    the value in the first parameter is not less than the corresponding
 895///    value in the second parameter.
 896///
 897///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 898///    If either value in a comparison is NaN, returns true.
 899///
 900/// \headerfile <x86intrin.h>
 901///
 902/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
 903///
 904/// \param __a
 905///    A 128-bit vector of [2 x double]. The lower double-precision value is
 906///    compared to the lower double-precision value of \a __b.
 907/// \param __b
 908///    A 128-bit vector of [2 x double]. The lower double-precision value is
 909///    compared to the lower double-precision value of \a __a.
 910/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 911///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 912static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
 913                                                           __m128d __b) {
 914  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
 915}
 916
 917/// Compares the lower double-precision floating-point values in each of
 918///    the two 128-bit floating-point vectors of [2 x double] to determine if
 919///    the value in the first parameter is not less than or equal to the
 920///    corresponding value in the second parameter.
 921///
 922///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 923///    If either value in a comparison is NaN, returns true.
 924///
 925/// \headerfile <x86intrin.h>
 926///
 927/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
 928///
 929/// \param __a
 930///    A 128-bit vector of [2 x double]. The lower double-precision value is
 931///    compared to the lower double-precision value of \a __b.
 932/// \param __b
 933///    A 128-bit vector of [2 x double]. The lower double-precision value is
 934///    compared to the lower double-precision value of \a __a.
 935/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
 936///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 937static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
 938                                                           __m128d __b) {
 939  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
 940}
 941
 942/// Compares the lower double-precision floating-point values in each of
 943///    the two 128-bit floating-point vectors of [2 x double] to determine if
 944///    the value in the first parameter is not greater than the corresponding
 945///    value in the second parameter.
 946///
 947///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 948///    If either value in a comparison is NaN, returns true.
 949///
 950/// \headerfile <x86intrin.h>
 951///
 952/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
 953///
 954/// \param __a
 955///    A 128-bit vector of [2 x double]. The lower double-precision value is
 956///    compared to the lower double-precision value of \a __b.
 957/// \param __b
 958///    A 128-bit vector of [2 x double]. The lower double-precision value is
 959///    compared to the lower double-precision value of \a __a.
 960/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 961///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 962static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
 963                                                           __m128d __b) {
 964  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
 965  return __extension__(__m128d){__c[0], __a[1]};
 966}
 967
 968/// Compares the lower double-precision floating-point values in each of
 969///    the two 128-bit floating-point vectors of [2 x double] to determine if
 970///    the value in the first parameter is not greater than or equal to the
 971///    corresponding value in the second parameter.
 972///
 973///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
 974///    If either value in a comparison is NaN, returns true.
 975///
 976/// \headerfile <x86intrin.h>
 977///
 978/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
 979///
 980/// \param __a
 981///    A 128-bit vector of [2 x double]. The lower double-precision value is
 982///    compared to the lower double-precision value of \a __b.
 983/// \param __b
 984///    A 128-bit vector of [2 x double]. The lower double-precision value is
 985///    compared to the lower double-precision value of \a __a.
 986/// \returns A 128-bit vector. The lower 64 bits contains the comparison
 987///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
 988static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
 989                                                           __m128d __b) {
 990  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
 991  return __extension__(__m128d){__c[0], __a[1]};
 992}
 993
 994/// Compares the lower double-precision floating-point values in each of
 995///    the two 128-bit floating-point vectors of [2 x double] for equality.
 996///
 997///    The comparison returns 0 for false, 1 for true. If either value in a
 998///    comparison is NaN, returns 0.
 999///
1000/// \headerfile <x86intrin.h>
1001///
1002/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1003///
1004/// \param __a
1005///    A 128-bit vector of [2 x double]. The lower double-precision value is
1006///    compared to the lower double-precision value of \a __b.
1007/// \param __b
1008///    A 128-bit vector of [2 x double]. The lower double-precision value is
1009///    compared to the lower double-precision value of \a __a.
1010/// \returns An integer containing the comparison results.
1011static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
1012                                                       __m128d __b) {
1013  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
1014}
1015
1016/// Compares the lower double-precision floating-point values in each of
1017///    the two 128-bit floating-point vectors of [2 x double] to determine if
1018///    the value in the first parameter is less than the corresponding value in
1019///    the second parameter.
1020///
1021///    The comparison returns 0 for false, 1 for true. If either value in a
1022///    comparison is NaN, returns 0.
1023///
1024/// \headerfile <x86intrin.h>
1025///
1026/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1027///
1028/// \param __a
1029///    A 128-bit vector of [2 x double]. The lower double-precision value is
1030///    compared to the lower double-precision value of \a __b.
1031/// \param __b
1032///    A 128-bit vector of [2 x double]. The lower double-precision value is
1033///    compared to the lower double-precision value of \a __a.
1034/// \returns An integer containing the comparison results.
1035static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1036                                                       __m128d __b) {
1037  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1038}
1039
1040/// Compares the lower double-precision floating-point values in each of
1041///    the two 128-bit floating-point vectors of [2 x double] to determine if
1042///    the value in the first parameter is less than or equal to the
1043///    corresponding value in the second parameter.
1044///
1045///    The comparison returns 0 for false, 1 for true. If either value in a
1046///    comparison is NaN, returns 0.
1047///
1048/// \headerfile <x86intrin.h>
1049///
1050/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1051///
1052/// \param __a
1053///    A 128-bit vector of [2 x double]. The lower double-precision value is
1054///    compared to the lower double-precision value of \a __b.
1055/// \param __b
1056///     A 128-bit vector of [2 x double]. The lower double-precision value is
1057///     compared to the lower double-precision value of \a __a.
1058/// \returns An integer containing the comparison results.
1059static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1060                                                       __m128d __b) {
1061  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1062}
1063
1064/// Compares the lower double-precision floating-point values in each of
1065///    the two 128-bit floating-point vectors of [2 x double] to determine if
1066///    the value in the first parameter is greater than the corresponding value
1067///    in the second parameter.
1068///
1069///    The comparison returns 0 for false, 1 for true. If either value in a
1070///    comparison is NaN, returns 0.
1071///
1072/// \headerfile <x86intrin.h>
1073///
1074/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1075///
1076/// \param __a
1077///    A 128-bit vector of [2 x double]. The lower double-precision value is
1078///    compared to the lower double-precision value of \a __b.
1079/// \param __b
1080///    A 128-bit vector of [2 x double]. The lower double-precision value is
1081///    compared to the lower double-precision value of \a __a.
1082/// \returns An integer containing the comparison results.
1083static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1084                                                       __m128d __b) {
1085  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1086}
1087
1088/// Compares the lower double-precision floating-point values in each of
1089///    the two 128-bit floating-point vectors of [2 x double] to determine if
1090///    the value in the first parameter is greater than or equal to the
1091///    corresponding value in the second parameter.
1092///
1093///    The comparison returns 0 for false, 1 for true. If either value in a
1094///    comparison is NaN, returns 0.
1095///
1096/// \headerfile <x86intrin.h>
1097///
1098/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1099///
1100/// \param __a
1101///    A 128-bit vector of [2 x double]. The lower double-precision value is
1102///    compared to the lower double-precision value of \a __b.
1103/// \param __b
1104///    A 128-bit vector of [2 x double]. The lower double-precision value is
1105///    compared to the lower double-precision value of \a __a.
1106/// \returns An integer containing the comparison results.
1107static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1108                                                       __m128d __b) {
1109  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1110}
1111
1112/// Compares the lower double-precision floating-point values in each of
1113///    the two 128-bit floating-point vectors of [2 x double] to determine if
1114///    the value in the first parameter is unequal to the corresponding value in
1115///    the second parameter.
1116///
1117///    The comparison returns 0 for false, 1 for true. If either value in a
1118///    comparison is NaN, returns 1.
1119///
1120/// \headerfile <x86intrin.h>
1121///
1122/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1123///
1124/// \param __a
1125///    A 128-bit vector of [2 x double]. The lower double-precision value is
1126///    compared to the lower double-precision value of \a __b.
1127/// \param __b
1128///    A 128-bit vector of [2 x double]. The lower double-precision value is
1129///    compared to the lower double-precision value of \a __a.
1130/// \returns An integer containing the comparison results.
1131static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1132                                                        __m128d __b) {
1133  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1134}
1135
1136/// Compares the lower double-precision floating-point values in each of
1137///    the two 128-bit floating-point vectors of [2 x double] for equality.
1138///
1139///    The comparison returns 0 for false, 1 for true. If either value in a
1140///    comparison is NaN, returns 0.
1141///
1142/// \headerfile <x86intrin.h>
1143///
1144/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1145///
1146/// \param __a
1147///    A 128-bit vector of [2 x double]. The lower double-precision value is
1148///    compared to the lower double-precision value of \a __b.
1149/// \param __b
1150///    A 128-bit vector of [2 x double]. The lower double-precision value is
1151///    compared to the lower double-precision value of \a __a.
1152/// \returns An integer containing the comparison results.
1153static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1154                                                        __m128d __b) {
1155  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1156}
1157
1158/// Compares the lower double-precision floating-point values in each of
1159///    the two 128-bit floating-point vectors of [2 x double] to determine if
1160///    the value in the first parameter is less than the corresponding value in
1161///    the second parameter.
1162///
1163///    The comparison returns 0 for false, 1 for true. If either value in a
1164///    comparison is NaN, returns 0.
1165///
1166/// \headerfile <x86intrin.h>
1167///
1168/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1169///
1170/// \param __a
1171///    A 128-bit vector of [2 x double]. The lower double-precision value is
1172///    compared to the lower double-precision value of \a __b.
1173/// \param __b
1174///    A 128-bit vector of [2 x double]. The lower double-precision value is
1175///    compared to the lower double-precision value of \a __a.
1176/// \returns An integer containing the comparison results.
1177static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1178                                                        __m128d __b) {
1179  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1180}
1181
1182/// Compares the lower double-precision floating-point values in each of
1183///    the two 128-bit floating-point vectors of [2 x double] to determine if
1184///    the value in the first parameter is less than or equal to the
1185///    corresponding value in the second parameter.
1186///
1187///    The comparison returns 0 for false, 1 for true. If either value in a
1188///    comparison is NaN, returns 0.
1189///
1190/// \headerfile <x86intrin.h>
1191///
1192/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1193///
1194/// \param __a
1195///    A 128-bit vector of [2 x double]. The lower double-precision value is
1196///    compared to the lower double-precision value of \a __b.
1197/// \param __b
1198///     A 128-bit vector of [2 x double]. The lower double-precision value is
1199///     compared to the lower double-precision value of \a __a.
1200/// \returns An integer containing the comparison results.
1201static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1202                                                        __m128d __b) {
1203  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1204}
1205
1206/// Compares the lower double-precision floating-point values in each of
1207///    the two 128-bit floating-point vectors of [2 x double] to determine if
1208///    the value in the first parameter is greater than the corresponding value
1209///    in the second parameter.
1210///
1211///    The comparison returns 0 for false, 1 for true. If either value in a
1212///    comparison is NaN, returns 0.
1213///
1214/// \headerfile <x86intrin.h>
1215///
1216/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1217///
1218/// \param __a
1219///    A 128-bit vector of [2 x double]. The lower double-precision value is
1220///    compared to the lower double-precision value of \a __b.
1221/// \param __b
1222///     A 128-bit vector of [2 x double]. The lower double-precision value is
1223///     compared to the lower double-precision value of \a __a.
1224/// \returns An integer containing the comparison results.
1225static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1226                                                        __m128d __b) {
1227  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1228}
1229
1230/// Compares the lower double-precision floating-point values in each of
1231///    the two 128-bit floating-point vectors of [2 x double] to determine if
1232///    the value in the first parameter is greater than or equal to the
1233///    corresponding value in the second parameter.
1234///
1235///    The comparison returns 0 for false, 1 for true. If either value in a
1236///    comparison is NaN, returns 0.
1237///
1238/// \headerfile <x86intrin.h>
1239///
1240/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1241///
1242/// \param __a
1243///    A 128-bit vector of [2 x double]. The lower double-precision value is
1244///    compared to the lower double-precision value of \a __b.
1245/// \param __b
1246///    A 128-bit vector of [2 x double]. The lower double-precision value is
1247///    compared to the lower double-precision value of \a __a.
1248/// \returns An integer containing the comparison results.
1249static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1250                                                        __m128d __b) {
1251  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1252}
1253
1254/// Compares the lower double-precision floating-point values in each of
1255///    the two 128-bit floating-point vectors of [2 x double] to determine if
1256///    the value in the first parameter is unequal to the corresponding value in
1257///    the second parameter.
1258///
1259///    The comparison returns 0 for false, 1 for true. If either value in a
1260///    comparison is NaN, returns 1.
1261///
1262/// \headerfile <x86intrin.h>
1263///
1264/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1265///
1266/// \param __a
1267///    A 128-bit vector of [2 x double]. The lower double-precision value is
1268///    compared to the lower double-precision value of \a __b.
1269/// \param __b
1270///    A 128-bit vector of [2 x double]. The lower double-precision value is
1271///    compared to the lower double-precision value of \a __a.
1272/// \returns An integer containing the comparison result.
1273static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1274                                                         __m128d __b) {
1275  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1276}
1277
1278/// Converts the two double-precision floating-point elements of a
1279///    128-bit vector of [2 x double] into two single-precision floating-point
1280///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1281///    The upper 64 bits of the result vector are set to zero.
1282///
1283/// \headerfile <x86intrin.h>
1284///
1285/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1286///
1287/// \param __a
1288///    A 128-bit vector of [2 x double].
1289/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1290///    converted values. The upper 64 bits are set to zero.
1291static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1292  return __builtin_ia32_cvtpd2ps((__v2df)__a);
1293}
1294
1295/// Converts the lower two single-precision floating-point elements of a
1296///    128-bit vector of [4 x float] into two double-precision floating-point
1297///    values, returned in a 128-bit vector of [2 x double]. The upper two
1298///    elements of the input vector are unused.
1299///
1300/// \headerfile <x86intrin.h>
1301///
1302/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1303///
1304/// \param __a
1305///    A 128-bit vector of [4 x float]. The lower two single-precision
1306///    floating-point elements are converted to double-precision values. The
1307///    upper two elements are unused.
1308/// \returns A 128-bit vector of [2 x double] containing the converted values.
1309static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1310_mm_cvtps_pd(__m128 __a) {
1311  return (__m128d) __builtin_convertvector(
1312      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1313}
1314
1315/// Converts the lower two integer elements of a 128-bit vector of
1316///    [4 x i32] into two double-precision floating-point values, returned in a
1317///    128-bit vector of [2 x double].
1318///
1319///    The upper two elements of the input vector are unused.
1320///
1321/// \headerfile <x86intrin.h>
1322///
1323/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1324///
1325/// \param __a
1326///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1327///    converted to double-precision values.
1328///
1329///    The upper two elements are unused.
1330/// \returns A 128-bit vector of [2 x double] containing the converted values.
1331static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1332_mm_cvtepi32_pd(__m128i __a) {
1333  return (__m128d) __builtin_convertvector(
1334      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1335}
1336
1337/// Converts the two double-precision floating-point elements of a
1338///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1339///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1340///    64 bits of the result vector are set to zero.
1341///
1342///    If a converted value does not fit in a 32-bit integer, raises a
1343///    floating-point invalid exception. If the exception is masked, returns
1344///    the most negative integer.
1345///
1346/// \headerfile <x86intrin.h>
1347///
1348/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1349///
1350/// \param __a
1351///    A 128-bit vector of [2 x double].
1352/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1353///    converted values. The upper 64 bits are set to zero.
1354static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1355  return __builtin_ia32_cvtpd2dq((__v2df)__a);
1356}
1357
1358/// Converts the low-order element of a 128-bit vector of [2 x double]
1359///    into a 32-bit signed integer value.
1360///
1361///    If the converted value does not fit in a 32-bit integer, raises a
1362///    floating-point invalid exception. If the exception is masked, returns
1363///    the most negative integer.
1364///
1365/// \headerfile <x86intrin.h>
1366///
1367/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1368///
1369/// \param __a
1370///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1371///    conversion.
1372/// \returns A 32-bit signed integer containing the converted value.
1373static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1374  return __builtin_ia32_cvtsd2si((__v2df)__a);
1375}
1376
1377/// Converts the lower double-precision floating-point element of a
1378///    128-bit vector of [2 x double], in the second parameter, into a
1379///    single-precision floating-point value, returned in the lower 32 bits of a
1380///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1381///    copied from the upper 96 bits of the first parameter.
1382///
1383/// \headerfile <x86intrin.h>
1384///
1385/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1386///
1387/// \param __a
1388///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1389///    copied to the upper 96 bits of the result.
1390/// \param __b
1391///    A 128-bit vector of [2 x double]. The lower double-precision
1392///    floating-point element is used in the conversion.
1393/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1394///    converted value from the second parameter. The upper 96 bits are copied
1395///    from the upper 96 bits of the first parameter.
1396static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1397                                                         __m128d __b) {
1398  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1399}
1400
1401/// Converts a 32-bit signed integer value, in the second parameter, into
1402///    a double-precision floating-point value, returned in the lower 64 bits of
1403///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1404///    are copied from the upper 64 bits of the first parameter.
1405///
1406/// \headerfile <x86intrin.h>
1407///
1408/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1409///
1410/// \param __a
1411///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1412///    copied to the upper 64 bits of the result.
1413/// \param __b
1414///    A 32-bit signed integer containing the value to be converted.
1415/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1416///    converted value from the second parameter. The upper 64 bits are copied
1417///    from the upper 64 bits of the first parameter.
1418static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1419_mm_cvtsi32_sd(__m128d __a, int __b) {
1420  __a[0] = __b;
1421  return __a;
1422}
1423
1424/// Converts the lower single-precision floating-point element of a
1425///    128-bit vector of [4 x float], in the second parameter, into a
1426///    double-precision floating-point value, returned in the lower 64 bits of
1427///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1428///    are copied from the upper 64 bits of the first parameter.
1429///
1430/// \headerfile <x86intrin.h>
1431///
1432/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1433///
1434/// \param __a
1435///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1436///    copied to the upper 64 bits of the result.
1437/// \param __b
1438///    A 128-bit vector of [4 x float]. The lower single-precision
1439///    floating-point element is used in the conversion.
1440/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1441///    converted value from the second parameter. The upper 64 bits are copied
1442///    from the upper 64 bits of the first parameter.
1443static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1444_mm_cvtss_sd(__m128d __a, __m128 __b) {
1445  __a[0] = __b[0];
1446  return __a;
1447}
1448
1449/// Converts the two double-precision floating-point elements of a
1450///    128-bit vector of [2 x double] into two signed truncated (rounded
1451///    toward zero) 32-bit integer values, returned in the lower 64 bits
1452///    of a 128-bit vector of [4 x i32].
1453///
1454///    If a converted value does not fit in a 32-bit integer, raises a
1455///    floating-point invalid exception. If the exception is masked, returns
1456///    the most negative integer.
1457///
1458/// \headerfile <x86intrin.h>
1459///
1460/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1461///   instruction.
1462///
1463/// \param __a
1464///    A 128-bit vector of [2 x double].
1465/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1466///    converted values. The upper 64 bits are set to zero.
1467static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1468  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1469}
1470
1471/// Converts the low-order element of a [2 x double] vector into a 32-bit
1472///    signed truncated (rounded toward zero) integer value.
1473///
1474///    If the converted value does not fit in a 32-bit integer, raises a
1475///    floating-point invalid exception. If the exception is masked, returns
1476///    the most negative integer.
1477///
1478/// \headerfile <x86intrin.h>
1479///
1480/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1481///   instruction.
1482///
1483/// \param __a
1484///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1485///    conversion.
1486/// \returns A 32-bit signed integer containing the converted value.
1487static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1488  return __builtin_ia32_cvttsd2si((__v2df)__a);
1489}
1490
1491/// Converts the two double-precision floating-point elements of a
1492///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1493///    returned in a 64-bit vector of [2 x i32].
1494///
1495///    If a converted value does not fit in a 32-bit integer, raises a
1496///    floating-point invalid exception. If the exception is masked, returns
1497///    the most negative integer.
1498///
1499/// \headerfile <x86intrin.h>
1500///
1501/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1502///
1503/// \param __a
1504///    A 128-bit vector of [2 x double].
1505/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1506static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvtpd_pi32(__m128d __a) {
1507  return __trunc64(__builtin_ia32_cvtpd2dq((__v2df)__a));
1508}
1509
1510/// Converts the two double-precision floating-point elements of a
1511///    128-bit vector of [2 x double] into two signed truncated (rounded toward
1512///    zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1513///
1514///    If a converted value does not fit in a 32-bit integer, raises a
1515///    floating-point invalid exception. If the exception is masked, returns
1516///    the most negative integer.
1517///
1518/// \headerfile <x86intrin.h>
1519///
1520/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1521///
1522/// \param __a
1523///    A 128-bit vector of [2 x double].
1524/// \returns A 64-bit vector of [2 x i32] containing the converted values.
1525static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_cvttpd_pi32(__m128d __a) {
1526  return __trunc64(__builtin_ia32_cvttpd2dq((__v2df)__a));
1527}
1528
1529/// Converts the two signed 32-bit integer elements of a 64-bit vector of
1530///    [2 x i32] into two double-precision floating-point values, returned in a
1531///    128-bit vector of [2 x double].
1532///
1533/// \headerfile <x86intrin.h>
1534///
1535/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1536///
1537/// \param __a
1538///    A 64-bit vector of [2 x i32].
1539/// \returns A 128-bit vector of [2 x double] containing the converted values.
1540static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1541_mm_cvtpi32_pd(__m64 __a) {
1542  return (__m128d) __builtin_convertvector((__v2si)__a, __v2df);
1543}
1544
1545/// Returns the low-order element of a 128-bit vector of [2 x double] as
1546///    a double-precision floating-point value.
1547///
1548/// \headerfile <x86intrin.h>
1549///
1550/// This intrinsic has no corresponding instruction.
1551///
1552/// \param __a
1553///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1554/// \returns A double-precision floating-point value copied from the lower 64
1555///    bits of \a __a.
1556static __inline__ double __DEFAULT_FN_ATTRS_CONSTEXPR
1557_mm_cvtsd_f64(__m128d __a) {
1558  return __a[0];
1559}
1560
1561/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1562///    memory location.
1563///
1564/// \headerfile <x86intrin.h>
1565///
1566/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1567///
1568/// \param __dp
1569///    A pointer to a 128-bit memory location. The address of the memory
1570///    location has to be 16-byte aligned.
1571/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1572static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1573  return *(const __m128d *)__dp;
1574}
1575
1576/// Loads a double-precision floating-point value from a specified memory
1577///    location and duplicates it to both vector elements of a 128-bit vector of
1578///    [2 x double].
1579///
1580/// \headerfile <x86intrin.h>
1581///
1582/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1583///
1584/// \param __dp
1585///    A pointer to a memory location containing a double-precision value.
1586/// \returns A 128-bit vector of [2 x double] containing the loaded and
1587///    duplicated values.
1588static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1589  struct __mm_load1_pd_struct {
1590    double __u;
1591  } __attribute__((__packed__, __may_alias__));
1592  double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1593  return __extension__(__m128d){__u, __u};
1594}
1595
1596#define _mm_load_pd1(dp) _mm_load1_pd(dp)
1597
1598/// Loads two double-precision values, in reverse order, from an aligned
1599///    memory location into a 128-bit vector of [2 x double].
1600///
1601/// \headerfile <x86intrin.h>
1602///
1603/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1604/// needed shuffling instructions. In AVX mode, the shuffling may be combined
1605/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1606///
1607/// \param __dp
1608///    A 16-byte aligned pointer to an array of double-precision values to be
1609///    loaded in reverse order.
1610/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1611///    values.
1612static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1613  __m128d __u = *(const __m128d *)__dp;
1614  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1615}
1616
1617/// Loads a 128-bit floating-point vector of [2 x double] from an
1618///    unaligned memory location.
1619///
1620/// \headerfile <x86intrin.h>
1621///
1622/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1623///
1624/// \param __dp
1625///    A pointer to a 128-bit memory location. The address of the memory
1626///    location does not have to be aligned.
1627/// \returns A 128-bit vector of [2 x double] containing the loaded values.
1628static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1629  struct __loadu_pd {
1630    __m128d_u __v;
1631  } __attribute__((__packed__, __may_alias__));
1632  return ((const struct __loadu_pd *)__dp)->__v;
1633}
1634
1635/// Loads a 64-bit integer value to the low element of a 128-bit integer
1636///    vector and clears the upper element.
1637///
1638/// \headerfile <x86intrin.h>
1639///
1640/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1641///
1642/// \param __a
1643///    A pointer to a 64-bit memory location. The address of the memory
1644///    location does not have to be aligned.
1645/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1646static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1647  struct __loadu_si64 {
1648    long long __v;
1649  } __attribute__((__packed__, __may_alias__));
1650  long long __u = ((const struct __loadu_si64 *)__a)->__v;
1651  return __extension__(__m128i)(__v2di){__u, 0LL};
1652}
1653
1654/// Loads a 32-bit integer value to the low element of a 128-bit integer
1655///    vector and clears the upper element.
1656///
1657/// \headerfile <x86intrin.h>
1658///
1659/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1660///
1661/// \param __a
1662///    A pointer to a 32-bit memory location. The address of the memory
1663///    location does not have to be aligned.
1664/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1665static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1666  struct __loadu_si32 {
1667    int __v;
1668  } __attribute__((__packed__, __may_alias__));
1669  int __u = ((const struct __loadu_si32 *)__a)->__v;
1670  return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1671}
1672
1673/// Loads a 16-bit integer value to the low element of a 128-bit integer
1674///    vector and clears the upper element.
1675///
1676/// \headerfile <x86intrin.h>
1677///
1678/// This intrinsic does not correspond to a specific instruction.
1679///
1680/// \param __a
1681///    A pointer to a 16-bit memory location. The address of the memory
1682///    location does not have to be aligned.
1683/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1684static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1685  struct __loadu_si16 {
1686    short __v;
1687  } __attribute__((__packed__, __may_alias__));
1688  short __u = ((const struct __loadu_si16 *)__a)->__v;
1689  return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1690}
1691
1692/// Loads a 64-bit double-precision value to the low element of a
1693///    128-bit integer vector and clears the upper element.
1694///
1695/// \headerfile <x86intrin.h>
1696///
1697/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1698///
1699/// \param __dp
1700///    A pointer to a memory location containing a double-precision value.
1701///    The address of the memory location does not have to be aligned.
1702/// \returns A 128-bit vector of [2 x double] containing the loaded value.
1703static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1704  struct __mm_load_sd_struct {
1705    double __u;
1706  } __attribute__((__packed__, __may_alias__));
1707  double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1708  return __extension__(__m128d){__u, 0};
1709}
1710
1711/// Loads a double-precision value into the high-order bits of a 128-bit
1712///    vector of [2 x double]. The low-order bits are copied from the low-order
1713///    bits of the first operand.
1714///
1715/// \headerfile <x86intrin.h>
1716///
1717/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1718///
1719/// \param __a
1720///    A 128-bit vector of [2 x double]. \n
1721///    Bits [63:0] are written to bits [63:0] of the result.
1722/// \param __dp
1723///    A pointer to a 64-bit memory location containing a double-precision
1724///    floating-point value that is loaded. The loaded value is written to bits
1725///    [127:64] of the result. The address of the memory location does not have
1726///    to be aligned.
1727/// \returns A 128-bit vector of [2 x double] containing the moved values.
1728static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1729                                                          double const *__dp) {
1730  struct __mm_loadh_pd_struct {
1731    double __u;
1732  } __attribute__((__packed__, __may_alias__));
1733  double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1734  return __extension__(__m128d){__a[0], __u};
1735}
1736
1737/// Loads a double-precision value into the low-order bits of a 128-bit
1738///    vector of [2 x double]. The high-order bits are copied from the
1739///    high-order bits of the first operand.
1740///
1741/// \headerfile <x86intrin.h>
1742///
1743/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1744///
1745/// \param __a
1746///    A 128-bit vector of [2 x double]. \n
1747///    Bits [127:64] are written to bits [127:64] of the result.
1748/// \param __dp
1749///    A pointer to a 64-bit memory location containing a double-precision
1750///    floating-point value that is loaded. The loaded value is written to bits
1751///    [63:0] of the result. The address of the memory location does not have to
1752///    be aligned.
1753/// \returns A 128-bit vector of [2 x double] containing the moved values.
1754static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1755                                                          double const *__dp) {
1756  struct __mm_loadl_pd_struct {
1757    double __u;
1758  } __attribute__((__packed__, __may_alias__));
1759  double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1760  return __extension__(__m128d){__u, __a[1]};
1761}
1762
1763/// Constructs a 128-bit floating-point vector of [2 x double] with
1764///    unspecified content. This could be used as an argument to another
1765///    intrinsic function where the argument is required but the value is not
1766///    actually used.
1767///
1768/// \headerfile <x86intrin.h>
1769///
1770/// This intrinsic has no corresponding instruction.
1771///
1772/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1773///    content.
1774static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1775  return (__m128d)__builtin_ia32_undef128();
1776}
1777
1778/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1779///    64 bits of the vector are initialized with the specified double-precision
1780///    floating-point value. The upper 64 bits are set to zero.
1781///
1782/// \headerfile <x86intrin.h>
1783///
1784/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1785///
1786/// \param __w
1787///    A double-precision floating-point value used to initialize the lower 64
1788///    bits of the result.
1789/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1790///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1791///    set to zero.
1792static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_sd(double __w) {
1793  return __extension__(__m128d){__w, 0.0};
1794}
1795
1796/// Constructs a 128-bit floating-point vector of [2 x double], with each
1797///    of the two double-precision floating-point vector elements set to the
1798///    specified double-precision floating-point value.
1799///
1800/// \headerfile <x86intrin.h>
1801///
1802/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1803///
1804/// \param __w
1805///    A double-precision floating-point value used to initialize each vector
1806///    element of the result.
1807/// \returns An initialized 128-bit floating-point vector of [2 x double].
1808static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_pd(double __w) {
1809  return __extension__(__m128d){__w, __w};
1810}
1811
1812/// Constructs a 128-bit floating-point vector of [2 x double], with each
1813///    of the two double-precision floating-point vector elements set to the
1814///    specified double-precision floating-point value.
1815///
1816/// \headerfile <x86intrin.h>
1817///
1818/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1819///
1820/// \param __w
1821///    A double-precision floating-point value used to initialize each vector
1822///    element of the result.
1823/// \returns An initialized 128-bit floating-point vector of [2 x double].
1824static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd1(double __w) {
1825  return _mm_set1_pd(__w);
1826}
1827
1828/// Constructs a 128-bit floating-point vector of [2 x double]
1829///    initialized with the specified double-precision floating-point values.
1830///
1831/// \headerfile <x86intrin.h>
1832///
1833/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1834///
1835/// \param __w
1836///    A double-precision floating-point value used to initialize the upper 64
1837///    bits of the result.
1838/// \param __x
1839///    A double-precision floating-point value used to initialize the lower 64
1840///    bits of the result.
1841/// \returns An initialized 128-bit floating-point vector of [2 x double].
1842static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_pd(double __w,
1843                                                                  double __x) {
1844  return __extension__(__m128d){__x, __w};
1845}
1846
1847/// Constructs a 128-bit floating-point vector of [2 x double],
1848///    initialized in reverse order with the specified double-precision
1849///    floating-point values.
1850///
1851/// \headerfile <x86intrin.h>
1852///
1853/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1854///
1855/// \param __w
1856///    A double-precision floating-point value used to initialize the lower 64
1857///    bits of the result.
1858/// \param __x
1859///    A double-precision floating-point value used to initialize the upper 64
1860///    bits of the result.
1861/// \returns An initialized 128-bit floating-point vector of [2 x double].
1862static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setr_pd(double __w,
1863                                                                   double __x) {
1864  return __extension__(__m128d){__w, __x};
1865}
1866
1867/// Constructs a 128-bit floating-point vector of [2 x double]
1868///    initialized to zero.
1869///
1870/// \headerfile <x86intrin.h>
1871///
1872/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1873///
1874/// \returns An initialized 128-bit floating-point vector of [2 x double] with
1875///    all elements set to zero.
1876static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_pd(void) {
1877  return __extension__(__m128d){0.0, 0.0};
1878}
1879
1880/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1881///    64 bits are set to the lower 64 bits of the second parameter. The upper
1882///    64 bits are set to the upper 64 bits of the first parameter.
1883///
1884/// \headerfile <x86intrin.h>
1885///
1886/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1887///
1888/// \param __a
1889///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1890///    upper 64 bits of the result.
1891/// \param __b
1892///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1893///    lower 64 bits of the result.
1894/// \returns A 128-bit vector of [2 x double] containing the moved values.
1895static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
1896_mm_move_sd(__m128d __a, __m128d __b) {
1897  __a[0] = __b[0];
1898  return __a;
1899}
1900
1901/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1902///    memory location.
1903///
1904/// \headerfile <x86intrin.h>
1905///
1906/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1907///
1908/// \param __dp
1909///    A pointer to a 64-bit memory location.
1910/// \param __a
1911///    A 128-bit vector of [2 x double] containing the value to be stored.
1912static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1913                                                       __m128d __a) {
1914  struct __mm_store_sd_struct {
1915    double __u;
1916  } __attribute__((__packed__, __may_alias__));
1917  ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1918}
1919
1920/// Moves packed double-precision values from a 128-bit vector of
1921///    [2 x double] to a memory location.
1922///
1923/// \headerfile <x86intrin.h>
1924///
1925/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1926///
1927/// \param __dp
1928///    A pointer to an aligned memory location that can store two
1929///    double-precision values.
1930/// \param __a
1931///    A packed 128-bit vector of [2 x double] containing the values to be
1932///    moved.
1933static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1934                                                       __m128d __a) {
1935  *(__m128d *)__dp = __a;
1936}
1937
1938/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1939///    the upper and lower 64 bits of a memory location.
1940///
1941/// \headerfile <x86intrin.h>
1942///
1943/// This intrinsic corresponds to the
1944///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1945///
1946/// \param __dp
1947///    A pointer to a memory location that can store two double-precision
1948///    values.
1949/// \param __a
1950///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1951///    of the values in \a __dp.
1952static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1953                                                        __m128d __a) {
1954  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1955  _mm_store_pd(__dp, __a);
1956}
1957
1958/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1959///    the upper and lower 64 bits of a memory location.
1960///
1961/// \headerfile <x86intrin.h>
1962///
1963/// This intrinsic corresponds to the
1964///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1965///
1966/// \param __dp
1967///    A pointer to a memory location that can store two double-precision
1968///    values.
1969/// \param __a
1970///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1971///    of the values in \a __dp.
1972static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1973                                                        __m128d __a) {
1974  _mm_store1_pd(__dp, __a);
1975}
1976
1977/// Stores a 128-bit vector of [2 x double] into an unaligned memory
1978///    location.
1979///
1980/// \headerfile <x86intrin.h>
1981///
1982/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1983///
1984/// \param __dp
1985///    A pointer to a 128-bit memory location. The address of the memory
1986///    location does not have to be aligned.
1987/// \param __a
1988///    A 128-bit vector of [2 x double] containing the values to be stored.
1989static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1990                                                        __m128d __a) {
1991  struct __storeu_pd {
1992    __m128d_u __v;
1993  } __attribute__((__packed__, __may_alias__));
1994  ((struct __storeu_pd *)__dp)->__v = __a;
1995}
1996
1997/// Stores two double-precision values, in reverse order, from a 128-bit
1998///    vector of [2 x double] to a 16-byte aligned memory location.
1999///
2000/// \headerfile <x86intrin.h>
2001///
2002/// This intrinsic corresponds to a shuffling instruction followed by a
2003/// <c> VMOVAPD / MOVAPD </c> instruction.
2004///
2005/// \param __dp
2006///    A pointer to a 16-byte aligned memory location that can store two
2007///    double-precision values.
2008/// \param __a
2009///    A 128-bit vector of [2 x double] containing the values to be reversed and
2010///    stored.
2011static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
2012                                                        __m128d __a) {
2013  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
2014  *(__m128d *)__dp = __a;
2015}
2016
2017/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
2018///    memory location.
2019///
2020/// \headerfile <x86intrin.h>
2021///
2022/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2023///
2024/// \param __dp
2025///    A pointer to a 64-bit memory location.
2026/// \param __a
2027///    A 128-bit vector of [2 x double] containing the value to be stored.
2028static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2029                                                        __m128d __a) {
2030  struct __mm_storeh_pd_struct {
2031    double __u;
2032  } __attribute__((__packed__, __may_alias__));
2033  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2034}
2035
2036/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2037///    memory location.
2038///
2039/// \headerfile <x86intrin.h>
2040///
2041/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2042///
2043/// \param __dp
2044///    A pointer to a 64-bit memory location.
2045/// \param __a
2046///    A 128-bit vector of [2 x double] containing the value to be stored.
2047static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2048                                                        __m128d __a) {
2049  struct __mm_storeh_pd_struct {
2050    double __u;
2051  } __attribute__((__packed__, __may_alias__));
2052  ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2053}
2054
2055/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2056///    saving the lower 8 bits of each sum in the corresponding element of a
2057///    128-bit result vector of [16 x i8].
2058///
2059///    The integer elements of both parameters can be either signed or unsigned.
2060///
2061/// \headerfile <x86intrin.h>
2062///
2063/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2064///
2065/// \param __a
2066///    A 128-bit vector of [16 x i8].
2067/// \param __b
2068///    A 128-bit vector of [16 x i8].
2069/// \returns A 128-bit vector of [16 x i8] containing the sums of both
2070///    parameters.
2071static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2072                                                          __m128i __b) {
2073  return (__m128i)((__v16qu)__a + (__v16qu)__b);
2074}
2075
2076/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2077///    saving the lower 16 bits of each sum in the corresponding element of a
2078///    128-bit result vector of [8 x i16].
2079///
2080///    The integer elements of both parameters can be either signed or unsigned.
2081///
2082/// \headerfile <x86intrin.h>
2083///
2084/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2085///
2086/// \param __a
2087///    A 128-bit vector of [8 x i16].
2088/// \param __b
2089///    A 128-bit vector of [8 x i16].
2090/// \returns A 128-bit vector of [8 x i16] containing the sums of both
2091///    parameters.
2092static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2093                                                           __m128i __b) {
2094  return (__m128i)((__v8hu)__a + (__v8hu)__b);
2095}
2096
2097/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2098///    saving the lower 32 bits of each sum in the corresponding element of a
2099///    128-bit result vector of [4 x i32].
2100///
2101///    The integer elements of both parameters can be either signed or unsigned.
2102///
2103/// \headerfile <x86intrin.h>
2104///
2105/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2106///
2107/// \param __a
2108///    A 128-bit vector of [4 x i32].
2109/// \param __b
2110///    A 128-bit vector of [4 x i32].
2111/// \returns A 128-bit vector of [4 x i32] containing the sums of both
2112///    parameters.
2113static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2114_mm_add_epi32(__m128i __a, __m128i __b) {
2115  return (__m128i)((__v4su)__a + (__v4su)__b);
2116}
2117
2118/// Adds two signed or unsigned 64-bit integer values, returning the
2119///    lower 64 bits of the sum.
2120///
2121/// \headerfile <x86intrin.h>
2122///
2123/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2124///
2125/// \param __a
2126///    A 64-bit integer.
2127/// \param __b
2128///    A 64-bit integer.
2129/// \returns A 64-bit integer containing the sum of both parameters.
2130static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_add_si64(__m64 __a, __m64 __b) {
2131  return (__m64)(((unsigned long long)__a) + ((unsigned long long)__b));
2132}
2133
2134/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2135///    saving the lower 64 bits of each sum in the corresponding element of a
2136///    128-bit result vector of [2 x i64].
2137///
2138///    The integer elements of both parameters can be either signed or unsigned.
2139///
2140/// \headerfile <x86intrin.h>
2141///
2142/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2143///
2144/// \param __a
2145///    A 128-bit vector of [2 x i64].
2146/// \param __b
2147///    A 128-bit vector of [2 x i64].
2148/// \returns A 128-bit vector of [2 x i64] containing the sums of both
2149///    parameters.
2150static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2151_mm_add_epi64(__m128i __a, __m128i __b) {
2152  return (__m128i)((__v2du)__a + (__v2du)__b);
2153}
2154
2155/// Adds, with saturation, the corresponding elements of two 128-bit
2156///    signed [16 x i8] vectors, saving each sum in the corresponding element
2157///    of a 128-bit result vector of [16 x i8].
2158///
2159///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2160///    less than 0x80 are saturated to 0x80.
2161///
2162/// \headerfile <x86intrin.h>
2163///
2164/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2165///
2166/// \param __a
2167///    A 128-bit signed [16 x i8] vector.
2168/// \param __b
2169///    A 128-bit signed [16 x i8] vector.
2170/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2171///    both parameters.
2172static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2173                                                           __m128i __b) {
2174  return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2175}
2176
2177/// Adds, with saturation, the corresponding elements of two 128-bit
2178///    signed [8 x i16] vectors, saving each sum in the corresponding element
2179///    of a 128-bit result vector of [8 x i16].
2180///
2181///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2182///    less than 0x8000 are saturated to 0x8000.
2183///
2184/// \headerfile <x86intrin.h>
2185///
2186/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2187///
2188/// \param __a
2189///    A 128-bit signed [8 x i16] vector.
2190/// \param __b
2191///    A 128-bit signed [8 x i16] vector.
2192/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2193///    both parameters.
2194static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2195                                                            __m128i __b) {
2196  return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2197}
2198
2199/// Adds, with saturation, the corresponding elements of two 128-bit
2200///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2201///    of a 128-bit result vector of [16 x i8].
2202///
2203///    Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2204///    saturated to 0x00.
2205///
2206/// \headerfile <x86intrin.h>
2207///
2208/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2209///
2210/// \param __a
2211///    A 128-bit unsigned [16 x i8] vector.
2212/// \param __b
2213///    A 128-bit unsigned [16 x i8] vector.
2214/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2215///    of both parameters.
2216static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2217                                                           __m128i __b) {
2218  return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2219}
2220
2221/// Adds, with saturation, the corresponding elements of two 128-bit
2222///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2223///    of a 128-bit result vector of [8 x i16].
2224///
2225///    Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2226///    are saturated to 0x0000.
2227///
2228/// \headerfile <x86intrin.h>
2229///
2230/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2231///
2232/// \param __a
2233///    A 128-bit unsigned [8 x i16] vector.
2234/// \param __b
2235///    A 128-bit unsigned [8 x i16] vector.
2236/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2237///    of both parameters.
2238static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2239                                                            __m128i __b) {
2240  return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2241}
2242
2243/// Computes the rounded averages of corresponding elements of two
2244///    128-bit unsigned [16 x i8] vectors, saving each result in the
2245///    corresponding element of a 128-bit result vector of [16 x i8].
2246///
2247/// \headerfile <x86intrin.h>
2248///
2249/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2250///
2251/// \param __a
2252///    A 128-bit unsigned [16 x i8] vector.
2253/// \param __b
2254///    A 128-bit unsigned [16 x i8] vector.
2255/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2256///    averages of both parameters.
2257static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2258                                                          __m128i __b) {
2259  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2260}
2261
2262/// Computes the rounded averages of corresponding elements of two
2263///    128-bit unsigned [8 x i16] vectors, saving each result in the
2264///    corresponding element of a 128-bit result vector of [8 x i16].
2265///
2266/// \headerfile <x86intrin.h>
2267///
2268/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2269///
2270/// \param __a
2271///    A 128-bit unsigned [8 x i16] vector.
2272/// \param __b
2273///    A 128-bit unsigned [8 x i16] vector.
2274/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2275///    averages of both parameters.
2276static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2277                                                           __m128i __b) {
2278  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2279}
2280
2281/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2282///    vectors, producing eight intermediate 32-bit signed integer products, and
2283///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2284///    [4 x i32] vector.
2285///
2286///    For example, bits [15:0] of both parameters are multiplied producing a
2287///    32-bit product, bits [31:16] of both parameters are multiplied producing
2288///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2289///    of the result.
2290///
2291/// \headerfile <x86intrin.h>
2292///
2293/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2294///
2295/// \param __a
2296///    A 128-bit signed [8 x i16] vector.
2297/// \param __b
2298///    A 128-bit signed [8 x i16] vector.
2299/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2300///    of both parameters.
2301static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2302                                                            __m128i __b) {
2303  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2304}
2305
2306/// Compares corresponding elements of two 128-bit signed [8 x i16]
2307///    vectors, saving the greater value from each comparison in the
2308///    corresponding element of a 128-bit result vector of [8 x i16].
2309///
2310/// \headerfile <x86intrin.h>
2311///
2312/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2313///
2314/// \param __a
2315///    A 128-bit signed [8 x i16] vector.
2316/// \param __b
2317///    A 128-bit signed [8 x i16] vector.
2318/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2319///    each comparison.
2320static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2321                                                           __m128i __b) {
2322  return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2323}
2324
2325/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2326///    vectors, saving the greater value from each comparison in the
2327///    corresponding element of a 128-bit result vector of [16 x i8].
2328///
2329/// \headerfile <x86intrin.h>
2330///
2331/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2332///
2333/// \param __a
2334///    A 128-bit unsigned [16 x i8] vector.
2335/// \param __b
2336///    A 128-bit unsigned [16 x i8] vector.
2337/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2338///    each comparison.
2339static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2340                                                          __m128i __b) {
2341  return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2342}
2343
2344/// Compares corresponding elements of two 128-bit signed [8 x i16]
2345///    vectors, saving the smaller value from each comparison in the
2346///    corresponding element of a 128-bit result vector of [8 x i16].
2347///
2348/// \headerfile <x86intrin.h>
2349///
2350/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2351///
2352/// \param __a
2353///    A 128-bit signed [8 x i16] vector.
2354/// \param __b
2355///    A 128-bit signed [8 x i16] vector.
2356/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2357///    each comparison.
2358static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2359                                                           __m128i __b) {
2360  return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2361}
2362
2363/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2364///    vectors, saving the smaller value from each comparison in the
2365///    corresponding element of a 128-bit result vector of [16 x i8].
2366///
2367/// \headerfile <x86intrin.h>
2368///
2369/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2370///
2371/// \param __a
2372///    A 128-bit unsigned [16 x i8] vector.
2373/// \param __b
2374///    A 128-bit unsigned [16 x i8] vector.
2375/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2376///    each comparison.
2377static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2378                                                          __m128i __b) {
2379  return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2380}
2381
2382/// Multiplies the corresponding elements of two signed [8 x i16]
2383///    vectors, saving the upper 16 bits of each 32-bit product in the
2384///    corresponding element of a 128-bit signed [8 x i16] result vector.
2385///
2386/// \headerfile <x86intrin.h>
2387///
2388/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2389///
2390/// \param __a
2391///    A 128-bit signed [8 x i16] vector.
2392/// \param __b
2393///    A 128-bit signed [8 x i16] vector.
2394/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2395///    each of the eight 32-bit products.
2396static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2397                                                             __m128i __b) {
2398  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2399}
2400
2401/// Multiplies the corresponding elements of two unsigned [8 x i16]
2402///    vectors, saving the upper 16 bits of each 32-bit product in the
2403///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2404///
2405/// \headerfile <x86intrin.h>
2406///
2407/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2408///
2409/// \param __a
2410///    A 128-bit unsigned [8 x i16] vector.
2411/// \param __b
2412///    A 128-bit unsigned [8 x i16] vector.
2413/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2414///    of each of the eight 32-bit products.
2415static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2416                                                             __m128i __b) {
2417  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2418}
2419
2420/// Multiplies the corresponding elements of two signed [8 x i16]
2421///    vectors, saving the lower 16 bits of each 32-bit product in the
2422///    corresponding element of a 128-bit signed [8 x i16] result vector.
2423///
2424/// \headerfile <x86intrin.h>
2425///
2426/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2427///
2428/// \param __a
2429///    A 128-bit signed [8 x i16] vector.
2430/// \param __b
2431///    A 128-bit signed [8 x i16] vector.
2432/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2433///    each of the eight 32-bit products.
2434static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2435                                                             __m128i __b) {
2436  return (__m128i)((__v8hu)__a * (__v8hu)__b);
2437}
2438
2439/// Multiplies 32-bit unsigned integer values contained in the lower bits
2440///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2441///    product.
2442///
2443/// \headerfile <x86intrin.h>
2444///
2445/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2446///
2447/// \param __a
2448///    A 64-bit integer containing one of the source operands.
2449/// \param __b
2450///    A 64-bit integer containing one of the source operands.
2451/// \returns A 64-bit integer vector containing the product of both operands.
2452static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) {
2453  return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a),
2454                                             (__v4si)__anyext128(__b)));
2455}
2456
2457/// Multiplies 32-bit unsigned integer values contained in the lower
2458///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2459///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2460///
2461/// \headerfile <x86intrin.h>
2462///
2463/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2464///
2465/// \param __a
2466///    A [2 x i64] vector containing one of the source operands.
2467/// \param __b
2468///    A [2 x i64] vector containing one of the source operands.
2469/// \returns A [2 x i64] vector containing the product of both operands.
2470static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2471                                                           __m128i __b) {
2472  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2473}
2474
2475/// Computes the absolute differences of corresponding 8-bit integer
2476///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2477///    separately sums the second 8 absolute differences. Packs these two
2478///    unsigned 16-bit integer sums into the upper and lower elements of a
2479///    [2 x i64] vector.
2480///
2481/// \headerfile <x86intrin.h>
2482///
2483/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2484///
2485/// \param __a
2486///    A 128-bit integer vector containing one of the source operands.
2487/// \param __b
2488///    A 128-bit integer vector containing one of the source operands.
2489/// \returns A [2 x i64] vector containing the sums of the sets of absolute
2490///    differences between both operands.
2491static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2492                                                          __m128i __b) {
2493  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2494}
2495
2496/// Subtracts the corresponding 8-bit integer values in the operands.
2497///
2498/// \headerfile <x86intrin.h>
2499///
2500/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2501///
2502/// \param __a
2503///    A 128-bit integer vector containing the minuends.
2504/// \param __b
2505///    A 128-bit integer vector containing the subtrahends.
2506/// \returns A 128-bit integer vector containing the differences of the values
2507///    in the operands.
2508static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2509                                                          __m128i __b) {
2510  return (__m128i)((__v16qu)__a - (__v16qu)__b);
2511}
2512
2513/// Subtracts the corresponding 16-bit integer values in the operands.
2514///
2515/// \headerfile <x86intrin.h>
2516///
2517/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2518///
2519/// \param __a
2520///    A 128-bit integer vector containing the minuends.
2521/// \param __b
2522///    A 128-bit integer vector containing the subtrahends.
2523/// \returns A 128-bit integer vector containing the differences of the values
2524///    in the operands.
2525static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2526                                                           __m128i __b) {
2527  return (__m128i)((__v8hu)__a - (__v8hu)__b);
2528}
2529
2530/// Subtracts the corresponding 32-bit integer values in the operands.
2531///
2532/// \headerfile <x86intrin.h>
2533///
2534/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2535///
2536/// \param __a
2537///    A 128-bit integer vector containing the minuends.
2538/// \param __b
2539///    A 128-bit integer vector containing the subtrahends.
2540/// \returns A 128-bit integer vector containing the differences of the values
2541///    in the operands.
2542static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2543_mm_sub_epi32(__m128i __a, __m128i __b) {
2544  return (__m128i)((__v4su)__a - (__v4su)__b);
2545}
2546
2547/// Subtracts signed or unsigned 64-bit integer values and writes the
2548///    difference to the corresponding bits in the destination.
2549///
2550/// \headerfile <x86intrin.h>
2551///
2552/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2553///
2554/// \param __a
2555///    A 64-bit integer vector containing the minuend.
2556/// \param __b
2557///    A 64-bit integer vector containing the subtrahend.
2558/// \returns A 64-bit integer vector containing the difference of the values in
2559///    the operands.
2560static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_sub_si64(__m64 __a, __m64 __b) {
2561  return (__m64)((unsigned long long)__a - (unsigned long long)__b);
2562}
2563
2564/// Subtracts the corresponding elements of two [2 x i64] vectors.
2565///
2566/// \headerfile <x86intrin.h>
2567///
2568/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2569///
2570/// \param __a
2571///    A 128-bit integer vector containing the minuends.
2572/// \param __b
2573///    A 128-bit integer vector containing the subtrahends.
2574/// \returns A 128-bit integer vector containing the differences of the values
2575///    in the operands.
2576static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
2577_mm_sub_epi64(__m128i __a, __m128i __b) {
2578  return (__m128i)((__v2du)__a - (__v2du)__b);
2579}
2580
2581/// Subtracts, with saturation, corresponding 8-bit signed integer values in
2582///    the input and returns the differences in the corresponding bytes in the
2583///    destination.
2584///
2585///    Differences greater than 0x7F are saturated to 0x7F, and differences
2586///    less than 0x80 are saturated to 0x80.
2587///
2588/// \headerfile <x86intrin.h>
2589///
2590/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2591///
2592/// \param __a
2593///    A 128-bit integer vector containing the minuends.
2594/// \param __b
2595///    A 128-bit integer vector containing the subtrahends.
2596/// \returns A 128-bit integer vector containing the differences of the values
2597///    in the operands.
2598static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2599                                                           __m128i __b) {
2600  return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2601}
2602
2603/// Subtracts, with saturation, corresponding 16-bit signed integer values in
2604///    the input and returns the differences in the corresponding bytes in the
2605///    destination.
2606///
2607///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2608///    than 0x8000 are saturated to 0x8000.
2609///
2610/// \headerfile <x86intrin.h>
2611///
2612/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2613///
2614/// \param __a
2615///    A 128-bit integer vector containing the minuends.
2616/// \param __b
2617///    A 128-bit integer vector containing the subtrahends.
2618/// \returns A 128-bit integer vector containing the differences of the values
2619///    in the operands.
2620static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2621                                                            __m128i __b) {
2622  return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2623}
2624
2625/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2626///    the input and returns the differences in the corresponding bytes in the
2627///    destination.
2628///
2629///    Differences less than 0x00 are saturated to 0x00.
2630///
2631/// \headerfile <x86intrin.h>
2632///
2633/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2634///
2635/// \param __a
2636///    A 128-bit integer vector containing the minuends.
2637/// \param __b
2638///    A 128-bit integer vector containing the subtrahends.
2639/// \returns A 128-bit integer vector containing the unsigned integer
2640///    differences of the values in the operands.
2641static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2642                                                           __m128i __b) {
2643  return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2644}
2645
2646/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2647///    the input and returns the differences in the corresponding bytes in the
2648///    destination.
2649///
2650///    Differences less than 0x0000 are saturated to 0x0000.
2651///
2652/// \headerfile <x86intrin.h>
2653///
2654/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2655///
2656/// \param __a
2657///    A 128-bit integer vector containing the minuends.
2658/// \param __b
2659///    A 128-bit integer vector containing the subtrahends.
2660/// \returns A 128-bit integer vector containing the unsigned integer
2661///    differences of the values in the operands.
2662static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2663                                                            __m128i __b) {
2664  return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2665}
2666
2667/// Performs a bitwise AND of two 128-bit integer vectors.
2668///
2669/// \headerfile <x86intrin.h>
2670///
2671/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2672///
2673/// \param __a
2674///    A 128-bit integer vector containing one of the source operands.
2675/// \param __b
2676///    A 128-bit integer vector containing one of the source operands.
2677/// \returns A 128-bit integer vector containing the bitwise AND of the values
2678///    in both operands.
2679static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2680                                                           __m128i __b) {
2681  return (__m128i)((__v2du)__a & (__v2du)__b);
2682}
2683
2684/// Performs a bitwise AND of two 128-bit integer vectors, using the
2685///    one's complement of the values contained in the first source operand.
2686///
2687/// \headerfile <x86intrin.h>
2688///
2689/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2690///
2691/// \param __a
2692///    A 128-bit vector containing the left source operand. The one's complement
2693///    of this value is used in the bitwise AND.
2694/// \param __b
2695///    A 128-bit vector containing the right source operand.
2696/// \returns A 128-bit integer vector containing the bitwise AND of the one's
2697///    complement of the first operand and the values in the second operand.
2698static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2699                                                              __m128i __b) {
2700  return (__m128i)(~(__v2du)__a & (__v2du)__b);
2701}
2702/// Performs a bitwise OR of two 128-bit integer vectors.
2703///
2704/// \headerfile <x86intrin.h>
2705///
2706/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2707///
2708/// \param __a
2709///    A 128-bit integer vector containing one of the source operands.
2710/// \param __b
2711///    A 128-bit integer vector containing one of the source operands.
2712/// \returns A 128-bit integer vector containing the bitwise OR of the values
2713///    in both operands.
2714static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2715                                                          __m128i __b) {
2716  return (__m128i)((__v2du)__a | (__v2du)__b);
2717}
2718
2719/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2720///
2721/// \headerfile <x86intrin.h>
2722///
2723/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2724///
2725/// \param __a
2726///    A 128-bit integer vector containing one of the source operands.
2727/// \param __b
2728///    A 128-bit integer vector containing one of the source operands.
2729/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2730///    values in both operands.
2731static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2732                                                           __m128i __b) {
2733  return (__m128i)((__v2du)__a ^ (__v2du)__b);
2734}
2735
2736/// Left-shifts the 128-bit integer vector operand by the specified
2737///    number of bytes. Low-order bits are cleared.
2738///
2739/// \headerfile <x86intrin.h>
2740///
2741/// \code
2742/// __m128i _mm_slli_si128(__m128i a, const int imm);
2743/// \endcode
2744///
2745/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2746///
2747/// \param a
2748///    A 128-bit integer vector containing the source operand.
2749/// \param imm
2750///    An immediate value specifying the number of bytes to left-shift operand
2751///    \a a.
2752/// \returns A 128-bit integer vector containing the left-shifted value.
2753#define _mm_slli_si128(a, imm)                                                 \
2754  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2755                                                (int)(imm)))
2756
2757#define _mm_bslli_si128(a, imm)                                                \
2758  ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2759                                                (int)(imm)))
2760
2761/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2762///    by the specified number of bits. Low-order bits are cleared.
2763///
2764/// \headerfile <x86intrin.h>
2765///
2766/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2767///
2768/// \param __a
2769///    A 128-bit integer vector containing the source operand.
2770/// \param __count
2771///    An integer value specifying the number of bits to left-shift each value
2772///    in operand \a __a.
2773/// \returns A 128-bit integer vector containing the left-shifted values.
2774static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2775                                                            int __count) {
2776  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2777}
2778
2779/// Left-shifts each 16-bit value in the 128-bit integer vector operand
2780///    by the specified number of bits. Low-order bits are cleared.
2781///
2782/// \headerfile <x86intrin.h>
2783///
2784/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2785///
2786/// \param __a
2787///    A 128-bit integer vector containing the source operand.
2788/// \param __count
2789///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2790///    to left-shift each value in operand \a __a.
2791/// \returns A 128-bit integer vector containing the left-shifted values.
2792static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2793                                                           __m128i __count) {
2794  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2795}
2796
2797/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2798///    by the specified number of bits. Low-order bits are cleared.
2799///
2800/// \headerfile <x86intrin.h>
2801///
2802/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2803///
2804/// \param __a
2805///    A 128-bit integer vector containing the source operand.
2806/// \param __count
2807///    An integer value specifying the number of bits to left-shift each value
2808///    in operand \a __a.
2809/// \returns A 128-bit integer vector containing the left-shifted values.
2810static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2811                                                            int __count) {
2812  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2813}
2814
2815/// Left-shifts each 32-bit value in the 128-bit integer vector operand
2816///    by the specified number of bits. Low-order bits are cleared.
2817///
2818/// \headerfile <x86intrin.h>
2819///
2820/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2821///
2822/// \param __a
2823///    A 128-bit integer vector containing the source operand.
2824/// \param __count
2825///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2826///    to left-shift each value in operand \a __a.
2827/// \returns A 128-bit integer vector containing the left-shifted values.
2828static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2829                                                           __m128i __count) {
2830  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2831}
2832
2833/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2834///    by the specified number of bits. Low-order bits are cleared.
2835///
2836/// \headerfile <x86intrin.h>
2837///
2838/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2839///
2840/// \param __a
2841///    A 128-bit integer vector containing the source operand.
2842/// \param __count
2843///    An integer value specifying the number of bits to left-shift each value
2844///    in operand \a __a.
2845/// \returns A 128-bit integer vector containing the left-shifted values.
2846static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2847                                                            int __count) {
2848  return __builtin_ia32_psllqi128((__v2di)__a, __count);
2849}
2850
2851/// Left-shifts each 64-bit value in the 128-bit integer vector operand
2852///    by the specified number of bits. Low-order bits are cleared.
2853///
2854/// \headerfile <x86intrin.h>
2855///
2856/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2857///
2858/// \param __a
2859///    A 128-bit integer vector containing the source operand.
2860/// \param __count
2861///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2862///    to left-shift each value in operand \a __a.
2863/// \returns A 128-bit integer vector containing the left-shifted values.
2864static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2865                                                           __m128i __count) {
2866  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2867}
2868
2869/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2870///    by the specified number of bits. High-order bits are filled with the sign
2871///    bit of the initial value.
2872///
2873/// \headerfile <x86intrin.h>
2874///
2875/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2876///
2877/// \param __a
2878///    A 128-bit integer vector containing the source operand.
2879/// \param __count
2880///    An integer value specifying the number of bits to right-shift each value
2881///    in operand \a __a.
2882/// \returns A 128-bit integer vector containing the right-shifted values.
2883static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2884                                                            int __count) {
2885  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2886}
2887
2888/// Right-shifts each 16-bit value in the 128-bit integer vector operand
2889///    by the specified number of bits. High-order bits are filled with the sign
2890///    bit of the initial value.
2891///
2892/// \headerfile <x86intrin.h>
2893///
2894/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2895///
2896/// \param __a
2897///    A 128-bit integer vector containing the source operand.
2898/// \param __count
2899///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2900///    to right-shift each value in operand \a __a.
2901/// \returns A 128-bit integer vector containing the right-shifted values.
2902static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2903                                                           __m128i __count) {
2904  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2905}
2906
2907/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2908///    by the specified number of bits. High-order bits are filled with the sign
2909///    bit of the initial value.
2910///
2911/// \headerfile <x86intrin.h>
2912///
2913/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2914///
2915/// \param __a
2916///    A 128-bit integer vector containing the source operand.
2917/// \param __count
2918///    An integer value specifying the number of bits to right-shift each value
2919///    in operand \a __a.
2920/// \returns A 128-bit integer vector containing the right-shifted values.
2921static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2922                                                            int __count) {
2923  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2924}
2925
2926/// Right-shifts each 32-bit value in the 128-bit integer vector operand
2927///    by the specified number of bits. High-order bits are filled with the sign
2928///    bit of the initial value.
2929///
2930/// \headerfile <x86intrin.h>
2931///
2932/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2933///
2934/// \param __a
2935///    A 128-bit integer vector containing the source operand.
2936/// \param __count
2937///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2938///    to right-shift each value in operand \a __a.
2939/// \returns A 128-bit integer vector containing the right-shifted values.
2940static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2941                                                           __m128i __count) {
2942  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2943}
2944
2945/// Right-shifts the 128-bit integer vector operand by the specified
2946///    number of bytes. High-order bits are cleared.
2947///
2948/// \headerfile <x86intrin.h>
2949///
2950/// \code
2951/// __m128i _mm_srli_si128(__m128i a, const int imm);
2952/// \endcode
2953///
2954/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2955///
2956/// \param a
2957///    A 128-bit integer vector containing the source operand.
2958/// \param imm
2959///    An immediate value specifying the number of bytes to right-shift operand
2960///    \a a.
2961/// \returns A 128-bit integer vector containing the right-shifted value.
2962#define _mm_srli_si128(a, imm)                                                 \
2963  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2964                                                (int)(imm)))
2965
2966#define _mm_bsrli_si128(a, imm)                                                \
2967  ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2968                                                (int)(imm)))
2969
2970/// Right-shifts each of 16-bit values in the 128-bit integer vector
2971///    operand by the specified number of bits. High-order bits are cleared.
2972///
2973/// \headerfile <x86intrin.h>
2974///
2975/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2976///
2977/// \param __a
2978///    A 128-bit integer vector containing the source operand.
2979/// \param __count
2980///    An integer value specifying the number of bits to right-shift each value
2981///    in operand \a __a.
2982/// \returns A 128-bit integer vector containing the right-shifted values.
2983static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2984                                                            int __count) {
2985  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2986}
2987
2988/// Right-shifts each of 16-bit values in the 128-bit integer vector
2989///    operand by the specified number of bits. High-order bits are cleared.
2990///
2991/// \headerfile <x86intrin.h>
2992///
2993/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2994///
2995/// \param __a
2996///    A 128-bit integer vector containing the source operand.
2997/// \param __count
2998///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2999///    to right-shift each value in operand \a __a.
3000/// \returns A 128-bit integer vector containing the right-shifted values.
3001static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
3002                                                           __m128i __count) {
3003  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
3004}
3005
3006/// Right-shifts each of 32-bit values in the 128-bit integer vector
3007///    operand by the specified number of bits. High-order bits are cleared.
3008///
3009/// \headerfile <x86intrin.h>
3010///
3011/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3012///
3013/// \param __a
3014///    A 128-bit integer vector containing the source operand.
3015/// \param __count
3016///    An integer value specifying the number of bits to right-shift each value
3017///    in operand \a __a.
3018/// \returns A 128-bit integer vector containing the right-shifted values.
3019static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3020                                                            int __count) {
3021  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3022}
3023
3024/// Right-shifts each of 32-bit values in the 128-bit integer vector
3025///    operand by the specified number of bits. High-order bits are cleared.
3026///
3027/// \headerfile <x86intrin.h>
3028///
3029/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3030///
3031/// \param __a
3032///    A 128-bit integer vector containing the source operand.
3033/// \param __count
3034///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3035///    to right-shift each value in operand \a __a.
3036/// \returns A 128-bit integer vector containing the right-shifted values.
3037static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3038                                                           __m128i __count) {
3039  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3040}
3041
3042/// Right-shifts each of 64-bit values in the 128-bit integer vector
3043///    operand by the specified number of bits. High-order bits are cleared.
3044///
3045/// \headerfile <x86intrin.h>
3046///
3047/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3048///
3049/// \param __a
3050///    A 128-bit integer vector containing the source operand.
3051/// \param __count
3052///    An integer value specifying the number of bits to right-shift each value
3053///    in operand \a __a.
3054/// \returns A 128-bit integer vector containing the right-shifted values.
3055static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3056                                                            int __count) {
3057  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3058}
3059
3060/// Right-shifts each of 64-bit values in the 128-bit integer vector
3061///    operand by the specified number of bits. High-order bits are cleared.
3062///
3063/// \headerfile <x86intrin.h>
3064///
3065/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3066///
3067/// \param __a
3068///    A 128-bit integer vector containing the source operand.
3069/// \param __count
3070///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3071///    to right-shift each value in operand \a __a.
3072/// \returns A 128-bit integer vector containing the right-shifted values.
3073static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3074                                                           __m128i __count) {
3075  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3076}
3077
3078/// Compares each of the corresponding 8-bit values of the 128-bit
3079///    integer vectors for equality.
3080///
3081///    Each comparison returns 0x0 for false, 0xFF for true.
3082///
3083/// \headerfile <x86intrin.h>
3084///
3085/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3086///
3087/// \param __a
3088///    A 128-bit integer vector.
3089/// \param __b
3090///    A 128-bit integer vector.
3091/// \returns A 128-bit integer vector containing the comparison results.
3092static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3093                                                            __m128i __b) {
3094  return (__m128i)((__v16qi)__a == (__v16qi)__b);
3095}
3096
3097/// Compares each of the corresponding 16-bit values of the 128-bit
3098///    integer vectors for equality.
3099///
3100///    Each comparison returns 0x0 for false, 0xFFFF for true.
3101///
3102/// \headerfile <x86intrin.h>
3103///
3104/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3105///
3106/// \param __a
3107///    A 128-bit integer vector.
3108/// \param __b
3109///    A 128-bit integer vector.
3110/// \returns A 128-bit integer vector containing the comparison results.
3111static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3112                                                             __m128i __b) {
3113  return (__m128i)((__v8hi)__a == (__v8hi)__b);
3114}
3115
3116/// Compares each of the corresponding 32-bit values of the 128-bit
3117///    integer vectors for equality.
3118///
3119///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3120///
3121/// \headerfile <x86intrin.h>
3122///
3123/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3124///
3125/// \param __a
3126///    A 128-bit integer vector.
3127/// \param __b
3128///    A 128-bit integer vector.
3129/// \returns A 128-bit integer vector containing the comparison results.
3130static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3131                                                             __m128i __b) {
3132  return (__m128i)((__v4si)__a == (__v4si)__b);
3133}
3134
3135/// Compares each of the corresponding signed 8-bit values of the 128-bit
3136///    integer vectors to determine if the values in the first operand are
3137///    greater than those in the second operand.
3138///
3139///    Each comparison returns 0x0 for false, 0xFF for true.
3140///
3141/// \headerfile <x86intrin.h>
3142///
3143/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3144///
3145/// \param __a
3146///    A 128-bit integer vector.
3147/// \param __b
3148///    A 128-bit integer vector.
3149/// \returns A 128-bit integer vector containing the comparison results.
3150static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3151                                                            __m128i __b) {
3152  /* This function always performs a signed comparison, but __v16qi is a char
3153     which may be signed or unsigned, so use __v16qs. */
3154  return (__m128i)((__v16qs)__a > (__v16qs)__b);
3155}
3156
3157/// Compares each of the corresponding signed 16-bit values of the
3158///    128-bit integer vectors to determine if the values in the first operand
3159///    are greater than those in the second operand.
3160///
3161///    Each comparison returns 0x0 for false, 0xFFFF for true.
3162///
3163/// \headerfile <x86intrin.h>
3164///
3165/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3166///
3167/// \param __a
3168///    A 128-bit integer vector.
3169/// \param __b
3170///    A 128-bit integer vector.
3171/// \returns A 128-bit integer vector containing the comparison results.
3172static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3173                                                             __m128i __b) {
3174  return (__m128i)((__v8hi)__a > (__v8hi)__b);
3175}
3176
3177/// Compares each of the corresponding signed 32-bit values of the
3178///    128-bit integer vectors to determine if the values in the first operand
3179///    are greater than those in the second operand.
3180///
3181///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3182///
3183/// \headerfile <x86intrin.h>
3184///
3185/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3186///
3187/// \param __a
3188///    A 128-bit integer vector.
3189/// \param __b
3190///    A 128-bit integer vector.
3191/// \returns A 128-bit integer vector containing the comparison results.
3192static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3193                                                             __m128i __b) {
3194  return (__m128i)((__v4si)__a > (__v4si)__b);
3195}
3196
3197/// Compares each of the corresponding signed 8-bit values of the 128-bit
3198///    integer vectors to determine if the values in the first operand are less
3199///    than those in the second operand.
3200///
3201///    Each comparison returns 0x0 for false, 0xFF for true.
3202///
3203/// \headerfile <x86intrin.h>
3204///
3205/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3206///
3207/// \param __a
3208///    A 128-bit integer vector.
3209/// \param __b
3210///    A 128-bit integer vector.
3211/// \returns A 128-bit integer vector containing the comparison results.
3212static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3213                                                            __m128i __b) {
3214  return _mm_cmpgt_epi8(__b, __a);
3215}
3216
3217/// Compares each of the corresponding signed 16-bit values of the
3218///    128-bit integer vectors to determine if the values in the first operand
3219///    are less than those in the second operand.
3220///
3221///    Each comparison returns 0x0 for false, 0xFFFF for true.
3222///
3223/// \headerfile <x86intrin.h>
3224///
3225/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3226///
3227/// \param __a
3228///    A 128-bit integer vector.
3229/// \param __b
3230///    A 128-bit integer vector.
3231/// \returns A 128-bit integer vector containing the comparison results.
3232static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3233                                                             __m128i __b) {
3234  return _mm_cmpgt_epi16(__b, __a);
3235}
3236
3237/// Compares each of the corresponding signed 32-bit values of the
3238///    128-bit integer vectors to determine if the values in the first operand
3239///    are less than those in the second operand.
3240///
3241///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3242///
3243/// \headerfile <x86intrin.h>
3244///
3245/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3246///
3247/// \param __a
3248///    A 128-bit integer vector.
3249/// \param __b
3250///    A 128-bit integer vector.
3251/// \returns A 128-bit integer vector containing the comparison results.
3252static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3253                                                             __m128i __b) {
3254  return _mm_cmpgt_epi32(__b, __a);
3255}
3256
3257#ifdef __x86_64__
3258/// Converts a 64-bit signed integer value from the second operand into a
3259///    double-precision value and returns it in the lower element of a [2 x
3260///    double] vector; the upper element of the returned vector is copied from
3261///    the upper element of the first operand.
3262///
3263/// \headerfile <x86intrin.h>
3264///
3265/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3266///
3267/// \param __a
3268///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3269///    copied to the upper 64 bits of the destination.
3270/// \param __b
3271///    A 64-bit signed integer operand containing the value to be converted.
3272/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3273///    converted value of the second operand. The upper 64 bits are copied from
3274///    the upper 64 bits of the first operand.
3275static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
3276_mm_cvtsi64_sd(__m128d __a, long long __b) {
3277  __a[0] = __b;
3278  return __a;
3279}
3280
3281/// Converts the first (lower) element of a vector of [2 x double] into a
3282///    64-bit signed integer value.
3283///
3284///    If the converted value does not fit in a 64-bit integer, raises a
3285///    floating-point invalid exception. If the exception is masked, returns
3286///    the most negative integer.
3287///
3288/// \headerfile <x86intrin.h>
3289///
3290/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3291///
3292/// \param __a
3293///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3294///    conversion.
3295/// \returns A 64-bit signed integer containing the converted value.
3296static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3297  return __builtin_ia32_cvtsd2si64((__v2df)__a);
3298}
3299
3300/// Converts the first (lower) element of a vector of [2 x double] into a
3301///    64-bit signed truncated (rounded toward zero) integer value.
3302///
3303///    If a converted value does not fit in a 64-bit integer, raises a
3304///    floating-point invalid exception. If the exception is masked, returns
3305///    the most negative integer.
3306///
3307/// \headerfile <x86intrin.h>
3308///
3309/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3310///   instruction.
3311///
3312/// \param __a
3313///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3314///    conversion.
3315/// \returns A 64-bit signed integer containing the converted value.
3316static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3317  return __builtin_ia32_cvttsd2si64((__v2df)__a);
3318}
3319#endif
3320
3321/// Converts a vector of [4 x i32] into a vector of [4 x float].
3322///
3323/// \headerfile <x86intrin.h>
3324///
3325/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3326///
3327/// \param __a
3328///    A 128-bit integer vector.
3329/// \returns A 128-bit vector of [4 x float] containing the converted values.
3330static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
3331_mm_cvtepi32_ps(__m128i __a) {
3332  return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3333}
3334
3335/// Converts a vector of [4 x float] into a vector of [4 x i32].
3336///
3337///    If a converted value does not fit in a 32-bit integer, raises a
3338///    floating-point invalid exception. If the exception is masked, returns
3339///    the most negative integer.
3340///
3341/// \headerfile <x86intrin.h>
3342///
3343/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3344///
3345/// \param __a
3346///    A 128-bit vector of [4 x float].
3347/// \returns A 128-bit integer vector of [4 x i32] containing the converted
3348///    values.
3349static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3350  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3351}
3352
3353/// Converts a vector of [4 x float] into four signed truncated (rounded toward
3354///    zero) 32-bit integers, returned in a vector of [4 x i32].
3355///
3356///    If a converted value does not fit in a 32-bit integer, raises a
3357///    floating-point invalid exception. If the exception is masked, returns
3358///    the most negative integer.
3359///
3360/// \headerfile <x86intrin.h>
3361///
3362/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3363///   instruction.
3364///
3365/// \param __a
3366///    A 128-bit vector of [4 x float].
3367/// \returns A 128-bit vector of [4 x i32] containing the converted values.
3368static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3369  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3370}
3371
3372/// Returns a vector of [4 x i32] where the lowest element is the input
3373///    operand and the remaining elements are zero.
3374///
3375/// \headerfile <x86intrin.h>
3376///
3377/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3378///
3379/// \param __a
3380///    A 32-bit signed integer operand.
3381/// \returns A 128-bit vector of [4 x i32].
3382static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3383  return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3384}
3385
3386/// Returns a vector of [2 x i64] where the lower element is the input
3387///    operand and the upper element is zero.
3388///
3389/// \headerfile <x86intrin.h>
3390///
3391/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3392/// in 64-bit mode.
3393///
3394/// \param __a
3395///    A 64-bit signed integer operand containing the value to be converted.
3396/// \returns A 128-bit vector of [2 x i64] containing the converted value.
3397static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3398  return __extension__(__m128i)(__v2di){__a, 0};
3399}
3400
3401/// Moves the least significant 32 bits of a vector of [4 x i32] to a
3402///    32-bit signed integer value.
3403///
3404/// \headerfile <x86intrin.h>
3405///
3406/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3407///
3408/// \param __a
3409///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3410///    destination.
3411/// \returns A 32-bit signed integer containing the moved value.
3412static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3413  __v4si __b = (__v4si)__a;
3414  return __b[0];
3415}
3416
3417/// Moves the least significant 64 bits of a vector of [2 x i64] to a
3418///    64-bit signed integer value.
3419///
3420/// \headerfile <x86intrin.h>
3421///
3422/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3423///
3424/// \param __a
3425///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3426///    destination.
3427/// \returns A 64-bit signed integer containing the moved value.
3428static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3429  return __a[0];
3430}
3431
3432/// Moves packed integer values from an aligned 128-bit memory location
3433///    to elements in a 128-bit integer vector.
3434///
3435/// \headerfile <x86intrin.h>
3436///
3437/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3438///
3439/// \param __p
3440///    An aligned pointer to a memory location containing integer values.
3441/// \returns A 128-bit integer vector containing the moved values.
3442static __inline__ __m128i __DEFAULT_FN_ATTRS
3443_mm_load_si128(__m128i const *__p) {
3444  return *__p;
3445}
3446
3447/// Moves packed integer values from an unaligned 128-bit memory location
3448///    to elements in a 128-bit integer vector.
3449///
3450/// \headerfile <x86intrin.h>
3451///
3452/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3453///
3454/// \param __p
3455///    A pointer to a memory location containing integer values.
3456/// \returns A 128-bit integer vector containing the moved values.
3457static __inline__ __m128i __DEFAULT_FN_ATTRS
3458_mm_loadu_si128(__m128i_u const *__p) {
3459  struct __loadu_si128 {
3460    __m128i_u __v;
3461  } __attribute__((__packed__, __may_alias__));
3462  return ((const struct __loadu_si128 *)__p)->__v;
3463}
3464
3465/// Returns a vector of [2 x i64] where the lower element is taken from
3466///    the lower element of the operand, and the upper element is zero.
3467///
3468/// \headerfile <x86intrin.h>
3469///
3470/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3471///
3472/// \param __p
3473///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3474///    the destination.
3475/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3476///    moved value. The higher order bits are cleared.
3477static __inline__ __m128i __DEFAULT_FN_ATTRS
3478_mm_loadl_epi64(__m128i_u const *__p) {
3479  struct __mm_loadl_epi64_struct {
3480    long long __u;
3481  } __attribute__((__packed__, __may_alias__));
3482  return __extension__(__m128i){
3483      ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3484}
3485
3486/// Generates a 128-bit vector of [4 x i32] with unspecified content.
3487///    This could be used as an argument to another intrinsic function where the
3488///    argument is required but the value is not actually used.
3489///
3490/// \headerfile <x86intrin.h>
3491///
3492/// This intrinsic has no corresponding instruction.
3493///
3494/// \returns A 128-bit vector of [4 x i32] with unspecified content.
3495static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3496  return (__m128i)__builtin_ia32_undef128();
3497}
3498
3499/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3500///    the specified 64-bit integer values.
3501///
3502/// \headerfile <x86intrin.h>
3503///
3504/// This intrinsic is a utility function and does not correspond to a specific
3505///    instruction.
3506///
3507/// \param __q1
3508///    A 64-bit integer value used to initialize the upper 64 bits of the
3509///    destination vector of [2 x i64].
3510/// \param __q0
3511///    A 64-bit integer value used to initialize the lower 64 bits of the
3512///    destination vector of [2 x i64].
3513/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3514///    provided in the operands.
3515static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3516_mm_set_epi64x(long long __q1, long long __q0) {
3517  return __extension__(__m128i)(__v2di){__q0, __q1};
3518}
3519
3520/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3521///    the specified 64-bit integer values.
3522///
3523/// \headerfile <x86intrin.h>
3524///
3525/// This intrinsic is a utility function and does not correspond to a specific
3526///    instruction.
3527///
3528/// \param __q1
3529///    A 64-bit integer value used to initialize the upper 64 bits of the
3530///    destination vector of [2 x i64].
3531/// \param __q0
3532///    A 64-bit integer value used to initialize the lower 64 bits of the
3533///    destination vector of [2 x i64].
3534/// \returns An initialized 128-bit vector of [2 x i64] containing the values
3535///    provided in the operands.
3536static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3537_mm_set_epi64(__m64 __q1, __m64 __q0) {
3538  return _mm_set_epi64x((long long)__q1[0], (long long)__q0[0]);
3539}
3540
3541/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3542///    the specified 32-bit integer values.
3543///
3544/// \headerfile <x86intrin.h>
3545///
3546/// This intrinsic is a utility function and does not correspond to a specific
3547///    instruction.
3548///
3549/// \param __i3
3550///    A 32-bit integer value used to initialize bits [127:96] of the
3551///    destination vector.
3552/// \param __i2
3553///    A 32-bit integer value used to initialize bits [95:64] of the destination
3554///    vector.
3555/// \param __i1
3556///    A 32-bit integer value used to initialize bits [63:32] of the destination
3557///    vector.
3558/// \param __i0
3559///    A 32-bit integer value used to initialize bits [31:0] of the destination
3560///    vector.
3561/// \returns An initialized 128-bit vector of [4 x i32] containing the values
3562///    provided in the operands.
3563static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set_epi32(int __i3,
3564                                                                     int __i2,
3565                                                                     int __i1,
3566                                                                     int __i0) {
3567  return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3568}
3569
3570/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3571///    the specified 16-bit integer values.
3572///
3573/// \headerfile <x86intrin.h>
3574///
3575/// This intrinsic is a utility function and does not correspond to a specific
3576///    instruction.
3577///
3578/// \param __w7
3579///    A 16-bit integer value used to initialize bits [127:112] of the
3580///    destination vector.
3581/// \param __w6
3582///    A 16-bit integer value used to initialize bits [111:96] of the
3583///    destination vector.
3584/// \param __w5
3585///    A 16-bit integer value used to initialize bits [95:80] of the destination
3586///    vector.
3587/// \param __w4
3588///    A 16-bit integer value used to initialize bits [79:64] of the destination
3589///    vector.
3590/// \param __w3
3591///    A 16-bit integer value used to initialize bits [63:48] of the destination
3592///    vector.
3593/// \param __w2
3594///    A 16-bit integer value used to initialize bits [47:32] of the destination
3595///    vector.
3596/// \param __w1
3597///    A 16-bit integer value used to initialize bits [31:16] of the destination
3598///    vector.
3599/// \param __w0
3600///    A 16-bit integer value used to initialize bits [15:0] of the destination
3601///    vector.
3602/// \returns An initialized 128-bit vector of [8 x i16] containing the values
3603///    provided in the operands.
3604static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3605_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3606              short __w2, short __w1, short __w0) {
3607  return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3608                                        __w4, __w5, __w6, __w7};
3609}
3610
3611/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3612///    the specified 8-bit integer values.
3613///
3614/// \headerfile <x86intrin.h>
3615///
3616/// This intrinsic is a utility function and does not correspond to a specific
3617///    instruction.
3618///
3619/// \param __b15
3620///    Initializes bits [127:120] of the destination vector.
3621/// \param __b14
3622///    Initializes bits [119:112] of the destination vector.
3623/// \param __b13
3624///    Initializes bits [111:104] of the destination vector.
3625/// \param __b12
3626///    Initializes bits [103:96] of the destination vector.
3627/// \param __b11
3628///    Initializes bits [95:88] of the destination vector.
3629/// \param __b10
3630///    Initializes bits [87:80] of the destination vector.
3631/// \param __b9
3632///    Initializes bits [79:72] of the destination vector.
3633/// \param __b8
3634///    Initializes bits [71:64] of the destination vector.
3635/// \param __b7
3636///    Initializes bits [63:56] of the destination vector.
3637/// \param __b6
3638///    Initializes bits [55:48] of the destination vector.
3639/// \param __b5
3640///    Initializes bits [47:40] of the destination vector.
3641/// \param __b4
3642///    Initializes bits [39:32] of the destination vector.
3643/// \param __b3
3644///    Initializes bits [31:24] of the destination vector.
3645/// \param __b2
3646///    Initializes bits [23:16] of the destination vector.
3647/// \param __b1
3648///    Initializes bits [15:8] of the destination vector.
3649/// \param __b0
3650///    Initializes bits [7:0] of the destination vector.
3651/// \returns An initialized 128-bit vector of [16 x i8] containing the values
3652///    provided in the operands.
3653static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3654_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3655             char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3656             char __b4, char __b3, char __b2, char __b1, char __b0) {
3657  return __extension__(__m128i)(__v16qi){
3658      __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
3659      __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3660}
3661
3662/// Initializes both values in a 128-bit integer vector with the
3663///    specified 64-bit integer value.
3664///
3665/// \headerfile <x86intrin.h>
3666///
3667/// This intrinsic is a utility function and does not correspond to a specific
3668///    instruction.
3669///
3670/// \param __q
3671///    Integer value used to initialize the elements of the destination integer
3672///    vector.
3673/// \returns An initialized 128-bit integer vector of [2 x i64] with both
3674///    elements containing the value provided in the operand.
3675static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3676_mm_set1_epi64x(long long __q) {
3677  return _mm_set_epi64x(__q, __q);
3678}
3679
3680/// Initializes both values in a 128-bit vector of [2 x i64] with the
3681///    specified 64-bit value.
3682///
3683/// \headerfile <x86intrin.h>
3684///
3685/// This intrinsic is a utility function and does not correspond to a specific
3686///    instruction.
3687///
3688/// \param __q
3689///    A 64-bit value used to initialize the elements of the destination integer
3690///    vector.
3691/// \returns An initialized 128-bit vector of [2 x i64] with all elements
3692///    containing the value provided in the operand.
3693static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3694_mm_set1_epi64(__m64 __q) {
3695  return _mm_set_epi64(__q, __q);
3696}
3697
3698/// Initializes all values in a 128-bit vector of [4 x i32] with the
3699///    specified 32-bit value.
3700///
3701/// \headerfile <x86intrin.h>
3702///
3703/// This intrinsic is a utility function and does not correspond to a specific
3704///    instruction.
3705///
3706/// \param __i
3707///    A 32-bit value used to initialize the elements of the destination integer
3708///    vector.
3709/// \returns An initialized 128-bit vector of [4 x i32] with all elements
3710///    containing the value provided in the operand.
3711static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi32(int __i) {
3712  return _mm_set_epi32(__i, __i, __i, __i);
3713}
3714
3715/// Initializes all values in a 128-bit vector of [8 x i16] with the
3716///    specified 16-bit value.
3717///
3718/// \headerfile <x86intrin.h>
3719///
3720/// This intrinsic is a utility function and does not correspond to a specific
3721///    instruction.
3722///
3723/// \param __w
3724///    A 16-bit value used to initialize the elements of the destination integer
3725///    vector.
3726/// \returns An initialized 128-bit vector of [8 x i16] with all elements
3727///    containing the value provided in the operand.
3728static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3729_mm_set1_epi16(short __w) {
3730  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3731}
3732
3733/// Initializes all values in a 128-bit vector of [16 x i8] with the
3734///    specified 8-bit value.
3735///
3736/// \headerfile <x86intrin.h>
3737///
3738/// This intrinsic is a utility function and does not correspond to a specific
3739///    instruction.
3740///
3741/// \param __b
3742///    An 8-bit value used to initialize the elements of the destination integer
3743///    vector.
3744/// \returns An initialized 128-bit vector of [16 x i8] with all elements
3745///    containing the value provided in the operand.
3746static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_set1_epi8(char __b) {
3747  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3748                      __b, __b, __b, __b, __b);
3749}
3750
3751/// Constructs a 128-bit integer vector, initialized in reverse order
3752///     with the specified 64-bit integral values.
3753///
3754/// \headerfile <x86intrin.h>
3755///
3756/// This intrinsic does not correspond to a specific instruction.
3757///
3758/// \param __q0
3759///    A 64-bit integral value used to initialize the lower 64 bits of the
3760///    result.
3761/// \param __q1
3762///    A 64-bit integral value used to initialize the upper 64 bits of the
3763///    result.
3764/// \returns An initialized 128-bit integer vector.
3765static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3766_mm_setr_epi64(__m64 __q0, __m64 __q1) {
3767  return _mm_set_epi64(__q1, __q0);
3768}
3769
3770/// Constructs a 128-bit integer vector, initialized in reverse order
3771///     with the specified 32-bit integral values.
3772///
3773/// \headerfile <x86intrin.h>
3774///
3775/// This intrinsic is a utility function and does not correspond to a specific
3776///    instruction.
3777///
3778/// \param __i0
3779///    A 32-bit integral value used to initialize bits [31:0] of the result.
3780/// \param __i1
3781///    A 32-bit integral value used to initialize bits [63:32] of the result.
3782/// \param __i2
3783///    A 32-bit integral value used to initialize bits [95:64] of the result.
3784/// \param __i3
3785///    A 32-bit integral value used to initialize bits [127:96] of the result.
3786/// \returns An initialized 128-bit integer vector.
3787static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3788_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) {
3789  return _mm_set_epi32(__i3, __i2, __i1, __i0);
3790}
3791
3792/// Constructs a 128-bit integer vector, initialized in reverse order
3793///     with the specified 16-bit integral values.
3794///
3795/// \headerfile <x86intrin.h>
3796///
3797/// This intrinsic is a utility function and does not correspond to a specific
3798///    instruction.
3799///
3800/// \param __w0
3801///    A 16-bit integral value used to initialize bits [15:0] of the result.
3802/// \param __w1
3803///    A 16-bit integral value used to initialize bits [31:16] of the result.
3804/// \param __w2
3805///    A 16-bit integral value used to initialize bits [47:32] of the result.
3806/// \param __w3
3807///    A 16-bit integral value used to initialize bits [63:48] of the result.
3808/// \param __w4
3809///    A 16-bit integral value used to initialize bits [79:64] of the result.
3810/// \param __w5
3811///    A 16-bit integral value used to initialize bits [95:80] of the result.
3812/// \param __w6
3813///    A 16-bit integral value used to initialize bits [111:96] of the result.
3814/// \param __w7
3815///    A 16-bit integral value used to initialize bits [127:112] of the result.
3816/// \returns An initialized 128-bit integer vector.
3817static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3818_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3819               short __w5, short __w6, short __w7) {
3820  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3821}
3822
3823/// Constructs a 128-bit integer vector, initialized in reverse order
3824///     with the specified 8-bit integral values.
3825///
3826/// \headerfile <x86intrin.h>
3827///
3828/// This intrinsic is a utility function and does not correspond to a specific
3829///    instruction.
3830///
3831/// \param __b0
3832///    An 8-bit integral value used to initialize bits [7:0] of the result.
3833/// \param __b1
3834///    An 8-bit integral value used to initialize bits [15:8] of the result.
3835/// \param __b2
3836///    An 8-bit integral value used to initialize bits [23:16] of the result.
3837/// \param __b3
3838///    An 8-bit integral value used to initialize bits [31:24] of the result.
3839/// \param __b4
3840///    An 8-bit integral value used to initialize bits [39:32] of the result.
3841/// \param __b5
3842///    An 8-bit integral value used to initialize bits [47:40] of the result.
3843/// \param __b6
3844///    An 8-bit integral value used to initialize bits [55:48] of the result.
3845/// \param __b7
3846///    An 8-bit integral value used to initialize bits [63:56] of the result.
3847/// \param __b8
3848///    An 8-bit integral value used to initialize bits [71:64] of the result.
3849/// \param __b9
3850///    An 8-bit integral value used to initialize bits [79:72] of the result.
3851/// \param __b10
3852///    An 8-bit integral value used to initialize bits [87:80] of the result.
3853/// \param __b11
3854///    An 8-bit integral value used to initialize bits [95:88] of the result.
3855/// \param __b12
3856///    An 8-bit integral value used to initialize bits [103:96] of the result.
3857/// \param __b13
3858///    An 8-bit integral value used to initialize bits [111:104] of the result.
3859/// \param __b14
3860///    An 8-bit integral value used to initialize bits [119:112] of the result.
3861/// \param __b15
3862///    An 8-bit integral value used to initialize bits [127:120] of the result.
3863/// \returns An initialized 128-bit integer vector.
3864static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
3865_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3866              char __b6, char __b7, char __b8, char __b9, char __b10,
3867              char __b11, char __b12, char __b13, char __b14, char __b15) {
3868  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3869                      __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3870}
3871
3872/// Creates a 128-bit integer vector initialized to zero.
3873///
3874/// \headerfile <x86intrin.h>
3875///
3876/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3877///
3878/// \returns An initialized 128-bit integer vector with all elements set to
3879///    zero.
3880static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_setzero_si128(void) {
3881  return __extension__(__m128i)(__v2di){0LL, 0LL};
3882}
3883
3884/// Stores a 128-bit integer vector to a memory location aligned on a
3885///    128-bit boundary.
3886///
3887/// \headerfile <x86intrin.h>
3888///
3889/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3890///
3891/// \param __p
3892///    A pointer to an aligned memory location that will receive the integer
3893///    values.
3894/// \param __b
3895///    A 128-bit integer vector containing the values to be moved.
3896static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3897                                                          __m128i __b) {
3898  *__p = __b;
3899}
3900
3901/// Stores a 128-bit integer vector to an unaligned memory location.
3902///
3903/// \headerfile <x86intrin.h>
3904///
3905/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3906///
3907/// \param __p
3908///    A pointer to a memory location that will receive the integer values.
3909/// \param __b
3910///    A 128-bit integer vector containing the values to be moved.
3911static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3912                                                           __m128i __b) {
3913  struct __storeu_si128 {
3914    __m128i_u __v;
3915  } __attribute__((__packed__, __may_alias__));
3916  ((struct __storeu_si128 *)__p)->__v = __b;
3917}
3918
3919/// Stores a 64-bit integer value from the low element of a 128-bit integer
3920///    vector.
3921///
3922/// \headerfile <x86intrin.h>
3923///
3924/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3925///
3926/// \param __p
3927///    A pointer to a 64-bit memory location. The address of the memory
3928///    location does not have to be aligned.
3929/// \param __b
3930///    A 128-bit integer vector containing the value to be stored.
3931static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3932                                                          __m128i __b) {
3933  struct __storeu_si64 {
3934    long long __v;
3935  } __attribute__((__packed__, __may_alias__));
3936  ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3937}
3938
3939/// Stores a 32-bit integer value from the low element of a 128-bit integer
3940///    vector.
3941///
3942/// \headerfile <x86intrin.h>
3943///
3944/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3945///
3946/// \param __p
3947///    A pointer to a 32-bit memory location. The address of the memory
3948///    location does not have to be aligned.
3949/// \param __b
3950///    A 128-bit integer vector containing the value to be stored.
3951static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3952                                                          __m128i __b) {
3953  struct __storeu_si32 {
3954    int __v;
3955  } __attribute__((__packed__, __may_alias__));
3956  ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3957}
3958
3959/// Stores a 16-bit integer value from the low element of a 128-bit integer
3960///    vector.
3961///
3962/// \headerfile <x86intrin.h>
3963///
3964/// This intrinsic does not correspond to a specific instruction.
3965///
3966/// \param __p
3967///    A pointer to a 16-bit memory location. The address of the memory
3968///    location does not have to be aligned.
3969/// \param __b
3970///    A 128-bit integer vector containing the value to be stored.
3971static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3972                                                          __m128i __b) {
3973  struct __storeu_si16 {
3974    short __v;
3975  } __attribute__((__packed__, __may_alias__));
3976  ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3977}
3978
3979/// Moves bytes selected by the mask from the first operand to the
3980///    specified unaligned memory location. When a mask bit is 1, the
3981///    corresponding byte is written, otherwise it is not written.
3982///
3983///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3984///    used again soon). Exception and trap behavior for elements not selected
3985///    for storage to memory are implementation dependent.
3986///
3987/// \headerfile <x86intrin.h>
3988///
3989/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3990///   instruction.
3991///
3992/// \param __d
3993///    A 128-bit integer vector containing the values to be moved.
3994/// \param __n
3995///    A 128-bit integer vector containing the mask. The most significant bit of
3996///    each byte represents the mask bits.
3997/// \param __p
3998///    A pointer to an unaligned 128-bit memory location where the specified
3999///    values are moved.
4000static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
4001                                                              __m128i __n,
4002                                                              char *__p) {
4003  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
4004}
4005
4006/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
4007///    a memory location.
4008///
4009/// \headerfile <x86intrin.h>
4010///
4011/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
4012///
4013/// \param __p
4014///    A pointer to a 64-bit memory location that will receive the lower 64 bits
4015///    of the integer vector parameter.
4016/// \param __a
4017///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
4018///    value to be stored.
4019static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
4020                                                           __m128i __a) {
4021  struct __mm_storel_epi64_struct {
4022    long long __u;
4023  } __attribute__((__packed__, __may_alias__));
4024  ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4025}
4026
4027/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4028///    aligned memory location.
4029///
4030///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4031///    used again soon).
4032///
4033/// \headerfile <x86intrin.h>
4034///
4035/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4036///
4037/// \param __p
4038///    A pointer to the 128-bit aligned memory location used to store the value.
4039/// \param __a
4040///    A vector of [2 x double] containing the 64-bit values to be stored.
4041static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4042                                                        __m128d __a) {
4043  __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4044}
4045
4046/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4047///
4048///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4049///    used again soon).
4050///
4051/// \headerfile <x86intrin.h>
4052///
4053/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4054///
4055/// \param __p
4056///    A pointer to the 128-bit aligned memory location used to store the value.
4057/// \param __a
4058///    A 128-bit integer vector containing the values to be stored.
4059static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4060                                                           __m128i __a) {
4061  __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4062}
4063
4064/// Stores a 32-bit integer value in the specified memory location.
4065///
4066///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4067///    used again soon).
4068///
4069/// \headerfile <x86intrin.h>
4070///
4071/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4072///
4073/// \param __p
4074///    A pointer to the 32-bit memory location used to store the value.
4075/// \param __a
4076///    A 32-bit integer containing the value to be stored.
4077static __inline__ void
4078    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4079    _mm_stream_si32(void *__p, int __a) {
4080  __builtin_ia32_movnti((int *)__p, __a);
4081}
4082
4083#ifdef __x86_64__
4084/// Stores a 64-bit integer value in the specified memory location.
4085///
4086///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4087///    used again soon).
4088///
4089/// \headerfile <x86intrin.h>
4090///
4091/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4092///
4093/// \param __p
4094///    A pointer to the 64-bit memory location used to store the value.
4095/// \param __a
4096///    A 64-bit integer containing the value to be stored.
4097static __inline__ void
4098    __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4099    _mm_stream_si64(void *__p, long long __a) {
4100  __builtin_ia32_movnti64((long long *)__p, __a);
4101}
4102#endif
4103
4104#if defined(__cplusplus)
4105extern "C" {
4106#endif
4107
4108/// The cache line containing \a __p is flushed and invalidated from all
4109///    caches in the coherency domain.
4110///
4111/// \headerfile <x86intrin.h>
4112///
4113/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4114///
4115/// \param __p
4116///    A pointer to the memory location used to identify the cache line to be
4117///    flushed.
4118void _mm_clflush(void const *__p);
4119
4120/// Forces strong memory ordering (serialization) between load
4121///    instructions preceding this instruction and load instructions following
4122///    this instruction, ensuring the system completes all previous loads before
4123///    executing subsequent loads.
4124///
4125/// \headerfile <x86intrin.h>
4126///
4127/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4128///
4129void _mm_lfence(void);
4130
4131/// Forces strong memory ordering (serialization) between load and store
4132///    instructions preceding this instruction and load and store instructions
4133///    following this instruction, ensuring that the system completes all
4134///    previous memory accesses before executing subsequent memory accesses.
4135///
4136/// \headerfile <x86intrin.h>
4137///
4138/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4139///
4140void _mm_mfence(void);
4141
4142#if defined(__cplusplus)
4143} // extern "C"
4144#endif
4145
4146/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4147///    vector operands into 8-bit signed integers, and packs the results into
4148///    the destination.
4149///
4150///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
4151///    less than 0x80 are saturated to 0x80.
4152///
4153/// \headerfile <x86intrin.h>
4154///
4155/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4156///
4157/// \param __a
4158///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4159///   written to the lower 64 bits of the result.
4160/// \param __b
4161///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4162///   written to the higher 64 bits of the result.
4163/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4164static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4165                                                             __m128i __b) {
4166  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4167}
4168
4169/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4170///    vector operands into 16-bit signed integers, and packs the results into
4171///    the destination.
4172///
4173///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4174///    values less than 0x8000 are saturated to 0x8000.
4175///
4176/// \headerfile <x86intrin.h>
4177///
4178/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4179///
4180/// \param __a
4181///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4182///    are written to the lower 64 bits of the result.
4183/// \param __b
4184///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4185///    are written to the higher 64 bits of the result.
4186/// \returns A 128-bit vector of [8 x i16] containing the converted values.
4187static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4188                                                             __m128i __b) {
4189  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4190}
4191
4192/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4193///    vector operands into 8-bit unsigned integers, and packs the results into
4194///    the destination.
4195///
4196///    Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4197///    are saturated to 0x00.
4198///
4199/// \headerfile <x86intrin.h>
4200///
4201/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4202///
4203/// \param __a
4204///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4205///    written to the lower 64 bits of the result.
4206/// \param __b
4207///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4208///    written to the higher 64 bits of the result.
4209/// \returns A 128-bit vector of [16 x i8] containing the converted values.
4210static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4211                                                              __m128i __b) {
4212  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4213}
4214
4215/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4216///    the immediate-value parameter as a selector.
4217///
4218/// \headerfile <x86intrin.h>
4219///
4220/// \code
4221/// __m128i _mm_extract_epi16(__m128i a, const int imm);
4222/// \endcode
4223///
4224/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4225///
4226/// \param a
4227///    A 128-bit integer vector.
4228/// \param imm
4229///    An immediate value. Bits [2:0] selects values from \a a to be assigned
4230///    to bits[15:0] of the result. \n
4231///    000: assign values from bits [15:0] of \a a. \n
4232///    001: assign values from bits [31:16] of \a a. \n
4233///    010: assign values from bits [47:32] of \a a. \n
4234///    011: assign values from bits [63:48] of \a a. \n
4235///    100: assign values from bits [79:64] of \a a. \n
4236///    101: assign values from bits [95:80] of \a a. \n
4237///    110: assign values from bits [111:96] of \a a. \n
4238///    111: assign values from bits [127:112] of \a a.
4239/// \returns An integer, whose lower 16 bits are selected from the 128-bit
4240///    integer vector parameter and the remaining bits are assigned zeros.
4241#define _mm_extract_epi16(a, imm)                                              \
4242  ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
4243                                                    (int)(imm)))
4244
4245/// Constructs a 128-bit integer vector by first making a copy of the
4246///    128-bit integer vector parameter, and then inserting the lower 16 bits
4247///    of an integer parameter into an offset specified by the immediate-value
4248///    parameter.
4249///
4250/// \headerfile <x86intrin.h>
4251///
4252/// \code
4253/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4254/// \endcode
4255///
4256/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4257///
4258/// \param a
4259///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4260///    result and then one of the eight elements in the result is replaced by
4261///    the lower 16 bits of \a b.
4262/// \param b
4263///    An integer. The lower 16 bits of this parameter are written to the
4264///    result beginning at an offset specified by \a imm.
4265/// \param imm
4266///    An immediate value specifying the bit offset in the result at which the
4267///    lower 16 bits of \a b are written.
4268/// \returns A 128-bit integer vector containing the constructed values.
4269#define _mm_insert_epi16(a, b, imm)                                            \
4270  ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
4271                                        (int)(imm)))
4272
4273/// Copies the values of the most significant bits from each 8-bit
4274///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4275///    value, zero-extends the value, and writes it to the destination.
4276///
4277/// \headerfile <x86intrin.h>
4278///
4279/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4280///
4281/// \param __a
4282///    A 128-bit integer vector containing the values with bits to be extracted.
4283/// \returns The most significant bits from each 8-bit element in \a __a,
4284///    written to bits [15:0]. The other bits are assigned zeros.
4285static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4286  return __builtin_ia32_pmovmskb128((__v16qi)__a);
4287}
4288
4289/// Constructs a 128-bit integer vector by shuffling four 32-bit
4290///    elements of a 128-bit integer vector parameter, using the immediate-value
4291///    parameter as a specifier.
4292///
4293/// \headerfile <x86intrin.h>
4294///
4295/// \code
4296/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4297/// \endcode
4298///
4299/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4300///
4301/// \param a
4302///    A 128-bit integer vector containing the values to be copied.
4303/// \param imm
4304///    An immediate value containing an 8-bit value specifying which elements to
4305///    copy from a. The destinations within the 128-bit destination are assigned
4306///    values as follows: \n
4307///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4308///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4309///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4310///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4311///    Bit value assignments: \n
4312///    00: assign values from bits [31:0] of \a a. \n
4313///    01: assign values from bits [63:32] of \a a. \n
4314///    10: assign values from bits [95:64] of \a a. \n
4315///    11: assign values from bits [127:96] of \a a. \n
4316///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4317///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4318///    <c>[b6, b4, b2, b0]</c>.
4319/// \returns A 128-bit integer vector containing the shuffled values.
4320#define _mm_shuffle_epi32(a, imm)                                              \
4321  ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4322
4323/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4324///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4325///    value parameter as a specifier.
4326///
4327/// \headerfile <x86intrin.h>
4328///
4329/// \code
4330/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4331/// \endcode
4332///
4333/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4334///
4335/// \param a
4336///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4337///    [127:64] of the result.
4338/// \param imm
4339///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4340///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4341///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4342///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4343///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4344///    Bit value assignments: \n
4345///    00: assign values from bits [15:0] of \a a. \n
4346///    01: assign values from bits [31:16] of \a a. \n
4347///    10: assign values from bits [47:32] of \a a. \n
4348///    11: assign values from bits [63:48] of \a a. \n
4349///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4350///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4351///    <c>[b6, b4, b2, b0]</c>.
4352/// \returns A 128-bit integer vector containing the shuffled values.
4353#define _mm_shufflelo_epi16(a, imm)                                            \
4354  ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4355
4356/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4357///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4358///    value parameter as a specifier.
4359///
4360/// \headerfile <x86intrin.h>
4361///
4362/// \code
4363/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4364/// \endcode
4365///
4366/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4367///
4368/// \param a
4369///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4370///    [63:0] of the result.
4371/// \param imm
4372///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4373///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4374///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4375///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4376///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4377///    Bit value assignments: \n
4378///    00: assign values from bits [79:64] of \a a. \n
4379///    01: assign values from bits [95:80] of \a a. \n
4380///    10: assign values from bits [111:96] of \a a. \n
4381///    11: assign values from bits [127:112] of \a a. \n
4382///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4383///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4384///    <c>[b6, b4, b2, b0]</c>.
4385/// \returns A 128-bit integer vector containing the shuffled values.
4386#define _mm_shufflehi_epi16(a, imm)                                            \
4387  ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4388
4389/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4390///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4391///
4392/// \headerfile <x86intrin.h>
4393///
4394/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4395///   instruction.
4396///
4397/// \param __a
4398///    A 128-bit vector of [16 x i8].
4399///    Bits [71:64] are written to bits [7:0] of the result. \n
4400///    Bits [79:72] are written to bits [23:16] of the result. \n
4401///    Bits [87:80] are written to bits [39:32] of the result. \n
4402///    Bits [95:88] are written to bits [55:48] of the result. \n
4403///    Bits [103:96] are written to bits [71:64] of the result. \n
4404///    Bits [111:104] are written to bits [87:80] of the result. \n
4405///    Bits [119:112] are written to bits [103:96] of the result. \n
4406///    Bits [127:120] are written to bits [119:112] of the result.
4407/// \param __b
4408///    A 128-bit vector of [16 x i8]. \n
4409///    Bits [71:64] are written to bits [15:8] of the result. \n
4410///    Bits [79:72] are written to bits [31:24] of the result. \n
4411///    Bits [87:80] are written to bits [47:40] of the result. \n
4412///    Bits [95:88] are written to bits [63:56] of the result. \n
4413///    Bits [103:96] are written to bits [79:72] of the result. \n
4414///    Bits [111:104] are written to bits [95:88] of the result. \n
4415///    Bits [119:112] are written to bits [111:104] of the result. \n
4416///    Bits [127:120] are written to bits [127:120] of the result.
4417/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4418static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4419                                                               __m128i __b) {
4420  return (__m128i)__builtin_shufflevector(
4421      (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4422      16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4423}
4424
4425/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4426///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4427///
4428/// \headerfile <x86intrin.h>
4429///
4430/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4431///   instruction.
4432///
4433/// \param __a
4434///    A 128-bit vector of [8 x i16].
4435///    Bits [79:64] are written to bits [15:0] of the result. \n
4436///    Bits [95:80] are written to bits [47:32] of the result. \n
4437///    Bits [111:96] are written to bits [79:64] of the result. \n
4438///    Bits [127:112] are written to bits [111:96] of the result.
4439/// \param __b
4440///    A 128-bit vector of [8 x i16].
4441///    Bits [79:64] are written to bits [31:16] of the result. \n
4442///    Bits [95:80] are written to bits [63:48] of the result. \n
4443///    Bits [111:96] are written to bits [95:80] of the result. \n
4444///    Bits [127:112] are written to bits [127:112] of the result.
4445/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4446static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4447                                                                __m128i __b) {
4448  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4449                                          8 + 5, 6, 8 + 6, 7, 8 + 7);
4450}
4451
4452/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4453///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4454///
4455/// \headerfile <x86intrin.h>
4456///
4457/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4458///   instruction.
4459///
4460/// \param __a
4461///    A 128-bit vector of [4 x i32]. \n
4462///    Bits [95:64] are written to bits [31:0] of the destination. \n
4463///    Bits [127:96] are written to bits [95:64] of the destination.
4464/// \param __b
4465///    A 128-bit vector of [4 x i32]. \n
4466///    Bits [95:64] are written to bits [64:32] of the destination. \n
4467///    Bits [127:96] are written to bits [127:96] of the destination.
4468/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4469static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4470                                                                __m128i __b) {
4471  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4472                                          4 + 3);
4473}
4474
4475/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4476///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4477///
4478/// \headerfile <x86intrin.h>
4479///
4480/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4481///   instruction.
4482///
4483/// \param __a
4484///    A 128-bit vector of [2 x i64]. \n
4485///    Bits [127:64] are written to bits [63:0] of the destination.
4486/// \param __b
4487///    A 128-bit vector of [2 x i64]. \n
4488///    Bits [127:64] are written to bits [127:64] of the destination.
4489/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4490static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4491                                                                __m128i __b) {
4492  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4493}
4494
4495/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4496///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4497///
4498/// \headerfile <x86intrin.h>
4499///
4500/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4501///   instruction.
4502///
4503/// \param __a
4504///    A 128-bit vector of [16 x i8]. \n
4505///    Bits [7:0] are written to bits [7:0] of the result. \n
4506///    Bits [15:8] are written to bits [23:16] of the result. \n
4507///    Bits [23:16] are written to bits [39:32] of the result. \n
4508///    Bits [31:24] are written to bits [55:48] of the result. \n
4509///    Bits [39:32] are written to bits [71:64] of the result. \n
4510///    Bits [47:40] are written to bits [87:80] of the result. \n
4511///    Bits [55:48] are written to bits [103:96] of the result. \n
4512///    Bits [63:56] are written to bits [119:112] of the result.
4513/// \param __b
4514///    A 128-bit vector of [16 x i8].
4515///    Bits [7:0] are written to bits [15:8] of the result. \n
4516///    Bits [15:8] are written to bits [31:24] of the result. \n
4517///    Bits [23:16] are written to bits [47:40] of the result. \n
4518///    Bits [31:24] are written to bits [63:56] of the result. \n
4519///    Bits [39:32] are written to bits [79:72] of the result. \n
4520///    Bits [47:40] are written to bits [95:88] of the result. \n
4521///    Bits [55:48] are written to bits [111:104] of the result. \n
4522///    Bits [63:56] are written to bits [127:120] of the result.
4523/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4524static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4525                                                               __m128i __b) {
4526  return (__m128i)__builtin_shufflevector(
4527      (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4528      16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4529}
4530
4531/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4532///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4533///    [8 x i16].
4534///
4535/// \headerfile <x86intrin.h>
4536///
4537/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4538///   instruction.
4539///
4540/// \param __a
4541///    A 128-bit vector of [8 x i16].
4542///    Bits [15:0] are written to bits [15:0] of the result. \n
4543///    Bits [31:16] are written to bits [47:32] of the result. \n
4544///    Bits [47:32] are written to bits [79:64] of the result. \n
4545///    Bits [63:48] are written to bits [111:96] of the result.
4546/// \param __b
4547///    A 128-bit vector of [8 x i16].
4548///    Bits [15:0] are written to bits [31:16] of the result. \n
4549///    Bits [31:16] are written to bits [63:48] of the result. \n
4550///    Bits [47:32] are written to bits [95:80] of the result. \n
4551///    Bits [63:48] are written to bits [127:112] of the result.
4552/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4553static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4554                                                                __m128i __b) {
4555  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4556                                          8 + 1, 2, 8 + 2, 3, 8 + 3);
4557}
4558
4559/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4560///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4561///
4562/// \headerfile <x86intrin.h>
4563///
4564/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4565///   instruction.
4566///
4567/// \param __a
4568///    A 128-bit vector of [4 x i32]. \n
4569///    Bits [31:0] are written to bits [31:0] of the destination. \n
4570///    Bits [63:32] are written to bits [95:64] of the destination.
4571/// \param __b
4572///    A 128-bit vector of [4 x i32]. \n
4573///    Bits [31:0] are written to bits [64:32] of the destination. \n
4574///    Bits [63:32] are written to bits [127:96] of the destination.
4575/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4576static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4577                                                                __m128i __b) {
4578  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4579                                          4 + 1);
4580}
4581
4582/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4583///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4584///
4585/// \headerfile <x86intrin.h>
4586///
4587/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4588///   instruction.
4589///
4590/// \param __a
4591///    A 128-bit vector of [2 x i64]. \n
4592///    Bits [63:0] are written to bits [63:0] of the destination. \n
4593/// \param __b
4594///    A 128-bit vector of [2 x i64]. \n
4595///    Bits [63:0] are written to bits [127:64] of the destination. \n
4596/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4597static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4598                                                                __m128i __b) {
4599  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4600}
4601
4602/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4603///    integer.
4604///
4605/// \headerfile <x86intrin.h>
4606///
4607/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4608///
4609/// \param __a
4610///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4611///    destination.
4612/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4613static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
4614_mm_movepi64_pi64(__m128i __a) {
4615  return (__m64)__a[0];
4616}
4617
4618/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4619///    upper bits.
4620///
4621/// \headerfile <x86intrin.h>
4622///
4623/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4624///
4625/// \param __a
4626///    A 64-bit value.
4627/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4628///    the operand. The upper 64 bits are assigned zeros.
4629static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4630_mm_movpi64_epi64(__m64 __a) {
4631  return __builtin_shufflevector((__v1di)__a, _mm_setzero_si64(), 0, 1);
4632}
4633
4634/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4635///    integer vector, zeroing the upper bits.
4636///
4637/// \headerfile <x86intrin.h>
4638///
4639/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4640///
4641/// \param __a
4642///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4643///    destination.
4644/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4645///    the operand. The upper 64 bits are assigned zeros.
4646static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4647_mm_move_epi64(__m128i __a) {
4648  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4649}
4650
4651/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4652///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4653///    double].
4654///
4655/// \headerfile <x86intrin.h>
4656///
4657/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4658///
4659/// \param __a
4660///    A 128-bit vector of [2 x double]. \n
4661///    Bits [127:64] are written to bits [63:0] of the destination.
4662/// \param __b
4663///    A 128-bit vector of [2 x double]. \n
4664///    Bits [127:64] are written to bits [127:64] of the destination.
4665/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4666static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4667_mm_unpackhi_pd(__m128d __a, __m128d __b) {
4668  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4669}
4670
4671/// Unpacks the low-order 64-bit elements from two 128-bit vectors
4672///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4673///    double].
4674///
4675/// \headerfile <x86intrin.h>
4676///
4677/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4678///
4679/// \param __a
4680///    A 128-bit vector of [2 x double]. \n
4681///    Bits [63:0] are written to bits [63:0] of the destination.
4682/// \param __b
4683///    A 128-bit vector of [2 x double]. \n
4684///    Bits [63:0] are written to bits [127:64] of the destination.
4685/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4686static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4687_mm_unpacklo_pd(__m128d __a, __m128d __b) {
4688  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4689}
4690
4691/// Extracts the sign bits of the double-precision values in the 128-bit
4692///    vector of [2 x double], zero-extends the value, and writes it to the
4693///    low-order bits of the destination.
4694///
4695/// \headerfile <x86intrin.h>
4696///
4697/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4698///
4699/// \param __a
4700///    A 128-bit vector of [2 x double] containing the values with sign bits to
4701///    be extracted.
4702/// \returns The sign bits from each of the double-precision elements in \a __a,
4703///    written to bits [1:0]. The remaining bits are assigned values of zero.
4704static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4705  return __builtin_ia32_movmskpd((__v2df)__a);
4706}
4707
4708/// Constructs a 128-bit floating-point vector of [2 x double] from two
4709///    128-bit vector parameters of [2 x double], using the immediate-value
4710///     parameter as a specifier.
4711///
4712/// \headerfile <x86intrin.h>
4713///
4714/// \code
4715/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4716/// \endcode
4717///
4718/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4719///
4720/// \param a
4721///    A 128-bit vector of [2 x double].
4722/// \param b
4723///    A 128-bit vector of [2 x double].
4724/// \param i
4725///    An 8-bit immediate value. The least significant two bits specify which
4726///    elements to copy from \a a and \a b: \n
4727///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4728///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4729///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4730///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4731///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4732///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4733///    <c>[b1, b0]</c>.
4734/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4735#define _mm_shuffle_pd(a, b, i)                                                \
4736  ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
4737                                  (int)(i)))
4738
4739/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4740///    floating-point vector of [4 x float].
4741///
4742/// \headerfile <x86intrin.h>
4743///
4744/// This intrinsic has no corresponding instruction.
4745///
4746/// \param __a
4747///    A 128-bit floating-point vector of [2 x double].
4748/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4749///    bitwise pattern as the parameter.
4750static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4751_mm_castpd_ps(__m128d __a) {
4752  return (__m128)__a;
4753}
4754
4755/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4756///    integer vector.
4757///
4758/// \headerfile <x86intrin.h>
4759///
4760/// This intrinsic has no corresponding instruction.
4761///
4762/// \param __a
4763///    A 128-bit floating-point vector of [2 x double].
4764/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4765///    parameter.
4766static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4767_mm_castpd_si128(__m128d __a) {
4768  return (__m128i)__a;
4769}
4770
4771/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4772///    floating-point vector of [2 x double].
4773///
4774/// \headerfile <x86intrin.h>
4775///
4776/// This intrinsic has no corresponding instruction.
4777///
4778/// \param __a
4779///    A 128-bit floating-point vector of [4 x float].
4780/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4781///    bitwise pattern as the parameter.
4782static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4783_mm_castps_pd(__m128 __a) {
4784  return (__m128d)__a;
4785}
4786
4787/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4788///    integer vector.
4789///
4790/// \headerfile <x86intrin.h>
4791///
4792/// This intrinsic has no corresponding instruction.
4793///
4794/// \param __a
4795///    A 128-bit floating-point vector of [4 x float].
4796/// \returns A 128-bit integer vector containing the same bitwise pattern as the
4797///    parameter.
4798static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
4799_mm_castps_si128(__m128 __a) {
4800  return (__m128i)__a;
4801}
4802
4803/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4804///    of [4 x float].
4805///
4806/// \headerfile <x86intrin.h>
4807///
4808/// This intrinsic has no corresponding instruction.
4809///
4810/// \param __a
4811///    A 128-bit integer vector.
4812/// \returns A 128-bit floating-point vector of [4 x float] containing the same
4813///    bitwise pattern as the parameter.
4814static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
4815_mm_castsi128_ps(__m128i __a) {
4816  return (__m128)__a;
4817}
4818
4819/// Casts a 128-bit integer vector into a 128-bit floating-point vector
4820///    of [2 x double].
4821///
4822/// \headerfile <x86intrin.h>
4823///
4824/// This intrinsic has no corresponding instruction.
4825///
4826/// \param __a
4827///    A 128-bit integer vector.
4828/// \returns A 128-bit floating-point vector of [2 x double] containing the same
4829///    bitwise pattern as the parameter.
4830static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR
4831_mm_castsi128_pd(__m128i __a) {
4832  return (__m128d)__a;
4833}
4834
4835/// Compares each of the corresponding double-precision values of two
4836///    128-bit vectors of [2 x double], using the operation specified by the
4837///    immediate integer operand.
4838///
4839///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4840///    If either value in a comparison is NaN, comparisons that are ordered
4841///    return false, and comparisons that are unordered return true.
4842///
4843/// \headerfile <x86intrin.h>
4844///
4845/// \code
4846/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4847/// \endcode
4848///
4849/// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4850///
4851/// \param a
4852///    A 128-bit vector of [2 x double].
4853/// \param b
4854///    A 128-bit vector of [2 x double].
4855/// \param c
4856///    An immediate integer operand, with bits [4:0] specifying which comparison
4857///    operation to use: \n
4858///    0x00: Equal (ordered, non-signaling) \n
4859///    0x01: Less-than (ordered, signaling) \n
4860///    0x02: Less-than-or-equal (ordered, signaling) \n
4861///    0x03: Unordered (non-signaling) \n
4862///    0x04: Not-equal (unordered, non-signaling) \n
4863///    0x05: Not-less-than (unordered, signaling) \n
4864///    0x06: Not-less-than-or-equal (unordered, signaling) \n
4865///    0x07: Ordered (non-signaling) \n
4866/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4867#define _mm_cmp_pd(a, b, c)                                                    \
4868  ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
4869                                 (c)))
4870
4871/// Compares each of the corresponding scalar double-precision values of
4872///    two 128-bit vectors of [2 x double], using the operation specified by the
4873///    immediate integer operand.
4874///
4875///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4876///    If either value in a comparison is NaN, comparisons that are ordered
4877///    return false, and comparisons that are unordered return true.
4878///
4879/// \headerfile <x86intrin.h>
4880///
4881/// \code
4882/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4883/// \endcode
4884///
4885/// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4886///
4887/// \param a
4888///    A 128-bit vector of [2 x double].
4889/// \param b
4890///    A 128-bit vector of [2 x double].
4891/// \param c
4892///    An immediate integer operand, with bits [4:0] specifying which comparison
4893///    operation to use: \n
4894///    0x00: Equal (ordered, non-signaling) \n
4895///    0x01: Less-than (ordered, signaling) \n
4896///    0x02: Less-than-or-equal (ordered, signaling) \n
4897///    0x03: Unordered (non-signaling) \n
4898///    0x04: Not-equal (unordered, non-signaling) \n
4899///    0x05: Not-less-than (unordered, signaling) \n
4900///    0x06: Not-less-than-or-equal (unordered, signaling) \n
4901///    0x07: Ordered (non-signaling) \n
4902/// \returns A 128-bit vector of [2 x double] containing the comparison results.
4903#define _mm_cmp_sd(a, b, c)                                                    \
4904  ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
4905                                 (c)))
4906
4907#if defined(__cplusplus)
4908extern "C" {
4909#endif
4910
4911/// Indicates that a spin loop is being executed for the purposes of
4912///    optimizing power consumption during the loop.
4913///
4914/// \headerfile <x86intrin.h>
4915///
4916/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4917///
4918void _mm_pause(void);
4919
4920#if defined(__cplusplus)
4921} // extern "C"
4922#endif
4923
4924#undef __anyext128
4925#undef __trunc64
4926#undef __DEFAULT_FN_ATTRS
4927#undef __DEFAULT_FN_ATTRS_CONSTEXPR
4928
4929#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4930
4931#define _MM_DENORMALS_ZERO_ON (0x0040U)
4932#define _MM_DENORMALS_ZERO_OFF (0x0000U)
4933
4934#define _MM_DENORMALS_ZERO_MASK (0x0040U)
4935
4936#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4937#define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
4938  (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4939
4940#endif /* __EMMINTRIN_H */