zig/lib/include/smmintrin.h at master

   1/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9
  10#ifndef __SMMINTRIN_H
  11#define __SMMINTRIN_H
  12
  13#if !defined(__i386__) && !defined(__x86_64__)
  14#error "This header is only meant to be used on x86 and x64 architecture"
  15#endif
  16
  17#include <tmmintrin.h>
  18
  19/* Define the default attributes for the functions in this file. */
  20#if defined(__EVEX512__) && !defined(__AVX10_1_512__)
  21#define __DEFAULT_FN_ATTRS                                                     \
  22  __attribute__((__always_inline__, __nodebug__,                               \
  23                 __target__("sse4.1,no-evex512"), __min_vector_width__(128)))
  24#else
  25#define __DEFAULT_FN_ATTRS                                                     \
  26  __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"),         \
  27                 __min_vector_width__(128)))
  28#endif
  29
  30/* SSE4 Rounding macros. */
  31#define _MM_FROUND_TO_NEAREST_INT 0x00
  32#define _MM_FROUND_TO_NEG_INF 0x01
  33#define _MM_FROUND_TO_POS_INF 0x02
  34#define _MM_FROUND_TO_ZERO 0x03
  35#define _MM_FROUND_CUR_DIRECTION 0x04
  36
  37#define _MM_FROUND_RAISE_EXC 0x00
  38#define _MM_FROUND_NO_EXC 0x08
  39
  40#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
  41#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
  42#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
  43#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
  44#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
  45#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
  46
  47/// Rounds up each element of the 128-bit vector of [4 x float] to an
  48///    integer and returns the rounded values in a 128-bit vector of
  49///    [4 x float].
  50///
  51/// \headerfile <x86intrin.h>
  52///
  53/// \code
  54/// __m128 _mm_ceil_ps(__m128 X);
  55/// \endcode
  56///
  57/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
  58///
  59/// \param X
  60///    A 128-bit vector of [4 x float] values to be rounded up.
  61/// \returns A 128-bit vector of [4 x float] containing the rounded values.
  62#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
  63
  64/// Rounds up each element of the 128-bit vector of [2 x double] to an
  65///    integer and returns the rounded values in a 128-bit vector of
  66///    [2 x double].
  67///
  68/// \headerfile <x86intrin.h>
  69///
  70/// \code
  71/// __m128d _mm_ceil_pd(__m128d X);
  72/// \endcode
  73///
  74/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
  75///
  76/// \param X
  77///    A 128-bit vector of [2 x double] values to be rounded up.
  78/// \returns A 128-bit vector of [2 x double] containing the rounded values.
  79#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
  80
  81/// Copies three upper elements of the first 128-bit vector operand to
  82///    the corresponding three upper elements of the 128-bit result vector of
  83///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
  84///    operand to an integer and copies it to the lowest element of the 128-bit
  85///    result vector of [4 x float].
  86///
  87/// \headerfile <x86intrin.h>
  88///
  89/// \code
  90/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
  91/// \endcode
  92///
  93/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
  94///
  95/// \param X
  96///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
  97///    copied to the corresponding bits of the result.
  98/// \param Y
  99///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
 100///    rounded up to the nearest integer and copied to the corresponding bits
 101///    of the result.
 102/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 103///    values.
 104#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
 105
 106/// Copies the upper element of the first 128-bit vector operand to the
 107///    corresponding upper element of the 128-bit result vector of [2 x double].
 108///    Rounds up the lower element of the second 128-bit vector operand to an
 109///    integer and copies it to the lower element of the 128-bit result vector
 110///    of [2 x double].
 111///
 112/// \headerfile <x86intrin.h>
 113///
 114/// \code
 115/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
 116/// \endcode
 117///
 118/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
 119///
 120/// \param X
 121///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
 122///    copied to the corresponding bits of the result.
 123/// \param Y
 124///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
 125///    rounded up to the nearest integer and copied to the corresponding bits
 126///    of the result.
 127/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 128///    values.
 129#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
 130
 131/// Rounds down each element of the 128-bit vector of [4 x float] to an
 132///    an integer and returns the rounded values in a 128-bit vector of
 133///    [4 x float].
 134///
 135/// \headerfile <x86intrin.h>
 136///
 137/// \code
 138/// __m128 _mm_floor_ps(__m128 X);
 139/// \endcode
 140///
 141/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
 142///
 143/// \param X
 144///    A 128-bit vector of [4 x float] values to be rounded down.
 145/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 146#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
 147
 148/// Rounds down each element of the 128-bit vector of [2 x double] to an
 149///    integer and returns the rounded values in a 128-bit vector of
 150///    [2 x double].
 151///
 152/// \headerfile <x86intrin.h>
 153///
 154/// \code
 155/// __m128d _mm_floor_pd(__m128d X);
 156/// \endcode
 157///
 158/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
 159///
 160/// \param X
 161///    A 128-bit vector of [2 x double].
 162/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 163#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
 164
 165/// Copies three upper elements of the first 128-bit vector operand to
 166///    the corresponding three upper elements of the 128-bit result vector of
 167///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
 168///    operand to an integer and copies it to the lowest element of the 128-bit
 169///    result vector of [4 x float].
 170///
 171/// \headerfile <x86intrin.h>
 172///
 173/// \code
 174/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
 175/// \endcode
 176///
 177/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
 178///
 179/// \param X
 180///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
 181///    copied to the corresponding bits of the result.
 182/// \param Y
 183///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
 184///    rounded down to the nearest integer and copied to the corresponding bits
 185///    of the result.
 186/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 187///    values.
 188#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
 189
 190/// Copies the upper element of the first 128-bit vector operand to the
 191///    corresponding upper element of the 128-bit result vector of [2 x double].
 192///    Rounds down the lower element of the second 128-bit vector operand to an
 193///    integer and copies it to the lower element of the 128-bit result vector
 194///    of [2 x double].
 195///
 196/// \headerfile <x86intrin.h>
 197///
 198/// \code
 199/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
 200/// \endcode
 201///
 202/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
 203///
 204/// \param X
 205///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
 206///    copied to the corresponding bits of the result.
 207/// \param Y
 208///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
 209///    rounded down to the nearest integer and copied to the corresponding bits
 210///    of the result.
 211/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 212///    values.
 213#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
 214
 215/// Rounds each element of the 128-bit vector of [4 x float] to an
 216///    integer value according to the rounding control specified by the second
 217///    argument and returns the rounded values in a 128-bit vector of
 218///    [4 x float].
 219///
 220/// \headerfile <x86intrin.h>
 221///
 222/// \code
 223/// __m128 _mm_round_ps(__m128 X, const int M);
 224/// \endcode
 225///
 226/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
 227///
 228/// \param X
 229///    A 128-bit vector of [4 x float].
 230/// \param M
 231///    An integer value that specifies the rounding operation. \n
 232///    Bits [7:4] are reserved. \n
 233///    Bit [3] is a precision exception value: \n
 234///      0: A normal PE exception is used \n
 235///      1: The PE field is not updated \n
 236///    Bit [2] is the rounding control source: \n
 237///      0: Use bits [1:0] of \a M \n
 238///      1: Use the current MXCSR setting \n
 239///    Bits [1:0] contain the rounding control definition: \n
 240///      00: Nearest \n
 241///      01: Downward (toward negative infinity) \n
 242///      10: Upward (toward positive infinity) \n
 243///      11: Truncated
 244/// \returns A 128-bit vector of [4 x float] containing the rounded values.
 245#define _mm_round_ps(X, M)                                                     \
 246  ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
 247
 248/// Copies three upper elements of the first 128-bit vector operand to
 249///    the corresponding three upper elements of the 128-bit result vector of
 250///    [4 x float]. Rounds the lowest element of the second 128-bit vector
 251///    operand to an integer value according to the rounding control specified
 252///    by the third argument and copies it to the lowest element of the 128-bit
 253///    result vector of [4 x float].
 254///
 255/// \headerfile <x86intrin.h>
 256///
 257/// \code
 258/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
 259/// \endcode
 260///
 261/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
 262///
 263/// \param X
 264///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
 265///    copied to the corresponding bits of the result.
 266/// \param Y
 267///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
 268///    rounded to the nearest integer using the specified rounding control and
 269///    copied to the corresponding bits of the result.
 270/// \param M
 271///    An integer value that specifies the rounding operation. \n
 272///    Bits [7:4] are reserved. \n
 273///    Bit [3] is a precision exception value: \n
 274///      0: A normal PE exception is used \n
 275///      1: The PE field is not updated \n
 276///    Bit [2] is the rounding control source: \n
 277///      0: Use bits [1:0] of \a M \n
 278///      1: Use the current MXCSR setting \n
 279///    Bits [1:0] contain the rounding control definition: \n
 280///      00: Nearest \n
 281///      01: Downward (toward negative infinity) \n
 282///      10: Upward (toward positive infinity) \n
 283///      11: Truncated
 284/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 285///    values.
 286#define _mm_round_ss(X, Y, M)                                                  \
 287  ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y),    \
 288                                  (M)))
 289
 290/// Rounds each element of the 128-bit vector of [2 x double] to an
 291///    integer value according to the rounding control specified by the second
 292///    argument and returns the rounded values in a 128-bit vector of
 293///    [2 x double].
 294///
 295/// \headerfile <x86intrin.h>
 296///
 297/// \code
 298/// __m128d _mm_round_pd(__m128d X, const int M);
 299/// \endcode
 300///
 301/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
 302///
 303/// \param X
 304///    A 128-bit vector of [2 x double].
 305/// \param M
 306///    An integer value that specifies the rounding operation. \n
 307///    Bits [7:4] are reserved. \n
 308///    Bit [3] is a precision exception value: \n
 309///      0: A normal PE exception is used \n
 310///      1: The PE field is not updated \n
 311///    Bit [2] is the rounding control source: \n
 312///      0: Use bits [1:0] of \a M \n
 313///      1: Use the current MXCSR setting \n
 314///    Bits [1:0] contain the rounding control definition: \n
 315///      00: Nearest \n
 316///      01: Downward (toward negative infinity) \n
 317///      10: Upward (toward positive infinity) \n
 318///      11: Truncated
 319/// \returns A 128-bit vector of [2 x double] containing the rounded values.
 320#define _mm_round_pd(X, M)                                                     \
 321  ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
 322
 323/// Copies the upper element of the first 128-bit vector operand to the
 324///    corresponding upper element of the 128-bit result vector of [2 x double].
 325///    Rounds the lower element of the second 128-bit vector operand to an
 326///    integer value according to the rounding control specified by the third
 327///    argument and copies it to the lower element of the 128-bit result vector
 328///    of [2 x double].
 329///
 330/// \headerfile <x86intrin.h>
 331///
 332/// \code
 333/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
 334/// \endcode
 335///
 336/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
 337///
 338/// \param X
 339///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
 340///    copied to the corresponding bits of the result.
 341/// \param Y
 342///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
 343///    rounded to the nearest integer using the specified rounding control and
 344///    copied to the corresponding bits of the result.
 345/// \param M
 346///    An integer value that specifies the rounding operation. \n
 347///    Bits [7:4] are reserved. \n
 348///    Bit [3] is a precision exception value: \n
 349///      0: A normal PE exception is used \n
 350///      1: The PE field is not updated \n
 351///    Bit [2] is the rounding control source: \n
 352///      0: Use bits [1:0] of \a M \n
 353///      1: Use the current MXCSR setting \n
 354///    Bits [1:0] contain the rounding control definition: \n
 355///      00: Nearest \n
 356///      01: Downward (toward negative infinity) \n
 357///      10: Upward (toward positive infinity) \n
 358///      11: Truncated
 359/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 360///    values.
 361#define _mm_round_sd(X, Y, M)                                                  \
 362  ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
 363                                   (M)))
 364
 365/* SSE4 Packed Blending Intrinsics.  */
 366/// Returns a 128-bit vector of [2 x double] where the values are
 367///    selected from either the first or second operand as specified by the
 368///    third operand, the control mask.
 369///
 370/// \headerfile <x86intrin.h>
 371///
 372/// \code
 373/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
 374/// \endcode
 375///
 376/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
 377///
 378/// \param V1
 379///    A 128-bit vector of [2 x double].
 380/// \param V2
 381///    A 128-bit vector of [2 x double].
 382/// \param M
 383///    An immediate integer operand, with mask bits [1:0] specifying how the
 384///    values are to be copied. The position of the mask bit corresponds to the
 385///    index of a copied value. When a mask bit is 0, the corresponding 64-bit
 386///    element in operand \a V1 is copied to the same position in the result.
 387///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
 388///    is copied to the same position in the result.
 389/// \returns A 128-bit vector of [2 x double] containing the copied values.
 390#define _mm_blend_pd(V1, V2, M)                                                \
 391  ((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1),                      \
 392                                   (__v2df)(__m128d)(V2), (int)(M)))
 393
 394/// Returns a 128-bit vector of [4 x float] where the values are selected
 395///    from either the first or second operand as specified by the third
 396///    operand, the control mask.
 397///
 398/// \headerfile <x86intrin.h>
 399///
 400/// \code
 401/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
 402/// \endcode
 403///
 404/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
 405///
 406/// \param V1
 407///    A 128-bit vector of [4 x float].
 408/// \param V2
 409///    A 128-bit vector of [4 x float].
 410/// \param M
 411///    An immediate integer operand, with mask bits [3:0] specifying how the
 412///    values are to be copied. The position of the mask bit corresponds to the
 413///    index of a copied value. When a mask bit is 0, the corresponding 32-bit
 414///    element in operand \a V1 is copied to the same position in the result.
 415///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
 416///    is copied to the same position in the result.
 417/// \returns A 128-bit vector of [4 x float] containing the copied values.
 418#define _mm_blend_ps(V1, V2, M)                                                \
 419  ((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2),  \
 420                                  (int)(M)))
 421
 422/// Returns a 128-bit vector of [2 x double] where the values are
 423///    selected from either the first or second operand as specified by the
 424///    third operand, the control mask.
 425///
 426/// \headerfile <x86intrin.h>
 427///
 428/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
 429///
 430/// \param __V1
 431///    A 128-bit vector of [2 x double].
 432/// \param __V2
 433///    A 128-bit vector of [2 x double].
 434/// \param __M
 435///    A 128-bit vector operand, with mask bits 127 and 63 specifying how the
 436///    values are to be copied. The position of the mask bit corresponds to the
 437///    most significant bit of a copied value. When a mask bit is 0, the
 438///    corresponding 64-bit element in operand \a __V1 is copied to the same
 439///    position in the result. When a mask bit is 1, the corresponding 64-bit
 440///    element in operand \a __V2 is copied to the same position in the result.
 441/// \returns A 128-bit vector of [2 x double] containing the copied values.
 442static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
 443                                                           __m128d __V2,
 444                                                           __m128d __M) {
 445  return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
 446                                          (__v2df)__M);
 447}
 448
 449/// Returns a 128-bit vector of [4 x float] where the values are
 450///    selected from either the first or second operand as specified by the
 451///    third operand, the control mask.
 452///
 453/// \headerfile <x86intrin.h>
 454///
 455/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
 456///
 457/// \param __V1
 458///    A 128-bit vector of [4 x float].
 459/// \param __V2
 460///    A 128-bit vector of [4 x float].
 461/// \param __M
 462///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
 463///    how the values are to be copied. The position of the mask bit corresponds
 464///    to the most significant bit of a copied value. When a mask bit is 0, the
 465///    corresponding 32-bit element in operand \a __V1 is copied to the same
 466///    position in the result. When a mask bit is 1, the corresponding 32-bit
 467///    element in operand \a __V2 is copied to the same position in the result.
 468/// \returns A 128-bit vector of [4 x float] containing the copied values.
 469static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
 470                                                          __m128 __V2,
 471                                                          __m128 __M) {
 472  return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
 473                                         (__v4sf)__M);
 474}
 475
 476/// Returns a 128-bit vector of [16 x i8] where the values are selected
 477///    from either of the first or second operand as specified by the third
 478///    operand, the control mask.
 479///
 480/// \headerfile <x86intrin.h>
 481///
 482/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
 483///
 484/// \param __V1
 485///    A 128-bit vector of [16 x i8].
 486/// \param __V2
 487///    A 128-bit vector of [16 x i8].
 488/// \param __M
 489///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
 490///    how the values are to be copied. The position of the mask bit corresponds
 491///    to the most significant bit of a copied value. When a mask bit is 0, the
 492///    corresponding 8-bit element in operand \a __V1 is copied to the same
 493///    position in the result. When a mask bit is 1, the corresponding 8-bit
 494///    element in operand \a __V2 is copied to the same position in the result.
 495/// \returns A 128-bit vector of [16 x i8] containing the copied values.
 496static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
 497                                                             __m128i __V2,
 498                                                             __m128i __M) {
 499  return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
 500                                             (__v16qi)__M);
 501}
 502
 503/// Returns a 128-bit vector of [8 x i16] where the values are selected
 504///    from either of the first or second operand as specified by the third
 505///    operand, the control mask.
 506///
 507/// \headerfile <x86intrin.h>
 508///
 509/// \code
 510/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
 511/// \endcode
 512///
 513/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
 514///
 515/// \param V1
 516///    A 128-bit vector of [8 x i16].
 517/// \param V2
 518///    A 128-bit vector of [8 x i16].
 519/// \param M
 520///    An immediate integer operand, with mask bits [7:0] specifying how the
 521///    values are to be copied. The position of the mask bit corresponds to the
 522///    index of a copied value. When a mask bit is 0, the corresponding 16-bit
 523///    element in operand \a V1 is copied to the same position in the result.
 524///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
 525///    is copied to the same position in the result.
 526/// \returns A 128-bit vector of [8 x i16] containing the copied values.
 527#define _mm_blend_epi16(V1, V2, M)                                             \
 528  ((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1),                   \
 529                                      (__v8hi)(__m128i)(V2), (int)(M)))
 530
 531/* SSE4 Dword Multiply Instructions.  */
 532/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
 533///    and returns the lower 32 bits of the each product in a 128-bit vector of
 534///    [4 x i32].
 535///
 536/// \headerfile <x86intrin.h>
 537///
 538/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
 539///
 540/// \param __V1
 541///    A 128-bit integer vector.
 542/// \param __V2
 543///    A 128-bit integer vector.
 544/// \returns A 128-bit integer vector containing the products of both operands.
 545static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
 546                                                             __m128i __V2) {
 547  return (__m128i)((__v4su)__V1 * (__v4su)__V2);
 548}
 549
 550/// Multiplies corresponding even-indexed elements of two 128-bit
 551///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
 552///    containing the products.
 553///
 554/// \headerfile <x86intrin.h>
 555///
 556/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
 557///
 558/// \param __V1
 559///    A 128-bit vector of [4 x i32].
 560/// \param __V2
 561///    A 128-bit vector of [4 x i32].
 562/// \returns A 128-bit vector of [2 x i64] containing the products of both
 563///    operands.
 564static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
 565                                                           __m128i __V2) {
 566  return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
 567}
 568
 569/* SSE4 Floating Point Dot Product Instructions.  */
 570/// Computes the dot product of the two 128-bit vectors of [4 x float]
 571///    and returns it in the elements of the 128-bit result vector of
 572///    [4 x float].
 573///
 574///    The immediate integer operand controls which input elements
 575///    will contribute to the dot product, and where the final results are
 576///    returned.
 577///
 578/// \headerfile <x86intrin.h>
 579///
 580/// \code
 581/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
 582/// \endcode
 583///
 584/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
 585///
 586/// \param X
 587///    A 128-bit vector of [4 x float].
 588/// \param Y
 589///    A 128-bit vector of [4 x float].
 590/// \param M
 591///    An immediate integer operand. Mask bits [7:4] determine which elements
 592///    of the input vectors are used, with bit [4] corresponding to the lowest
 593///    element and bit [7] corresponding to the highest element of each [4 x
 594///    float] vector. If a bit is set, the corresponding elements from the two
 595///    input vectors are used as an input for dot product; otherwise that input
 596///    is treated as zero. Bits [3:0] determine which elements of the result
 597///    will receive a copy of the final dot product, with bit [0] corresponding
 598///    to the lowest element and bit [3] corresponding to the highest element of
 599///    each [4 x float] subvector. If a bit is set, the dot product is returned
 600///    in the corresponding element; otherwise that element is set to zero.
 601/// \returns A 128-bit vector of [4 x float] containing the dot product.
 602#define _mm_dp_ps(X, Y, M)                                                     \
 603  ((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
 604
 605/// Computes the dot product of the two 128-bit vectors of [2 x double]
 606///    and returns it in the elements of the 128-bit result vector of
 607///    [2 x double].
 608///
 609///    The immediate integer operand controls which input
 610///    elements will contribute to the dot product, and where the final results
 611///    are returned.
 612///
 613/// \headerfile <x86intrin.h>
 614///
 615/// \code
 616/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
 617/// \endcode
 618///
 619/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
 620///
 621/// \param X
 622///    A 128-bit vector of [2 x double].
 623/// \param Y
 624///    A 128-bit vector of [2 x double].
 625/// \param M
 626///    An immediate integer operand. Mask bits [5:4] determine which elements
 627///    of the input vectors are used, with bit [4] corresponding to the lowest
 628///    element and bit [5] corresponding to the highest element of each of [2 x
 629///    double] vector. If a bit is set, the corresponding elements from the two
 630///    input vectors are used as an input for dot product; otherwise that input
 631///    is treated as zero. Bits [1:0] determine which elements of the result
 632///    will receive a copy of the final dot product, with bit [0] corresponding
 633///    to the lowest element and bit [1] corresponding to the highest element of
 634///    each [2 x double] vector. If a bit is set, the dot product is returned in
 635///    the corresponding element; otherwise that element is set to zero.
 636#define _mm_dp_pd(X, Y, M)                                                     \
 637  ((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y),    \
 638                                (M)))
 639
 640/* SSE4 Streaming Load Hint Instruction.  */
 641/// Loads integer values from a 128-bit aligned memory location to a
 642///    128-bit integer vector.
 643///
 644/// \headerfile <x86intrin.h>
 645///
 646/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
 647///
 648/// \param __V
 649///    A pointer to a 128-bit aligned memory location that contains the integer
 650///    values.
 651/// \returns A 128-bit integer vector containing the data stored at the
 652///    specified memory location.
 653static __inline__ __m128i __DEFAULT_FN_ATTRS
 654_mm_stream_load_si128(const void *__V) {
 655  return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
 656}
 657
 658/* SSE4 Packed Integer Min/Max Instructions.  */
 659/// Compares the corresponding elements of two 128-bit vectors of
 660///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
 661///    of the two values.
 662///
 663/// \headerfile <x86intrin.h>
 664///
 665/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
 666///
 667/// \param __V1
 668///    A 128-bit vector of [16 x i8].
 669/// \param __V2
 670///    A 128-bit vector of [16 x i8]
 671/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
 672static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
 673                                                          __m128i __V2) {
 674  return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
 675}
 676
 677/// Compares the corresponding elements of two 128-bit vectors of
 678///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
 679///    greater value of the two.
 680///
 681/// \headerfile <x86intrin.h>
 682///
 683/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
 684///
 685/// \param __V1
 686///    A 128-bit vector of [16 x i8].
 687/// \param __V2
 688///    A 128-bit vector of [16 x i8].
 689/// \returns A 128-bit vector of [16 x i8] containing the greater values.
 690static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
 691                                                          __m128i __V2) {
 692  return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
 693}
 694
 695/// Compares the corresponding elements of two 128-bit vectors of
 696///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
 697///    value of the two.
 698///
 699/// \headerfile <x86intrin.h>
 700///
 701/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
 702///
 703/// \param __V1
 704///    A 128-bit vector of [8 x u16].
 705/// \param __V2
 706///    A 128-bit vector of [8 x u16].
 707/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
 708static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
 709                                                           __m128i __V2) {
 710  return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
 711}
 712
 713/// Compares the corresponding elements of two 128-bit vectors of
 714///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
 715///    greater value of the two.
 716///
 717/// \headerfile <x86intrin.h>
 718///
 719/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
 720///
 721/// \param __V1
 722///    A 128-bit vector of [8 x u16].
 723/// \param __V2
 724///    A 128-bit vector of [8 x u16].
 725/// \returns A 128-bit vector of [8 x u16] containing the greater values.
 726static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
 727                                                           __m128i __V2) {
 728  return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
 729}
 730
 731/// Compares the corresponding elements of two 128-bit vectors of
 732///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
 733///    value of the two.
 734///
 735/// \headerfile <x86intrin.h>
 736///
 737/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
 738///
 739/// \param __V1
 740///    A 128-bit vector of [4 x i32].
 741/// \param __V2
 742///    A 128-bit vector of [4 x i32].
 743/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
 744static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
 745                                                           __m128i __V2) {
 746  return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
 747}
 748
 749/// Compares the corresponding elements of two 128-bit vectors of
 750///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
 751///    greater value of the two.
 752///
 753/// \headerfile <x86intrin.h>
 754///
 755/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
 756///
 757/// \param __V1
 758///    A 128-bit vector of [4 x i32].
 759/// \param __V2
 760///    A 128-bit vector of [4 x i32].
 761/// \returns A 128-bit vector of [4 x i32] containing the greater values.
 762static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
 763                                                           __m128i __V2) {
 764  return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
 765}
 766
 767/// Compares the corresponding elements of two 128-bit vectors of
 768///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
 769///    value of the two.
 770///
 771/// \headerfile <x86intrin.h>
 772///
 773/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  instruction.
 774///
 775/// \param __V1
 776///    A 128-bit vector of [4 x u32].
 777/// \param __V2
 778///    A 128-bit vector of [4 x u32].
 779/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
 780static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
 781                                                           __m128i __V2) {
 782  return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
 783}
 784
 785/// Compares the corresponding elements of two 128-bit vectors of
 786///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
 787///    greater value of the two.
 788///
 789/// \headerfile <x86intrin.h>
 790///
 791/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
 792///
 793/// \param __V1
 794///    A 128-bit vector of [4 x u32].
 795/// \param __V2
 796///    A 128-bit vector of [4 x u32].
 797/// \returns A 128-bit vector of [4 x u32] containing the greater values.
 798static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
 799                                                           __m128i __V2) {
 800  return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
 801}
 802
 803/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
 804/// Takes the first argument \a X and inserts an element from the second
 805///    argument \a Y as selected by the third argument \a N. That result then
 806///    has elements zeroed out also as selected by the third argument \a N. The
 807///    resulting 128-bit vector of [4 x float] is then returned.
 808///
 809/// \headerfile <x86intrin.h>
 810///
 811/// \code
 812/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
 813/// \endcode
 814///
 815/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
 816///
 817/// \param X
 818///    A 128-bit vector source operand of [4 x float]. With the exception of
 819///    those bits in the result copied from parameter \a Y and zeroed by bits
 820///    [3:0] of \a N, all bits from this parameter are copied to the result.
 821/// \param Y
 822///    A 128-bit vector source operand of [4 x float]. One single-precision
 823///    floating-point element from this source, as determined by the immediate
 824///    parameter, is copied to the result.
 825/// \param N
 826///    Specifies which bits from operand \a Y will be copied, which bits in the
 827///    result they will be copied to, and which bits in the result will be
 828///    cleared. The following assignments are made: \n
 829///    Bits [7:6] specify the bits to copy from operand \a Y: \n
 830///      00: Selects bits [31:0] from operand \a Y. \n
 831///      01: Selects bits [63:32] from operand \a Y. \n
 832///      10: Selects bits [95:64] from operand \a Y. \n
 833///      11: Selects bits [127:96] from operand \a Y. \n
 834///    Bits [5:4] specify the bits in the result to which the selected bits
 835///    from operand \a Y are copied: \n
 836///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
 837///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
 838///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
 839///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
 840///    Bits[3:0]: If any of these bits are set, the corresponding result
 841///    element is cleared.
 842/// \returns A 128-bit vector of [4 x float] containing the copied
 843///    single-precision floating point elements from the operands.
 844#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
 845
 846/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
 847///    returns it, using the immediate value parameter \a N as a selector.
 848///
 849/// \headerfile <x86intrin.h>
 850///
 851/// \code
 852/// int _mm_extract_ps(__m128 X, const int N);
 853/// \endcode
 854///
 855/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
 856/// instruction.
 857///
 858/// \param X
 859///    A 128-bit vector of [4 x float].
 860/// \param N
 861///    An immediate value. Bits [1:0] determines which bits from the argument
 862///    \a X are extracted and returned: \n
 863///    00: Bits [31:0] of parameter \a X are returned. \n
 864///    01: Bits [63:32] of parameter \a X are returned. \n
 865///    10: Bits [95:64] of parameter \a X are returned. \n
 866///    11: Bits [127:96] of parameter \a X are returned.
 867/// \returns A 32-bit integer containing the extracted 32 bits of float data.
 868#define _mm_extract_ps(X, N)                                                   \
 869  __builtin_bit_cast(                                                          \
 870      int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
 871
 872/* Miscellaneous insert and extract macros.  */
 873/* Extract a single-precision float from X at index N into D.  */
 874#define _MM_EXTRACT_FLOAT(D, X, N)                                             \
 875  do {                                                                         \
 876    (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N));          \
 877  } while (0)
 878
 879/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
 880   an index suitable for _mm_insert_ps.  */
 881#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
 882
 883/* Extract a float from X at index N into the first index of the return.  */
 884#define _MM_PICK_OUT_PS(X, N)                                                  \
 885  _mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
 886
 887/* Insert int into packed integer array at index.  */
 888/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
 889///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
 890///    of an integer parameter \a I into an offset specified by the immediate
 891///    value parameter \a N.
 892///
 893/// \headerfile <x86intrin.h>
 894///
 895/// \code
 896/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
 897/// \endcode
 898///
 899/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
 900///
 901/// \param X
 902///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
 903///    result and then one of the sixteen elements in the result vector is
 904///    replaced by the lower 8 bits of \a I.
 905/// \param I
 906///    An integer. The lower 8 bits of this operand are written to the result
 907///    beginning at the offset specified by \a N.
 908/// \param N
 909///    An immediate value. Bits [3:0] specify the bit offset in the result at
 910///    which the lower 8 bits of \a I are written. \n
 911///    0000: Bits [7:0] of the result are used for insertion. \n
 912///    0001: Bits [15:8] of the result are used for insertion. \n
 913///    0010: Bits [23:16] of the result are used for insertion. \n
 914///    0011: Bits [31:24] of the result are used for insertion. \n
 915///    0100: Bits [39:32] of the result are used for insertion. \n
 916///    0101: Bits [47:40] of the result are used for insertion. \n
 917///    0110: Bits [55:48] of the result are used for insertion. \n
 918///    0111: Bits [63:56] of the result are used for insertion. \n
 919///    1000: Bits [71:64] of the result are used for insertion. \n
 920///    1001: Bits [79:72] of the result are used for insertion. \n
 921///    1010: Bits [87:80] of the result are used for insertion. \n
 922///    1011: Bits [95:88] of the result are used for insertion. \n
 923///    1100: Bits [103:96] of the result are used for insertion. \n
 924///    1101: Bits [111:104] of the result are used for insertion. \n
 925///    1110: Bits [119:112] of the result are used for insertion. \n
 926///    1111: Bits [127:120] of the result are used for insertion.
 927/// \returns A 128-bit integer vector containing the constructed values.
 928#define _mm_insert_epi8(X, I, N)                                               \
 929  ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I),      \
 930                                         (int)(N)))
 931
 932/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
 933///    the 128-bit integer vector parameter, and then inserting the 32-bit
 934///    integer parameter \a I at the offset specified by the immediate value
 935///    parameter \a N.
 936///
 937/// \headerfile <x86intrin.h>
 938///
 939/// \code
 940/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
 941/// \endcode
 942///
 943/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
 944///
 945/// \param X
 946///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
 947///    result and then one of the four elements in the result vector is
 948///    replaced by \a I.
 949/// \param I
 950///    A 32-bit integer that is written to the result beginning at the offset
 951///    specified by \a N.
 952/// \param N
 953///    An immediate value. Bits [1:0] specify the bit offset in the result at
 954///    which the integer \a I is written. \n
 955///    00: Bits [31:0] of the result are used for insertion. \n
 956///    01: Bits [63:32] of the result are used for insertion. \n
 957///    10: Bits [95:64] of the result are used for insertion. \n
 958///    11: Bits [127:96] of the result are used for insertion.
 959/// \returns A 128-bit integer vector containing the constructed values.
 960#define _mm_insert_epi32(X, I, N)                                              \
 961  ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I),        \
 962                                        (int)(N)))
 963
 964#ifdef __x86_64__
 965/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
 966///    the 128-bit integer vector parameter, and then inserting the 64-bit
 967///    integer parameter \a I, using the immediate value parameter \a N as an
 968///    insertion location selector.
 969///
 970/// \headerfile <x86intrin.h>
 971///
 972/// \code
 973/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
 974/// \endcode
 975///
 976/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
 977///
 978/// \param X
 979///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
 980///    result and then one of the two elements in the result vector is replaced
 981///    by \a I.
 982/// \param I
 983///    A 64-bit integer that is written to the result beginning at the offset
 984///    specified by \a N.
 985/// \param N
 986///    An immediate value. Bit [0] specifies the bit offset in the result at
 987///    which the integer \a I is written. \n
 988///    0: Bits [63:0] of the result are used for insertion. \n
 989///    1: Bits [127:64] of the result are used for insertion. \n
 990/// \returns A 128-bit integer vector containing the constructed values.
 991#define _mm_insert_epi64(X, I, N)                                              \
 992  ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I),  \
 993                                        (int)(N)))
 994#endif /* __x86_64__ */
 995
 996/* Extract int from packed integer array at index.  This returns the element
 997 * as a zero extended value, so it is unsigned.
 998 */
 999/// Extracts an 8-bit element from the 128-bit integer vector of
1000///    [16 x i8], using the immediate value parameter \a N as a selector.
1001///
1002/// \headerfile <x86intrin.h>
1003///
1004/// \code
1005/// int _mm_extract_epi8(__m128i X, const int N);
1006/// \endcode
1007///
1008/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
1009///
1010/// \param X
1011///    A 128-bit integer vector.
1012/// \param N
1013///    An immediate value. Bits [3:0] specify which 8-bit vector element from
1014///    the argument \a X to extract and copy to the result. \n
1015///    0000: Bits [7:0] of parameter \a X are extracted. \n
1016///    0001: Bits [15:8] of the parameter \a X are extracted. \n
1017///    0010: Bits [23:16] of the parameter \a X are extracted. \n
1018///    0011: Bits [31:24] of the parameter \a X are extracted. \n
1019///    0100: Bits [39:32] of the parameter \a X are extracted. \n
1020///    0101: Bits [47:40] of the parameter \a X are extracted. \n
1021///    0110: Bits [55:48] of the parameter \a X are extracted. \n
1022///    0111: Bits [63:56] of the parameter \a X are extracted. \n
1023///    1000: Bits [71:64] of the parameter \a X are extracted. \n
1024///    1001: Bits [79:72] of the parameter \a X are extracted. \n
1025///    1010: Bits [87:80] of the parameter \a X are extracted. \n
1026///    1011: Bits [95:88] of the parameter \a X are extracted. \n
1027///    1100: Bits [103:96] of the parameter \a X are extracted. \n
1028///    1101: Bits [111:104] of the parameter \a X are extracted. \n
1029///    1110: Bits [119:112] of the parameter \a X are extracted. \n
1030///    1111: Bits [127:120] of the parameter \a X are extracted.
1031/// \returns  An unsigned integer, whose lower 8 bits are selected from the
1032///    128-bit integer vector parameter and the remaining bits are assigned
1033///    zeros.
1034#define _mm_extract_epi8(X, N)                                                 \
1035  ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X),     \
1036                                                    (int)(N)))
1037
1038/// Extracts a 32-bit element from the 128-bit integer vector of
1039///    [4 x i32], using the immediate value parameter \a N as a selector.
1040///
1041/// \headerfile <x86intrin.h>
1042///
1043/// \code
1044/// int _mm_extract_epi32(__m128i X, const int N);
1045/// \endcode
1046///
1047/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
1048///
1049/// \param X
1050///    A 128-bit integer vector.
1051/// \param N
1052///    An immediate value. Bits [1:0] specify which 32-bit vector element from
1053///    the argument \a X to extract and copy to the result. \n
1054///    00: Bits [31:0] of the parameter \a X are extracted. \n
1055///    01: Bits [63:32] of the parameter \a X are extracted. \n
1056///    10: Bits [95:64] of the parameter \a X are extracted. \n
1057///    11: Bits [127:96] of the parameter \a X are exracted.
1058/// \returns  An integer, whose lower 32 bits are selected from the 128-bit
1059///    integer vector parameter and the remaining bits are assigned zeros.
1060#define _mm_extract_epi32(X, N)                                                \
1061  ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
1062
1063/// Extracts a 64-bit element from the 128-bit integer vector of
1064///    [2 x i64], using the immediate value parameter \a N as a selector.
1065///
1066/// \headerfile <x86intrin.h>
1067///
1068/// \code
1069/// long long _mm_extract_epi64(__m128i X, const int N);
1070/// \endcode
1071///
1072/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
1073/// in 64-bit mode.
1074///
1075/// \param X
1076///    A 128-bit integer vector.
1077/// \param N
1078///    An immediate value. Bit [0] specifies which 64-bit vector element from
1079///    the argument \a X to return. \n
1080///    0: Bits [63:0] are returned. \n
1081///    1: Bits [127:64] are returned. \n
1082/// \returns  A 64-bit integer.
1083#define _mm_extract_epi64(X, N)                                                \
1084  ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
1085
1086/* SSE4 128-bit Packed Integer Comparisons.  */
1087/// Tests whether the specified bits in a 128-bit integer vector are all
1088///    zeros.
1089///
1090/// \headerfile <x86intrin.h>
1091///
1092/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1093///
1094/// \param __M
1095///    A 128-bit integer vector containing the bits to be tested.
1096/// \param __V
1097///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1098/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1099static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
1100                                                         __m128i __V) {
1101  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
1102}
1103
1104/// Tests whether the specified bits in a 128-bit integer vector are all
1105///    ones.
1106///
1107/// \headerfile <x86intrin.h>
1108///
1109/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1110///
1111/// \param __M
1112///    A 128-bit integer vector containing the bits to be tested.
1113/// \param __V
1114///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1115/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
1116static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
1117                                                         __m128i __V) {
1118  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
1119}
1120
1121/// Tests whether the specified bits in a 128-bit integer vector are
1122///    neither all zeros nor all ones.
1123///
1124/// \headerfile <x86intrin.h>
1125///
1126/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1127///
1128/// \param __M
1129///    A 128-bit integer vector containing the bits to be tested.
1130/// \param __V
1131///    A 128-bit integer vector selecting which bits to test in operand \a __M.
1132/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1133///    FALSE otherwise.
1134static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
1135                                                           __m128i __V) {
1136  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
1137}
1138
1139/// Tests whether the specified bits in a 128-bit integer vector are all
1140///    ones.
1141///
1142/// \headerfile <x86intrin.h>
1143///
1144/// \code
1145/// int _mm_test_all_ones(__m128i V);
1146/// \endcode
1147///
1148/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1149///
1150/// \param V
1151///    A 128-bit integer vector containing the bits to be tested.
1152/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
1153///    otherwise.
1154#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
1155
1156/// Tests whether the specified bits in a 128-bit integer vector are
1157///    neither all zeros nor all ones.
1158///
1159/// \headerfile <x86intrin.h>
1160///
1161/// \code
1162/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
1163/// \endcode
1164///
1165/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1166///
1167/// \param M
1168///    A 128-bit integer vector containing the bits to be tested.
1169/// \param V
1170///    A 128-bit integer vector selecting which bits to test in operand \a M.
1171/// \returns TRUE if the specified bits are neither all zeros nor all ones;
1172///    FALSE otherwise.
1173#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
1174
1175/// Tests whether the specified bits in a 128-bit integer vector are all
1176///    zeros.
1177///
1178/// \headerfile <x86intrin.h>
1179///
1180/// \code
1181/// int _mm_test_all_zeros(__m128i M, __m128i V);
1182/// \endcode
1183///
1184/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
1185///
1186/// \param M
1187///    A 128-bit integer vector containing the bits to be tested.
1188/// \param V
1189///    A 128-bit integer vector selecting which bits to test in operand \a M.
1190/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
1191#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
1192
1193/* SSE4 64-bit Packed Integer Comparisons.  */
1194/// Compares each of the corresponding 64-bit values of the 128-bit
1195///    integer vectors for equality.
1196///
1197///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
1198///
1199/// \headerfile <x86intrin.h>
1200///
1201/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
1202///
1203/// \param __V1
1204///    A 128-bit integer vector.
1205/// \param __V2
1206///    A 128-bit integer vector.
1207/// \returns A 128-bit integer vector containing the comparison results.
1208static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
1209                                                             __m128i __V2) {
1210  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
1211}
1212
1213/* SSE4 Packed Integer Sign-Extension.  */
1214/// Sign-extends each of the lower eight 8-bit integer elements of a
1215///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
1216///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
1217///    are unused.
1218///
1219/// \headerfile <x86intrin.h>
1220///
1221/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
1222///
1223/// \param __V
1224///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1225///    sign-extended to 16-bit values.
1226/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
1227static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
1228  /* This function always performs a signed extension, but __v16qi is a char
1229     which may be signed or unsigned, so use __v16qs. */
1230  return (__m128i) __builtin_convertvector(
1231      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
1232                              7),
1233      __v8hi);
1234}
1235
1236/// Sign-extends each of the lower four 8-bit integer elements of a
1237///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
1238///    128-bit vector of [4 x i32]. The upper twelve elements of the input
1239///    vector are unused.
1240///
1241/// \headerfile <x86intrin.h>
1242///
1243/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
1244///
1245/// \param __V
1246///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1247///    sign-extended to 32-bit values.
1248/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1249static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
1250  /* This function always performs a signed extension, but __v16qi is a char
1251     which may be signed or unsigned, so use __v16qs. */
1252  return (__m128i) __builtin_convertvector(
1253      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
1254}
1255
1256/// Sign-extends each of the lower two 8-bit integer elements of a
1257///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1258///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1259///    vector are unused.
1260///
1261/// \headerfile <x86intrin.h>
1262///
1263/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
1264///
1265/// \param __V
1266///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1267///    sign-extended to 64-bit values.
1268/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1269static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
1270  /* This function always performs a signed extension, but __v16qi is a char
1271     which may be signed or unsigned, so use __v16qs. */
1272  return (__m128i) __builtin_convertvector(
1273      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
1274}
1275
1276/// Sign-extends each of the lower four 16-bit integer elements of a
1277///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1278///    a 128-bit vector of [4 x i32]. The upper four elements of the input
1279///    vector are unused.
1280///
1281/// \headerfile <x86intrin.h>
1282///
1283/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
1284///
1285/// \param __V
1286///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1287///    sign-extended to 32-bit values.
1288/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
1289static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
1290  return (__m128i) __builtin_convertvector(
1291      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
1292}
1293
1294/// Sign-extends each of the lower two 16-bit integer elements of a
1295///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1296///    a 128-bit vector of [2 x i64]. The upper six elements of the input
1297///    vector are unused.
1298///
1299/// \headerfile <x86intrin.h>
1300///
1301/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
1302///
1303/// \param __V
1304///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1305///     sign-extended to 64-bit values.
1306/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1307static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
1308  return (__m128i) __builtin_convertvector(
1309      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
1310}
1311
1312/// Sign-extends each of the lower two 32-bit integer elements of a
1313///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1314///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1315///    are unused.
1316///
1317/// \headerfile <x86intrin.h>
1318///
1319/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
1320///
1321/// \param __V
1322///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1323///    sign-extended to 64-bit values.
1324/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
1325static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
1326  return (__m128i) __builtin_convertvector(
1327      __builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
1328}
1329
1330/* SSE4 Packed Integer Zero-Extension.  */
1331/// Zero-extends each of the lower eight 8-bit integer elements of a
1332///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
1333///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
1334///    are unused.
1335///
1336/// \headerfile <x86intrin.h>
1337///
1338/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
1339///
1340/// \param __V
1341///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
1342///    zero-extended to 16-bit values.
1343/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
1344static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
1345  return (__m128i) __builtin_convertvector(
1346      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
1347                              7),
1348      __v8hi);
1349}
1350
1351/// Zero-extends each of the lower four 8-bit integer elements of a
1352///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
1353///    128-bit vector of [4 x i32]. The upper twelve elements of the input
1354///    vector are unused.
1355///
1356/// \headerfile <x86intrin.h>
1357///
1358/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
1359///
1360/// \param __V
1361///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
1362///    zero-extended to 32-bit values.
1363/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1364static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
1365  return (__m128i) __builtin_convertvector(
1366      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
1367}
1368
1369/// Zero-extends each of the lower two 8-bit integer elements of a
1370///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
1371///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
1372///    vector are unused.
1373///
1374/// \headerfile <x86intrin.h>
1375///
1376/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
1377///
1378/// \param __V
1379///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
1380///    zero-extended to 64-bit values.
1381/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1382static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
1383  return (__m128i) __builtin_convertvector(
1384      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
1385}
1386
1387/// Zero-extends each of the lower four 16-bit integer elements of a
1388///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
1389///    a 128-bit vector of [4 x i32]. The upper four elements of the input
1390///    vector are unused.
1391///
1392/// \headerfile <x86intrin.h>
1393///
1394/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
1395///
1396/// \param __V
1397///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
1398///    zero-extended to 32-bit values.
1399/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
1400static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
1401  return (__m128i) __builtin_convertvector(
1402      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
1403}
1404
1405/// Zero-extends each of the lower two 16-bit integer elements of a
1406///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
1407///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
1408///    are unused.
1409///
1410/// \headerfile <x86intrin.h>
1411///
1412/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
1413///
1414/// \param __V
1415///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
1416///    zero-extended to 64-bit values.
1417/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1418static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
1419  return (__m128i) __builtin_convertvector(
1420      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
1421}
1422
1423/// Zero-extends each of the lower two 32-bit integer elements of a
1424///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
1425///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
1426///    are unused.
1427///
1428/// \headerfile <x86intrin.h>
1429///
1430/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
1431///
1432/// \param __V
1433///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
1434///    zero-extended to 64-bit values.
1435/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
1436static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
1437  return (__m128i) __builtin_convertvector(
1438      __builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
1439}
1440
1441/* SSE4 Pack with Unsigned Saturation.  */
1442/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
1443///    vector operands into 16-bit unsigned integers, and returns the packed
1444///    result.
1445///
1446///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
1447///    0x0000 are saturated to 0x0000.
1448///
1449/// \headerfile <x86intrin.h>
1450///
1451/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
1452///
1453/// \param __V1
1454///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1455///    written to the lower 64 bits of the result.
1456/// \param __V2
1457///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
1458///    written to the higher 64 bits of the result.
1459/// \returns A 128-bit vector of [8 x i16] containing the converted values.
1460static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
1461                                                              __m128i __V2) {
1462  return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
1463}
1464
1465/* SSE4 Multiple Packed Sums of Absolute Difference.  */
1466/// Subtracts 8-bit unsigned integer values and computes the absolute
1467///    values of the differences to the corresponding bits in the destination.
1468///    Then sums of the absolute differences are returned according to the bit
1469///    fields in the immediate operand.
1470///
1471/// \headerfile <x86intrin.h>
1472///
1473/// \code
1474/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
1475/// \endcode
1476///
1477/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
1478///
1479/// \param X
1480///    A 128-bit vector of [16 x i8].
1481/// \param Y
1482///    A 128-bit vector of [16 x i8].
1483/// \param M
1484///    An 8-bit immediate operand specifying how the absolute differences are to
1485///    be calculated, according to the following algorithm:
1486///    \code
1487///    // M2 represents bit 2 of the immediate operand
1488///    // M10 represents bits [1:0] of the immediate operand
1489///    i = M2 * 4;
1490///    j = M10 * 4;
1491///    for (k = 0; k < 8; k = k + 1) {
1492///      d0 = abs(X[i + k + 0] - Y[j + 0]);
1493///      d1 = abs(X[i + k + 1] - Y[j + 1]);
1494///      d2 = abs(X[i + k + 2] - Y[j + 2]);
1495///      d3 = abs(X[i + k + 3] - Y[j + 3]);
1496///      r[k] = d0 + d1 + d2 + d3;
1497///    }
1498///    \endcode
1499/// \returns A 128-bit integer vector containing the sums of the sets of
1500///    absolute differences between both operands.
1501#define _mm_mpsadbw_epu8(X, Y, M)                                              \
1502  ((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X),                   \
1503                                      (__v16qi)(__m128i)(Y), (M)))
1504
1505/// Finds the minimum unsigned 16-bit element in the input 128-bit
1506///    vector of [8 x u16] and returns it and along with its index.
1507///
1508/// \headerfile <x86intrin.h>
1509///
1510/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
1511/// instruction.
1512///
1513/// \param __V
1514///    A 128-bit vector of [8 x u16].
1515/// \returns A 128-bit value where bits [15:0] contain the minimum value found
1516///    in parameter \a __V, bits [18:16] contain the index of the minimum value
1517///    and the remaining bits are set to 0.
1518static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
1519  return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
1520}
1521
1522/* Handle the sse4.2 definitions here. */
1523
1524/* These definitions are normally in nmmintrin.h, but gcc puts them in here
1525   so we'll do the same.  */
1526
1527#undef __DEFAULT_FN_ATTRS
1528#define __DEFAULT_FN_ATTRS                                                     \
1529  __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
1530
1531/* These specify the type of data that we're comparing.  */
1532#define _SIDD_UBYTE_OPS 0x00
1533#define _SIDD_UWORD_OPS 0x01
1534#define _SIDD_SBYTE_OPS 0x02
1535#define _SIDD_SWORD_OPS 0x03
1536
1537/* These specify the type of comparison operation.  */
1538#define _SIDD_CMP_EQUAL_ANY 0x00
1539#define _SIDD_CMP_RANGES 0x04
1540#define _SIDD_CMP_EQUAL_EACH 0x08
1541#define _SIDD_CMP_EQUAL_ORDERED 0x0c
1542
1543/* These macros specify the polarity of the operation.  */
1544#define _SIDD_POSITIVE_POLARITY 0x00
1545#define _SIDD_NEGATIVE_POLARITY 0x10
1546#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
1547#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
1548
1549/* These macros are used in _mm_cmpXstri() to specify the return.  */
1550#define _SIDD_LEAST_SIGNIFICANT 0x00
1551#define _SIDD_MOST_SIGNIFICANT 0x40
1552
1553/* These macros are used in _mm_cmpXstri() to specify the return.  */
1554#define _SIDD_BIT_MASK 0x00
1555#define _SIDD_UNIT_MASK 0x40
1556
1557/* SSE4.2 Packed Comparison Intrinsics.  */
1558/// Uses the immediate operand \a M to perform a comparison of string
1559///    data with implicitly defined lengths that is contained in source operands
1560///    \a A and \a B. Returns a 128-bit integer vector representing the result
1561///    mask of the comparison.
1562///
1563/// \headerfile <x86intrin.h>
1564///
1565/// \code
1566/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
1567/// \endcode
1568///
1569/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
1570/// instruction.
1571///
1572/// \param A
1573///    A 128-bit integer vector containing one of the source operands to be
1574///    compared.
1575/// \param B
1576///    A 128-bit integer vector containing one of the source operands to be
1577///    compared.
1578/// \param M
1579///    An 8-bit immediate operand specifying whether the characters are bytes or
1580///    words, the type of comparison to perform, and the format of the return
1581///    value. \n
1582///    Bits [1:0]: Determine source data format. \n
1583///      00: 16 unsigned bytes \n
1584///      01: 8 unsigned words \n
1585///      10: 16 signed bytes \n
1586///      11: 8 signed words \n
1587///    Bits [3:2]: Determine comparison type and aggregation method. \n
1588///      00: Subset: Each character in \a B is compared for equality with all
1589///          the characters in \a A. \n
1590///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1591///          basis is greater than or equal for even-indexed elements in \a A,
1592///          and less than or equal for odd-indexed elements in \a A. \n
1593///      10: Match: Compare each pair of corresponding characters in \a A and
1594///          \a B for equality. \n
1595///      11: Substring: Search \a B for substring matches of \a A. \n
1596///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1597///                mask of the comparison results. \n
1598///      00: No effect. \n
1599///      01: Negate the bit mask. \n
1600///      10: No effect. \n
1601///      11: Negate the bit mask only for bits with an index less than or equal
1602///          to the size of \a A or \a B. \n
1603///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
1604///             bytes. \n
1605///      0: The result is zero-extended to 16 bytes. \n
1606///      1: The result is expanded to 16 bytes (this expansion is performed by
1607///         repeating each bit 8 or 16 times).
1608/// \returns Returns a 128-bit integer vector representing the result mask of
1609///    the comparison.
1610#define _mm_cmpistrm(A, B, M)                                                  \
1611  ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A),                 \
1612                                        (__v16qi)(__m128i)(B), (int)(M)))
1613
1614/// Uses the immediate operand \a M to perform a comparison of string
1615///    data with implicitly defined lengths that is contained in source operands
1616///    \a A and \a B. Returns an integer representing the result index of the
1617///    comparison.
1618///
1619/// \headerfile <x86intrin.h>
1620///
1621/// \code
1622/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
1623/// \endcode
1624///
1625/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1626/// instruction.
1627///
1628/// \param A
1629///    A 128-bit integer vector containing one of the source operands to be
1630///    compared.
1631/// \param B
1632///    A 128-bit integer vector containing one of the source operands to be
1633///    compared.
1634/// \param M
1635///    An 8-bit immediate operand specifying whether the characters are bytes or
1636///    words, the type of comparison to perform, and the format of the return
1637///    value. \n
1638///    Bits [1:0]: Determine source data format. \n
1639///      00: 16 unsigned bytes \n
1640///      01: 8 unsigned words \n
1641///      10: 16 signed bytes \n
1642///      11: 8 signed words \n
1643///    Bits [3:2]: Determine comparison type and aggregation method. \n
1644///      00: Subset: Each character in \a B is compared for equality with all
1645///          the characters in \a A. \n
1646///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1647///          basis is greater than or equal for even-indexed elements in \a A,
1648///          and less than or equal for odd-indexed elements in \a A. \n
1649///      10: Match: Compare each pair of corresponding characters in \a A and
1650///          \a B for equality. \n
1651///      11: Substring: Search B for substring matches of \a A. \n
1652///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1653///                mask of the comparison results. \n
1654///      00: No effect. \n
1655///      01: Negate the bit mask. \n
1656///      10: No effect. \n
1657///      11: Negate the bit mask only for bits with an index less than or equal
1658///          to the size of \a A or \a B. \n
1659///    Bit [6]: Determines whether the index of the lowest set bit or the
1660///             highest set bit is returned. \n
1661///      0: The index of the least significant set bit. \n
1662///      1: The index of the most significant set bit. \n
1663/// \returns Returns an integer representing the result index of the comparison.
1664#define _mm_cmpistri(A, B, M)                                                  \
1665  ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A),                     \
1666                                    (__v16qi)(__m128i)(B), (int)(M)))
1667
1668/// Uses the immediate operand \a M to perform a comparison of string
1669///    data with explicitly defined lengths that is contained in source operands
1670///    \a A and \a B. Returns a 128-bit integer vector representing the result
1671///    mask of the comparison.
1672///
1673/// \headerfile <x86intrin.h>
1674///
1675/// \code
1676/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
1677/// \endcode
1678///
1679/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
1680/// instruction.
1681///
1682/// \param A
1683///    A 128-bit integer vector containing one of the source operands to be
1684///    compared.
1685/// \param LA
1686///    An integer that specifies the length of the string in \a A.
1687/// \param B
1688///    A 128-bit integer vector containing one of the source operands to be
1689///    compared.
1690/// \param LB
1691///    An integer that specifies the length of the string in \a B.
1692/// \param M
1693///    An 8-bit immediate operand specifying whether the characters are bytes or
1694///    words, the type of comparison to perform, and the format of the return
1695///    value. \n
1696///    Bits [1:0]: Determine source data format. \n
1697///      00: 16 unsigned bytes \n
1698///      01: 8 unsigned words \n
1699///      10: 16 signed bytes \n
1700///      11: 8 signed words \n
1701///    Bits [3:2]: Determine comparison type and aggregation method. \n
1702///      00: Subset: Each character in \a B is compared for equality with all
1703///          the characters in \a A. \n
1704///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1705///          basis is greater than or equal for even-indexed elements in \a A,
1706///          and less than or equal for odd-indexed elements in \a A. \n
1707///      10: Match: Compare each pair of corresponding characters in \a A and
1708///          \a B for equality. \n
1709///      11: Substring: Search \a B for substring matches of \a A. \n
1710///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1711///                mask of the comparison results. \n
1712///      00: No effect. \n
1713///      01: Negate the bit mask. \n
1714///      10: No effect. \n
1715///      11: Negate the bit mask only for bits with an index less than or equal
1716///          to the size of \a A or \a B. \n
1717///    Bit [6]: Determines whether the result is zero-extended or expanded to 16
1718///             bytes. \n
1719///      0: The result is zero-extended to 16 bytes. \n
1720///      1: The result is expanded to 16 bytes (this expansion is performed by
1721///         repeating each bit 8 or 16 times). \n
1722/// \returns Returns a 128-bit integer vector representing the result mask of
1723///    the comparison.
1724#define _mm_cmpestrm(A, LA, B, LB, M)                                          \
1725  ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA),      \
1726                                        (__v16qi)(__m128i)(B), (int)(LB),      \
1727                                        (int)(M)))
1728
1729/// Uses the immediate operand \a M to perform a comparison of string
1730///    data with explicitly defined lengths that is contained in source operands
1731///    \a A and \a B. Returns an integer representing the result index of the
1732///    comparison.
1733///
1734/// \headerfile <x86intrin.h>
1735///
1736/// \code
1737/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
1738/// \endcode
1739///
1740/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
1741/// instruction.
1742///
1743/// \param A
1744///    A 128-bit integer vector containing one of the source operands to be
1745///    compared.
1746/// \param LA
1747///    An integer that specifies the length of the string in \a A.
1748/// \param B
1749///    A 128-bit integer vector containing one of the source operands to be
1750///    compared.
1751/// \param LB
1752///    An integer that specifies the length of the string in \a B.
1753/// \param M
1754///    An 8-bit immediate operand specifying whether the characters are bytes or
1755///    words, the type of comparison to perform, and the format of the return
1756///    value. \n
1757///    Bits [1:0]: Determine source data format. \n
1758///      00: 16 unsigned bytes \n
1759///      01: 8 unsigned words \n
1760///      10: 16 signed bytes \n
1761///      11: 8 signed words \n
1762///    Bits [3:2]: Determine comparison type and aggregation method. \n
1763///      00: Subset: Each character in \a B is compared for equality with all
1764///          the characters in \a A. \n
1765///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1766///          basis is greater than or equal for even-indexed elements in \a A,
1767///          and less than or equal for odd-indexed elements in \a A. \n
1768///      10: Match: Compare each pair of corresponding characters in \a A and
1769///          \a B for equality. \n
1770///      11: Substring: Search B for substring matches of \a A. \n
1771///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1772///                mask of the comparison results. \n
1773///      00: No effect. \n
1774///      01: Negate the bit mask. \n
1775///      10: No effect. \n
1776///      11: Negate the bit mask only for bits with an index less than or equal
1777///          to the size of \a A or \a B. \n
1778///    Bit [6]: Determines whether the index of the lowest set bit or the
1779///             highest set bit is returned. \n
1780///      0: The index of the least significant set bit. \n
1781///      1: The index of the most significant set bit. \n
1782/// \returns Returns an integer representing the result index of the comparison.
1783#define _mm_cmpestri(A, LA, B, LB, M)                                          \
1784  ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA),          \
1785                                    (__v16qi)(__m128i)(B), (int)(LB),          \
1786                                    (int)(M)))
1787
1788/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
1789/// Uses the immediate operand \a M to perform a comparison of string
1790///    data with implicitly defined lengths that is contained in source operands
1791///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
1792///    string in \a B is the maximum, otherwise, returns 0.
1793///
1794/// \headerfile <x86intrin.h>
1795///
1796/// \code
1797/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
1798/// \endcode
1799///
1800/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1801/// instruction.
1802///
1803/// \param A
1804///    A 128-bit integer vector containing one of the source operands to be
1805///    compared.
1806/// \param B
1807///    A 128-bit integer vector containing one of the source operands to be
1808///    compared.
1809/// \param M
1810///    An 8-bit immediate operand specifying whether the characters are bytes or
1811///    words and the type of comparison to perform. \n
1812///    Bits [1:0]: Determine source data format. \n
1813///      00: 16 unsigned bytes \n
1814///      01: 8 unsigned words \n
1815///      10: 16 signed bytes \n
1816///      11: 8 signed words \n
1817///    Bits [3:2]: Determine comparison type and aggregation method. \n
1818///      00: Subset: Each character in \a B is compared for equality with all
1819///          the characters in \a A. \n
1820///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1821///          basis is greater than or equal for even-indexed elements in \a A,
1822///          and less than or equal for odd-indexed elements in \a A. \n
1823///      10: Match: Compare each pair of corresponding characters in \a A and
1824///          \a B for equality. \n
1825///      11: Substring: Search \a B for substring matches of \a A. \n
1826///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1827///                mask of the comparison results. \n
1828///      00: No effect. \n
1829///      01: Negate the bit mask. \n
1830///      10: No effect. \n
1831///      11: Negate the bit mask only for bits with an index less than or equal
1832///          to the size of \a A or \a B. \n
1833/// \returns Returns 1 if the bit mask is zero and the length of the string in
1834///    \a B is the maximum; otherwise, returns 0.
1835#define _mm_cmpistra(A, B, M)                                                  \
1836  ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A),                    \
1837                                     (__v16qi)(__m128i)(B), (int)(M)))
1838
1839/// Uses the immediate operand \a M to perform a comparison of string
1840///    data with implicitly defined lengths that is contained in source operands
1841///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
1842///    0.
1843///
1844/// \headerfile <x86intrin.h>
1845///
1846/// \code
1847/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
1848/// \endcode
1849///
1850/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1851/// instruction.
1852///
1853/// \param A
1854///    A 128-bit integer vector containing one of the source operands to be
1855///    compared.
1856/// \param B
1857///    A 128-bit integer vector containing one of the source operands to be
1858///    compared.
1859/// \param M
1860///    An 8-bit immediate operand specifying whether the characters are bytes or
1861///    words and the type of comparison to perform. \n
1862///    Bits [1:0]: Determine source data format. \n
1863///      00: 16 unsigned bytes \n
1864///      01: 8 unsigned words \n
1865///      10: 16 signed bytes \n
1866///      11: 8 signed words \n
1867///    Bits [3:2]: Determine comparison type and aggregation method. \n
1868///      00: Subset: Each character in \a B is compared for equality with all
1869///          the characters in \a A. \n
1870///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1871///          basis is greater than or equal for even-indexed elements in \a A,
1872///          and less than or equal for odd-indexed elements in \a A. \n
1873///      10: Match: Compare each pair of corresponding characters in \a A and
1874///          \a B for equality. \n
1875///      11: Substring: Search B for substring matches of \a A. \n
1876///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1877///                mask of the comparison results. \n
1878///      00: No effect. \n
1879///      01: Negate the bit mask. \n
1880///      10: No effect. \n
1881///      11: Negate the bit mask only for bits with an index less than or equal
1882///          to the size of \a A or \a B.
1883/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
1884#define _mm_cmpistrc(A, B, M)                                                  \
1885  ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A),                    \
1886                                     (__v16qi)(__m128i)(B), (int)(M)))
1887
1888/// Uses the immediate operand \a M to perform a comparison of string
1889///    data with implicitly defined lengths that is contained in source operands
1890///    \a A and \a B. Returns bit 0 of the resulting bit mask.
1891///
1892/// \headerfile <x86intrin.h>
1893///
1894/// \code
1895/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
1896/// \endcode
1897///
1898/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1899/// instruction.
1900///
1901/// \param A
1902///    A 128-bit integer vector containing one of the source operands to be
1903///    compared.
1904/// \param B
1905///    A 128-bit integer vector containing one of the source operands to be
1906///    compared.
1907/// \param M
1908///    An 8-bit immediate operand specifying whether the characters are bytes or
1909///    words and the type of comparison to perform. \n
1910///    Bits [1:0]: Determine source data format. \n
1911///      00: 16 unsigned bytes \n
1912///      01: 8 unsigned words \n
1913///      10: 16 signed bytes \n
1914///      11: 8 signed words \n
1915///    Bits [3:2]: Determine comparison type and aggregation method. \n
1916///      00: Subset: Each character in \a B is compared for equality with all
1917///          the characters in \a A. \n
1918///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1919///          basis is greater than or equal for even-indexed elements in \a A,
1920///          and less than or equal for odd-indexed elements in \a A. \n
1921///      10: Match: Compare each pair of corresponding characters in \a A and
1922///          \a B for equality. \n
1923///      11: Substring: Search B for substring matches of \a A. \n
1924///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1925///                mask of the comparison results. \n
1926///      00: No effect. \n
1927///      01: Negate the bit mask. \n
1928///      10: No effect. \n
1929///      11: Negate the bit mask only for bits with an index less than or equal
1930///          to the size of \a A or \a B. \n
1931/// \returns Returns bit 0 of the resulting bit mask.
1932#define _mm_cmpistro(A, B, M)                                                  \
1933  ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A),                    \
1934                                     (__v16qi)(__m128i)(B), (int)(M)))
1935
1936/// Uses the immediate operand \a M to perform a comparison of string
1937///    data with implicitly defined lengths that is contained in source operands
1938///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
1939///    the maximum, otherwise, returns 0.
1940///
1941/// \headerfile <x86intrin.h>
1942///
1943/// \code
1944/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
1945/// \endcode
1946///
1947/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1948/// instruction.
1949///
1950/// \param A
1951///    A 128-bit integer vector containing one of the source operands to be
1952///    compared.
1953/// \param B
1954///    A 128-bit integer vector containing one of the source operands to be
1955///    compared.
1956/// \param M
1957///    An 8-bit immediate operand specifying whether the characters are bytes or
1958///    words and the type of comparison to perform. \n
1959///    Bits [1:0]: Determine source data format. \n
1960///      00: 16 unsigned bytes \n
1961///      01: 8 unsigned words \n
1962///      10: 16 signed bytes \n
1963///      11: 8 signed words \n
1964///    Bits [3:2]: Determine comparison type and aggregation method. \n
1965///      00: Subset: Each character in \a B is compared for equality with all
1966///          the characters in \a A. \n
1967///      01: Ranges: Each character in \a B is compared to \a A. The comparison
1968///          basis is greater than or equal for even-indexed elements in \a A,
1969///          and less than or equal for odd-indexed elements in \a A. \n
1970///      10: Match: Compare each pair of corresponding characters in \a A and
1971///          \a B for equality. \n
1972///      11: Substring: Search \a B for substring matches of \a A. \n
1973///    Bits [5:4]: Determine whether to perform a one's complement on the bit
1974///                mask of the comparison results. \n
1975///      00: No effect. \n
1976///      01: Negate the bit mask. \n
1977///      10: No effect. \n
1978///      11: Negate the bit mask only for bits with an index less than or equal
1979///          to the size of \a A or \a B. \n
1980/// \returns Returns 1 if the length of the string in \a A is less than the
1981///    maximum, otherwise, returns 0.
1982#define _mm_cmpistrs(A, B, M)                                                  \
1983  ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A),                    \
1984                                     (__v16qi)(__m128i)(B), (int)(M)))
1985
1986/// Uses the immediate operand \a M to perform a comparison of string
1987///    data with implicitly defined lengths that is contained in source operands
1988///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
1989///    the maximum, otherwise, returns 0.
1990///
1991/// \headerfile <x86intrin.h>
1992///
1993/// \code
1994/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
1995/// \endcode
1996///
1997/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
1998/// instruction.
1999///
2000/// \param A
2001///    A 128-bit integer vector containing one of the source operands to be
2002///    compared.
2003/// \param B
2004///    A 128-bit integer vector containing one of the source operands to be
2005///    compared.
2006/// \param M
2007///    An 8-bit immediate operand specifying whether the characters are bytes or
2008///    words and the type of comparison to perform. \n
2009///    Bits [1:0]: Determine source data format. \n
2010///      00: 16 unsigned bytes \n
2011///      01: 8 unsigned words \n
2012///      10: 16 signed bytes \n
2013///      11: 8 signed words \n
2014///    Bits [3:2]: Determine comparison type and aggregation method. \n
2015///      00: Subset: Each character in \a B is compared for equality with all
2016///          the characters in \a A. \n
2017///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2018///          basis is greater than or equal for even-indexed elements in \a A,
2019///          and less than or equal for odd-indexed elements in \a A. \n
2020///      10: Match: Compare each pair of corresponding characters in \a A and
2021///          \a B for equality. \n
2022///      11: Substring: Search \a B for substring matches of \a A. \n
2023///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2024///                mask of the comparison results. \n
2025///      00: No effect. \n
2026///      01: Negate the bit mask. \n
2027///      10: No effect. \n
2028///      11: Negate the bit mask only for bits with an index less than or equal
2029///          to the size of \a A or \a B.
2030/// \returns Returns 1 if the length of the string in \a B is less than the
2031///    maximum, otherwise, returns 0.
2032#define _mm_cmpistrz(A, B, M)                                                  \
2033  ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A),                    \
2034                                     (__v16qi)(__m128i)(B), (int)(M)))
2035
2036/// Uses the immediate operand \a M to perform a comparison of string
2037///    data with explicitly defined lengths that is contained in source operands
2038///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
2039///    string in \a B is the maximum, otherwise, returns 0.
2040///
2041/// \headerfile <x86intrin.h>
2042///
2043/// \code
2044/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
2045/// \endcode
2046///
2047/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2048/// instruction.
2049///
2050/// \param A
2051///    A 128-bit integer vector containing one of the source operands to be
2052///    compared.
2053/// \param LA
2054///    An integer that specifies the length of the string in \a A.
2055/// \param B
2056///    A 128-bit integer vector containing one of the source operands to be
2057///    compared.
2058/// \param LB
2059///    An integer that specifies the length of the string in \a B.
2060/// \param M
2061///    An 8-bit immediate operand specifying whether the characters are bytes or
2062///    words and the type of comparison to perform. \n
2063///    Bits [1:0]: Determine source data format. \n
2064///      00: 16 unsigned bytes \n
2065///      01: 8 unsigned words \n
2066///      10: 16 signed bytes \n
2067///      11: 8 signed words \n
2068///    Bits [3:2]: Determine comparison type and aggregation method. \n
2069///      00: Subset: Each character in \a B is compared for equality with all
2070///          the characters in \a A. \n
2071///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2072///          basis is greater than or equal for even-indexed elements in \a A,
2073///          and less than or equal for odd-indexed elements in \a A. \n
2074///      10: Match: Compare each pair of corresponding characters in \a A and
2075///          \a B for equality. \n
2076///      11: Substring: Search \a B for substring matches of \a A. \n
2077///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2078///                mask of the comparison results. \n
2079///      00: No effect. \n
2080///      01: Negate the bit mask. \n
2081///      10: No effect. \n
2082///      11: Negate the bit mask only for bits with an index less than or equal
2083///          to the size of \a A or \a B.
2084/// \returns Returns 1 if the bit mask is zero and the length of the string in
2085///    \a B is the maximum, otherwise, returns 0.
2086#define _mm_cmpestra(A, LA, B, LB, M)                                          \
2087  ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA),         \
2088                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2089                                     (int)(M)))
2090
2091/// Uses the immediate operand \a M to perform a comparison of string
2092///    data with explicitly defined lengths that is contained in source operands
2093///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
2094///    returns 0.
2095///
2096/// \headerfile <x86intrin.h>
2097///
2098/// \code
2099/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
2100/// \endcode
2101///
2102/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2103/// instruction.
2104///
2105/// \param A
2106///    A 128-bit integer vector containing one of the source operands to be
2107///    compared.
2108/// \param LA
2109///    An integer that specifies the length of the string in \a A.
2110/// \param B
2111///    A 128-bit integer vector containing one of the source operands to be
2112///    compared.
2113/// \param LB
2114///    An integer that specifies the length of the string in \a B.
2115/// \param M
2116///    An 8-bit immediate operand specifying whether the characters are bytes or
2117///    words and the type of comparison to perform. \n
2118///    Bits [1:0]: Determine source data format. \n
2119///      00: 16 unsigned bytes \n
2120///      01: 8 unsigned words \n
2121///      10: 16 signed bytes \n
2122///      11: 8 signed words \n
2123///    Bits [3:2]: Determine comparison type and aggregation method. \n
2124///      00: Subset: Each character in \a B is compared for equality with all
2125///          the characters in \a A. \n
2126///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2127///          basis is greater than or equal for even-indexed elements in \a A,
2128///          and less than or equal for odd-indexed elements in \a A. \n
2129///      10: Match: Compare each pair of corresponding characters in \a A and
2130///          \a B for equality. \n
2131///      11: Substring: Search \a B for substring matches of \a A. \n
2132///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2133///                mask of the comparison results. \n
2134///      00: No effect. \n
2135///      01: Negate the bit mask. \n
2136///      10: No effect. \n
2137///      11: Negate the bit mask only for bits with an index less than or equal
2138///          to the size of \a A or \a B. \n
2139/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
2140#define _mm_cmpestrc(A, LA, B, LB, M)                                          \
2141  ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA),         \
2142                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2143                                     (int)(M)))
2144
2145/// Uses the immediate operand \a M to perform a comparison of string
2146///    data with explicitly defined lengths that is contained in source operands
2147///    \a A and \a B. Returns bit 0 of the resulting bit mask.
2148///
2149/// \headerfile <x86intrin.h>
2150///
2151/// \code
2152/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
2153/// \endcode
2154///
2155/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2156/// instruction.
2157///
2158/// \param A
2159///    A 128-bit integer vector containing one of the source operands to be
2160///    compared.
2161/// \param LA
2162///    An integer that specifies the length of the string in \a A.
2163/// \param B
2164///    A 128-bit integer vector containing one of the source operands to be
2165///    compared.
2166/// \param LB
2167///    An integer that specifies the length of the string in \a B.
2168/// \param M
2169///    An 8-bit immediate operand specifying whether the characters are bytes or
2170///    words and the type of comparison to perform. \n
2171///    Bits [1:0]: Determine source data format. \n
2172///      00: 16 unsigned bytes \n
2173///      01: 8 unsigned words \n
2174///      10: 16 signed bytes \n
2175///      11: 8 signed words \n
2176///    Bits [3:2]: Determine comparison type and aggregation method. \n
2177///      00: Subset: Each character in \a B is compared for equality with all
2178///          the characters in \a A. \n
2179///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2180///          basis is greater than or equal for even-indexed elements in \a A,
2181///          and less than or equal for odd-indexed elements in \a A. \n
2182///      10: Match: Compare each pair of corresponding characters in \a A and
2183///          \a B for equality. \n
2184///      11: Substring: Search \a B for substring matches of \a A. \n
2185///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2186///                mask of the comparison results. \n
2187///      00: No effect. \n
2188///      01: Negate the bit mask. \n
2189///      10: No effect. \n
2190///      11: Negate the bit mask only for bits with an index less than or equal
2191///          to the size of \a A or \a B.
2192/// \returns Returns bit 0 of the resulting bit mask.
2193#define _mm_cmpestro(A, LA, B, LB, M)                                          \
2194  ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA),         \
2195                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2196                                     (int)(M)))
2197
2198/// Uses the immediate operand \a M to perform a comparison of string
2199///    data with explicitly defined lengths that is contained in source operands
2200///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
2201///    the maximum, otherwise, returns 0.
2202///
2203/// \headerfile <x86intrin.h>
2204///
2205/// \code
2206/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
2207/// \endcode
2208///
2209/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
2210/// instruction.
2211///
2212/// \param A
2213///    A 128-bit integer vector containing one of the source operands to be
2214///    compared.
2215/// \param LA
2216///    An integer that specifies the length of the string in \a A.
2217/// \param B
2218///    A 128-bit integer vector containing one of the source operands to be
2219///    compared.
2220/// \param LB
2221///    An integer that specifies the length of the string in \a B.
2222/// \param M
2223///    An 8-bit immediate operand specifying whether the characters are bytes or
2224///    words and the type of comparison to perform. \n
2225///    Bits [1:0]: Determine source data format. \n
2226///      00: 16 unsigned bytes \n
2227///      01: 8 unsigned words \n
2228///      10: 16 signed bytes \n
2229///      11: 8 signed words \n
2230///    Bits [3:2]: Determine comparison type and aggregation method. \n
2231///      00: Subset: Each character in \a B is compared for equality with all
2232///          the characters in \a A. \n
2233///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2234///          basis is greater than or equal for even-indexed elements in \a A,
2235///          and less than or equal for odd-indexed elements in \a A. \n
2236///      10: Match: Compare each pair of corresponding characters in \a A and
2237///          \a B for equality. \n
2238///      11: Substring: Search \a B for substring matches of \a A. \n
2239///    Bits [5:4]: Determine whether to perform a one's complement in the bit
2240///                mask of the comparison results. \n
2241///      00: No effect. \n
2242///      01: Negate the bit mask. \n
2243///      10: No effect. \n
2244///      11: Negate the bit mask only for bits with an index less than or equal
2245///          to the size of \a A or \a B. \n
2246/// \returns Returns 1 if the length of the string in \a A is less than the
2247///    maximum, otherwise, returns 0.
2248#define _mm_cmpestrs(A, LA, B, LB, M)                                          \
2249  ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA),         \
2250                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2251                                     (int)(M)))
2252
2253/// Uses the immediate operand \a M to perform a comparison of string
2254///    data with explicitly defined lengths that is contained in source operands
2255///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
2256///    the maximum, otherwise, returns 0.
2257///
2258/// \headerfile <x86intrin.h>
2259///
2260/// \code
2261/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
2262/// \endcode
2263///
2264/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
2265///
2266/// \param A
2267///    A 128-bit integer vector containing one of the source operands to be
2268///    compared.
2269/// \param LA
2270///    An integer that specifies the length of the string in \a A.
2271/// \param B
2272///    A 128-bit integer vector containing one of the source operands to be
2273///    compared.
2274/// \param LB
2275///    An integer that specifies the length of the string in \a B.
2276/// \param M
2277///    An 8-bit immediate operand specifying whether the characters are bytes or
2278///    words and the type of comparison to perform. \n
2279///    Bits [1:0]: Determine source data format. \n
2280///      00: 16 unsigned bytes  \n
2281///      01: 8 unsigned words \n
2282///      10: 16 signed bytes \n
2283///      11: 8 signed words \n
2284///    Bits [3:2]: Determine comparison type and aggregation method. \n
2285///      00: Subset: Each character in \a B is compared for equality with all
2286///          the characters in \a A. \n
2287///      01: Ranges: Each character in \a B is compared to \a A. The comparison
2288///          basis is greater than or equal for even-indexed elements in \a A,
2289///          and less than or equal for odd-indexed elements in \a A. \n
2290///      10: Match: Compare each pair of corresponding characters in \a A and
2291///          \a B for equality. \n
2292///      11: Substring: Search \a B for substring matches of \a A. \n
2293///    Bits [5:4]: Determine whether to perform a one's complement on the bit
2294///                mask of the comparison results. \n
2295///      00: No effect. \n
2296///      01: Negate the bit mask. \n
2297///      10: No effect. \n
2298///      11: Negate the bit mask only for bits with an index less than or equal
2299///          to the size of \a A or \a B.
2300/// \returns Returns 1 if the length of the string in \a B is less than the
2301///    maximum, otherwise, returns 0.
2302#define _mm_cmpestrz(A, LA, B, LB, M)                                          \
2303  ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA),         \
2304                                     (__v16qi)(__m128i)(B), (int)(LB),         \
2305                                     (int)(M)))
2306
2307/* SSE4.2 Compare Packed Data -- Greater Than.  */
2308/// Compares each of the corresponding 64-bit values of the 128-bit
2309///    integer vectors to determine if the values in the first operand are
2310///    greater than those in the second operand.
2311///
2312///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
2313///
2314/// \headerfile <x86intrin.h>
2315///
2316/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
2317///
2318/// \param __V1
2319///    A 128-bit integer vector.
2320/// \param __V2
2321///    A 128-bit integer vector.
2322/// \returns A 128-bit integer vector containing the comparison results.
2323static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
2324                                                             __m128i __V2) {
2325  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
2326}
2327
2328#undef __DEFAULT_FN_ATTRS
2329
2330#include <popcntintrin.h>
2331
2332#include <crc32intrin.h>
2333
2334#endif /* __SMMINTRIN_H */