master
   1/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9
  10/* Implemented from the specification included in the Intel C++ Compiler
  11   User Guide and Reference, version 9.0.  */
  12
  13#ifndef NO_WARN_X86_INTRINSICS
  14/* This header file is to help porting code using Intel intrinsics
  15   explicitly from x86_64 to powerpc64/powerpc64le.
  16
  17   Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
  18   PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
  19   However scalar float operations in vector (XMM) registers require
  20   the POWER8 VSX ISA (2.07) level. There are differences for data
  21   format and placement of float scalars in the vector register, which
  22   require extra steps to match SSE2 scalar float semantics on POWER.
  23
  24   It should be noted that there's much difference between X86_64's
  25   MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
  26   portable <fenv.h> instead of access MXSCR directly.
  27
  28   Most SSE2 scalar float intrinsic operations can be performed more
  29   efficiently as C language float scalar operations or optimized to
  30   use vector SIMD operations. We recommend this for new applications.
  31*/
  32#error                                                                         \
  33    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  34#endif
  35
  36#ifndef EMMINTRIN_H_
  37#define EMMINTRIN_H_
  38
  39#if defined(__powerpc64__) &&                                                  \
  40    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
  41
  42#include <altivec.h>
  43
  44/* We need definitions from the SSE header files.  */
  45#include <xmmintrin.h>
  46
  47/* SSE2 */
  48typedef __vector double __v2df;
  49typedef __vector float __v4f;
  50typedef __vector long long __v2di;
  51typedef __vector unsigned long long __v2du;
  52typedef __vector int __v4si;
  53typedef __vector unsigned int __v4su;
  54typedef __vector short __v8hi;
  55typedef __vector unsigned short __v8hu;
  56typedef __vector signed char __v16qi;
  57typedef __vector unsigned char __v16qu;
  58
  59/* The Intel API is flexible enough that we must allow aliasing with other
  60   vector types, and their scalar components.  */
  61typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
  62typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
  63
  64/* Unaligned version of the same types.  */
  65typedef long long __m128i_u
  66    __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  67typedef double __m128d_u
  68    __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
  69
  70/* Define two value permute mask.  */
  71#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
  72
  73/* Create a vector with element 0 as F and the rest zero.  */
  74extern __inline __m128d
  75    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  76    _mm_set_sd(double __F) {
  77  return __extension__(__m128d){__F, 0.0};
  78}
  79
  80/* Create a vector with both elements equal to F.  */
  81extern __inline __m128d
  82    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  83    _mm_set1_pd(double __F) {
  84  return __extension__(__m128d){__F, __F};
  85}
  86
  87extern __inline __m128d
  88    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  89    _mm_set_pd1(double __F) {
  90  return _mm_set1_pd(__F);
  91}
  92
  93/* Create a vector with the lower value X and upper value W.  */
  94extern __inline __m128d
  95    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  96    _mm_set_pd(double __W, double __X) {
  97  return __extension__(__m128d){__X, __W};
  98}
  99
 100/* Create a vector with the lower value W and upper value X.  */
 101extern __inline __m128d
 102    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 103    _mm_setr_pd(double __W, double __X) {
 104  return __extension__(__m128d){__W, __X};
 105}
 106
 107/* Create an undefined vector.  */
 108extern __inline __m128d
 109    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 110    _mm_undefined_pd(void) {
 111  __m128d __Y = __Y;
 112  return __Y;
 113}
 114
 115/* Create a vector of zeros.  */
 116extern __inline __m128d
 117    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 118    _mm_setzero_pd(void) {
 119  return (__m128d)vec_splats(0);
 120}
 121
 122/* Sets the low DPFP value of A from the low value of B.  */
 123extern __inline __m128d
 124    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 125    _mm_move_sd(__m128d __A, __m128d __B) {
 126  __v2df __result = (__v2df)__A;
 127  __result[0] = ((__v2df)__B)[0];
 128  return (__m128d)__result;
 129}
 130
 131/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
 132extern __inline __m128d
 133    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 134    _mm_load_pd(double const *__P) {
 135  return ((__m128d)vec_ld(0, (__v16qu *)__P));
 136}
 137
 138/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
 139extern __inline __m128d
 140    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 141    _mm_loadu_pd(double const *__P) {
 142  return (vec_vsx_ld(0, __P));
 143}
 144
 145/* Create a vector with all two elements equal to *P.  */
 146extern __inline __m128d
 147    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 148    _mm_load1_pd(double const *__P) {
 149  return (vec_splats(*__P));
 150}
 151
 152/* Create a vector with element 0 as *P and the rest zero.  */
 153extern __inline __m128d
 154    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 155    _mm_load_sd(double const *__P) {
 156  return _mm_set_sd(*__P);
 157}
 158
 159extern __inline __m128d
 160    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 161    _mm_load_pd1(double const *__P) {
 162  return _mm_load1_pd(__P);
 163}
 164
 165/* Load two DPFP values in reverse order.  The address must be aligned.  */
 166extern __inline __m128d
 167    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 168    _mm_loadr_pd(double const *__P) {
 169  __v2df __tmp = _mm_load_pd(__P);
 170  return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
 171}
 172
 173/* Store two DPFP values.  The address must be 16-byte aligned.  */
 174extern __inline void
 175    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 176    _mm_store_pd(double *__P, __m128d __A) {
 177  vec_st((__v16qu)__A, 0, (__v16qu *)__P);
 178}
 179
 180/* Store two DPFP values.  The address need not be 16-byte aligned.  */
 181extern __inline void
 182    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 183    _mm_storeu_pd(double *__P, __m128d __A) {
 184  *(__m128d_u *)__P = __A;
 185}
 186
 187/* Stores the lower DPFP value.  */
 188extern __inline void
 189    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 190    _mm_store_sd(double *__P, __m128d __A) {
 191  *__P = ((__v2df)__A)[0];
 192}
 193
 194extern __inline double
 195    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 196    _mm_cvtsd_f64(__m128d __A) {
 197  return ((__v2df)__A)[0];
 198}
 199
 200extern __inline void
 201    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 202    _mm_storel_pd(double *__P, __m128d __A) {
 203  _mm_store_sd(__P, __A);
 204}
 205
 206/* Stores the upper DPFP value.  */
 207extern __inline void
 208    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 209    _mm_storeh_pd(double *__P, __m128d __A) {
 210  *__P = ((__v2df)__A)[1];
 211}
 212/* Store the lower DPFP value across two words.
 213   The address must be 16-byte aligned.  */
 214extern __inline void
 215    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 216    _mm_store1_pd(double *__P, __m128d __A) {
 217  _mm_store_pd(__P, vec_splat(__A, 0));
 218}
 219
 220extern __inline void
 221    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 222    _mm_store_pd1(double *__P, __m128d __A) {
 223  _mm_store1_pd(__P, __A);
 224}
 225
 226/* Store two DPFP values in reverse order.  The address must be aligned.  */
 227extern __inline void
 228    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 229    _mm_storer_pd(double *__P, __m128d __A) {
 230  _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
 231}
 232
 233/* Intel intrinsic.  */
 234extern __inline long long
 235    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 236    _mm_cvtsi128_si64(__m128i __A) {
 237  return ((__v2di)__A)[0];
 238}
 239
 240/* Microsoft intrinsic.  */
 241extern __inline long long
 242    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 243    _mm_cvtsi128_si64x(__m128i __A) {
 244  return ((__v2di)__A)[0];
 245}
 246
 247extern __inline __m128d
 248    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 249    _mm_add_pd(__m128d __A, __m128d __B) {
 250  return (__m128d)((__v2df)__A + (__v2df)__B);
 251}
 252
 253/* Add the lower double-precision (64-bit) floating-point element in
 254   a and b, store the result in the lower element of dst, and copy
 255   the upper element from a to the upper element of dst. */
 256extern __inline __m128d
 257    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 258    _mm_add_sd(__m128d __A, __m128d __B) {
 259  __A[0] = __A[0] + __B[0];
 260  return (__A);
 261}
 262
 263extern __inline __m128d
 264    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 265    _mm_sub_pd(__m128d __A, __m128d __B) {
 266  return (__m128d)((__v2df)__A - (__v2df)__B);
 267}
 268
 269extern __inline __m128d
 270    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 271    _mm_sub_sd(__m128d __A, __m128d __B) {
 272  __A[0] = __A[0] - __B[0];
 273  return (__A);
 274}
 275
 276extern __inline __m128d
 277    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 278    _mm_mul_pd(__m128d __A, __m128d __B) {
 279  return (__m128d)((__v2df)__A * (__v2df)__B);
 280}
 281
 282extern __inline __m128d
 283    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 284    _mm_mul_sd(__m128d __A, __m128d __B) {
 285  __A[0] = __A[0] * __B[0];
 286  return (__A);
 287}
 288
 289extern __inline __m128d
 290    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 291    _mm_div_pd(__m128d __A, __m128d __B) {
 292  return (__m128d)((__v2df)__A / (__v2df)__B);
 293}
 294
 295extern __inline __m128d
 296    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 297    _mm_div_sd(__m128d __A, __m128d __B) {
 298  __A[0] = __A[0] / __B[0];
 299  return (__A);
 300}
 301
 302extern __inline __m128d
 303    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 304    _mm_sqrt_pd(__m128d __A) {
 305  return (vec_sqrt(__A));
 306}
 307
 308/* Return pair {sqrt (B[0]), A[1]}.  */
 309extern __inline __m128d
 310    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 311    _mm_sqrt_sd(__m128d __A, __m128d __B) {
 312  __v2df __c;
 313  __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
 314  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 315}
 316
 317extern __inline __m128d
 318    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 319    _mm_min_pd(__m128d __A, __m128d __B) {
 320  return (vec_min(__A, __B));
 321}
 322
 323extern __inline __m128d
 324    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 325    _mm_min_sd(__m128d __A, __m128d __B) {
 326  __v2df __a, __b, __c;
 327  __a = vec_splats(__A[0]);
 328  __b = vec_splats(__B[0]);
 329  __c = vec_min(__a, __b);
 330  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 331}
 332
 333extern __inline __m128d
 334    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 335    _mm_max_pd(__m128d __A, __m128d __B) {
 336  return (vec_max(__A, __B));
 337}
 338
 339extern __inline __m128d
 340    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 341    _mm_max_sd(__m128d __A, __m128d __B) {
 342  __v2df __a, __b, __c;
 343  __a = vec_splats(__A[0]);
 344  __b = vec_splats(__B[0]);
 345  __c = vec_max(__a, __b);
 346  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 347}
 348
 349extern __inline __m128d
 350    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 351    _mm_cmpeq_pd(__m128d __A, __m128d __B) {
 352  return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
 353}
 354
 355extern __inline __m128d
 356    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 357    _mm_cmplt_pd(__m128d __A, __m128d __B) {
 358  return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
 359}
 360
 361extern __inline __m128d
 362    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 363    _mm_cmple_pd(__m128d __A, __m128d __B) {
 364  return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
 365}
 366
 367extern __inline __m128d
 368    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 369    _mm_cmpgt_pd(__m128d __A, __m128d __B) {
 370  return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
 371}
 372
 373extern __inline __m128d
 374    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 375    _mm_cmpge_pd(__m128d __A, __m128d __B) {
 376  return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
 377}
 378
 379extern __inline __m128d
 380    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 381    _mm_cmpneq_pd(__m128d __A, __m128d __B) {
 382  __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
 383  return ((__m128d)vec_nor(__temp, __temp));
 384}
 385
 386extern __inline __m128d
 387    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 388    _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
 389  return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
 390}
 391
 392extern __inline __m128d
 393    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 394    _mm_cmpnle_pd(__m128d __A, __m128d __B) {
 395  return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
 396}
 397
 398extern __inline __m128d
 399    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 400    _mm_cmpngt_pd(__m128d __A, __m128d __B) {
 401  return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
 402}
 403
 404extern __inline __m128d
 405    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 406    _mm_cmpnge_pd(__m128d __A, __m128d __B) {
 407  return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
 408}
 409
 410extern __inline __m128d
 411    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 412    _mm_cmpord_pd(__m128d __A, __m128d __B) {
 413  __v2du __c, __d;
 414  /* Compare against self will return false (0's) if NAN.  */
 415  __c = (__v2du)vec_cmpeq(__A, __A);
 416  __d = (__v2du)vec_cmpeq(__B, __B);
 417  /* A != NAN and B != NAN.  */
 418  return ((__m128d)vec_and(__c, __d));
 419}
 420
 421extern __inline __m128d
 422    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 423    _mm_cmpunord_pd(__m128d __A, __m128d __B) {
 424#if _ARCH_PWR8
 425  __v2du __c, __d;
 426  /* Compare against self will return false (0's) if NAN.  */
 427  __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
 428  __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
 429  /* A == NAN OR B == NAN converts too:
 430     NOT(A != NAN) OR NOT(B != NAN).  */
 431  __c = vec_nor(__c, __c);
 432  return ((__m128d)vec_orc(__c, __d));
 433#else
 434  __v2du __c, __d;
 435  /* Compare against self will return false (0's) if NAN.  */
 436  __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
 437  __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
 438  /* Convert the true ('1's) is NAN.  */
 439  __c = vec_nor(__c, __c);
 440  __d = vec_nor(__d, __d);
 441  return ((__m128d)vec_or(__c, __d));
 442#endif
 443}
 444
 445extern __inline __m128d
 446    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 447    _mm_cmpeq_sd(__m128d __A, __m128d __B) {
 448  __v2df __a, __b, __c;
 449  /* PowerISA VSX does not allow partial (for just lower double)
 450     results. So to insure we don't generate spurious exceptions
 451     (from the upper double values) we splat the lower double
 452     before we do the operation. */
 453  __a = vec_splats(__A[0]);
 454  __b = vec_splats(__B[0]);
 455  __c = (__v2df)vec_cmpeq(__a, __b);
 456  /* Then we merge the lower double result with the original upper
 457     double from __A.  */
 458  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 459}
 460
 461extern __inline __m128d
 462    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 463    _mm_cmplt_sd(__m128d __A, __m128d __B) {
 464  __v2df __a, __b, __c;
 465  __a = vec_splats(__A[0]);
 466  __b = vec_splats(__B[0]);
 467  __c = (__v2df)vec_cmplt(__a, __b);
 468  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 469}
 470
 471extern __inline __m128d
 472    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 473    _mm_cmple_sd(__m128d __A, __m128d __B) {
 474  __v2df __a, __b, __c;
 475  __a = vec_splats(__A[0]);
 476  __b = vec_splats(__B[0]);
 477  __c = (__v2df)vec_cmple(__a, __b);
 478  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 479}
 480
 481extern __inline __m128d
 482    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 483    _mm_cmpgt_sd(__m128d __A, __m128d __B) {
 484  __v2df __a, __b, __c;
 485  __a = vec_splats(__A[0]);
 486  __b = vec_splats(__B[0]);
 487  __c = (__v2df)vec_cmpgt(__a, __b);
 488  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 489}
 490
 491extern __inline __m128d
 492    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 493    _mm_cmpge_sd(__m128d __A, __m128d __B) {
 494  __v2df __a, __b, __c;
 495  __a = vec_splats(__A[0]);
 496  __b = vec_splats(__B[0]);
 497  __c = (__v2df)vec_cmpge(__a, __b);
 498  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 499}
 500
 501extern __inline __m128d
 502    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 503    _mm_cmpneq_sd(__m128d __A, __m128d __B) {
 504  __v2df __a, __b, __c;
 505  __a = vec_splats(__A[0]);
 506  __b = vec_splats(__B[0]);
 507  __c = (__v2df)vec_cmpeq(__a, __b);
 508  __c = vec_nor(__c, __c);
 509  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 510}
 511
 512extern __inline __m128d
 513    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 514    _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
 515  __v2df __a, __b, __c;
 516  __a = vec_splats(__A[0]);
 517  __b = vec_splats(__B[0]);
 518  /* Not less than is just greater than or equal.  */
 519  __c = (__v2df)vec_cmpge(__a, __b);
 520  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 521}
 522
 523extern __inline __m128d
 524    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 525    _mm_cmpnle_sd(__m128d __A, __m128d __B) {
 526  __v2df __a, __b, __c;
 527  __a = vec_splats(__A[0]);
 528  __b = vec_splats(__B[0]);
 529  /* Not less than or equal is just greater than.  */
 530  __c = (__v2df)vec_cmpge(__a, __b);
 531  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 532}
 533
 534extern __inline __m128d
 535    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 536    _mm_cmpngt_sd(__m128d __A, __m128d __B) {
 537  __v2df __a, __b, __c;
 538  __a = vec_splats(__A[0]);
 539  __b = vec_splats(__B[0]);
 540  /* Not greater than is just less than or equal.  */
 541  __c = (__v2df)vec_cmple(__a, __b);
 542  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 543}
 544
 545extern __inline __m128d
 546    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 547    _mm_cmpnge_sd(__m128d __A, __m128d __B) {
 548  __v2df __a, __b, __c;
 549  __a = vec_splats(__A[0]);
 550  __b = vec_splats(__B[0]);
 551  /* Not greater than or equal is just less than.  */
 552  __c = (__v2df)vec_cmplt(__a, __b);
 553  return (__m128d)_mm_setr_pd(__c[0], __A[1]);
 554}
 555
 556extern __inline __m128d
 557    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 558    _mm_cmpord_sd(__m128d __A, __m128d __B) {
 559  __v2df __r;
 560  __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
 561  return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
 562}
 563
 564extern __inline __m128d
 565    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 566    _mm_cmpunord_sd(__m128d __A, __m128d __B) {
 567  __v2df __r;
 568  __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
 569  return (__m128d)_mm_setr_pd(__r[0], __A[1]);
 570}
 571
 572/* FIXME
 573   The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
 574   exactly the same because GCC for PowerPC only generates unordered
 575   compares (scalar and vector).
 576   Technically __mm_comieq_sp et all should be using the ordered
 577   compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
 578   be OK.   */
 579extern __inline int
 580    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 581    _mm_comieq_sd(__m128d __A, __m128d __B) {
 582  return (__A[0] == __B[0]);
 583}
 584
 585extern __inline int
 586    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 587    _mm_comilt_sd(__m128d __A, __m128d __B) {
 588  return (__A[0] < __B[0]);
 589}
 590
 591extern __inline int
 592    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 593    _mm_comile_sd(__m128d __A, __m128d __B) {
 594  return (__A[0] <= __B[0]);
 595}
 596
 597extern __inline int
 598    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 599    _mm_comigt_sd(__m128d __A, __m128d __B) {
 600  return (__A[0] > __B[0]);
 601}
 602
 603extern __inline int
 604    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 605    _mm_comige_sd(__m128d __A, __m128d __B) {
 606  return (__A[0] >= __B[0]);
 607}
 608
 609extern __inline int
 610    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 611    _mm_comineq_sd(__m128d __A, __m128d __B) {
 612  return (__A[0] != __B[0]);
 613}
 614
 615extern __inline int
 616    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 617    _mm_ucomieq_sd(__m128d __A, __m128d __B) {
 618  return (__A[0] == __B[0]);
 619}
 620
 621extern __inline int
 622    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 623    _mm_ucomilt_sd(__m128d __A, __m128d __B) {
 624  return (__A[0] < __B[0]);
 625}
 626
 627extern __inline int
 628    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 629    _mm_ucomile_sd(__m128d __A, __m128d __B) {
 630  return (__A[0] <= __B[0]);
 631}
 632
 633extern __inline int
 634    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 635    _mm_ucomigt_sd(__m128d __A, __m128d __B) {
 636  return (__A[0] > __B[0]);
 637}
 638
 639extern __inline int
 640    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 641    _mm_ucomige_sd(__m128d __A, __m128d __B) {
 642  return (__A[0] >= __B[0]);
 643}
 644
 645extern __inline int
 646    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 647    _mm_ucomineq_sd(__m128d __A, __m128d __B) {
 648  return (__A[0] != __B[0]);
 649}
 650
 651/* Create a vector of Qi, where i is the element number.  */
 652extern __inline __m128i
 653    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 654    _mm_set_epi64x(long long __q1, long long __q0) {
 655  return __extension__(__m128i)(__v2di){__q0, __q1};
 656}
 657
 658extern __inline __m128i
 659    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 660    _mm_set_epi64(__m64 __q1, __m64 __q0) {
 661  return _mm_set_epi64x((long long)__q1, (long long)__q0);
 662}
 663
 664extern __inline __m128i
 665    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 666    _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
 667  return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
 668}
 669
 670extern __inline __m128i
 671    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 672    _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
 673                  short __q2, short __q1, short __q0) {
 674  return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
 675                                        __q4, __q5, __q6, __q7};
 676}
 677
 678extern __inline __m128i
 679    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 680    _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
 681                 char __q10, char __q09, char __q08, char __q07, char __q06,
 682                 char __q05, char __q04, char __q03, char __q02, char __q01,
 683                 char __q00) {
 684  return __extension__(__m128i)(__v16qi){
 685      __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
 686      __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
 687}
 688
 689/* Set all of the elements of the vector to A.  */
 690extern __inline __m128i
 691    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 692    _mm_set1_epi64x(long long __A) {
 693  return _mm_set_epi64x(__A, __A);
 694}
 695
 696extern __inline __m128i
 697    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 698    _mm_set1_epi64(__m64 __A) {
 699  return _mm_set_epi64(__A, __A);
 700}
 701
 702extern __inline __m128i
 703    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 704    _mm_set1_epi32(int __A) {
 705  return _mm_set_epi32(__A, __A, __A, __A);
 706}
 707
 708extern __inline __m128i
 709    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 710    _mm_set1_epi16(short __A) {
 711  return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
 712}
 713
 714extern __inline __m128i
 715    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 716    _mm_set1_epi8(char __A) {
 717  return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
 718                      __A, __A, __A, __A, __A);
 719}
 720
 721/* Create a vector of Qi, where i is the element number.
 722   The parameter order is reversed from the _mm_set_epi* functions.  */
 723extern __inline __m128i
 724    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 725    _mm_setr_epi64(__m64 __q0, __m64 __q1) {
 726  return _mm_set_epi64(__q1, __q0);
 727}
 728
 729extern __inline __m128i
 730    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 731    _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
 732  return _mm_set_epi32(__q3, __q2, __q1, __q0);
 733}
 734
 735extern __inline __m128i
 736    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 737    _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
 738                   short __q5, short __q6, short __q7) {
 739  return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
 740}
 741
 742extern __inline __m128i
 743    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 744    _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
 745                  char __q05, char __q06, char __q07, char __q08, char __q09,
 746                  char __q10, char __q11, char __q12, char __q13, char __q14,
 747                  char __q15) {
 748  return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
 749                      __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
 750}
 751
 752/* Create a vector with element 0 as *P and the rest zero.  */
 753extern __inline __m128i
 754    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 755    _mm_load_si128(__m128i const *__P) {
 756  return *__P;
 757}
 758
 759extern __inline __m128i
 760    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 761    _mm_loadu_si128(__m128i_u const *__P) {
 762  return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
 763}
 764
 765extern __inline __m128i
 766    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 767    _mm_loadl_epi64(__m128i_u const *__P) {
 768  return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
 769}
 770
 771extern __inline void
 772    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 773    _mm_store_si128(__m128i *__P, __m128i __B) {
 774  vec_st((__v16qu)__B, 0, (__v16qu *)__P);
 775}
 776
 777extern __inline void
 778    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 779    _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
 780  *__P = __B;
 781}
 782
 783extern __inline void
 784    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 785    _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
 786  *(long long *)__P = ((__v2di)__B)[0];
 787}
 788
 789extern __inline __m64
 790    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 791    _mm_movepi64_pi64(__m128i_u __B) {
 792  return (__m64)((__v2di)__B)[0];
 793}
 794
 795extern __inline __m128i
 796    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 797    _mm_movpi64_epi64(__m64 __A) {
 798  return _mm_set_epi64((__m64)0LL, __A);
 799}
 800
 801extern __inline __m128i
 802    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 803    _mm_move_epi64(__m128i __A) {
 804  return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
 805}
 806
 807/* Create an undefined vector.  */
 808extern __inline __m128i
 809    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 810    _mm_undefined_si128(void) {
 811  __m128i __Y = __Y;
 812  return __Y;
 813}
 814
 815/* Create a vector of zeros.  */
 816extern __inline __m128i
 817    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 818    _mm_setzero_si128(void) {
 819  return __extension__(__m128i)(__v4si){0, 0, 0, 0};
 820}
 821
 822#ifdef _ARCH_PWR8
 823extern __inline __m128d
 824    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 825    _mm_cvtepi32_pd(__m128i __A) {
 826  __v2di __val;
 827  /* For LE need to generate Vector Unpack Low Signed Word.
 828     Which is generated from unpackh.  */
 829  __val = (__v2di)vec_unpackh((__v4si)__A);
 830
 831  return (__m128d)vec_ctf(__val, 0);
 832}
 833#endif
 834
 835extern __inline __m128
 836    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 837    _mm_cvtepi32_ps(__m128i __A) {
 838  return ((__m128)vec_ctf((__v4si)__A, 0));
 839}
 840
 841extern __inline __m128i
 842    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 843    _mm_cvtpd_epi32(__m128d __A) {
 844  __v2df __rounded = vec_rint(__A);
 845  __v4si __result, __temp;
 846  const __v4si __vzero = {0, 0, 0, 0};
 847
 848  /* VSX Vector truncate Double-Precision to integer and Convert to
 849   Signed Integer Word format with Saturate.  */
 850  __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
 851
 852#ifdef _ARCH_PWR8
 853#ifdef __LITTLE_ENDIAN__
 854  __temp = vec_mergeo(__temp, __temp);
 855#else
 856  __temp = vec_mergee(__temp, __temp);
 857#endif
 858  __result = (__v4si)vec_vpkudum((__vector long long)__temp,
 859                                 (__vector long long)__vzero);
 860#else
 861  {
 862    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 863                              0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 864    __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 865  }
 866#endif
 867  return (__m128i)__result;
 868}
 869
 870extern __inline __m64
 871    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 872    _mm_cvtpd_pi32(__m128d __A) {
 873  __m128i __result = _mm_cvtpd_epi32(__A);
 874
 875  return (__m64)__result[0];
 876}
 877
 878extern __inline __m128
 879    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 880    _mm_cvtpd_ps(__m128d __A) {
 881  __v4sf __result;
 882  __v4si __temp;
 883  const __v4si __vzero = {0, 0, 0, 0};
 884
 885  __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
 886
 887#ifdef _ARCH_PWR8
 888#ifdef __LITTLE_ENDIAN__
 889  __temp = vec_mergeo(__temp, __temp);
 890#else
 891  __temp = vec_mergee(__temp, __temp);
 892#endif
 893  __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
 894                                 (__vector long long)__vzero);
 895#else
 896  {
 897    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 898                              0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 899    __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 900  }
 901#endif
 902  return ((__m128)__result);
 903}
 904
 905extern __inline __m128i
 906    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 907    _mm_cvttpd_epi32(__m128d __A) {
 908  __v4si __result;
 909  __v4si __temp;
 910  const __v4si __vzero = {0, 0, 0, 0};
 911
 912  /* VSX Vector truncate Double-Precision to integer and Convert to
 913   Signed Integer Word format with Saturate.  */
 914  __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
 915
 916#ifdef _ARCH_PWR8
 917#ifdef __LITTLE_ENDIAN__
 918  __temp = vec_mergeo(__temp, __temp);
 919#else
 920  __temp = vec_mergee(__temp, __temp);
 921#endif
 922  __result = (__v4si)vec_vpkudum((__vector long long)__temp,
 923                                 (__vector long long)__vzero);
 924#else
 925  {
 926    const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
 927                              0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
 928    __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
 929  }
 930#endif
 931
 932  return ((__m128i)__result);
 933}
 934
 935extern __inline __m64
 936    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 937    _mm_cvttpd_pi32(__m128d __A) {
 938  __m128i __result = _mm_cvttpd_epi32(__A);
 939
 940  return (__m64)__result[0];
 941}
 942
 943extern __inline int
 944    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 945    _mm_cvtsi128_si32(__m128i __A) {
 946  return ((__v4si)__A)[0];
 947}
 948
 949#ifdef _ARCH_PWR8
 950extern __inline __m128d
 951    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 952    _mm_cvtpi32_pd(__m64 __A) {
 953  __v4si __temp;
 954  __v2di __tmp2;
 955  __v4f __result;
 956
 957  __temp = (__v4si)vec_splats(__A);
 958  __tmp2 = (__v2di)vec_unpackl(__temp);
 959  __result = vec_ctf((__vector signed long long)__tmp2, 0);
 960  return (__m128d)__result;
 961}
 962#endif
 963
 964extern __inline __m128i
 965    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 966    _mm_cvtps_epi32(__m128 __A) {
 967  __v4sf __rounded;
 968  __v4si __result;
 969
 970  __rounded = vec_rint((__v4sf)__A);
 971  __result = vec_cts(__rounded, 0);
 972  return (__m128i)__result;
 973}
 974
 975extern __inline __m128i
 976    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 977    _mm_cvttps_epi32(__m128 __A) {
 978  __v4si __result;
 979
 980  __result = vec_cts((__v4sf)__A, 0);
 981  return (__m128i)__result;
 982}
 983
 984extern __inline __m128d
 985    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 986    _mm_cvtps_pd(__m128 __A) {
 987  /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
 988#ifdef vec_doubleh
 989  return (__m128d)vec_doubleh((__v4sf)__A);
 990#else
 991  /* Otherwise the compiler is not current and so need to generate the
 992     equivalent code.  */
 993  __v4sf __a = (__v4sf)__A;
 994  __v4sf __temp;
 995  __v2df __result;
 996#ifdef __LITTLE_ENDIAN__
 997  /* The input float values are in elements {[0], [1]} but the convert
 998     instruction needs them in elements {[1], [3]}, So we use two
 999     shift left double vector word immediates to get the elements
1000     lined up.  */
1001  __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1002  __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1003#else
1004  /* The input float values are in elements {[0], [1]} but the convert
1005     instruction needs them in elements {[0], [2]}, So we use two
1006     shift left double vector word immediates to get the elements
1007     lined up.  */
1008  __temp = vec_vmrghw(__a, __a);
1009#endif
1010  __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1011  return (__m128d)__result;
1012#endif
1013}
1014
1015extern __inline int
1016    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017    _mm_cvtsd_si32(__m128d __A) {
1018  __v2df __rounded = vec_rint((__v2df)__A);
1019  int __result = ((__v2df)__rounded)[0];
1020
1021  return __result;
1022}
1023/* Intel intrinsic.  */
1024extern __inline long long
1025    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026    _mm_cvtsd_si64(__m128d __A) {
1027  __v2df __rounded = vec_rint((__v2df)__A);
1028  long long __result = ((__v2df)__rounded)[0];
1029
1030  return __result;
1031}
1032
1033/* Microsoft intrinsic.  */
1034extern __inline long long
1035    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036    _mm_cvtsd_si64x(__m128d __A) {
1037  return _mm_cvtsd_si64((__v2df)__A);
1038}
1039
1040extern __inline int
1041    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1042    _mm_cvttsd_si32(__m128d __A) {
1043  int __result = ((__v2df)__A)[0];
1044
1045  return __result;
1046}
1047
1048/* Intel intrinsic.  */
1049extern __inline long long
1050    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1051    _mm_cvttsd_si64(__m128d __A) {
1052  long long __result = ((__v2df)__A)[0];
1053
1054  return __result;
1055}
1056
1057/* Microsoft intrinsic.  */
1058extern __inline long long
1059    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1060    _mm_cvttsd_si64x(__m128d __A) {
1061  return _mm_cvttsd_si64(__A);
1062}
1063
1064extern __inline __m128
1065    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1066    _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1067  __v4sf __result = (__v4sf)__A;
1068
1069#ifdef __LITTLE_ENDIAN__
1070  __v4sf __temp_s;
1071  /* Copy double element[0] to element [1] for conversion.  */
1072  __v2df __temp_b = vec_splat((__v2df)__B, 0);
1073
1074  /* Pre-rotate __A left 3 (logically right 1) elements.  */
1075  __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1076  /* Convert double to single float scalar in a vector.  */
1077  __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1078  /* Shift the resulting scalar into vector element [0].  */
1079  __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1080#else
1081  __result[0] = ((__v2df)__B)[0];
1082#endif
1083  return (__m128)__result;
1084}
1085
1086extern __inline __m128d
1087    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1088    _mm_cvtsi32_sd(__m128d __A, int __B) {
1089  __v2df __result = (__v2df)__A;
1090  double __db = __B;
1091  __result[0] = __db;
1092  return (__m128d)__result;
1093}
1094
1095/* Intel intrinsic.  */
1096extern __inline __m128d
1097    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1098    _mm_cvtsi64_sd(__m128d __A, long long __B) {
1099  __v2df __result = (__v2df)__A;
1100  double __db = __B;
1101  __result[0] = __db;
1102  return (__m128d)__result;
1103}
1104
1105/* Microsoft intrinsic.  */
1106extern __inline __m128d
1107    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108    _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1109  return _mm_cvtsi64_sd(__A, __B);
1110}
1111
1112extern __inline __m128d
1113    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1114    _mm_cvtss_sd(__m128d __A, __m128 __B) {
1115#ifdef __LITTLE_ENDIAN__
1116  /* Use splat to move element [0] into position for the convert. */
1117  __v4sf __temp = vec_splat((__v4sf)__B, 0);
1118  __v2df __res;
1119  /* Convert single float scalar to double in a vector.  */
1120  __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1121  return (__m128d)vec_mergel(__res, (__v2df)__A);
1122#else
1123  __v2df __res = (__v2df)__A;
1124  __res[0] = ((__v4sf)__B)[0];
1125  return (__m128d)__res;
1126#endif
1127}
1128
1129extern __inline __m128d
1130    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1131    _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1132  __vector double __result;
1133  const int __litmsk = __mask & 0x3;
1134
1135  if (__litmsk == 0)
1136    __result = vec_mergeh(__A, __B);
1137#if __GNUC__ < 6
1138  else if (__litmsk == 1)
1139    __result = vec_xxpermdi(__B, __A, 2);
1140  else if (__litmsk == 2)
1141    __result = vec_xxpermdi(__B, __A, 1);
1142#else
1143  else if (__litmsk == 1)
1144    __result = vec_xxpermdi(__A, __B, 2);
1145  else if (__litmsk == 2)
1146    __result = vec_xxpermdi(__A, __B, 1);
1147#endif
1148  else
1149    __result = vec_mergel(__A, __B);
1150
1151  return __result;
1152}
1153
1154extern __inline __m128d
1155    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156    _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1157  return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1158}
1159
1160extern __inline __m128d
1161    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1162    _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1163  return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1164}
1165
1166extern __inline __m128d
1167    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1168    _mm_loadh_pd(__m128d __A, double const *__B) {
1169  __v2df __result = (__v2df)__A;
1170  __result[1] = *__B;
1171  return (__m128d)__result;
1172}
1173
1174extern __inline __m128d
1175    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1176    _mm_loadl_pd(__m128d __A, double const *__B) {
1177  __v2df __result = (__v2df)__A;
1178  __result[0] = *__B;
1179  return (__m128d)__result;
1180}
1181
1182#ifdef _ARCH_PWR8
1183/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1184
1185/* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1186extern __inline int
1187    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188    _mm_movemask_pd(__m128d __A) {
1189#ifdef _ARCH_PWR10
1190  return vec_extractm((__v2du)__A);
1191#else
1192  __vector unsigned long long __result;
1193  static const __vector unsigned int __perm_mask = {
1194#ifdef __LITTLE_ENDIAN__
1195      0x80800040, 0x80808080, 0x80808080, 0x80808080
1196#else
1197      0x80808080, 0x80808080, 0x80808080, 0x80804000
1198#endif
1199  };
1200
1201  __result = ((__vector unsigned long long)vec_vbpermq(
1202      (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1203
1204#ifdef __LITTLE_ENDIAN__
1205  return __result[1];
1206#else
1207  return __result[0];
1208#endif
1209#endif /* !_ARCH_PWR10 */
1210}
1211#endif /* _ARCH_PWR8 */
1212
1213extern __inline __m128i
1214    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1215    _mm_packs_epi16(__m128i __A, __m128i __B) {
1216  return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1217}
1218
1219extern __inline __m128i
1220    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221    _mm_packs_epi32(__m128i __A, __m128i __B) {
1222  return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1223}
1224
1225extern __inline __m128i
1226    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1227    _mm_packus_epi16(__m128i __A, __m128i __B) {
1228  return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1229}
1230
1231extern __inline __m128i
1232    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1233    _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1234  return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1235}
1236
1237extern __inline __m128i
1238    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1239    _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1240  return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1241}
1242
1243extern __inline __m128i
1244    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1245    _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1246  return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1247}
1248
1249extern __inline __m128i
1250    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1251    _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1252  return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1253}
1254
1255extern __inline __m128i
1256    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1257    _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1258  return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1259}
1260
1261extern __inline __m128i
1262    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1263    _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1264  return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1265}
1266
1267extern __inline __m128i
1268    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269    _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1270  return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1271}
1272
1273extern __inline __m128i
1274    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275    _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1276  return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1277}
1278
1279extern __inline __m128i
1280    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281    _mm_add_epi8(__m128i __A, __m128i __B) {
1282  return (__m128i)((__v16qu)__A + (__v16qu)__B);
1283}
1284
1285extern __inline __m128i
1286    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1287    _mm_add_epi16(__m128i __A, __m128i __B) {
1288  return (__m128i)((__v8hu)__A + (__v8hu)__B);
1289}
1290
1291extern __inline __m128i
1292    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1293    _mm_add_epi32(__m128i __A, __m128i __B) {
1294  return (__m128i)((__v4su)__A + (__v4su)__B);
1295}
1296
1297extern __inline __m128i
1298    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1299    _mm_add_epi64(__m128i __A, __m128i __B) {
1300  return (__m128i)((__v2du)__A + (__v2du)__B);
1301}
1302
1303extern __inline __m128i
1304    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1305    _mm_adds_epi8(__m128i __A, __m128i __B) {
1306  return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1307}
1308
1309extern __inline __m128i
1310    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1311    _mm_adds_epi16(__m128i __A, __m128i __B) {
1312  return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1313}
1314
1315extern __inline __m128i
1316    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317    _mm_adds_epu8(__m128i __A, __m128i __B) {
1318  return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1319}
1320
1321extern __inline __m128i
1322    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1323    _mm_adds_epu16(__m128i __A, __m128i __B) {
1324  return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1325}
1326
1327extern __inline __m128i
1328    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329    _mm_sub_epi8(__m128i __A, __m128i __B) {
1330  return (__m128i)((__v16qu)__A - (__v16qu)__B);
1331}
1332
1333extern __inline __m128i
1334    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1335    _mm_sub_epi16(__m128i __A, __m128i __B) {
1336  return (__m128i)((__v8hu)__A - (__v8hu)__B);
1337}
1338
1339extern __inline __m128i
1340    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341    _mm_sub_epi32(__m128i __A, __m128i __B) {
1342  return (__m128i)((__v4su)__A - (__v4su)__B);
1343}
1344
1345extern __inline __m128i
1346    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1347    _mm_sub_epi64(__m128i __A, __m128i __B) {
1348  return (__m128i)((__v2du)__A - (__v2du)__B);
1349}
1350
1351extern __inline __m128i
1352    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1353    _mm_subs_epi8(__m128i __A, __m128i __B) {
1354  return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1355}
1356
1357extern __inline __m128i
1358    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1359    _mm_subs_epi16(__m128i __A, __m128i __B) {
1360  return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1361}
1362
1363extern __inline __m128i
1364    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1365    _mm_subs_epu8(__m128i __A, __m128i __B) {
1366  return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1367}
1368
1369extern __inline __m128i
1370    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1371    _mm_subs_epu16(__m128i __A, __m128i __B) {
1372  return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1373}
1374
1375extern __inline __m128i
1376    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1377    _mm_madd_epi16(__m128i __A, __m128i __B) {
1378  __vector signed int __zero = {0, 0, 0, 0};
1379
1380  return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1381}
1382
1383extern __inline __m128i
1384    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1385    _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1386  __vector signed int __w0, __w1;
1387
1388  __vector unsigned char __xform1 = {
1389#ifdef __LITTLE_ENDIAN__
1390      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1391      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1392#else
1393      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1394      0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1395#endif
1396  };
1397
1398  __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1399  __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1400  return (__m128i)vec_perm(__w0, __w1, __xform1);
1401}
1402
1403extern __inline __m128i
1404    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405    _mm_mullo_epi16(__m128i __A, __m128i __B) {
1406  return (__m128i)((__v8hi)__A * (__v8hi)__B);
1407}
1408
1409extern __inline __m64
1410    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1411    _mm_mul_su32(__m64 __A, __m64 __B) {
1412  unsigned int __a = __A;
1413  unsigned int __b = __B;
1414
1415  return ((__m64)__a * (__m64)__b);
1416}
1417
1418#ifdef _ARCH_PWR8
1419extern __inline __m128i
1420    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421    _mm_mul_epu32(__m128i __A, __m128i __B) {
1422#if __GNUC__ < 8
1423  __v2du __result;
1424
1425#ifdef __LITTLE_ENDIAN__
1426  /* VMX Vector Multiply Odd Unsigned Word.  */
1427  __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1428#else
1429  /* VMX Vector Multiply Even Unsigned Word.  */
1430  __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1431#endif
1432  return (__m128i)__result;
1433#else
1434  return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1435#endif
1436}
1437#endif
1438
1439extern __inline __m128i
1440    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1441    _mm_slli_epi16(__m128i __A, int __B) {
1442  __v8hu __lshift;
1443  __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1444
1445  if (__B >= 0 && __B < 16) {
1446    if (__builtin_constant_p(__B))
1447      __lshift = (__v8hu)vec_splat_s16(__B);
1448    else
1449      __lshift = vec_splats((unsigned short)__B);
1450
1451    __result = vec_sl((__v8hi)__A, __lshift);
1452  }
1453
1454  return (__m128i)__result;
1455}
1456
1457extern __inline __m128i
1458    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1459    _mm_slli_epi32(__m128i __A, int __B) {
1460  __v4su __lshift;
1461  __v4si __result = {0, 0, 0, 0};
1462
1463  if (__B >= 0 && __B < 32) {
1464    if (__builtin_constant_p(__B) && __B < 16)
1465      __lshift = (__v4su)vec_splat_s32(__B);
1466    else
1467      __lshift = vec_splats((unsigned int)__B);
1468
1469    __result = vec_sl((__v4si)__A, __lshift);
1470  }
1471
1472  return (__m128i)__result;
1473}
1474
1475#ifdef _ARCH_PWR8
1476extern __inline __m128i
1477    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1478    _mm_slli_epi64(__m128i __A, int __B) {
1479  __v2du __lshift;
1480  __v2di __result = {0, 0};
1481
1482  if (__B >= 0 && __B < 64) {
1483    if (__builtin_constant_p(__B) && __B < 16)
1484      __lshift = (__v2du)vec_splat_s32(__B);
1485    else
1486      __lshift = (__v2du)vec_splats((unsigned int)__B);
1487
1488    __result = vec_sl((__v2di)__A, __lshift);
1489  }
1490
1491  return (__m128i)__result;
1492}
1493#endif
1494
1495extern __inline __m128i
1496    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1497    _mm_srai_epi16(__m128i __A, int __B) {
1498  __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1499  __v8hi __result;
1500
1501  if (__B < 16) {
1502    if (__builtin_constant_p(__B))
1503      __rshift = (__v8hu)vec_splat_s16(__B);
1504    else
1505      __rshift = vec_splats((unsigned short)__B);
1506  }
1507  __result = vec_sra((__v8hi)__A, __rshift);
1508
1509  return (__m128i)__result;
1510}
1511
1512extern __inline __m128i
1513    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1514    _mm_srai_epi32(__m128i __A, int __B) {
1515  __v4su __rshift = {31, 31, 31, 31};
1516  __v4si __result;
1517
1518  if (__B < 32) {
1519    if (__builtin_constant_p(__B)) {
1520      if (__B < 16)
1521        __rshift = (__v4su)vec_splat_s32(__B);
1522      else
1523        __rshift = (__v4su)vec_splats((unsigned int)__B);
1524    } else
1525      __rshift = vec_splats((unsigned int)__B);
1526  }
1527  __result = vec_sra((__v4si)__A, __rshift);
1528
1529  return (__m128i)__result;
1530}
1531
1532extern __inline __m128i
1533    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1534    _mm_bslli_si128(__m128i __A, const int __N) {
1535  __v16qu __result;
1536  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1537
1538  if (__N < 16)
1539    __result = vec_sld((__v16qu)__A, __zeros, __N);
1540  else
1541    __result = __zeros;
1542
1543  return (__m128i)__result;
1544}
1545
1546extern __inline __m128i
1547    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1548    _mm_bsrli_si128(__m128i __A, const int __N) {
1549  __v16qu __result;
1550  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1551
1552  if (__N < 16)
1553#ifdef __LITTLE_ENDIAN__
1554    if (__builtin_constant_p(__N))
1555      /* Would like to use Vector Shift Left Double by Octet
1556         Immediate here to use the immediate form and avoid
1557         load of __N * 8 value into a separate VR.  */
1558      __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1559    else
1560#endif
1561    {
1562      __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1563#ifdef __LITTLE_ENDIAN__
1564      __result = vec_sro((__v16qu)__A, __shift);
1565#else
1566    __result = vec_slo((__v16qu)__A, __shift);
1567#endif
1568    }
1569  else
1570    __result = __zeros;
1571
1572  return (__m128i)__result;
1573}
1574
1575extern __inline __m128i
1576    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577    _mm_srli_si128(__m128i __A, const int __N) {
1578  return _mm_bsrli_si128(__A, __N);
1579}
1580
1581extern __inline __m128i
1582    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1583    _mm_slli_si128(__m128i __A, const int _imm5) {
1584  __v16qu __result;
1585  const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1586
1587  if (_imm5 < 16)
1588#ifdef __LITTLE_ENDIAN__
1589    __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1590#else
1591    __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1592#endif
1593  else
1594    __result = __zeros;
1595
1596  return (__m128i)__result;
1597}
1598
1599extern __inline __m128i
1600    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601
1602    _mm_srli_epi16(__m128i __A, int __B) {
1603  __v8hu __rshift;
1604  __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1605
1606  if (__B < 16) {
1607    if (__builtin_constant_p(__B))
1608      __rshift = (__v8hu)vec_splat_s16(__B);
1609    else
1610      __rshift = vec_splats((unsigned short)__B);
1611
1612    __result = vec_sr((__v8hi)__A, __rshift);
1613  }
1614
1615  return (__m128i)__result;
1616}
1617
1618extern __inline __m128i
1619    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1620    _mm_srli_epi32(__m128i __A, int __B) {
1621  __v4su __rshift;
1622  __v4si __result = {0, 0, 0, 0};
1623
1624  if (__B < 32) {
1625    if (__builtin_constant_p(__B)) {
1626      if (__B < 16)
1627        __rshift = (__v4su)vec_splat_s32(__B);
1628      else
1629        __rshift = (__v4su)vec_splats((unsigned int)__B);
1630    } else
1631      __rshift = vec_splats((unsigned int)__B);
1632
1633    __result = vec_sr((__v4si)__A, __rshift);
1634  }
1635
1636  return (__m128i)__result;
1637}
1638
1639#ifdef _ARCH_PWR8
1640extern __inline __m128i
1641    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1642    _mm_srli_epi64(__m128i __A, int __B) {
1643  __v2du __rshift;
1644  __v2di __result = {0, 0};
1645
1646  if (__B < 64) {
1647    if (__builtin_constant_p(__B)) {
1648      if (__B < 16)
1649        __rshift = (__v2du)vec_splat_s32(__B);
1650      else
1651        __rshift = (__v2du)vec_splats((unsigned long long)__B);
1652    } else
1653      __rshift = (__v2du)vec_splats((unsigned int)__B);
1654
1655    __result = vec_sr((__v2di)__A, __rshift);
1656  }
1657
1658  return (__m128i)__result;
1659}
1660#endif
1661
1662extern __inline __m128i
1663    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1664    _mm_sll_epi16(__m128i __A, __m128i __B) {
1665  __v8hu __lshift;
1666  __vector __bool short __shmask;
1667  const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1668  __v8hu __result;
1669
1670#ifdef __LITTLE_ENDIAN__
1671  __lshift = vec_splat((__v8hu)__B, 0);
1672#else
1673  __lshift = vec_splat((__v8hu)__B, 3);
1674#endif
1675  __shmask = vec_cmple(__lshift, __shmax);
1676  __result = vec_sl((__v8hu)__A, __lshift);
1677  __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1678
1679  return (__m128i)__result;
1680}
1681
1682extern __inline __m128i
1683    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1684    _mm_sll_epi32(__m128i __A, __m128i __B) {
1685  __v4su __lshift;
1686  __vector __bool int __shmask;
1687  const __v4su __shmax = {32, 32, 32, 32};
1688  __v4su __result;
1689#ifdef __LITTLE_ENDIAN__
1690  __lshift = vec_splat((__v4su)__B, 0);
1691#else
1692  __lshift = vec_splat((__v4su)__B, 1);
1693#endif
1694  __shmask = vec_cmplt(__lshift, __shmax);
1695  __result = vec_sl((__v4su)__A, __lshift);
1696  __result = vec_sel((__v4su)__shmask, __result, __shmask);
1697
1698  return (__m128i)__result;
1699}
1700
1701#ifdef _ARCH_PWR8
1702extern __inline __m128i
1703    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1704    _mm_sll_epi64(__m128i __A, __m128i __B) {
1705  __v2du __lshift;
1706  __vector __bool long long __shmask;
1707  const __v2du __shmax = {64, 64};
1708  __v2du __result;
1709
1710  __lshift = vec_splat((__v2du)__B, 0);
1711  __shmask = vec_cmplt(__lshift, __shmax);
1712  __result = vec_sl((__v2du)__A, __lshift);
1713  __result = vec_sel((__v2du)__shmask, __result, __shmask);
1714
1715  return (__m128i)__result;
1716}
1717#endif
1718
1719extern __inline __m128i
1720    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1721    _mm_sra_epi16(__m128i __A, __m128i __B) {
1722  const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1723  __v8hu __rshift;
1724  __v8hi __result;
1725
1726#ifdef __LITTLE_ENDIAN__
1727  __rshift = vec_splat((__v8hu)__B, 0);
1728#else
1729  __rshift = vec_splat((__v8hu)__B, 3);
1730#endif
1731  __rshift = vec_min(__rshift, __rshmax);
1732  __result = vec_sra((__v8hi)__A, __rshift);
1733
1734  return (__m128i)__result;
1735}
1736
1737extern __inline __m128i
1738    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1739    _mm_sra_epi32(__m128i __A, __m128i __B) {
1740  const __v4su __rshmax = {31, 31, 31, 31};
1741  __v4su __rshift;
1742  __v4si __result;
1743
1744#ifdef __LITTLE_ENDIAN__
1745  __rshift = vec_splat((__v4su)__B, 0);
1746#else
1747  __rshift = vec_splat((__v4su)__B, 1);
1748#endif
1749  __rshift = vec_min(__rshift, __rshmax);
1750  __result = vec_sra((__v4si)__A, __rshift);
1751
1752  return (__m128i)__result;
1753}
1754
1755extern __inline __m128i
1756    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757    _mm_srl_epi16(__m128i __A, __m128i __B) {
1758  __v8hu __rshift;
1759  __vector __bool short __shmask;
1760  const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1761  __v8hu __result;
1762
1763#ifdef __LITTLE_ENDIAN__
1764  __rshift = vec_splat((__v8hu)__B, 0);
1765#else
1766  __rshift = vec_splat((__v8hu)__B, 3);
1767#endif
1768  __shmask = vec_cmple(__rshift, __shmax);
1769  __result = vec_sr((__v8hu)__A, __rshift);
1770  __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1771
1772  return (__m128i)__result;
1773}
1774
1775extern __inline __m128i
1776    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1777    _mm_srl_epi32(__m128i __A, __m128i __B) {
1778  __v4su __rshift;
1779  __vector __bool int __shmask;
1780  const __v4su __shmax = {32, 32, 32, 32};
1781  __v4su __result;
1782
1783#ifdef __LITTLE_ENDIAN__
1784  __rshift = vec_splat((__v4su)__B, 0);
1785#else
1786  __rshift = vec_splat((__v4su)__B, 1);
1787#endif
1788  __shmask = vec_cmplt(__rshift, __shmax);
1789  __result = vec_sr((__v4su)__A, __rshift);
1790  __result = vec_sel((__v4su)__shmask, __result, __shmask);
1791
1792  return (__m128i)__result;
1793}
1794
1795#ifdef _ARCH_PWR8
1796extern __inline __m128i
1797    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1798    _mm_srl_epi64(__m128i __A, __m128i __B) {
1799  __v2du __rshift;
1800  __vector __bool long long __shmask;
1801  const __v2du __shmax = {64, 64};
1802  __v2du __result;
1803
1804  __rshift = vec_splat((__v2du)__B, 0);
1805  __shmask = vec_cmplt(__rshift, __shmax);
1806  __result = vec_sr((__v2du)__A, __rshift);
1807  __result = vec_sel((__v2du)__shmask, __result, __shmask);
1808
1809  return (__m128i)__result;
1810}
1811#endif
1812
1813extern __inline __m128d
1814    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1815    _mm_and_pd(__m128d __A, __m128d __B) {
1816  return (vec_and((__v2df)__A, (__v2df)__B));
1817}
1818
1819extern __inline __m128d
1820    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1821    _mm_andnot_pd(__m128d __A, __m128d __B) {
1822  return (vec_andc((__v2df)__B, (__v2df)__A));
1823}
1824
1825extern __inline __m128d
1826    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1827    _mm_or_pd(__m128d __A, __m128d __B) {
1828  return (vec_or((__v2df)__A, (__v2df)__B));
1829}
1830
1831extern __inline __m128d
1832    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1833    _mm_xor_pd(__m128d __A, __m128d __B) {
1834  return (vec_xor((__v2df)__A, (__v2df)__B));
1835}
1836
1837extern __inline __m128i
1838    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1839    _mm_and_si128(__m128i __A, __m128i __B) {
1840  return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1841}
1842
1843extern __inline __m128i
1844    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1845    _mm_andnot_si128(__m128i __A, __m128i __B) {
1846  return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1847}
1848
1849extern __inline __m128i
1850    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851    _mm_or_si128(__m128i __A, __m128i __B) {
1852  return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1853}
1854
1855extern __inline __m128i
1856    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857    _mm_xor_si128(__m128i __A, __m128i __B) {
1858  return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1859}
1860
1861extern __inline __m128i
1862    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863    _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1864  return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1865}
1866
1867extern __inline __m128i
1868    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869    _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1870  return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1871}
1872
1873extern __inline __m128i
1874    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875    _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1876  return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1877}
1878
1879extern __inline __m128i
1880    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881    _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1882  return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1883}
1884
1885extern __inline __m128i
1886    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887    _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1888  return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1889}
1890
1891extern __inline __m128i
1892    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893    _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1894  return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1895}
1896
1897extern __inline __m128i
1898    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899    _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1900  return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1901}
1902
1903extern __inline __m128i
1904    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905    _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1906  return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1907}
1908
1909extern __inline __m128i
1910    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911    _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1912  return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1913}
1914
1915extern __inline int
1916    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917    _mm_extract_epi16(__m128i const __A, int const __N) {
1918  return (unsigned short)((__v8hi)__A)[__N & 7];
1919}
1920
1921extern __inline __m128i
1922    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923    _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1924  __v8hi __result = (__v8hi)__A;
1925
1926  __result[(__N & 7)] = __D;
1927
1928  return (__m128i)__result;
1929}
1930
1931extern __inline __m128i
1932    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1933    _mm_max_epi16(__m128i __A, __m128i __B) {
1934  return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1935}
1936
1937extern __inline __m128i
1938    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1939    _mm_max_epu8(__m128i __A, __m128i __B) {
1940  return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1941}
1942
1943extern __inline __m128i
1944    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1945    _mm_min_epi16(__m128i __A, __m128i __B) {
1946  return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1947}
1948
1949extern __inline __m128i
1950    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1951    _mm_min_epu8(__m128i __A, __m128i __B) {
1952  return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1953}
1954
1955#ifdef _ARCH_PWR8
1956/* Intrinsic functions that require PowerISA 2.07 minimum.  */
1957
1958/* Return a mask created from the most significant bit of each 8-bit
1959   element in A.  */
1960extern __inline int
1961    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1962    _mm_movemask_epi8(__m128i __A) {
1963#ifdef _ARCH_PWR10
1964  return vec_extractm((__v16qu)__A);
1965#else
1966  __vector unsigned long long __result;
1967  static const __vector unsigned char __perm_mask = {
1968      0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1969      0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1970
1971  __result = ((__vector unsigned long long)vec_vbpermq(
1972      (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1973
1974#ifdef __LITTLE_ENDIAN__
1975  return __result[1];
1976#else
1977  return __result[0];
1978#endif
1979#endif /* !_ARCH_PWR10 */
1980}
1981#endif /* _ARCH_PWR8 */
1982
1983extern __inline __m128i
1984    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1985    _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1986  __v4su __w0, __w1;
1987  __v16qu __xform1 = {
1988#ifdef __LITTLE_ENDIAN__
1989      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1990      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1991#else
1992      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1993      0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1994#endif
1995  };
1996
1997  __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1998  __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1999  return (__m128i)vec_perm(__w0, __w1, __xform1);
2000}
2001
2002extern __inline __m128i
2003    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2004    _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2005  unsigned long __element_selector_98 = __mask & 0x03;
2006  unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2007  unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2008  unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2009  static const unsigned short __permute_selectors[4] = {
2010#ifdef __LITTLE_ENDIAN__
2011      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2012#else
2013      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2014#endif
2015  };
2016  __v2du __pmask =
2017#ifdef __LITTLE_ENDIAN__
2018      {0x1716151413121110UL, 0UL};
2019#else
2020      {0x1011121314151617UL, 0UL};
2021#endif
2022  __m64_union __t;
2023  __v2du __a, __r;
2024
2025  __t.as_short[0] = __permute_selectors[__element_selector_98];
2026  __t.as_short[1] = __permute_selectors[__element_selector_BA];
2027  __t.as_short[2] = __permute_selectors[__element_selector_DC];
2028  __t.as_short[3] = __permute_selectors[__element_selector_FE];
2029  __pmask[1] = __t.as_m64;
2030  __a = (__v2du)__A;
2031  __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2032  return (__m128i)__r;
2033}
2034
2035extern __inline __m128i
2036    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2037    _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2038  unsigned long __element_selector_10 = __mask & 0x03;
2039  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2040  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2041  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2042  static const unsigned short __permute_selectors[4] = {
2043#ifdef __LITTLE_ENDIAN__
2044      0x0100, 0x0302, 0x0504, 0x0706
2045#else
2046      0x0001, 0x0203, 0x0405, 0x0607
2047#endif
2048  };
2049  __v2du __pmask =
2050#ifdef __LITTLE_ENDIAN__
2051      {0UL, 0x1f1e1d1c1b1a1918UL};
2052#else
2053      {0UL, 0x18191a1b1c1d1e1fUL};
2054#endif
2055  __m64_union __t;
2056  __v2du __a, __r;
2057  __t.as_short[0] = __permute_selectors[__element_selector_10];
2058  __t.as_short[1] = __permute_selectors[__element_selector_32];
2059  __t.as_short[2] = __permute_selectors[__element_selector_54];
2060  __t.as_short[3] = __permute_selectors[__element_selector_76];
2061  __pmask[0] = __t.as_m64;
2062  __a = (__v2du)__A;
2063  __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2064  return (__m128i)__r;
2065}
2066
2067extern __inline __m128i
2068    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2069    _mm_shuffle_epi32(__m128i __A, const int __mask) {
2070  unsigned long __element_selector_10 = __mask & 0x03;
2071  unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2072  unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2073  unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2074  static const unsigned int __permute_selectors[4] = {
2075#ifdef __LITTLE_ENDIAN__
2076      0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2077#else
2078      0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2079#endif
2080  };
2081  __v4su __t;
2082
2083  __t[0] = __permute_selectors[__element_selector_10];
2084  __t[1] = __permute_selectors[__element_selector_32];
2085  __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2086  __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2087  return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2088                           (__vector unsigned char)__t);
2089}
2090
2091extern __inline void
2092    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2093    _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2094  __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2095  __v16qu __mask, __tmp;
2096  __m128i_u *__p = (__m128i_u *)__C;
2097
2098  __tmp = (__v16qu)_mm_loadu_si128(__p);
2099  __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2100  __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2101  _mm_storeu_si128(__p, (__m128i)__tmp);
2102}
2103
2104extern __inline __m128i
2105    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106    _mm_avg_epu8(__m128i __A, __m128i __B) {
2107  return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2108}
2109
2110extern __inline __m128i
2111    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2112    _mm_avg_epu16(__m128i __A, __m128i __B) {
2113  return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2114}
2115
2116extern __inline __m128i
2117    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2118    _mm_sad_epu8(__m128i __A, __m128i __B) {
2119  __v16qu __a, __b;
2120  __v16qu __vabsdiff;
2121  __v4si __vsum;
2122  const __v4su __zero = {0, 0, 0, 0};
2123  __v4si __result;
2124
2125  __a = (__v16qu)__A;
2126  __b = (__v16qu)__B;
2127#ifndef _ARCH_PWR9
2128  __v16qu __vmin = vec_min(__a, __b);
2129  __v16qu __vmax = vec_max(__a, __b);
2130  __vabsdiff = vec_sub(__vmax, __vmin);
2131#else
2132  __vabsdiff = vec_absd(__a, __b);
2133#endif
2134  /* Sum four groups of bytes into integers.  */
2135  __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2136#ifdef __LITTLE_ENDIAN__
2137  /* Sum across four integers with two integer results.  */
2138  __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2139  /* Note: vec_sum2s could be used here, but on little-endian, vector
2140     shifts are added that are not needed for this use-case.
2141     A vector shift to correctly position the 32-bit integer results
2142     (currently at [0] and [2]) to [1] and [3] would then need to be
2143     swapped back again since the desired results are two 64-bit
2144     integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
2145#else
2146  /* Sum across four integers with two integer results.  */
2147  __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2148  /* Rotate the sums into the correct position.  */
2149  __result = vec_sld(__result, __result, 6);
2150#endif
2151  return (__m128i)__result;
2152}
2153
2154extern __inline void
2155    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156    _mm_stream_si32(int *__A, int __B) {
2157  /* Use the data cache block touch for store transient.  */
2158  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2159  *__A = __B;
2160}
2161
2162extern __inline void
2163    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2164    _mm_stream_si64(long long int *__A, long long int __B) {
2165  /* Use the data cache block touch for store transient.  */
2166  __asm__("	dcbtstt	0,%0" : : "b"(__A) : "memory");
2167  *__A = __B;
2168}
2169
2170extern __inline void
2171    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2172    _mm_stream_si128(__m128i *__A, __m128i __B) {
2173  /* Use the data cache block touch for store transient.  */
2174  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2175  *__A = __B;
2176}
2177
2178extern __inline void
2179    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2180    _mm_stream_pd(double *__A, __m128d __B) {
2181  /* Use the data cache block touch for store transient.  */
2182  __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2183  *(__m128d *)__A = __B;
2184}
2185
2186extern __inline void
2187    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2188    _mm_clflush(void const *__A) {
2189  /* Use the data cache block flush.  */
2190  __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2191}
2192
2193extern __inline void
2194    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2195    _mm_lfence(void) {
2196  /* Use light weight sync for load to load ordering.  */
2197  __atomic_thread_fence(__ATOMIC_RELEASE);
2198}
2199
2200extern __inline void
2201    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2202    _mm_mfence(void) {
2203  /* Use heavy weight sync for any to any ordering.  */
2204  __atomic_thread_fence(__ATOMIC_SEQ_CST);
2205}
2206
2207extern __inline __m128i
2208    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2209    _mm_cvtsi32_si128(int __A) {
2210  return _mm_set_epi32(0, 0, 0, __A);
2211}
2212
2213extern __inline __m128i
2214    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2215    _mm_cvtsi64_si128(long long __A) {
2216  return __extension__(__m128i)(__v2di){__A, 0LL};
2217}
2218
2219/* Microsoft intrinsic.  */
2220extern __inline __m128i
2221    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2222    _mm_cvtsi64x_si128(long long __A) {
2223  return __extension__(__m128i)(__v2di){__A, 0LL};
2224}
2225
2226/* Casts between various SP, DP, INT vector types.  Note that these do no
2227   conversion of values, they just change the type.  */
2228extern __inline __m128
2229    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2230    _mm_castpd_ps(__m128d __A) {
2231  return (__m128)__A;
2232}
2233
2234extern __inline __m128i
2235    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236    _mm_castpd_si128(__m128d __A) {
2237  return (__m128i)__A;
2238}
2239
2240extern __inline __m128d
2241    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2242    _mm_castps_pd(__m128 __A) {
2243  return (__m128d)__A;
2244}
2245
2246extern __inline __m128i
2247    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248    _mm_castps_si128(__m128 __A) {
2249  return (__m128i)__A;
2250}
2251
2252extern __inline __m128
2253    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2254    _mm_castsi128_ps(__m128i __A) {
2255  return (__m128)__A;
2256}
2257
2258extern __inline __m128d
2259    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2260    _mm_castsi128_pd(__m128i __A) {
2261  return (__m128d)__A;
2262}
2263
2264#else
2265#include_next <emmintrin.h>
2266#endif /* defined(__powerpc64__) &&                                            \
2267        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2268
2269#endif /* EMMINTRIN_H_ */