zig/lib/include/ppc_wrappers/mmintrin.h at master

   1/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
   2 *
   3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 * See https://llvm.org/LICENSE.txt for license information.
   5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 *
   7 *===-----------------------------------------------------------------------===
   8 */
   9
  10/* Implemented from the specification included in the Intel C++ Compiler
  11   User Guide and Reference, version 9.0.  */
  12
  13#ifndef NO_WARN_X86_INTRINSICS
  14/* This header file is to help porting code using Intel intrinsics
  15   explicitly from x86_64 to powerpc64/powerpc64le.
  16
  17   Since PowerPC target doesn't support native 64-bit vector type, we
  18   typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
  19   works well for _si64 and some _pi32 operations.
  20
  21   For _pi16 and _pi8 operations, it's better to transfer __m64 into
  22   128-bit PowerPC vector first. Power8 introduced direct register
  23   move instructions which helps for more efficient implementation.
  24
  25   It's user's responsibility to determine if the results of such port
  26   are acceptable or further changes are needed. Please note that much
  27   code using Intel intrinsics CAN BE REWRITTEN in more portable and
  28   efficient standard C or GNU C extensions with 64-bit scalar
  29   operations, or 128-bit SSE/Altivec operations, which are more
  30   recommended. */
  31#error                                                                         \
  32    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
  33#endif
  34
  35#ifndef _MMINTRIN_H_INCLUDED
  36#define _MMINTRIN_H_INCLUDED
  37
  38#if defined(__powerpc64__) &&                                                  \
  39    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
  40
  41#include <altivec.h>
  42/* The Intel API is flexible enough that we must allow aliasing with other
  43   vector types, and their scalar components.  */
  44typedef __attribute__((__aligned__(8))) unsigned long long __m64;
  45
  46typedef __attribute__((__aligned__(8))) union {
  47  __m64 as_m64;
  48  char as_char[8];
  49  signed char as_signed_char[8];
  50  short as_short[4];
  51  int as_int[2];
  52  long long as_long_long;
  53  float as_float[2];
  54  double as_double;
  55} __m64_union;
  56
  57/* Empty the multimedia state.  */
  58extern __inline void
  59    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  60    _mm_empty(void) {
  61  /* nothing to do on PowerPC.  */
  62}
  63
  64extern __inline void
  65    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  66    _m_empty(void) {
  67  /* nothing to do on PowerPC.  */
  68}
  69
  70/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
  71extern __inline __m64
  72    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  73    _mm_cvtsi32_si64(int __i) {
  74  return (__m64)(unsigned int)__i;
  75}
  76
  77extern __inline __m64
  78    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  79    _m_from_int(int __i) {
  80  return _mm_cvtsi32_si64(__i);
  81}
  82
  83/* Convert the lower 32 bits of the __m64 object into an integer.  */
  84extern __inline int
  85    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  86    _mm_cvtsi64_si32(__m64 __i) {
  87  return ((int)__i);
  88}
  89
  90extern __inline int
  91    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
  92    _m_to_int(__m64 __i) {
  93  return _mm_cvtsi64_si32(__i);
  94}
  95
  96/* Convert I to a __m64 object.  */
  97
  98/* Intel intrinsic.  */
  99extern __inline __m64
 100    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 101    _m_from_int64(long long __i) {
 102  return (__m64)__i;
 103}
 104
 105extern __inline __m64
 106    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 107    _mm_cvtsi64_m64(long long __i) {
 108  return (__m64)__i;
 109}
 110
 111/* Microsoft intrinsic.  */
 112extern __inline __m64
 113    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 114    _mm_cvtsi64x_si64(long long __i) {
 115  return (__m64)__i;
 116}
 117
 118extern __inline __m64
 119    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 120    _mm_set_pi64x(long long __i) {
 121  return (__m64)__i;
 122}
 123
 124/* Convert the __m64 object to a 64bit integer.  */
 125
 126/* Intel intrinsic.  */
 127extern __inline long long
 128    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 129    _m_to_int64(__m64 __i) {
 130  return (long long)__i;
 131}
 132
 133extern __inline long long
 134    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 135    _mm_cvtm64_si64(__m64 __i) {
 136  return (long long)__i;
 137}
 138
 139/* Microsoft intrinsic.  */
 140extern __inline long long
 141    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 142    _mm_cvtsi64_si64x(__m64 __i) {
 143  return (long long)__i;
 144}
 145
 146#ifdef _ARCH_PWR8
 147/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 148   the result, and the four 16-bit values from M2 into the upper four 8-bit
 149   values of the result, all with signed saturation.  */
 150extern __inline __m64
 151    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 152    _mm_packs_pi16(__m64 __m1, __m64 __m2) {
 153  __vector signed short __vm1;
 154  __vector signed char __vresult;
 155
 156  __vm1 = (__vector signed short)(__vector unsigned long long)
 157#ifdef __LITTLE_ENDIAN__
 158      {__m1, __m2};
 159#else
 160      {__m2, __m1};
 161#endif
 162  __vresult = vec_packs(__vm1, __vm1);
 163  return (__m64)((__vector long long)__vresult)[0];
 164}
 165
 166extern __inline __m64
 167    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 168    _m_packsswb(__m64 __m1, __m64 __m2) {
 169  return _mm_packs_pi16(__m1, __m2);
 170}
 171
 172/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
 173   the result, and the two 32-bit values from M2 into the upper two 16-bit
 174   values of the result, all with signed saturation.  */
 175extern __inline __m64
 176    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 177    _mm_packs_pi32(__m64 __m1, __m64 __m2) {
 178  __vector signed int __vm1;
 179  __vector signed short __vresult;
 180
 181  __vm1 = (__vector signed int)(__vector unsigned long long)
 182#ifdef __LITTLE_ENDIAN__
 183      {__m1, __m2};
 184#else
 185      {__m2, __m1};
 186#endif
 187  __vresult = vec_packs(__vm1, __vm1);
 188  return (__m64)((__vector long long)__vresult)[0];
 189}
 190
 191extern __inline __m64
 192    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 193    _m_packssdw(__m64 __m1, __m64 __m2) {
 194  return _mm_packs_pi32(__m1, __m2);
 195}
 196
 197/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
 198   the result, and the four 16-bit values from M2 into the upper four 8-bit
 199   values of the result, all with unsigned saturation.  */
 200extern __inline __m64
 201    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 202    _mm_packs_pu16(__m64 __m1, __m64 __m2) {
 203  __vector unsigned char __r;
 204  __vector signed short __vm1 = (__vector signed short)(__vector long long)
 205#ifdef __LITTLE_ENDIAN__
 206      {__m1, __m2};
 207#else
 208      {__m2, __m1};
 209#endif
 210  const __vector signed short __zero = {0};
 211  __vector __bool short __select = vec_cmplt(__vm1, __zero);
 212  __r =
 213      vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
 214  __vector __bool char __packsel = vec_pack(__select, __select);
 215  __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
 216  return (__m64)((__vector long long)__r)[0];
 217}
 218
 219extern __inline __m64
 220    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 221    _m_packuswb(__m64 __m1, __m64 __m2) {
 222  return _mm_packs_pu16(__m1, __m2);
 223}
 224#endif /* end ARCH_PWR8 */
 225
 226/* Interleave the four 8-bit values from the high half of M1 with the four
 227   8-bit values from the high half of M2.  */
 228extern __inline __m64
 229    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 230    _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
 231#if _ARCH_PWR8
 232  __vector unsigned char __a, __b, __c;
 233
 234  __a = (__vector unsigned char)vec_splats(__m1);
 235  __b = (__vector unsigned char)vec_splats(__m2);
 236  __c = vec_mergel(__a, __b);
 237  return (__m64)((__vector long long)__c)[1];
 238#else
 239  __m64_union __mu1, __mu2, __res;
 240
 241  __mu1.as_m64 = __m1;
 242  __mu2.as_m64 = __m2;
 243
 244  __res.as_char[0] = __mu1.as_char[4];
 245  __res.as_char[1] = __mu2.as_char[4];
 246  __res.as_char[2] = __mu1.as_char[5];
 247  __res.as_char[3] = __mu2.as_char[5];
 248  __res.as_char[4] = __mu1.as_char[6];
 249  __res.as_char[5] = __mu2.as_char[6];
 250  __res.as_char[6] = __mu1.as_char[7];
 251  __res.as_char[7] = __mu2.as_char[7];
 252
 253  return (__m64)__res.as_m64;
 254#endif
 255}
 256
 257extern __inline __m64
 258    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 259    _m_punpckhbw(__m64 __m1, __m64 __m2) {
 260  return _mm_unpackhi_pi8(__m1, __m2);
 261}
 262
 263/* Interleave the two 16-bit values from the high half of M1 with the two
 264   16-bit values from the high half of M2.  */
 265extern __inline __m64
 266    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 267    _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
 268  __m64_union __mu1, __mu2, __res;
 269
 270  __mu1.as_m64 = __m1;
 271  __mu2.as_m64 = __m2;
 272
 273  __res.as_short[0] = __mu1.as_short[2];
 274  __res.as_short[1] = __mu2.as_short[2];
 275  __res.as_short[2] = __mu1.as_short[3];
 276  __res.as_short[3] = __mu2.as_short[3];
 277
 278  return (__m64)__res.as_m64;
 279}
 280
 281extern __inline __m64
 282    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 283    _m_punpckhwd(__m64 __m1, __m64 __m2) {
 284  return _mm_unpackhi_pi16(__m1, __m2);
 285}
 286/* Interleave the 32-bit value from the high half of M1 with the 32-bit
 287   value from the high half of M2.  */
 288extern __inline __m64
 289    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 290    _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
 291  __m64_union __mu1, __mu2, __res;
 292
 293  __mu1.as_m64 = __m1;
 294  __mu2.as_m64 = __m2;
 295
 296  __res.as_int[0] = __mu1.as_int[1];
 297  __res.as_int[1] = __mu2.as_int[1];
 298
 299  return (__m64)__res.as_m64;
 300}
 301
 302extern __inline __m64
 303    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 304    _m_punpckhdq(__m64 __m1, __m64 __m2) {
 305  return _mm_unpackhi_pi32(__m1, __m2);
 306}
 307/* Interleave the four 8-bit values from the low half of M1 with the four
 308   8-bit values from the low half of M2.  */
 309extern __inline __m64
 310    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 311    _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
 312#if _ARCH_PWR8
 313  __vector unsigned char __a, __b, __c;
 314
 315  __a = (__vector unsigned char)vec_splats(__m1);
 316  __b = (__vector unsigned char)vec_splats(__m2);
 317  __c = vec_mergel(__a, __b);
 318  return (__m64)((__vector long long)__c)[0];
 319#else
 320  __m64_union __mu1, __mu2, __res;
 321
 322  __mu1.as_m64 = __m1;
 323  __mu2.as_m64 = __m2;
 324
 325  __res.as_char[0] = __mu1.as_char[0];
 326  __res.as_char[1] = __mu2.as_char[0];
 327  __res.as_char[2] = __mu1.as_char[1];
 328  __res.as_char[3] = __mu2.as_char[1];
 329  __res.as_char[4] = __mu1.as_char[2];
 330  __res.as_char[5] = __mu2.as_char[2];
 331  __res.as_char[6] = __mu1.as_char[3];
 332  __res.as_char[7] = __mu2.as_char[3];
 333
 334  return (__m64)__res.as_m64;
 335#endif
 336}
 337
 338extern __inline __m64
 339    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 340    _m_punpcklbw(__m64 __m1, __m64 __m2) {
 341  return _mm_unpacklo_pi8(__m1, __m2);
 342}
 343/* Interleave the two 16-bit values from the low half of M1 with the two
 344   16-bit values from the low half of M2.  */
 345extern __inline __m64
 346    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 347    _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
 348  __m64_union __mu1, __mu2, __res;
 349
 350  __mu1.as_m64 = __m1;
 351  __mu2.as_m64 = __m2;
 352
 353  __res.as_short[0] = __mu1.as_short[0];
 354  __res.as_short[1] = __mu2.as_short[0];
 355  __res.as_short[2] = __mu1.as_short[1];
 356  __res.as_short[3] = __mu2.as_short[1];
 357
 358  return (__m64)__res.as_m64;
 359}
 360
 361extern __inline __m64
 362    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 363    _m_punpcklwd(__m64 __m1, __m64 __m2) {
 364  return _mm_unpacklo_pi16(__m1, __m2);
 365}
 366
 367/* Interleave the 32-bit value from the low half of M1 with the 32-bit
 368   value from the low half of M2.  */
 369extern __inline __m64
 370    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 371    _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
 372  __m64_union __mu1, __mu2, __res;
 373
 374  __mu1.as_m64 = __m1;
 375  __mu2.as_m64 = __m2;
 376
 377  __res.as_int[0] = __mu1.as_int[0];
 378  __res.as_int[1] = __mu2.as_int[0];
 379
 380  return (__m64)__res.as_m64;
 381}
 382
 383extern __inline __m64
 384    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 385    _m_punpckldq(__m64 __m1, __m64 __m2) {
 386  return _mm_unpacklo_pi32(__m1, __m2);
 387}
 388
 389/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
 390extern __inline __m64
 391    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 392    _mm_add_pi8(__m64 __m1, __m64 __m2) {
 393#if _ARCH_PWR8
 394  __vector signed char __a, __b, __c;
 395
 396  __a = (__vector signed char)vec_splats(__m1);
 397  __b = (__vector signed char)vec_splats(__m2);
 398  __c = vec_add(__a, __b);
 399  return (__m64)((__vector long long)__c)[0];
 400#else
 401  __m64_union __mu1, __mu2, __res;
 402
 403  __mu1.as_m64 = __m1;
 404  __mu2.as_m64 = __m2;
 405
 406  __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
 407  __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
 408  __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
 409  __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
 410  __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
 411  __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
 412  __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
 413  __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
 414
 415  return (__m64)__res.as_m64;
 416#endif
 417}
 418
 419extern __inline __m64
 420    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 421    _m_paddb(__m64 __m1, __m64 __m2) {
 422  return _mm_add_pi8(__m1, __m2);
 423}
 424
 425/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
 426extern __inline __m64
 427    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 428    _mm_add_pi16(__m64 __m1, __m64 __m2) {
 429#if _ARCH_PWR8
 430  __vector signed short __a, __b, __c;
 431
 432  __a = (__vector signed short)vec_splats(__m1);
 433  __b = (__vector signed short)vec_splats(__m2);
 434  __c = vec_add(__a, __b);
 435  return (__m64)((__vector long long)__c)[0];
 436#else
 437  __m64_union __mu1, __mu2, __res;
 438
 439  __mu1.as_m64 = __m1;
 440  __mu2.as_m64 = __m2;
 441
 442  __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
 443  __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
 444  __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
 445  __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
 446
 447  return (__m64)__res.as_m64;
 448#endif
 449}
 450
 451extern __inline __m64
 452    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 453    _m_paddw(__m64 __m1, __m64 __m2) {
 454  return _mm_add_pi16(__m1, __m2);
 455}
 456
 457/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
 458extern __inline __m64
 459    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 460    _mm_add_pi32(__m64 __m1, __m64 __m2) {
 461#if _ARCH_PWR9
 462  __vector signed int __a, __b, __c;
 463
 464  __a = (__vector signed int)vec_splats(__m1);
 465  __b = (__vector signed int)vec_splats(__m2);
 466  __c = vec_add(__a, __b);
 467  return (__m64)((__vector long long)__c)[0];
 468#else
 469  __m64_union __mu1, __mu2, __res;
 470
 471  __mu1.as_m64 = __m1;
 472  __mu2.as_m64 = __m2;
 473
 474  __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
 475  __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
 476
 477  return (__m64)__res.as_m64;
 478#endif
 479}
 480
 481extern __inline __m64
 482    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 483    _m_paddd(__m64 __m1, __m64 __m2) {
 484  return _mm_add_pi32(__m1, __m2);
 485}
 486
 487/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
 488extern __inline __m64
 489    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 490    _mm_sub_pi8(__m64 __m1, __m64 __m2) {
 491#if _ARCH_PWR8
 492  __vector signed char __a, __b, __c;
 493
 494  __a = (__vector signed char)vec_splats(__m1);
 495  __b = (__vector signed char)vec_splats(__m2);
 496  __c = vec_sub(__a, __b);
 497  return (__m64)((__vector long long)__c)[0];
 498#else
 499  __m64_union __mu1, __mu2, __res;
 500
 501  __mu1.as_m64 = __m1;
 502  __mu2.as_m64 = __m2;
 503
 504  __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
 505  __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
 506  __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
 507  __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
 508  __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
 509  __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
 510  __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
 511  __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
 512
 513  return (__m64)__res.as_m64;
 514#endif
 515}
 516
 517extern __inline __m64
 518    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 519    _m_psubb(__m64 __m1, __m64 __m2) {
 520  return _mm_sub_pi8(__m1, __m2);
 521}
 522
 523/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
 524extern __inline __m64
 525    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 526    _mm_sub_pi16(__m64 __m1, __m64 __m2) {
 527#if _ARCH_PWR8
 528  __vector signed short __a, __b, __c;
 529
 530  __a = (__vector signed short)vec_splats(__m1);
 531  __b = (__vector signed short)vec_splats(__m2);
 532  __c = vec_sub(__a, __b);
 533  return (__m64)((__vector long long)__c)[0];
 534#else
 535  __m64_union __mu1, __mu2, __res;
 536
 537  __mu1.as_m64 = __m1;
 538  __mu2.as_m64 = __m2;
 539
 540  __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
 541  __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
 542  __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
 543  __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
 544
 545  return (__m64)__res.as_m64;
 546#endif
 547}
 548
 549extern __inline __m64
 550    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 551    _m_psubw(__m64 __m1, __m64 __m2) {
 552  return _mm_sub_pi16(__m1, __m2);
 553}
 554
 555/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
 556extern __inline __m64
 557    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 558    _mm_sub_pi32(__m64 __m1, __m64 __m2) {
 559#if _ARCH_PWR9
 560  __vector signed int __a, __b, __c;
 561
 562  __a = (__vector signed int)vec_splats(__m1);
 563  __b = (__vector signed int)vec_splats(__m2);
 564  __c = vec_sub(__a, __b);
 565  return (__m64)((__vector long long)__c)[0];
 566#else
 567  __m64_union __mu1, __mu2, __res;
 568
 569  __mu1.as_m64 = __m1;
 570  __mu2.as_m64 = __m2;
 571
 572  __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
 573  __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
 574
 575  return (__m64)__res.as_m64;
 576#endif
 577}
 578
 579extern __inline __m64
 580    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 581    _m_psubd(__m64 __m1, __m64 __m2) {
 582  return _mm_sub_pi32(__m1, __m2);
 583}
 584
 585extern __inline __m64
 586    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 587    _mm_add_si64(__m64 __m1, __m64 __m2) {
 588  return (__m1 + __m2);
 589}
 590
 591extern __inline __m64
 592    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 593    _mm_sub_si64(__m64 __m1, __m64 __m2) {
 594  return (__m1 - __m2);
 595}
 596
 597/* Shift the 64-bit value in M left by COUNT.  */
 598extern __inline __m64
 599    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 600    _mm_sll_si64(__m64 __m, __m64 __count) {
 601  return (__m << __count);
 602}
 603
 604extern __inline __m64
 605    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 606    _m_psllq(__m64 __m, __m64 __count) {
 607  return _mm_sll_si64(__m, __count);
 608}
 609
 610extern __inline __m64
 611    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 612    _mm_slli_si64(__m64 __m, const int __count) {
 613  return (__m << __count);
 614}
 615
 616extern __inline __m64
 617    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 618    _m_psllqi(__m64 __m, const int __count) {
 619  return _mm_slli_si64(__m, __count);
 620}
 621
 622/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
 623extern __inline __m64
 624    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 625    _mm_srl_si64(__m64 __m, __m64 __count) {
 626  return (__m >> __count);
 627}
 628
 629extern __inline __m64
 630    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 631    _m_psrlq(__m64 __m, __m64 __count) {
 632  return _mm_srl_si64(__m, __count);
 633}
 634
 635extern __inline __m64
 636    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 637    _mm_srli_si64(__m64 __m, const int __count) {
 638  return (__m >> __count);
 639}
 640
 641extern __inline __m64
 642    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 643    _m_psrlqi(__m64 __m, const int __count) {
 644  return _mm_srli_si64(__m, __count);
 645}
 646
 647/* Bit-wise AND the 64-bit values in M1 and M2.  */
 648extern __inline __m64
 649    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 650    _mm_and_si64(__m64 __m1, __m64 __m2) {
 651  return (__m1 & __m2);
 652}
 653
 654extern __inline __m64
 655    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 656    _m_pand(__m64 __m1, __m64 __m2) {
 657  return _mm_and_si64(__m1, __m2);
 658}
 659
 660/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
 661   64-bit value in M2.  */
 662extern __inline __m64
 663    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 664    _mm_andnot_si64(__m64 __m1, __m64 __m2) {
 665  return (~__m1 & __m2);
 666}
 667
 668extern __inline __m64
 669    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 670    _m_pandn(__m64 __m1, __m64 __m2) {
 671  return _mm_andnot_si64(__m1, __m2);
 672}
 673
 674/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
 675extern __inline __m64
 676    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 677    _mm_or_si64(__m64 __m1, __m64 __m2) {
 678  return (__m1 | __m2);
 679}
 680
 681extern __inline __m64
 682    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 683    _m_por(__m64 __m1, __m64 __m2) {
 684  return _mm_or_si64(__m1, __m2);
 685}
 686
 687/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
 688extern __inline __m64
 689    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 690    _mm_xor_si64(__m64 __m1, __m64 __m2) {
 691  return (__m1 ^ __m2);
 692}
 693
 694extern __inline __m64
 695    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 696    _m_pxor(__m64 __m1, __m64 __m2) {
 697  return _mm_xor_si64(__m1, __m2);
 698}
 699
 700/* Creates a 64-bit zero.  */
 701extern __inline __m64
 702    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 703    _mm_setzero_si64(void) {
 704  return (__m64)0;
 705}
 706
 707/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
 708   test is true and zero if false.  */
 709extern __inline __m64
 710    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 711    _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
 712#if defined(_ARCH_PWR6) && defined(__powerpc64__)
 713  __m64 __res;
 714  __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
 715  return (__res);
 716#else
 717  __m64_union __mu1, __mu2, __res;
 718
 719  __mu1.as_m64 = __m1;
 720  __mu2.as_m64 = __m2;
 721
 722  __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
 723  __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
 724  __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
 725  __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
 726  __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
 727  __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
 728  __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
 729  __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
 730
 731  return (__m64)__res.as_m64;
 732#endif
 733}
 734
 735extern __inline __m64
 736    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 737    _m_pcmpeqb(__m64 __m1, __m64 __m2) {
 738  return _mm_cmpeq_pi8(__m1, __m2);
 739}
 740
 741extern __inline __m64
 742    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 743    _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
 744#if _ARCH_PWR8
 745  __vector signed char __a, __b, __c;
 746
 747  __a = (__vector signed char)vec_splats(__m1);
 748  __b = (__vector signed char)vec_splats(__m2);
 749  __c = (__vector signed char)vec_cmpgt(__a, __b);
 750  return (__m64)((__vector long long)__c)[0];
 751#else
 752  __m64_union __mu1, __mu2, __res;
 753
 754  __mu1.as_m64 = __m1;
 755  __mu2.as_m64 = __m2;
 756
 757  __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
 758  __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
 759  __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
 760  __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
 761  __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
 762  __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
 763  __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
 764  __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
 765
 766  return (__m64)__res.as_m64;
 767#endif
 768}
 769
 770extern __inline __m64
 771    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 772    _m_pcmpgtb(__m64 __m1, __m64 __m2) {
 773  return _mm_cmpgt_pi8(__m1, __m2);
 774}
 775
 776/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
 777   the test is true and zero if false.  */
 778extern __inline __m64
 779    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 780    _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
 781#if _ARCH_PWR8
 782  __vector signed short __a, __b, __c;
 783
 784  __a = (__vector signed short)vec_splats(__m1);
 785  __b = (__vector signed short)vec_splats(__m2);
 786  __c = (__vector signed short)vec_cmpeq(__a, __b);
 787  return (__m64)((__vector long long)__c)[0];
 788#else
 789  __m64_union __mu1, __mu2, __res;
 790
 791  __mu1.as_m64 = __m1;
 792  __mu2.as_m64 = __m2;
 793
 794  __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
 795  __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
 796  __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
 797  __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
 798
 799  return (__m64)__res.as_m64;
 800#endif
 801}
 802
 803extern __inline __m64
 804    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 805    _m_pcmpeqw(__m64 __m1, __m64 __m2) {
 806  return _mm_cmpeq_pi16(__m1, __m2);
 807}
 808
 809extern __inline __m64
 810    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 811    _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
 812#if _ARCH_PWR8
 813  __vector signed short __a, __b, __c;
 814
 815  __a = (__vector signed short)vec_splats(__m1);
 816  __b = (__vector signed short)vec_splats(__m2);
 817  __c = (__vector signed short)vec_cmpgt(__a, __b);
 818  return (__m64)((__vector long long)__c)[0];
 819#else
 820  __m64_union __mu1, __mu2, __res;
 821
 822  __mu1.as_m64 = __m1;
 823  __mu2.as_m64 = __m2;
 824
 825  __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
 826  __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
 827  __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
 828  __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
 829
 830  return (__m64)__res.as_m64;
 831#endif
 832}
 833
 834extern __inline __m64
 835    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 836    _m_pcmpgtw(__m64 __m1, __m64 __m2) {
 837  return _mm_cmpgt_pi16(__m1, __m2);
 838}
 839
 840/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
 841   the test is true and zero if false.  */
 842extern __inline __m64
 843    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 844    _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
 845#if _ARCH_PWR9
 846  __vector signed int __a, __b, __c;
 847
 848  __a = (__vector signed int)vec_splats(__m1);
 849  __b = (__vector signed int)vec_splats(__m2);
 850  __c = (__vector signed int)vec_cmpeq(__a, __b);
 851  return (__m64)((__vector long long)__c)[0];
 852#else
 853  __m64_union __mu1, __mu2, __res;
 854
 855  __mu1.as_m64 = __m1;
 856  __mu2.as_m64 = __m2;
 857
 858  __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
 859  __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
 860
 861  return (__m64)__res.as_m64;
 862#endif
 863}
 864
 865extern __inline __m64
 866    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 867    _m_pcmpeqd(__m64 __m1, __m64 __m2) {
 868  return _mm_cmpeq_pi32(__m1, __m2);
 869}
 870
 871extern __inline __m64
 872    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 873    _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
 874#if _ARCH_PWR9
 875  __vector signed int __a, __b, __c;
 876
 877  __a = (__vector signed int)vec_splats(__m1);
 878  __b = (__vector signed int)vec_splats(__m2);
 879  __c = (__vector signed int)vec_cmpgt(__a, __b);
 880  return (__m64)((__vector long long)__c)[0];
 881#else
 882  __m64_union __mu1, __mu2, __res;
 883
 884  __mu1.as_m64 = __m1;
 885  __mu2.as_m64 = __m2;
 886
 887  __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
 888  __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
 889
 890  return (__m64)__res.as_m64;
 891#endif
 892}
 893
 894extern __inline __m64
 895    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 896    _m_pcmpgtd(__m64 __m1, __m64 __m2) {
 897  return _mm_cmpgt_pi32(__m1, __m2);
 898}
 899
 900#if _ARCH_PWR8
 901/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
 902   saturated arithmetic.  */
 903extern __inline __m64
 904    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 905    _mm_adds_pi8(__m64 __m1, __m64 __m2) {
 906  __vector signed char __a, __b, __c;
 907
 908  __a = (__vector signed char)vec_splats(__m1);
 909  __b = (__vector signed char)vec_splats(__m2);
 910  __c = vec_adds(__a, __b);
 911  return (__m64)((__vector long long)__c)[0];
 912}
 913
 914extern __inline __m64
 915    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 916    _m_paddsb(__m64 __m1, __m64 __m2) {
 917  return _mm_adds_pi8(__m1, __m2);
 918}
 919/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
 920   saturated arithmetic.  */
 921extern __inline __m64
 922    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 923    _mm_adds_pi16(__m64 __m1, __m64 __m2) {
 924  __vector signed short __a, __b, __c;
 925
 926  __a = (__vector signed short)vec_splats(__m1);
 927  __b = (__vector signed short)vec_splats(__m2);
 928  __c = vec_adds(__a, __b);
 929  return (__m64)((__vector long long)__c)[0];
 930}
 931
 932extern __inline __m64
 933    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 934    _m_paddsw(__m64 __m1, __m64 __m2) {
 935  return _mm_adds_pi16(__m1, __m2);
 936}
 937/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
 938   saturated arithmetic.  */
 939extern __inline __m64
 940    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 941    _mm_adds_pu8(__m64 __m1, __m64 __m2) {
 942  __vector unsigned char __a, __b, __c;
 943
 944  __a = (__vector unsigned char)vec_splats(__m1);
 945  __b = (__vector unsigned char)vec_splats(__m2);
 946  __c = vec_adds(__a, __b);
 947  return (__m64)((__vector long long)__c)[0];
 948}
 949
 950extern __inline __m64
 951    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 952    _m_paddusb(__m64 __m1, __m64 __m2) {
 953  return _mm_adds_pu8(__m1, __m2);
 954}
 955
 956/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
 957   saturated arithmetic.  */
 958extern __inline __m64
 959    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 960    _mm_adds_pu16(__m64 __m1, __m64 __m2) {
 961  __vector unsigned short __a, __b, __c;
 962
 963  __a = (__vector unsigned short)vec_splats(__m1);
 964  __b = (__vector unsigned short)vec_splats(__m2);
 965  __c = vec_adds(__a, __b);
 966  return (__m64)((__vector long long)__c)[0];
 967}
 968
 969extern __inline __m64
 970    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 971    _m_paddusw(__m64 __m1, __m64 __m2) {
 972  return _mm_adds_pu16(__m1, __m2);
 973}
 974
 975/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
 976   saturating arithmetic.  */
 977extern __inline __m64
 978    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 979    _mm_subs_pi8(__m64 __m1, __m64 __m2) {
 980  __vector signed char __a, __b, __c;
 981
 982  __a = (__vector signed char)vec_splats(__m1);
 983  __b = (__vector signed char)vec_splats(__m2);
 984  __c = vec_subs(__a, __b);
 985  return (__m64)((__vector long long)__c)[0];
 986}
 987
 988extern __inline __m64
 989    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 990    _m_psubsb(__m64 __m1, __m64 __m2) {
 991  return _mm_subs_pi8(__m1, __m2);
 992}
 993
 994/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
 995   signed saturating arithmetic.  */
 996extern __inline __m64
 997    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 998    _mm_subs_pi16(__m64 __m1, __m64 __m2) {
 999  __vector signed short __a, __b, __c;
1000
1001  __a = (__vector signed short)vec_splats(__m1);
1002  __b = (__vector signed short)vec_splats(__m2);
1003  __c = vec_subs(__a, __b);
1004  return (__m64)((__vector long long)__c)[0];
1005}
1006
1007extern __inline __m64
1008    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009    _m_psubsw(__m64 __m1, __m64 __m2) {
1010  return _mm_subs_pi16(__m1, __m2);
1011}
1012
1013/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1014   unsigned saturating arithmetic.  */
1015extern __inline __m64
1016    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017    _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1018  __vector unsigned char __a, __b, __c;
1019
1020  __a = (__vector unsigned char)vec_splats(__m1);
1021  __b = (__vector unsigned char)vec_splats(__m2);
1022  __c = vec_subs(__a, __b);
1023  return (__m64)((__vector long long)__c)[0];
1024}
1025
1026extern __inline __m64
1027    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028    _m_psubusb(__m64 __m1, __m64 __m2) {
1029  return _mm_subs_pu8(__m1, __m2);
1030}
1031
1032/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1033   unsigned saturating arithmetic.  */
1034extern __inline __m64
1035    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036    _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1037  __vector unsigned short __a, __b, __c;
1038
1039  __a = (__vector unsigned short)vec_splats(__m1);
1040  __b = (__vector unsigned short)vec_splats(__m2);
1041  __c = vec_subs(__a, __b);
1042  return (__m64)((__vector long long)__c)[0];
1043}
1044
1045extern __inline __m64
1046    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047    _m_psubusw(__m64 __m1, __m64 __m2) {
1048  return _mm_subs_pu16(__m1, __m2);
1049}
1050
1051/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1052   four 32-bit intermediate results, which are then summed by pairs to
1053   produce two 32-bit results.  */
1054extern __inline __m64
1055    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056    _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1057  __vector signed short __a, __b;
1058  __vector signed int __c;
1059  __vector signed int __zero = {0, 0, 0, 0};
1060
1061  __a = (__vector signed short)vec_splats(__m1);
1062  __b = (__vector signed short)vec_splats(__m2);
1063  __c = vec_vmsumshm(__a, __b, __zero);
1064  return (__m64)((__vector long long)__c)[0];
1065}
1066
1067extern __inline __m64
1068    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069    _m_pmaddwd(__m64 __m1, __m64 __m2) {
1070  return _mm_madd_pi16(__m1, __m2);
1071}
1072/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1073   M2 and produce the high 16 bits of the 32-bit results.  */
1074extern __inline __m64
1075    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076    _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1077  __vector signed short __a, __b;
1078  __vector signed short __c;
1079  __vector signed int __w0, __w1;
1080  __vector unsigned char __xform1 = {
1081#ifdef __LITTLE_ENDIAN__
1082      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1083      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1084#else
1085      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1086      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1087#endif
1088  };
1089
1090  __a = (__vector signed short)vec_splats(__m1);
1091  __b = (__vector signed short)vec_splats(__m2);
1092
1093  __w0 = vec_vmulesh(__a, __b);
1094  __w1 = vec_vmulosh(__a, __b);
1095  __c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
1096
1097  return (__m64)((__vector long long)__c)[0];
1098}
1099
1100extern __inline __m64
1101    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102    _m_pmulhw(__m64 __m1, __m64 __m2) {
1103  return _mm_mulhi_pi16(__m1, __m2);
1104}
1105
1106/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1107   the low 16 bits of the results.  */
1108extern __inline __m64
1109    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110    _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1111  __vector signed short __a, __b, __c;
1112
1113  __a = (__vector signed short)vec_splats(__m1);
1114  __b = (__vector signed short)vec_splats(__m2);
1115  __c = __a * __b;
1116  return (__m64)((__vector long long)__c)[0];
1117}
1118
1119extern __inline __m64
1120    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121    _m_pmullw(__m64 __m1, __m64 __m2) {
1122  return _mm_mullo_pi16(__m1, __m2);
1123}
1124
1125/* Shift four 16-bit values in M left by COUNT.  */
1126extern __inline __m64
1127    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128    _mm_sll_pi16(__m64 __m, __m64 __count) {
1129  __vector signed short __r;
1130  __vector unsigned short __c;
1131
1132  if (__count <= 15) {
1133    __r = (__vector signed short)vec_splats(__m);
1134    __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1135    __r = vec_sl(__r, (__vector unsigned short)__c);
1136    return (__m64)((__vector long long)__r)[0];
1137  } else
1138    return (0);
1139}
1140
1141extern __inline __m64
1142    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143    _m_psllw(__m64 __m, __m64 __count) {
1144  return _mm_sll_pi16(__m, __count);
1145}
1146
1147extern __inline __m64
1148    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149    _mm_slli_pi16(__m64 __m, int __count) {
1150  /* Promote int to long then invoke mm_sll_pi16.  */
1151  return _mm_sll_pi16(__m, __count);
1152}
1153
1154extern __inline __m64
1155    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156    _m_psllwi(__m64 __m, int __count) {
1157  return _mm_slli_pi16(__m, __count);
1158}
1159
1160/* Shift two 32-bit values in M left by COUNT.  */
1161extern __inline __m64
1162    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163    _mm_sll_pi32(__m64 __m, __m64 __count) {
1164  __m64_union __res;
1165
1166  __res.as_m64 = __m;
1167
1168  __res.as_int[0] = __res.as_int[0] << __count;
1169  __res.as_int[1] = __res.as_int[1] << __count;
1170  return (__res.as_m64);
1171}
1172
1173extern __inline __m64
1174    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175    _m_pslld(__m64 __m, __m64 __count) {
1176  return _mm_sll_pi32(__m, __count);
1177}
1178
1179extern __inline __m64
1180    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181    _mm_slli_pi32(__m64 __m, int __count) {
1182  /* Promote int to long then invoke mm_sll_pi32.  */
1183  return _mm_sll_pi32(__m, __count);
1184}
1185
1186extern __inline __m64
1187    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188    _m_pslldi(__m64 __m, int __count) {
1189  return _mm_slli_pi32(__m, __count);
1190}
1191
1192/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1193extern __inline __m64
1194    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195    _mm_sra_pi16(__m64 __m, __m64 __count) {
1196  __vector signed short __r;
1197  __vector unsigned short __c;
1198
1199  if (__count <= 15) {
1200    __r = (__vector signed short)vec_splats(__m);
1201    __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1202    __r = vec_sra(__r, (__vector unsigned short)__c);
1203    return (__m64)((__vector long long)__r)[0];
1204  } else
1205    return (0);
1206}
1207
1208extern __inline __m64
1209    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210    _m_psraw(__m64 __m, __m64 __count) {
1211  return _mm_sra_pi16(__m, __count);
1212}
1213
1214extern __inline __m64
1215    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216    _mm_srai_pi16(__m64 __m, int __count) {
1217  /* Promote int to long then invoke mm_sra_pi32.  */
1218  return _mm_sra_pi16(__m, __count);
1219}
1220
1221extern __inline __m64
1222    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223    _m_psrawi(__m64 __m, int __count) {
1224  return _mm_srai_pi16(__m, __count);
1225}
1226
1227/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1228extern __inline __m64
1229    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230    _mm_sra_pi32(__m64 __m, __m64 __count) {
1231  __m64_union __res;
1232
1233  __res.as_m64 = __m;
1234
1235  __res.as_int[0] = __res.as_int[0] >> __count;
1236  __res.as_int[1] = __res.as_int[1] >> __count;
1237  return (__res.as_m64);
1238}
1239
1240extern __inline __m64
1241    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242    _m_psrad(__m64 __m, __m64 __count) {
1243  return _mm_sra_pi32(__m, __count);
1244}
1245
1246extern __inline __m64
1247    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248    _mm_srai_pi32(__m64 __m, int __count) {
1249  /* Promote int to long then invoke mm_sra_pi32.  */
1250  return _mm_sra_pi32(__m, __count);
1251}
1252
1253extern __inline __m64
1254    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255    _m_psradi(__m64 __m, int __count) {
1256  return _mm_srai_pi32(__m, __count);
1257}
1258
1259/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1260extern __inline __m64
1261    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262    _mm_srl_pi16(__m64 __m, __m64 __count) {
1263  __vector unsigned short __r;
1264  __vector unsigned short __c;
1265
1266  if (__count <= 15) {
1267    __r = (__vector unsigned short)vec_splats(__m);
1268    __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1269    __r = vec_sr(__r, (__vector unsigned short)__c);
1270    return (__m64)((__vector long long)__r)[0];
1271  } else
1272    return (0);
1273}
1274
1275extern __inline __m64
1276    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277    _m_psrlw(__m64 __m, __m64 __count) {
1278  return _mm_srl_pi16(__m, __count);
1279}
1280
1281extern __inline __m64
1282    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283    _mm_srli_pi16(__m64 __m, int __count) {
1284  /* Promote int to long then invoke mm_sra_pi32.  */
1285  return _mm_srl_pi16(__m, __count);
1286}
1287
1288extern __inline __m64
1289    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290    _m_psrlwi(__m64 __m, int __count) {
1291  return _mm_srli_pi16(__m, __count);
1292}
1293
1294/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1295extern __inline __m64
1296    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297    _mm_srl_pi32(__m64 __m, __m64 __count) {
1298  __m64_union __res;
1299
1300  __res.as_m64 = __m;
1301
1302  __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1303  __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1304  return (__res.as_m64);
1305}
1306
1307extern __inline __m64
1308    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309    _m_psrld(__m64 __m, __m64 __count) {
1310  return _mm_srl_pi32(__m, __count);
1311}
1312
1313extern __inline __m64
1314    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315    _mm_srli_pi32(__m64 __m, int __count) {
1316  /* Promote int to long then invoke mm_srl_pi32.  */
1317  return _mm_srl_pi32(__m, __count);
1318}
1319
1320extern __inline __m64
1321    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322    _m_psrldi(__m64 __m, int __count) {
1323  return _mm_srli_pi32(__m, __count);
1324}
1325#endif /* _ARCH_PWR8 */
1326
1327/* Creates a vector of two 32-bit values; I0 is least significant.  */
1328extern __inline __m64
1329    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330    _mm_set_pi32(int __i1, int __i0) {
1331  __m64_union __res;
1332
1333  __res.as_int[0] = __i0;
1334  __res.as_int[1] = __i1;
1335  return (__res.as_m64);
1336}
1337
1338/* Creates a vector of four 16-bit values; W0 is least significant.  */
1339extern __inline __m64
1340    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341    _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1342  __m64_union __res;
1343
1344  __res.as_short[0] = __w0;
1345  __res.as_short[1] = __w1;
1346  __res.as_short[2] = __w2;
1347  __res.as_short[3] = __w3;
1348  return (__res.as_m64);
1349}
1350
1351/* Creates a vector of eight 8-bit values; B0 is least significant.  */
1352extern __inline __m64
1353    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354    _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1355                char __b2, char __b1, char __b0) {
1356  __m64_union __res;
1357
1358  __res.as_char[0] = __b0;
1359  __res.as_char[1] = __b1;
1360  __res.as_char[2] = __b2;
1361  __res.as_char[3] = __b3;
1362  __res.as_char[4] = __b4;
1363  __res.as_char[5] = __b5;
1364  __res.as_char[6] = __b6;
1365  __res.as_char[7] = __b7;
1366  return (__res.as_m64);
1367}
1368
1369/* Similar, but with the arguments in reverse order.  */
1370extern __inline __m64
1371    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372    _mm_setr_pi32(int __i0, int __i1) {
1373  __m64_union __res;
1374
1375  __res.as_int[0] = __i0;
1376  __res.as_int[1] = __i1;
1377  return (__res.as_m64);
1378}
1379
1380extern __inline __m64
1381    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382    _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1383  return _mm_set_pi16(__w3, __w2, __w1, __w0);
1384}
1385
1386extern __inline __m64
1387    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388    _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1389                 char __b5, char __b6, char __b7) {
1390  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1391}
1392
1393/* Creates a vector of two 32-bit values, both elements containing I.  */
1394extern __inline __m64
1395    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396    _mm_set1_pi32(int __i) {
1397  __m64_union __res;
1398
1399  __res.as_int[0] = __i;
1400  __res.as_int[1] = __i;
1401  return (__res.as_m64);
1402}
1403
1404/* Creates a vector of four 16-bit values, all elements containing W.  */
1405extern __inline __m64
1406    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407    _mm_set1_pi16(short __w) {
1408#if _ARCH_PWR9
1409  __vector signed short w;
1410
1411  w = (__vector signed short)vec_splats(__w);
1412  return (__m64)((__vector long long)w)[0];
1413#else
1414  __m64_union __res;
1415
1416  __res.as_short[0] = __w;
1417  __res.as_short[1] = __w;
1418  __res.as_short[2] = __w;
1419  __res.as_short[3] = __w;
1420  return (__res.as_m64);
1421#endif
1422}
1423
1424/* Creates a vector of eight 8-bit values, all elements containing B.  */
1425extern __inline __m64
1426    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427    _mm_set1_pi8(signed char __b) {
1428#if _ARCH_PWR8
1429  __vector signed char __res;
1430
1431  __res = (__vector signed char)vec_splats(__b);
1432  return (__m64)((__vector long long)__res)[0];
1433#else
1434  __m64_union __res;
1435
1436  __res.as_char[0] = __b;
1437  __res.as_char[1] = __b;
1438  __res.as_char[2] = __b;
1439  __res.as_char[3] = __b;
1440  __res.as_char[4] = __b;
1441  __res.as_char[5] = __b;
1442  __res.as_char[6] = __b;
1443  __res.as_char[7] = __b;
1444  return (__res.as_m64);
1445#endif
1446}
1447
1448#else
1449#include_next <mmintrin.h>
1450#endif /* defined(__powerpc64__) &&                                            \
1451        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1452
1453#endif /* _MMINTRIN_H_INCLUDED */