zig/lib/include/ppc_wrappers/smmintrin.h at master

  1/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9
 10/* Implemented from the specification included in the Intel C++ Compiler
 11   User Guide and Reference, version 9.0.
 12
 13   NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
 14
 15#ifndef NO_WARN_X86_INTRINSICS
 16/* This header is distributed to simplify porting x86_64 code that
 17   makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
 18
 19   It is the user's responsibility to determine if the results are
 20   acceptable and make additional changes as necessary.
 21
 22   Note that much code that uses Intel intrinsics can be rewritten in
 23   standard C or GNU C extensions, which are more portable and better
 24   optimized across multiple targets.  */
 25#error                                                                         \
 26    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
 27#endif
 28
 29#ifndef SMMINTRIN_H_
 30#define SMMINTRIN_H_
 31
 32#if defined(__powerpc64__) &&                                                  \
 33    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 34
 35#include <altivec.h>
 36#include <tmmintrin.h>
 37
 38/* Rounding mode macros. */
 39#define _MM_FROUND_TO_NEAREST_INT 0x00
 40#define _MM_FROUND_TO_ZERO 0x01
 41#define _MM_FROUND_TO_POS_INF 0x02
 42#define _MM_FROUND_TO_NEG_INF 0x03
 43#define _MM_FROUND_CUR_DIRECTION 0x04
 44
 45#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
 46#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
 47#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
 48#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
 49#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
 50#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
 51
 52#define _MM_FROUND_RAISE_EXC 0x00
 53#define _MM_FROUND_NO_EXC 0x08
 54
 55extern __inline __m128d
 56    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 57    _mm_round_pd(__m128d __A, int __rounding) {
 58  __v2df __r;
 59  union {
 60    double __fr;
 61    long long __fpscr;
 62  } __enables_save, __fpscr_save;
 63
 64  if (__rounding & _MM_FROUND_NO_EXC) {
 65    /* Save enabled exceptions, disable all exceptions,
 66       and preserve the rounding mode.  */
 67#ifdef _ARCH_PWR9
 68    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
 69    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 70#else
 71    __fpscr_save.__fr = __builtin_ppc_mffs();
 72    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 73    __fpscr_save.__fpscr &= ~0xf8;
 74    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
 75#endif
 76    /* Insert an artificial "read/write" reference to the variable
 77       read below, to ensure the compiler does not schedule
 78       a read/use of the variable before the FPSCR is modified, above.
 79       This can be removed if and when GCC PR102783 is fixed.
 80     */
 81    __asm__("" : "+wa"(__A));
 82  }
 83
 84  switch (__rounding) {
 85  case _MM_FROUND_TO_NEAREST_INT:
 86#ifdef _ARCH_PWR9
 87    __fpscr_save.__fr = __builtin_ppc_mffsl();
 88#else
 89    __fpscr_save.__fr = __builtin_ppc_mffs();
 90    __fpscr_save.__fpscr &= 0x70007f0ffL;
 91#endif
 92    __attribute__((fallthrough));
 93  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
 94    __builtin_ppc_set_fpscr_rn(0b00);
 95    /* Insert an artificial "read/write" reference to the variable
 96       read below, to ensure the compiler does not schedule
 97       a read/use of the variable before the FPSCR is modified, above.
 98       This can be removed if and when GCC PR102783 is fixed.
 99     */
100    __asm__("" : "+wa"(__A));
101
102    __r = vec_rint((__v2df)__A);
103
104    /* Insert an artificial "read" reference to the variable written
105       above, to ensure the compiler does not schedule the computation
106       of the value after the manipulation of the FPSCR, below.
107       This can be removed if and when GCC PR102783 is fixed.
108     */
109    __asm__("" : : "wa"(__r));
110    __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
111    break;
112  case _MM_FROUND_TO_NEG_INF:
113  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
114    __r = vec_floor((__v2df)__A);
115    break;
116  case _MM_FROUND_TO_POS_INF:
117  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
118    __r = vec_ceil((__v2df)__A);
119    break;
120  case _MM_FROUND_TO_ZERO:
121  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
122    __r = vec_trunc((__v2df)__A);
123    break;
124  case _MM_FROUND_CUR_DIRECTION:
125    __r = vec_rint((__v2df)__A);
126    break;
127  }
128  if (__rounding & _MM_FROUND_NO_EXC) {
129    /* Insert an artificial "read" reference to the variable written
130       above, to ensure the compiler does not schedule the computation
131       of the value after the manipulation of the FPSCR, below.
132       This can be removed if and when GCC PR102783 is fixed.
133     */
134    __asm__("" : : "wa"(__r));
135    /* Restore enabled exceptions.  */
136#ifdef _ARCH_PWR9
137    __fpscr_save.__fr = __builtin_ppc_mffsl();
138#else
139    __fpscr_save.__fr = __builtin_ppc_mffs();
140    __fpscr_save.__fpscr &= 0x70007f0ffL;
141#endif
142    __fpscr_save.__fpscr |= __enables_save.__fpscr;
143    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
144  }
145  return (__m128d)__r;
146}
147
148extern __inline __m128d
149    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
150    _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
151  __B = _mm_round_pd(__B, __rounding);
152  __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
153  return (__m128d)__r;
154}
155
156extern __inline __m128
157    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
158    _mm_round_ps(__m128 __A, int __rounding) {
159  __v4sf __r;
160  union {
161    double __fr;
162    long long __fpscr;
163  } __enables_save, __fpscr_save;
164
165  if (__rounding & _MM_FROUND_NO_EXC) {
166    /* Save enabled exceptions, disable all exceptions,
167       and preserve the rounding mode.  */
168#ifdef _ARCH_PWR9
169    __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
170    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
171#else
172    __fpscr_save.__fr = __builtin_ppc_mffs();
173    __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
174    __fpscr_save.__fpscr &= ~0xf8;
175    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
176#endif
177    /* Insert an artificial "read/write" reference to the variable
178       read below, to ensure the compiler does not schedule
179       a read/use of the variable before the FPSCR is modified, above.
180       This can be removed if and when GCC PR102783 is fixed.
181     */
182    __asm__("" : "+wa"(__A));
183  }
184
185  switch (__rounding) {
186  case _MM_FROUND_TO_NEAREST_INT:
187#ifdef _ARCH_PWR9
188    __fpscr_save.__fr = __builtin_ppc_mffsl();
189#else
190    __fpscr_save.__fr = __builtin_ppc_mffs();
191    __fpscr_save.__fpscr &= 0x70007f0ffL;
192#endif
193    __attribute__((fallthrough));
194  case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
195    __builtin_ppc_set_fpscr_rn(0b00);
196    /* Insert an artificial "read/write" reference to the variable
197       read below, to ensure the compiler does not schedule
198       a read/use of the variable before the FPSCR is modified, above.
199       This can be removed if and when GCC PR102783 is fixed.
200     */
201    __asm__("" : "+wa"(__A));
202
203    __r = vec_rint((__v4sf)__A);
204
205    /* Insert an artificial "read" reference to the variable written
206       above, to ensure the compiler does not schedule the computation
207       of the value after the manipulation of the FPSCR, below.
208       This can be removed if and when GCC PR102783 is fixed.
209     */
210    __asm__("" : : "wa"(__r));
211    __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
212    break;
213  case _MM_FROUND_TO_NEG_INF:
214  case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
215    __r = vec_floor((__v4sf)__A);
216    break;
217  case _MM_FROUND_TO_POS_INF:
218  case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
219    __r = vec_ceil((__v4sf)__A);
220    break;
221  case _MM_FROUND_TO_ZERO:
222  case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
223    __r = vec_trunc((__v4sf)__A);
224    break;
225  case _MM_FROUND_CUR_DIRECTION:
226    __r = vec_rint((__v4sf)__A);
227    break;
228  }
229  if (__rounding & _MM_FROUND_NO_EXC) {
230    /* Insert an artificial "read" reference to the variable written
231       above, to ensure the compiler does not schedule the computation
232       of the value after the manipulation of the FPSCR, below.
233       This can be removed if and when GCC PR102783 is fixed.
234     */
235    __asm__("" : : "wa"(__r));
236    /* Restore enabled exceptions.  */
237#ifdef _ARCH_PWR9
238    __fpscr_save.__fr = __builtin_ppc_mffsl();
239#else
240    __fpscr_save.__fr = __builtin_ppc_mffs();
241    __fpscr_save.__fpscr &= 0x70007f0ffL;
242#endif
243    __fpscr_save.__fpscr |= __enables_save.__fpscr;
244    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
245  }
246  return (__m128)__r;
247}
248
249extern __inline __m128
250    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
251    _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
252  __B = _mm_round_ps(__B, __rounding);
253  __v4sf __r = (__v4sf)__A;
254  __r[0] = ((__v4sf)__B)[0];
255  return (__m128)__r;
256}
257
258#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
259#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
260
261#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
262#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
263
264#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
265#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
266
267#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
268#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
269
270extern __inline __m128i
271    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272    _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
273  __v16qi __result = (__v16qi)__A;
274
275  __result[__N & 0xf] = __D;
276
277  return (__m128i)__result;
278}
279
280extern __inline __m128i
281    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282    _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
283  __v4si __result = (__v4si)__A;
284
285  __result[__N & 3] = __D;
286
287  return (__m128i)__result;
288}
289
290extern __inline __m128i
291    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
292    _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
293  __v2di __result = (__v2di)__A;
294
295  __result[__N & 1] = __D;
296
297  return (__m128i)__result;
298}
299
300extern __inline int
301    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302    _mm_extract_epi8(__m128i __X, const int __N) {
303  return (unsigned char)((__v16qi)__X)[__N & 15];
304}
305
306extern __inline int
307    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
308    _mm_extract_epi32(__m128i __X, const int __N) {
309  return ((__v4si)__X)[__N & 3];
310}
311
312extern __inline int
313    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
314    _mm_extract_epi64(__m128i __X, const int __N) {
315  return ((__v2di)__X)[__N & 1];
316}
317
318extern __inline int
319    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320    _mm_extract_ps(__m128 __X, const int __N) {
321  return ((__v4si)__X)[__N & 3];
322}
323
324#ifdef _ARCH_PWR8
325extern __inline __m128i
326    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
328  __v16qu __charmask = vec_splats((unsigned char)__imm8);
329  __charmask = vec_gb(__charmask);
330  __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
331#ifdef __BIG_ENDIAN__
332  __shortmask = vec_reve(__shortmask);
333#endif
334  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
335}
336#endif
337
338extern __inline __m128i
339    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
341#ifdef _ARCH_PWR10
342  return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
343#else
344  const __v16qu __seven = vec_splats((unsigned char)0x07);
345  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
346  return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
347#endif
348}
349
350extern __inline __m128
351    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
352    _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
353  __v16qu __pcv[] = {
354      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
355      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
356      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
357      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
358      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
359      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
360      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
361      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
362      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
363      {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
364      {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
365      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
366      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
367      {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
368      {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
369      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
370  };
371  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
372  return (__m128)__r;
373}
374
375extern __inline __m128
376    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
377    _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
378#ifdef _ARCH_PWR10
379  return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
380#else
381  const __v4si __zero = {0};
382  const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
383  return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
384#endif
385}
386
387extern __inline __m128d
388    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
389    _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
390  __v16qu __pcv[] = {
391      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
392      {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
393      {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
394      {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
395  __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
396  return (__m128d)__r;
397}
398
399#ifdef _ARCH_PWR8
400extern __inline __m128d
401    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
402    _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
403#ifdef _ARCH_PWR10
404  return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
405#else
406  const __v2di __zero = {0};
407  const __vector __bool long long __boolmask =
408      vec_cmplt((__v2di)__mask, __zero);
409  return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
410#endif
411}
412#endif
413
414extern __inline int
415    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
416    _mm_testz_si128(__m128i __A, __m128i __B) {
417  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
418  const __v16qu __zero = {0};
419  return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
420}
421
422extern __inline int
423    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
424    _mm_testc_si128(__m128i __A, __m128i __B) {
425  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
426  const __v16qu __zero = {0};
427  const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
428  return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
429}
430
431extern __inline int
432    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433    _mm_testnzc_si128(__m128i __A, __m128i __B) {
434  /* Note: This implementation does NOT set "zero" or "carry" flags.  */
435  return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
436}
437
438#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
439
440#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
441
442#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
443
444#ifdef _ARCH_PWR8
445extern __inline __m128i
446    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
447    _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
448  return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
449}
450#endif
451
452extern __inline __m128i
453    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
454    _mm_min_epi8(__m128i __X, __m128i __Y) {
455  return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
456}
457
458extern __inline __m128i
459    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460    _mm_min_epu16(__m128i __X, __m128i __Y) {
461  return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
462}
463
464extern __inline __m128i
465    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
466    _mm_min_epi32(__m128i __X, __m128i __Y) {
467  return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
468}
469
470extern __inline __m128i
471    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
472    _mm_min_epu32(__m128i __X, __m128i __Y) {
473  return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
474}
475
476extern __inline __m128i
477    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
478    _mm_max_epi8(__m128i __X, __m128i __Y) {
479  return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
480}
481
482extern __inline __m128i
483    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
484    _mm_max_epu16(__m128i __X, __m128i __Y) {
485  return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
486}
487
488extern __inline __m128i
489    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490    _mm_max_epi32(__m128i __X, __m128i __Y) {
491  return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
492}
493
494extern __inline __m128i
495    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496    _mm_max_epu32(__m128i __X, __m128i __Y) {
497  return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
498}
499
500extern __inline __m128i
501    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502    _mm_mullo_epi32(__m128i __X, __m128i __Y) {
503  return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
504}
505
506#ifdef _ARCH_PWR8
507extern __inline __m128i
508    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
509    _mm_mul_epi32(__m128i __X, __m128i __Y) {
510  return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
511}
512#endif
513
514extern __inline __m128i
515    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516    _mm_cvtepi8_epi16(__m128i __A) {
517  return (__m128i)vec_unpackh((__v16qi)__A);
518}
519
520extern __inline __m128i
521    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
522    _mm_cvtepi8_epi32(__m128i __A) {
523  __A = (__m128i)vec_unpackh((__v16qi)__A);
524  return (__m128i)vec_unpackh((__v8hi)__A);
525}
526
527#ifdef _ARCH_PWR8
528extern __inline __m128i
529    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
530    _mm_cvtepi8_epi64(__m128i __A) {
531  __A = (__m128i)vec_unpackh((__v16qi)__A);
532  __A = (__m128i)vec_unpackh((__v8hi)__A);
533  return (__m128i)vec_unpackh((__v4si)__A);
534}
535#endif
536
537extern __inline __m128i
538    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539    _mm_cvtepi16_epi32(__m128i __A) {
540  return (__m128i)vec_unpackh((__v8hi)__A);
541}
542
543#ifdef _ARCH_PWR8
544extern __inline __m128i
545    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
546    _mm_cvtepi16_epi64(__m128i __A) {
547  __A = (__m128i)vec_unpackh((__v8hi)__A);
548  return (__m128i)vec_unpackh((__v4si)__A);
549}
550#endif
551
552#ifdef _ARCH_PWR8
553extern __inline __m128i
554    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
555    _mm_cvtepi32_epi64(__m128i __A) {
556  return (__m128i)vec_unpackh((__v4si)__A);
557}
558#endif
559
560extern __inline __m128i
561    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
562    _mm_cvtepu8_epi16(__m128i __A) {
563  const __v16qu __zero = {0};
564#ifdef __LITTLE_ENDIAN__
565  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
566#else  /* __BIG_ENDIAN__.  */
567  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
568#endif /* __BIG_ENDIAN__.  */
569  return __A;
570}
571
572extern __inline __m128i
573    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
574    _mm_cvtepu8_epi32(__m128i __A) {
575  const __v16qu __zero = {0};
576#ifdef __LITTLE_ENDIAN__
577  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
578  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
579#else  /* __BIG_ENDIAN__.  */
580  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
581  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
582#endif /* __BIG_ENDIAN__.  */
583  return __A;
584}
585
586extern __inline __m128i
587    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
588    _mm_cvtepu8_epi64(__m128i __A) {
589  const __v16qu __zero = {0};
590#ifdef __LITTLE_ENDIAN__
591  __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
592  __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
593  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
594#else  /* __BIG_ENDIAN__.  */
595  __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
596  __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
597  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
598#endif /* __BIG_ENDIAN__.  */
599  return __A;
600}
601
602extern __inline __m128i
603    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604    _mm_cvtepu16_epi32(__m128i __A) {
605  const __v8hu __zero = {0};
606#ifdef __LITTLE_ENDIAN__
607  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
608#else  /* __BIG_ENDIAN__.  */
609  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
610#endif /* __BIG_ENDIAN__.  */
611  return __A;
612}
613
614extern __inline __m128i
615    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616    _mm_cvtepu16_epi64(__m128i __A) {
617  const __v8hu __zero = {0};
618#ifdef __LITTLE_ENDIAN__
619  __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
620  __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
621#else  /* __BIG_ENDIAN__.  */
622  __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
623  __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
624#endif /* __BIG_ENDIAN__.  */
625  return __A;
626}
627
628extern __inline __m128i
629    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
630    _mm_cvtepu32_epi64(__m128i __A) {
631  const __v4su __zero = {0};
632#ifdef __LITTLE_ENDIAN__
633  __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
634#else  /* __BIG_ENDIAN__.  */
635  __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
636#endif /* __BIG_ENDIAN__.  */
637  return __A;
638}
639
640/* Return horizontal packed word minimum and its index in bits [15:0]
641   and bits [18:16] respectively.  */
642extern __inline __m128i
643    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
644    _mm_minpos_epu16(__m128i __A) {
645  union __u {
646    __m128i __m;
647    __v8hu __uh;
648  };
649  union __u __u = {.__m = __A}, __r = {.__m = {0}};
650  unsigned short __ridx = 0;
651  unsigned short __rmin = __u.__uh[__ridx];
652  unsigned long __i;
653  for (__i = 1; __i < 8; __i++) {
654    if (__u.__uh[__i] < __rmin) {
655      __rmin = __u.__uh[__i];
656      __ridx = __i;
657    }
658  }
659  __r.__uh[0] = __rmin;
660  __r.__uh[1] = __ridx;
661  return __r.__m;
662}
663
664extern __inline __m128i
665    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
666    _mm_packus_epi32(__m128i __X, __m128i __Y) {
667  return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
668}
669
670#ifdef _ARCH_PWR8
671extern __inline __m128i
672    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
673    _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
674  return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
675}
676#endif
677
678#else
679#include_next <smmintrin.h>
680#endif /* defined(__powerpc64__) &&                                            \
681        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
682
683#endif /* SMMINTRIN_H_ */