master
  1/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9
 10#if !defined X86GPRINTRIN_H_
 11#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
 12#endif
 13
 14#ifndef BMI2INTRIN_H_
 15#define BMI2INTRIN_H_
 16
 17extern __inline unsigned int
 18    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 19    _bzhi_u32(unsigned int __X, unsigned int __Y) {
 20  return ((__X << (32 - __Y)) >> (32 - __Y));
 21}
 22
 23extern __inline unsigned int
 24    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 25    _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
 26  unsigned long long __res = (unsigned long long)__X * __Y;
 27  *__P = (unsigned int)(__res >> 32);
 28  return (unsigned int)__res;
 29}
 30
 31#ifdef __PPC64__
 32extern __inline unsigned long long
 33    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 34    _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
 35  return ((__X << (64 - __Y)) >> (64 - __Y));
 36}
 37
 38/* __int128 requires base 64-bit.  */
 39extern __inline unsigned long long
 40    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 41    _mulx_u64(unsigned long long __X, unsigned long long __Y,
 42              unsigned long long *__P) {
 43  unsigned __int128 __res = (unsigned __int128)__X * __Y;
 44  *__P = (unsigned long long)(__res >> 64);
 45  return (unsigned long long)__res;
 46}
 47
 48#ifdef _ARCH_PWR7
 49/* popcount and bpermd require power7 minimum.  */
 50extern __inline unsigned long long
 51    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 52    _pdep_u64(unsigned long long __X, unsigned long long __M) {
 53  unsigned long __result = 0x0UL;
 54  const unsigned long __mask = 0x8000000000000000UL;
 55  unsigned long __m = __M;
 56  unsigned long __c, __t;
 57  unsigned long __p;
 58
 59  /* The pop-count of the mask gives the number of the bits from
 60   source to process.  This is also needed to shift bits from the
 61   source into the correct position for the result.  */
 62  __p = 64 - __builtin_popcountl(__M);
 63
 64  /* The loop is for the number of '1' bits in the mask and clearing
 65   each mask bit as it is processed.  */
 66  while (__m != 0) {
 67    __c = __builtin_clzl(__m);
 68    __t = __X << (__p - __c);
 69    __m ^= (__mask >> __c);
 70    __result |= (__t & (__mask >> __c));
 71    __p++;
 72  }
 73  return __result;
 74}
 75
 76extern __inline unsigned long long
 77    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 78    _pext_u64(unsigned long long __X, unsigned long long __M) {
 79  unsigned long __p = 0x4040404040404040UL; // initial bit permute control
 80  const unsigned long __mask = 0x8000000000000000UL;
 81  unsigned long __m = __M;
 82  unsigned long __c;
 83  unsigned long __result;
 84
 85  /* if the mask is constant and selects 8 bits or less we can use
 86   the Power8 Bit permute instruction.  */
 87  if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
 88    /* Also if the pext mask is constant, then the popcount is
 89     constant, we can evaluate the following loop at compile
 90     time and use a constant bit permute vector.  */
 91    long __i;
 92    for (__i = 0; __i < __builtin_popcountl(__M); __i++) {
 93      __c = __builtin_clzl(__m);
 94      __p = (__p << 8) | __c;
 95      __m ^= (__mask >> __c);
 96    }
 97    __result = __builtin_bpermd(__p, __X);
 98  } else {
 99    __p = 64 - __builtin_popcountl(__M);
100    __result = 0;
101    /* We could a use a for loop here, but that combined with
102     -funroll-loops can expand to a lot of code.  The while
103     loop avoids unrolling and the compiler commons the xor
104     from clearing the mask bit with the (m != 0) test.  The
105     result is a more compact loop setup and body.  */
106    while (__m != 0) {
107      unsigned long __t;
108      __c = __builtin_clzl(__m);
109      __t = (__X & (__mask >> __c)) >> (__p - __c);
110      __m ^= (__mask >> __c);
111      __result |= (__t);
112      __p++;
113    }
114  }
115  return __result;
116}
117
118/* these 32-bit implementations depend on 64-bit pdep/pext
119   which depend on _ARCH_PWR7.  */
120extern __inline unsigned int
121    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
122    _pdep_u32(unsigned int __X, unsigned int __Y) {
123  return _pdep_u64(__X, __Y);
124}
125
126extern __inline unsigned int
127    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128    _pext_u32(unsigned int __X, unsigned int __Y) {
129  return _pext_u64(__X, __Y);
130}
131#endif /* _ARCH_PWR7  */
132#endif /* __PPC64__  */
133
134#endif /* BMI2INTRIN_H_ */