zig/lib/include/velintrin_approx.h at master

  1/*===---- velintrin_approx.h - VEL intrinsics helper for VE ----------------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9#ifndef __VEL_INTRIN_APPROX_H__
 10#define __VEL_INTRIN_APPROX_H__
 11
 12static inline __vr _vel_approx_vfdivs_vvvl(__vr v0, __vr v1, int l) {
 13  float s0;
 14  __vr v2, v3, v4, v5;
 15  v5 = _vel_vrcps_vvl(v1, l);
 16  s0 = 1.0;
 17  v4 = _vel_vfnmsbs_vsvvl(s0, v1, v5, l);
 18  v3 = _vel_vfmads_vvvvl(v5, v5, v4, l);
 19  v2 = _vel_vfmuls_vvvl(v0, v3, l);
 20  v4 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
 21  v2 = _vel_vfmads_vvvvl(v2, v5, v4, l);
 22  v0 = _vel_vfnmsbs_vvvvl(v0, v2, v1, l);
 23  v0 = _vel_vfmads_vvvvl(v2, v3, v0, l);
 24  return v0;
 25}
 26
 27static inline __vr _vel_approx_pvfdiv_vvvl(__vr v0, __vr v1, int l) {
 28  float s0;
 29  __vr v2, v3, v4, v5;
 30  v5 = _vel_pvrcp_vvl(v1, l);
 31  s0 = 1.0;
 32  v4 = _vel_pvfnmsb_vsvvl(s0, v1, v5, l);
 33  v3 = _vel_pvfmad_vvvvl(v5, v5, v4, l);
 34  v2 = _vel_pvfmul_vvvl(v0, v3, l);
 35  v4 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
 36  v2 = _vel_pvfmad_vvvvl(v2, v5, v4, l);
 37  v0 = _vel_pvfnmsb_vvvvl(v0, v2, v1, l);
 38  v0 = _vel_pvfmad_vvvvl(v2, v3, v0, l);
 39  return v0;
 40}
 41
 42static inline __vr _vel_approx_vfdivs_vsvl(float s0, __vr v0, int l) {
 43  float s1;
 44  __vr v1, v2, v3, v4;
 45  v4 = _vel_vrcps_vvl(v0, l);
 46  s1 = 1.0;
 47  v2 = _vel_vfnmsbs_vsvvl(s1, v0, v4, l);
 48  v2 = _vel_vfmads_vvvvl(v4, v4, v2, l);
 49  v1 = _vel_vfmuls_vsvl(s0, v2, l);
 50  v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
 51  v1 = _vel_vfmads_vvvvl(v1, v4, v3, l);
 52  v3 = _vel_vfnmsbs_vsvvl(s0, v1, v0, l);
 53  v0 = _vel_vfmads_vvvvl(v1, v2, v3, l);
 54  return v0;
 55}
 56
 57static inline __vr _vel_approx_vfdivs_vvsl(__vr v0, float s0, int l) {
 58  float s1;
 59  __vr v1, v2;
 60  s1 = 1.0f / s0;
 61  v1 = _vel_vfmuls_vsvl(s1, v0, l);
 62  v2 = _vel_vfnmsbs_vvsvl(v0, s0, v1, l);
 63  v0 = _vel_vfmads_vvsvl(v1, s1, v2, l);
 64  return v0;
 65}
 66
 67static inline __vr _vel_approx_vfdivd_vsvl(double s0, __vr v0, int l) {
 68  __vr v1, v2, v3;
 69  v2 = _vel_vrcpd_vvl(v0, l);
 70  double s1 = 1.0;
 71  v3 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
 72  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
 73  v1 = _vel_vfnmsbd_vsvvl(s1, v0, v2, l);
 74  v1 = _vel_vfmadd_vvvvl(v2, v2, v1, l);
 75  v1 = _vel_vaddul_vsvl(1, v1, l);
 76  v3 = _vel_vfnmsbd_vsvvl(s1, v0, v1, l);
 77  v3 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
 78  v1 = _vel_vfmuld_vsvl(s0, v3, l);
 79  v0 = _vel_vfnmsbd_vsvvl(s0, v1, v0, l);
 80  v0 = _vel_vfmadd_vvvvl(v1, v3, v0, l);
 81  return v0;
 82}
 83
 84static inline __vr _vel_approx_vfsqrtd_vvl(__vr v0, int l) {
 85  double s0, s1;
 86  __vr v1, v2, v3;
 87  v2 = _vel_vrsqrtdnex_vvl(v0, l);
 88  v1 = _vel_vfmuld_vvvl(v0, v2, l);
 89  s0 = 1.0;
 90  s1 = 0.5;
 91  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
 92  v3 = _vel_vfmuld_vsvl(s1, v3, l);
 93  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
 94  v1 = _vel_vfmuld_vvvl(v0, v2, l);
 95  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
 96  v3 = _vel_vfmuld_vsvl(s1, v3, l);
 97  v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
 98  return v0;
 99}
100
101static inline __vr _vel_approx_vfsqrts_vvl(__vr v0, int l) {
102  float s0, s1;
103  __vr v1, v2, v3;
104  v0 = _vel_vcvtds_vvl(v0, l);
105  v2 = _vel_vrsqrtdnex_vvl(v0, l);
106  v1 = _vel_vfmuld_vvvl(v0, v2, l);
107  s0 = 1.0;
108  s1 = 0.5;
109  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
110  v3 = _vel_vfmuld_vsvl(s1, v3, l);
111  v2 = _vel_vfmadd_vvvvl(v2, v2, v3, l);
112  v1 = _vel_vfmuld_vvvl(v0, v2, l);
113  v3 = _vel_vfnmsbd_vsvvl(s0, v1, v2, l);
114  v3 = _vel_vfmuld_vsvl(s1, v3, l);
115  v0 = _vel_vfmadd_vvvvl(v1, v1, v3, l);
116  v0 = _vel_vcvtsd_vvl(v0, l);
117  return v0;
118}
119
120#endif