master
  1/*===-------- avx10_2minmaxintrin.h - AVX10_2MINMAX intrinsics -------------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9#ifndef __IMMINTRIN_H
 10#error                                                                         \
 11    "Never use <avx10_2minmaxintrin.h> directly; include <immintrin.h> instead."
 12#endif // __IMMINTRIN_H
 13
 14#ifndef __AVX10_2MINMAXINTRIN_H
 15#define __AVX10_2MINMAXINTRIN_H
 16
 17#define _mm_minmax_pbh(A, B, C)                                                \
 18  ((__m128bh)__builtin_ia32_vminmaxbf16128((__m128bh)(__v8bf)(A),              \
 19                                           (__m128bh)(__v8bf)(B), (int)(C)))
 20
 21#define _mm_mask_minmax_pbh(W, U, A, B, C)                                     \
 22  ((__m128bh)__builtin_ia32_selectpbf_128(                                     \
 23      (__mmask8)(U),                                                           \
 24      (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),     \
 25                             (int)(C)),                                        \
 26      (__v8bf)(W)))
 27
 28#define _mm_maskz_minmax_pbh(U, A, B, C)                                       \
 29  ((__m128bh)__builtin_ia32_selectpbf_128(                                     \
 30      (__mmask8)(U),                                                           \
 31      (__v8bf)_mm_minmax_pbh((__m128bh)(__v8bf)(A), (__m128bh)(__v8bf)(B),     \
 32                             (int)(C)),                                        \
 33      (__v8bf) __builtin_bit_cast(__m128bh, _mm_setzero_ps())))
 34
 35#define _mm256_minmax_pbh(A, B, C)                                             \
 36  ((__m256bh)__builtin_ia32_vminmaxbf16256((__m256bh)(__v16bf)(A),             \
 37                                           (__m256bh)(__v16bf)(B), (int)(C)))
 38
 39#define _mm256_mask_minmax_pbh(W, U, A, B, C)                                  \
 40  ((__m256bh)__builtin_ia32_selectpbf_256(                                     \
 41      (__mmask16)(U),                                                          \
 42      (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A),                       \
 43                                 (__m256bh)(__v16bf)(B), (int)(C)),            \
 44      (__v16bf)(W)))
 45
 46#define _mm256_maskz_minmax_pbh(U, A, B, C)                                    \
 47  ((__m256bh)__builtin_ia32_selectpbf_256(                                     \
 48      (__mmask16)(U),                                                          \
 49      (__v16bf)_mm256_minmax_pbh((__m256bh)(__v16bf)(A),                       \
 50                                 (__m256bh)(__v16bf)(B), (int)(C)),            \
 51      (__v16bf) __builtin_bit_cast(__m256bh, _mm256_setzero_ps())))
 52
 53#define _mm_minmax_pd(A, B, C)                                                 \
 54  ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
 55      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
 56      (__v2df)_mm_setzero_pd(), (__mmask8)-1))
 57
 58#define _mm_mask_minmax_pd(W, U, A, B, C)                                      \
 59  ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
 60      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
 61      (__v2df)(__m128d)(W), (__mmask8)(U)))
 62
 63#define _mm_maskz_minmax_pd(U, A, B, C)                                        \
 64  ((__m128d)__builtin_ia32_vminmaxpd128_mask(                                  \
 65      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
 66      (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
 67
 68#define _mm256_minmax_pd(A, B, C)                                              \
 69  ((__m256d)__builtin_ia32_vminmaxpd256_mask(                                  \
 70      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
 71      (__v4df)_mm256_setzero_pd(), (__mmask8)-1))
 72
 73#define _mm256_mask_minmax_pd(W, U, A, B, C)                                   \
 74  ((__m256d)__builtin_ia32_vminmaxpd256_mask(                                  \
 75      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
 76      (__v4df)(__m256d)(W), (__mmask8)(U)))
 77
 78#define _mm256_maskz_minmax_pd(U, A, B, C)                                     \
 79  ((__m256d)__builtin_ia32_vminmaxpd256_mask(                                  \
 80      (__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C),                    \
 81      (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))
 82
 83#define _mm_minmax_ph(A, B, C)                                                 \
 84  ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
 85      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
 86      (__v8hf)_mm_setzero_ph(), (__mmask8)-1))
 87
 88#define _mm_mask_minmax_ph(W, U, A, B, C)                                      \
 89  ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
 90      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
 91      (__v8hf)(__m128h)(W), (__mmask16)-1))
 92
 93#define _mm_maskz_minmax_ph(U, A, B, C)                                        \
 94  ((__m128h)__builtin_ia32_vminmaxph128_mask(                                  \
 95      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
 96      (__v8hf)_mm_setzero_ph(), (__mmask8)(U)))
 97
 98#define _mm256_minmax_ph(A, B, C)                                              \
 99  ((__m256h)__builtin_ia32_vminmaxph256_mask(                                  \
100      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
101      (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
102
103#define _mm256_mask_minmax_ph(W, U, A, B, C)                                   \
104  ((__m256h)__builtin_ia32_vminmaxph256_mask(                                  \
105      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
106      (__v16hf)(__m256h)(W), (__mmask16)(U)))
107
108#define _mm256_maskz_minmax_ph(U, A, B, C)                                     \
109  ((__m256h)__builtin_ia32_vminmaxph256_mask(                                  \
110      (__v16hf)(__m256h)(A), (__v16hf)(__m256h)(B), (int)(C),                  \
111      (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
112
113#define _mm_minmax_ps(A, B, C)                                                 \
114  ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
115      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
116      (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
117
118#define _mm_mask_minmax_ps(W, U, A, B, C)                                      \
119  ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
120      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \
121      (__mmask8)(U)))
122
123#define _mm_maskz_minmax_ps(U, A, B, C)                                        \
124  ((__m128)__builtin_ia32_vminmaxps128_mask(                                   \
125      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
126      (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
127
128#define _mm256_minmax_ps(A, B, C)                                              \
129  ((__m256)__builtin_ia32_vminmaxps256_mask(                                   \
130      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
131      (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))
132
133#define _mm256_mask_minmax_ps(W, U, A, B, C)                                   \
134  ((__m256)__builtin_ia32_vminmaxps256_mask(                                   \
135      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), \
136      (__mmask8)(U)))
137
138#define _mm256_maskz_minmax_ps(U, A, B, C)                                     \
139  ((__m256)__builtin_ia32_vminmaxps256_mask(                                   \
140      (__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C),                      \
141      (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))
142
143#define _mm_minmax_sd(A, B, C)                                                 \
144  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
145      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
146      (__v2df)_mm_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
147
148#define _mm_mask_minmax_sd(W, U, A, B, C)                                      \
149  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
150      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
151      (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
152
153#define _mm_maskz_minmax_sd(U, A, B, C)                                        \
154  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
155      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
156      (__v2df)_mm_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
157
158#define _mm_minmax_round_sd(A, B, C, R)                                        \
159  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
160      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
161      (__v2df)_mm_undefined_pd(), (__mmask8)-1, (int)(R)))
162
163#define _mm_mask_minmax_round_sd(W, U, A, B, C, R)                             \
164  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
165      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
166      (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
167
168#define _mm_maskz_minmax_round_sd(U, A, B, C, R)                               \
169  ((__m128d)__builtin_ia32_vminmaxsd_round_mask(                               \
170      (__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C),                    \
171      (__v2df)_mm_setzero_pd(), (__mmask8)(U), (int)(R)))
172
173#define _mm_minmax_sh(A, B, C)                                                 \
174  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
175      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
176      (__v8hf)_mm_undefined_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
177
178#define _mm_mask_minmax_sh(W, U, A, B, C)                                      \
179  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
180      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
181      (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
182
183#define _mm_maskz_minmax_sh(U, A, B, C)                                        \
184  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
185      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
186      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
187
188#define _mm_minmax_round_sh(A, B, C, R)                                        \
189  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
190      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
191      (__v8hf)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
192
193#define _mm_mask_minmax_round_sh(W, U, A, B, C, R)                             \
194  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
195      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
196      (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
197
198#define _mm_maskz_minmax_round_sh(U, A, B, C, R)                               \
199  ((__m128h)__builtin_ia32_vminmaxsh_round_mask(                               \
200      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(C),                    \
201      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
202
203#define _mm_minmax_ss(A, B, C)                                                 \
204  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
205      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
206      (__v4sf)_mm_undefined_ps(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
207
208#define _mm_mask_minmax_ss(W, U, A, B, C)                                      \
209  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
210      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W),         \
211      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
212
213#define _mm_maskz_minmax_ss(U, A, B, C)                                        \
214  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
215      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
216      (__v4sf)_mm_setzero_ps(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
217
218#define _mm_minmax_round_ss(A, B, C, R)                                        \
219  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
220      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
221      (__v4sf)_mm_undefined_ps(), (__mmask8)-1, (int)(R)))
222
223#define _mm_mask_minmax_round_ss(W, U, A, B, C, R)                             \
224  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
225      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(W),         \
226      (__mmask8)(U), (int)(R)))
227
228#define _mm_maskz_minmax_round_ss(U, A, B, C, R)                               \
229  ((__m128)__builtin_ia32_vminmaxss_round_mask(                                \
230      (__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C),                      \
231      (__v4sf)_mm_setzero_ps(), (__mmask8)(U), (int)(R)))
232#endif // __AVX10_2MINMAXINTRIN_H