zig/lib/include/avxvnniint16intrin.h at master

  1/*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9
 10#ifndef __IMMINTRIN_H
 11#error                                                                         \
 12    "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
 13#endif // __IMMINTRIN_H
 14
 15#ifndef __AVXVNNIINT16INTRIN_H
 16#define __AVXVNNIINT16INTRIN_H
 17
 18/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 19///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 20///    signed 16-bit results. Sum these 2 results with the corresponding
 21///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 22///
 23/// \headerfile <immintrin.h>
 24///
 25/// \code
 26/// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
 27/// \endcode
 28///
 29/// This intrinsic corresponds to the \c VPDPWSUD instruction.
 30///
 31/// \param __W
 32///    A 128-bit vector of [4 x int].
 33/// \param __A
 34///    A 128-bit vector of [8 x short].
 35/// \param __B
 36///    A 128-bit vector of [8 x unsigned short].
 37/// \returns
 38///    A 128-bit vector of [4 x int].
 39///
 40/// \code{.operation}
 41/// FOR j := 0 to 3
 42/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 43/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 44/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 45/// ENDFOR
 46/// dst[MAX:128] := 0
 47/// \endcode
 48#define _mm_dpwsud_epi32(__W, __A, __B)                                        \
 49  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
 50                                       (__v4si)(__B)))
 51
 52/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 53///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 54///    signed 16-bit results. Sum these 2 results with the corresponding
 55///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 56///
 57/// \headerfile <immintrin.h>
 58///
 59/// \code
 60/// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
 61/// \endcode
 62///
 63/// This intrinsic corresponds to the \c VPDPWSUD instruction.
 64///
 65/// \param __W
 66///    A 256-bit vector of [8 x int].
 67/// \param __A
 68///    A 256-bit vector of [16 x short].
 69/// \param __B
 70///    A 256-bit vector of [16 x unsigned short].
 71/// \returns
 72///    A 256-bit vector of [8 x int].
 73///
 74/// \code{.operation}
 75/// FOR j := 0 to 7
 76/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
 77/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 78/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 79/// ENDFOR
 80/// dst[MAX:256] := 0
 81/// \endcode
 82#define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
 83  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
 84                                       (__v8si)(__B)))
 85
 86/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 87///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
 88///    signed 16-bit results. Sum these 2 results with the corresponding
 89///    32-bit integer in \a __W with signed saturation, and store the packed
 90///    32-bit results in \a dst.
 91///
 92/// \headerfile <immintrin.h>
 93///
 94/// \code
 95/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
 96/// \endcode
 97///
 98/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
 99///
100/// \param __W
101///    A 128-bit vector of [4 x int].
102/// \param __A
103///    A 128-bit vector of [8 x short].
104/// \param __B
105///    A 128-bit vector of [8 x unsigned short].
106/// \returns
107///    A 128-bit vector of [4 x int].
108///
109/// \code{.operation}
110/// FOR j := 0 to 3
111/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
112/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
113/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
114/// ENDFOR
115/// dst[MAX:128] := 0
116/// \endcode
117#define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
118  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
119                                        (__v4si)(__B)))
120
121/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
122///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
123///    signed 16-bit results. Sum these 2 results with the corresponding
124///    32-bit integer in \a __W with signed saturation, and store the packed
125///    32-bit results in \a dst.
126///
127/// \headerfile <immintrin.h>
128///
129/// \code
130/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
131/// \endcode
132///
133/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
134///
135/// \param __W
136///    A 256-bit vector of [8 x int].
137/// \param __A
138///    A 256-bit vector of [16 x short].
139/// \param __B
140///    A 256-bit vector of [16 x unsigned short].
141/// \returns
142///    A 256-bit vector of [8 x int].
143///
144/// \code{.operation}
145/// FOR j := 0 to 7
146/// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
147/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
148/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
149/// ENDFOR
150/// dst[MAX:256] := 0
151/// \endcode
152#define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
153  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
154                                        (__v8si)(__B)))
155
156/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
157///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
158///    signed 16-bit results. Sum these 2 results with the corresponding
159///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
160///
161/// \headerfile <immintrin.h>
162///
163/// \code
164/// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
165/// \endcode
166///
167/// This intrinsic corresponds to the \c VPDPWUSD instruction.
168///
169/// \param __W
170///    A 128-bit vector of [4 x int].
171/// \param __A
172///    A 128-bit vector of [8 x unsigned short].
173/// \param __B
174///    A 128-bit vector of [8 x short].
175/// \returns
176///    A 128-bit vector of [4 x int].
177///
178/// \code{.operation}
179/// FOR j := 0 to 3
180/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
181/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
182/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
183/// ENDFOR
184/// dst[MAX:128] := 0
185/// \endcode
186#define _mm_dpwusd_epi32(__W, __A, __B)                                        \
187  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
188                                       (__v4si)(__B)))
189
190/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
191///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
192///    signed 16-bit results. Sum these 2 results with the corresponding
193///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
194///
195/// \headerfile <immintrin.h>
196///
197/// \code
198/// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
199/// \endcode
200///
201/// This intrinsic corresponds to the \c VPDPWUSD instruction.
202///
203/// \param __W
204///    A 256-bit vector of [8 x int].
205/// \param __A
206///    A 256-bit vector of [16 x unsigned short].
207/// \param __B
208///    A 256-bit vector of [16 x short].
209/// \returns
210///    A 256-bit vector of [8 x int].
211///
212/// \code{.operation}
213/// FOR j := 0 to 7
214/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
215/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
216/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
217/// ENDFOR
218/// dst[MAX:256] := 0
219/// \endcode
220#define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
221  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
222                                       (__v8si)(__B)))
223
224/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
225///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
226///    signed 16-bit results. Sum these 2 results with the corresponding
227///    32-bit integer in \a __W with signed saturation, and store the packed
228///    32-bit results in \a dst.
229///
230/// \headerfile <immintrin.h>
231///
232/// \code
233/// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
234/// \endcode
235///
236/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
237///
238/// \param __W
239///    A 128-bit vector of [4 x int].
240/// \param __A
241///    A 128-bit vector of [8 x unsigned short].
242/// \param __B
243///    A 128-bit vector of [8 x short].
244/// \returns
245///    A 128-bit vector of [4 x int].
246///
247/// \code{.operation}
248/// FOR j := 0 to 3
249/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
250/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
251/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
252/// ENDFOR
253/// dst[MAX:128] := 0
254/// \endcode
255#define _mm_dpwusds_epi32(__W, __A, __B)                                       \
256  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A),          \
257                                        (__v4si)(__B)))
258
259/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
260///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
261///    signed 16-bit results. Sum these 2 results with the corresponding
262///    32-bit integer in \a __W with signed saturation, and store the packed
263///    32-bit results in \a dst.
264///
265/// \headerfile <immintrin.h>
266///
267/// \code
268/// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
269/// \endcode
270///
271/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
272///
273/// \param __W
274///    A 256-bit vector of [8 x int].
275/// \param __A
276///    A 256-bit vector of [16 x unsigned short].
277/// \param __B
278///    A 256-bit vector of [16 x short].
279/// \returns
280///    A 256-bit vector of [8 x int].
281///
282/// \code{.operation}
283/// FOR j := 0 to 7
284/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
285/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
286/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
287/// ENDFOR
288/// dst[MAX:256] := 0
289/// \endcode
290#define _mm256_dpwusds_epi32(__W, __A, __B)                                    \
291  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A),          \
292                                        (__v8si)(__B)))
293
294/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
295///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
296///    signed 16-bit results. Sum these 2 results with the corresponding
297///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
298///
299/// \headerfile <immintrin.h>
300///
301/// \code
302/// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
303/// \endcode
304///
305/// This intrinsic corresponds to the \c VPDPWUUD instruction.
306///
307/// \param __W
308///    A 128-bit vector of [4 x unsigned int].
309/// \param __A
310///    A 128-bit vector of [8 x unsigned short].
311/// \param __B
312///    A 128-bit vector of [8 x unsigned short].
313/// \returns
314///    A 128-bit vector of [4 x unsigned int].
315///
316/// \code{.operation}
317/// FOR j := 0 to 3
318/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
319/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
320/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
321/// ENDFOR
322/// dst[MAX:128] := 0
323/// \endcode
324#define _mm_dpwuud_epi32(__W, __A, __B)                                        \
325  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A),           \
326                                       (__v4si)(__B)))
327
328/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
329///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
330///    signed 16-bit results. Sum these 2 results with the corresponding
331///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
332///
333/// \headerfile <immintrin.h>
334///
335/// \code
336/// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
337/// \endcode
338///
339/// This intrinsic corresponds to the \c VPDPWUUD instruction.
340///
341/// \param __W
342///    A 256-bit vector of [8 x unsigned int].
343/// \param __A
344///    A 256-bit vector of [16 x unsigned short].
345/// \param __B
346///    A 256-bit vector of [16 x unsigned short].
347/// \returns
348///    A 256-bit vector of [8 x unsigned int].
349///
350/// \code{.operation}
351/// FOR j := 0 to 7
352/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
353/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
354/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
355/// ENDFOR
356/// dst[MAX:256] := 0
357/// \endcode
358#define _mm256_dpwuud_epi32(__W, __A, __B)                                     \
359  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A),           \
360                                       (__v8si)(__B)))
361
362/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
363///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
364///    signed 16-bit results. Sum these 2 results with the corresponding
365///    32-bit integer in \a __W with signed saturation, and store the packed
366///    32-bit results in \a dst.
367///
368/// \headerfile <immintrin.h>
369///
370/// \code
371/// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
372/// \endcode
373///
374/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
375///
376/// \param __W
377///    A 128-bit vector of [4 x unsigned int].
378/// \param __A
379///    A 128-bit vector of [8 x unsigned short].
380/// \param __B
381///    A 128-bit vector of [8 x unsigned short].
382/// \returns
383///    A 128-bit vector of [4 x unsigned int].
384///
385/// \code{.operation}
386/// FOR j := 0 to 3
387/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
388/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
389/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
390/// ENDFOR
391/// dst[MAX:128] := 0
392/// \endcode
393#define _mm_dpwuuds_epi32(__W, __A, __B)                                       \
394  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A),          \
395                                        (__v4si)(__B)))
396
397/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
398///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
399///    signed 16-bit results. Sum these 2 results with the corresponding
400///    32-bit integer in \a __W with signed saturation, and store the packed
401///    32-bit results in \a dst.
402///
403/// \headerfile <immintrin.h>
404///
405/// \code
406/// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
407/// \endcode
408///
409/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
410///
411/// \param __W
412///    A 256-bit vector of [8 x unsigned int].
413/// \param __A
414///    A 256-bit vector of [16 x unsigned short].
415/// \param __B
416///    A 256-bit vector of [16 x unsigned short].
417/// \returns
418///    A 256-bit vector of [8 x unsigned int].
419///
420/// \code{.operation}
421/// FOR j := 0 to 7
422/// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
423/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
424/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
425/// ENDFOR
426/// dst[MAX:256] := 0
427/// \endcode
428#define _mm256_dpwuuds_epi32(__W, __A, __B)                                    \
429  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A),          \
430                                        (__v8si)(__B)))
431
432#endif // __AVXVNNIINT16INTRIN_H