zig/lib/include/avxvnniint8intrin.h at master

  1/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9#ifndef __IMMINTRIN_H
 10#error                                                                         \
 11    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
 12#endif
 13
 14#ifndef __AVXVNNIINT8INTRIN_H
 15#define __AVXVNNIINT8INTRIN_H
 16
 17/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 18///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 19///    signed 16-bit results. Sum these 4 results with the corresponding
 20///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 21///
 22/// \headerfile <x86intrin.h>
 23///
 24/// \code
 25/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
 26/// \endcode
 27///
 28/// This intrinsic corresponds to the \c VPDPBSSD instruction.
 29///
 30/// \param __A
 31///    A 128-bit vector of [16 x char].
 32/// \param __B
 33///    A 128-bit vector of [16 x char].
 34/// \returns
 35///    A 128-bit vector of [4 x int].
 36///
 37/// \code{.operation}
 38/// FOR j := 0 to 3
 39/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
 40/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
 41/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
 42/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
 43/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 44/// ENDFOR
 45/// dst[MAX:128] := 0
 46/// \endcode
 47#define _mm_dpbssd_epi32(__W, __A, __B)                                        \
 48  ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A),           \
 49                                       (__v4si)(__B)))
 50
 51/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 52///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 53///    signed 16-bit results. Sum these 4 results with the corresponding
 54///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
 55///
 56/// \headerfile <x86intrin.h>
 57///
 58/// \code
 59/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
 60/// \endcode
 61///
 62/// This intrinsic corresponds to the \c VPDPBSSD instruction.
 63///
 64/// \param __A
 65///    A 256-bit vector of [32 x char].
 66/// \param __B
 67///    A 256-bit vector of [32 x char].
 68/// \returns
 69///    A 256-bit vector of [8 x int].
 70///
 71/// \code{.operation}
 72/// FOR j := 0 to 7
 73/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
 74/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
 75/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
 76/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
 77/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 78/// ENDFOR
 79/// dst[MAX:256] := 0
 80/// \endcode
 81#define _mm256_dpbssd_epi32(__W, __A, __B)                                     \
 82  ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A),           \
 83                                       (__v8si)(__B)))
 84
 85/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
 86///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
 87///    signed 16-bit results. Sum these 4 results with the corresponding
 88///    32-bit integer in \a __W with signed saturation, and store the packed
 89///    32-bit results in \a dst.
 90///
 91/// \headerfile <x86intrin.h>
 92///
 93/// \code
 94/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
 95/// \endcode
 96///
 97/// This intrinsic corresponds to the \c VPDPBSSD instruction.
 98///
 99/// \param __A
100///    A 128-bit vector of [16 x char].
101/// \param __B
102///    A 128-bit vector of [16 x char].
103/// \returns
104///    A 128-bit vector of [4 x int].
105///
106/// \code{.operation}
107/// FOR j := 0 to 3
108/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
109/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
110/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
111/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
112/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
113/// ENDFOR
114/// dst[MAX:128] := 0
115/// \endcode
116#define _mm_dpbssds_epi32(__W, __A, __B)                                       \
117  ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A),          \
118                                        (__v4si)(__B)))
119
120/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
121///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
122///    signed 16-bit results. Sum these 4 results with the corresponding
123///    32-bit integer in \a __W with signed saturation, and store the packed
124///    32-bit results in \a dst.
125///
126/// \headerfile <x86intrin.h>
127///
128/// \code
129/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
130/// \endcode
131///
132/// This intrinsic corresponds to the \c VPDPBSSD instruction.
133///
134/// \param __A
135///    A 256-bit vector of [32 x char].
136/// \param __B
137///    A 256-bit vector of [32 x char].
138/// \returns
139///    A 256-bit vector of [8 x int].
140///
141/// \code{.operation}
142/// FOR j := 0 to 7
143/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
144/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
145/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
146/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
147/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
148/// ENDFOR
149/// dst[MAX:256] := 0
150/// \endcode
151#define _mm256_dpbssds_epi32(__W, __A, __B)                                    \
152  ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A),          \
153                                        (__v8si)(__B)))
154
155/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
156///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
157///    signed 16-bit results. Sum these 4 results with the corresponding
158///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
159///
160/// \headerfile <x86intrin.h>
161///
162/// \code
163/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
164/// \endcode
165///
166/// This intrinsic corresponds to the \c VPDPBSSD instruction.
167///
168/// \param __A
169///    A 128-bit vector of [16 x char].
170/// \param __B
171///    A 128-bit vector of [16 x unsigned char].
172/// \returns
173///    A 128-bit vector of [4 x int].
174///
175/// \code{.operation}
176/// FOR j := 0 to 3
177/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
178/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
179/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
180/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
181/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
182/// ENDFOR
183/// dst[MAX:128] := 0
184/// \endcode
185#define _mm_dpbsud_epi32(__W, __A, __B)                                        \
186  ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A),           \
187                                       (__v4si)(__B)))
188
189/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
190///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
191///    signed 16-bit results. Sum these 4 results with the corresponding
192///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
193///
194/// \headerfile <x86intrin.h>
195///
196/// \code
197/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
198/// \endcode
199///
200/// This intrinsic corresponds to the \c VPDPBSSD instruction.
201///
202/// \param __A
203///    A 256-bit vector of [32 x char].
204/// \param __B
205///    A 256-bit vector of [32 x unsigned char].
206/// \returns
207///    A 256-bit vector of [8 x int].
208///
209/// \code{.operation}
210/// FOR j := 0 to 7
211/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
212/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
213/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
214/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
215/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
216/// ENDFOR
217/// dst[MAX:256] := 0
218/// \endcode
219#define _mm256_dpbsud_epi32(__W, __A, __B)                                     \
220  ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A),           \
221                                       (__v8si)(__B)))
222
223/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
224///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
225///    signed 16-bit results. Sum these 4 results with the corresponding
226///    32-bit integer in \a __W with signed saturation, and store the packed
227///    32-bit results in \a dst.
228///
229/// \headerfile <x86intrin.h>
230///
231/// \code
232/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
233/// \endcode
234///
235/// This intrinsic corresponds to the \c VPDPBSSD instruction.
236///
237/// \param __A
238///    A 128-bit vector of [16 x char].
239/// \param __B
240///    A 128-bit vector of [16 x unsigned char].
241/// \returns
242///    A 128-bit vector of [4 x int].
243///
244/// \code{.operation}
245/// FOR j := 0 to 3
246/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
247/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
248/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
249/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
250/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
251/// ENDFOR
252/// dst[MAX:128] := 0
253/// \endcode
254#define _mm_dpbsuds_epi32(__W, __A, __B)                                       \
255  ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A),          \
256                                        (__v4si)(__B)))
257
258/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
259///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
260///    signed 16-bit results. Sum these 4 results with the corresponding
261///    32-bit integer in \a __W with signed saturation, and store the packed
262///    32-bit results in \a dst.
263///
264/// \headerfile <x86intrin.h>
265///
266/// \code
267/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
268/// \endcode
269///
270/// This intrinsic corresponds to the \c VPDPBSSD instruction.
271///
272/// \param __A
273///    A 256-bit vector of [32 x char].
274/// \param __B
275///    A 256-bit vector of [32 x unsigned char].
276/// \returns
277///    A 256-bit vector of [8 x int].
278///
279/// \code{.operation}
280/// FOR j := 0 to 7
281/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
282/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
283/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
284/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
285/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
286/// ENDFOR
287/// dst[MAX:256] := 0
288/// \endcode
289#define _mm256_dpbsuds_epi32(__W, __A, __B)                                    \
290  ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A),          \
291                                        (__v8si)(__B)))
292
293/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
294///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
295///    signed 16-bit results. Sum these 4 results with the corresponding
296///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
297///
298/// \headerfile <x86intrin.h>
299///
300/// \code
301/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
302/// \endcode
303///
304/// This intrinsic corresponds to the \c VPDPBSSD instruction.
305///
306/// \param __A
307///    A 128-bit vector of [16 x unsigned char].
308/// \param __B
309///    A 128-bit vector of [16 x unsigned char].
310/// \returns
311///    A 128-bit vector of [4 x int].
312///
313/// \code{.operation}
314/// FOR j := 0 to 3
315/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
316/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
317/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
318/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
319/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
320/// ENDFOR
321/// dst[MAX:128] := 0
322/// \endcode
323#define _mm_dpbuud_epi32(__W, __A, __B)                                        \
324  ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A),           \
325                                       (__v4si)(__B)))
326
327/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
328///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
329///    signed 16-bit results. Sum these 4 results with the corresponding
330///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
331///
332/// \headerfile <x86intrin.h>
333///
334/// \code
335/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
336/// \endcode
337///
338/// This intrinsic corresponds to the \c VPDPBSSD instruction.
339///
340/// \param __A
341///    A 256-bit vector of [32 x unsigned char].
342/// \param __B
343///    A 256-bit vector of [32 x unsigned char].
344/// \returns
345///    A 256-bit vector of [8 x int].
346///
347/// \code{.operation}
348/// FOR j := 0 to 7
349/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
350/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
351/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
352/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
353/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
354/// ENDFOR
355/// dst[MAX:256] := 0
356/// \endcode
357#define _mm256_dpbuud_epi32(__W, __A, __B)                                     \
358  ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A),           \
359                                       (__v8si)(__B)))
360
361/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
362///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
363///    signed 16-bit results. Sum these 4 results with the corresponding
364///    32-bit integer in \a __W with signed saturation, and store the packed
365///    32-bit results in \a dst.
366///
367/// \headerfile <x86intrin.h>
368///
369/// \code
370/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
371/// \endcode
372///
373/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
374///
375/// \param __A
376///    A 128-bit vector of [16 x unsigned char].
377/// \param __B
378///    A 128-bit vector of [16 x unsigned char].
379/// \returns
380///    A 128-bit vector of [4 x int].
381///
382/// \code{.operation}
383/// FOR j := 0 to 3
384/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
385/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
386/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
387/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
388/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
389/// ENDFOR
390/// dst[MAX:128] := 0
391/// \endcode
392#define _mm_dpbuuds_epi32(__W, __A, __B)                                       \
393  ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A),          \
394                                        (__v4si)(__B)))
395
396///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
397///    signed 16-bit results. Sum these 4 results with the corresponding
398///    32-bit integer in \a __W with signed saturation, and store the packed
399///    32-bit results in \a dst.
400///
401/// \headerfile <x86intrin.h>
402///
403/// \code
404/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
405/// \endcode
406///
407/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
408///
409/// \param __A
410///    A 256-bit vector of [32 x unsigned char].
411/// \param __B
412///    A 256-bit vector of [32 x unsigned char].
413/// \returns
414///    A 256-bit vector of [8 x int].
415///
416/// \code{.operation}
417/// FOR j := 0 to 7
418/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
419/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
420/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
421/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
422/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
423/// ENDFOR
424/// dst[MAX:256] := 0
425/// \endcode
426#define _mm256_dpbuuds_epi32(__W, __A, __B)                                    \
427  ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A),          \
428                                        (__v8si)(__B)))
429
430#endif // __AVXVNNIINT8INTRIN_H