master
  1/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
  2 *
  3 *
  4 * Permission is hereby granted, free of charge, to any person obtaining a copy
  5 * of this software and associated documentation files (the "Software"), to deal
  6 * in the Software without restriction, including without limitation the rights
  7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 * copies of the Software, and to permit persons to whom the Software is
  9 * furnished to do so, subject to the following conditions:
 10 *
 11 * The above copyright notice and this permission notice shall be included in
 12 * all copies or substantial portions of the Software.
 13 *
 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 20 * THE SOFTWARE.
 21 *
 22 *===-----------------------------------------------------------------------===
 23 */
 24#ifndef __IMMINTRIN_H
 25#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
 26#endif
 27
 28#ifndef __AVXVNNIINTRIN_H
 29#define __AVXVNNIINTRIN_H
 30
 31/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
 32/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
 33/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
 34/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
 35/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
 36/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
 37/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
 38/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
 39/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
 40
 41/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
 42/* Define the default attributes for the functions in this file. */
 43#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
 44#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
 45
 46/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 47/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
 48/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
 49/// in \a __S, and store the packed 32-bit results in DST.
 50///
 51/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
 52///
 53/// \code{.operation}
 54///    FOR j := 0 to 7
 55///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 56///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
 57///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
 58///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
 59///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
 60///    ENDFOR
 61///    DST[MAX:256] := 0
 62/// \endcode
 63static __inline__ __m256i __DEFAULT_FN_ATTRS256
 64_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 65{
 66  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
 67}
 68
 69/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
 70/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
 71/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
 72/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
 73///
 74/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
 75///
 76/// \code{.operation}
 77///    FOR j := 0 to 7
 78///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
 79///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
 80///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
 81///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
 82///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
 83///    ENDFOR
 84///    DST[MAX:256] := 0
 85/// \endcode
 86static __inline__ __m256i __DEFAULT_FN_ATTRS256
 87_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 88{
 89  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
 90}
 91
 92/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 93/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
 94/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
 95///  and store the packed 32-bit results in DST.
 96///
 97/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
 98///
 99/// \code{.operation}
100///    FOR j := 0 to 7
101///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
102///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
103///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
104///    ENDFOR
105///    DST[MAX:256] := 0
106/// \endcode
107static __inline__ __m256i __DEFAULT_FN_ATTRS256
108_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
109{
110  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
111}
112
113/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
114/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
115/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
116/// using signed saturation, and store the packed 32-bit results in DST.
117///
118/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
119///
120/// \code{.operation}
121///    FOR j := 0 to 7
122///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
123///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
124///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
125///    ENDFOR
126///    DST[MAX:256] := 0
127/// \endcode
128static __inline__ __m256i __DEFAULT_FN_ATTRS256
129_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
130{
131  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
132}
133
134/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
135/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
136/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
137/// in \a __S, and store the packed 32-bit results in DST.
138///
139/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
140///
141/// \code{.operation}
142///    FOR j := 0 to 3
143///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
144///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
145///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
146///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
147///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
148///    ENDFOR
149///    DST[MAX:128] := 0
150/// \endcode
151static __inline__ __m128i __DEFAULT_FN_ATTRS128
152_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
153{
154  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
155}
156
157/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
158/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
159/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
160/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
161///
162/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
163///
164/// \code{.operation}
165///    FOR j := 0 to 3
166///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
167///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
168///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
169///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
170///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
171///    ENDFOR
172///    DST[MAX:128] := 0
173/// \endcode
174static __inline__ __m128i __DEFAULT_FN_ATTRS128
175_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
176{
177  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
178}
179
180/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
181/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
182/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
183/// and store the packed 32-bit results in DST.
184///
185/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
186///
187/// \code{.operation}
188///    FOR j := 0 to 3
189///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
190///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
191///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
192///    ENDFOR
193///    DST[MAX:128] := 0
194/// \endcode
195static __inline__ __m128i __DEFAULT_FN_ATTRS128
196_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
197{
198  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
199}
200
201/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
202/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
203/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
204/// using signed saturation, and store the packed 32-bit results in DST.
205///
206/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
207///
208/// \code{.operation}
209///    FOR j := 0 to 3
210///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
211///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
212///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
213///    ENDFOR
214///    DST[MAX:128] := 0
215/// \endcode
216static __inline__ __m128i __DEFAULT_FN_ATTRS128
217_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
218{
219  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
220}
221
222#undef __DEFAULT_FN_ATTRS128
223#undef __DEFAULT_FN_ATTRS256
224
225#endif // __AVXVNNIINTRIN_H