master
  1/* ===--- amxtransposeintrin.h - AMX_TRANSPOSE intrinsics -*- C++ -*---------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 * ===-----------------------------------------------------------------------===
  8 */
  9
 10#ifndef __IMMINTRIN_H
 11#error "Never use <amxtransposeintrin.h> directly; use <immintrin.h> instead."
 12#endif /* __IMMINTRIN_H */
 13
 14#ifndef __AMX_TRANSPOSEINTRIN_H
 15#define __AMX_TRANSPOSEINTRIN_H
 16#ifdef __x86_64__
 17
 18#define __DEFAULT_FN_ATTRS_TRANSPOSE                                           \
 19  __attribute__((__always_inline__, __nodebug__, __target__("amx-transpose")))
 20
 21#define _tile_2rpntlvwz0(tdst, base, stride)                                   \
 22  __builtin_ia32_t2rpntlvwz0(tdst, base, stride)
 23#define _tile_2rpntlvwz0t1(tdst, base, stride)                                 \
 24  __builtin_ia32_t2rpntlvwz0t1(tdst, base, stride)
 25#define _tile_2rpntlvwz1(tdst, base, stride)                                   \
 26  __builtin_ia32_t2rpntlvwz1(tdst, base, stride)
 27#define _tile_2rpntlvwz1t1(tdst, base, stride)                                 \
 28  __builtin_ia32_t2rpntlvwz1t1(tdst, base, stride)
 29
 30/// Transpose 32-bit elements from \a src and write the result to \a dst.
 31///
 32/// \headerfile <immintrin.h>
 33///
 34/// \code
 35/// void _tile_transposed(__tile dst, __tile src);
 36/// \endcode
 37///
 38/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
 39///
 40/// \param dst
 41/// 	The destination tile. Max size is 1024 Bytes.
 42/// \param src
 43/// 	The source tile. Max size is 1024 Bytes.
 44///
 45/// \code{.operation}
 46///
 47/// FOR i := 0 TO (dst.rows-1)
 48/// 	tmp[511:0] := 0
 49/// 	FOR j := 0 TO (dst.colsb/4-1)
 50/// 		tmp.dword[j] := src.row[j].dword[i]
 51/// 	ENDFOR
 52/// 	dst.row[i] := tmp
 53/// ENDFOR
 54///
 55/// zero_upper_rows(dst, dst.rows)
 56/// zero_tileconfig_start()
 57/// \endcode
 58#define _tile_transposed(dst, src) __builtin_ia32_ttransposed(dst, src)
 59
 60static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0_internal(
 61    unsigned short row, unsigned short col0, unsigned short col1,
 62    _tile1024i *dst0, _tile1024i *dst1, const void *base,
 63    __SIZE_TYPE__ stride) {
 64  // Use __tile1024i_1024a* to escape the alignment check in
 65  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
 66  __builtin_ia32_t2rpntlvwz0_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
 67                                      (_tile1024i_1024a *)dst1, base,
 68                                      (__SIZE_TYPE__)(stride));
 69}
 70
 71static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz0t1_internal(
 72    unsigned short row, unsigned short col0, unsigned short col1,
 73    _tile1024i *dst0, _tile1024i *dst1, const void *base,
 74    __SIZE_TYPE__ stride) {
 75  __builtin_ia32_t2rpntlvwz0t1_internal(
 76      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
 77      (__SIZE_TYPE__)(stride));
 78}
 79
 80static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1_internal(
 81    unsigned short row, unsigned short col0, unsigned short col1,
 82    _tile1024i *dst0, _tile1024i *dst1, const void *base,
 83    __SIZE_TYPE__ stride) {
 84  __builtin_ia32_t2rpntlvwz1_internal(row, col0, col1, (_tile1024i_1024a *)dst0,
 85                                      (_tile1024i_1024a *)dst1, base,
 86                                      (__SIZE_TYPE__)(stride));
 87}
 88
 89static __inline__ void __DEFAULT_FN_ATTRS_TRANSPOSE _tile_2rpntlvwz1t1_internal(
 90    unsigned short row, unsigned short col0, unsigned short col1,
 91    _tile1024i *dst0, _tile1024i *dst1, const void *base,
 92    __SIZE_TYPE__ stride) {
 93  __builtin_ia32_t2rpntlvwz1t1_internal(
 94      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
 95      (__SIZE_TYPE__)(stride));
 96}
 97
 98// This is internal intrinsic. C/C++ user should avoid calling it directly.
 99static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TRANSPOSE
100_tile_transposed_internal(unsigned short m, unsigned short n, _tile1024i src) {
101  return __builtin_ia32_ttransposed_internal(m, n, src);
102}
103
104/// Converts a pair of tiles from memory into VNNI format, and places the
105/// results in a pair of destinations specified by dst. The pair of tiles
106/// in memory is specified via a tsib; the second tile is after the first
107/// one, separated by the same stride that separates each row.
108/// The tile configuration for the destination tiles indicates the amount
109/// of data to read from memory. The instruction will load a number of rows
110/// that is equal to twice the number of rows in tmm1. The size of each row
111/// is equal to the average width of the destination tiles. If the second
112/// tile is configured with zero rows and columns, only the first tile will
113/// be written.
114/// Provides a hint to the implementation that the data will likely not be
115/// reused in the near future and the data caching can be optimized.
116///
117/// \headerfile <immintrin.h>
118///
119/// This intrinsic corresponds to the <c> T2RPNTLVWZ0 </c> instruction.
120///
121/// \param dst0
122///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
123/// \param dst1
124///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
125/// \param base
126///    A pointer to base address.
127/// \param stride
128///    The stride between the rows' data to be loaded in memory.
129__DEFAULT_FN_ATTRS_TRANSPOSE
130static void __tile_2rpntlvwz0(__tile1024i *dst0, __tile1024i *dst1,
131                              const void *base, __SIZE_TYPE__ stride) {
132  _tile_2rpntlvwz0_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
133                            &dst1->tile, base, stride);
134}
135
136/// Converts a pair of tiles from memory into VNNI format, and places the
137/// results in a pair of destinations specified by dst. The pair of tiles
138/// in memory is specified via a tsib; the second tile is after the first
139/// one, separated by the same stride that separates each row.
140/// The tile configuration for the destination tiles indicates the amount
141/// of data to read from memory. The instruction will load a number of rows
142/// that is equal to twice the number of rows in tmm1. The size of each row
143/// is equal to the average width of the destination tiles. If the second
144/// tile is configured with zero rows and columns, only the first tile will
145/// be written.
146///
147/// \headerfile <immintrin.h>
148///
149/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1 </c> instruction.
150///
151/// \param dst0
152///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
153/// \param dst1
154///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
155/// \param base
156///    A pointer to base address.
157/// \param stride
158///    The stride between the rows' data to be loaded in memory.
159__DEFAULT_FN_ATTRS_TRANSPOSE
160static void __tile_2rpntlvwz0t1(__tile1024i *dst0, __tile1024i *dst1,
161                                const void *base, __SIZE_TYPE__ stride) {
162  _tile_2rpntlvwz0t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
163                              &dst1->tile, base, stride);
164}
165
166/// Converts a pair of tiles from memory into VNNI format, and places the
167/// results in a pair of destinations specified by dst. The pair of tiles
168/// in memory is specified via a tsib; the second tile is after the first
169/// one, separated by the same stride that separates each row.
170/// The tile configuration for the destination tiles indicates the amount
171/// of data to read from memory. The instruction will load a number of rows
172/// that is equal to twice the number of rows in tmm1. The size of each row
173/// is equal to the average width of the destination tiles. If the second
174/// tile is configured with zero rows and columns, only the first tile will
175/// be written. The last row will be not be read from memory but instead
176/// filled with zeros.
177/// Provides a hint to the implementation that the data will likely not be
178/// reused in the near future and the data caching can be optimized.
179///
180/// \headerfile <immintrin.h>
181///
182/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
183///
184/// \param dst0
185///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
186/// \param dst1
187///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
188/// \param base
189///    A pointer to base address.
190/// \param stride
191///    The stride between the rows' data to be loaded in memory.
192__DEFAULT_FN_ATTRS_TRANSPOSE
193static void __tile_2rpntlvwz1(__tile1024i *dst0, __tile1024i *dst1,
194                              const void *base, __SIZE_TYPE__ stride) {
195  _tile_2rpntlvwz1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
196                            &dst1->tile, base, stride);
197}
198
199/// Converts a pair of tiles from memory into VNNI format, and places the
200/// results in a pair of destinations specified by dst. The pair of tiles
201/// in memory is specified via a tsib; the second tile is after the first
202/// one, separated by the same stride that separates each row.
203/// The tile configuration for the destination tiles indicates the amount
204/// of data to read from memory. The instruction will load a number of rows
205/// that is equal to twice the number of rows in tmm1. The size of each row
206/// is equal to the average width of the destination tiles. If the second
207/// tile is configured with zero rows and columns, only the first tile will
208/// be written. The last row will be not be read from memory but instead
209/// filled with zeros.
210/// Provides a hint to the implementation that the data will likely not be
211/// reused in the near future and the data caching can be optimized.
212///
213/// \headerfile <immintrin.h>
214///
215/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1 </c> instruction.
216///
217/// \param dst0
218///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
219/// \param dst1
220///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
221/// \param base
222///    A pointer to base address.
223/// \param stride
224///    The stride between the rows' data to be loaded in memory.
225__DEFAULT_FN_ATTRS_TRANSPOSE
226static void __tile_2rpntlvwz1t1(__tile1024i *dst0, __tile1024i *dst1,
227                                const void *base, __SIZE_TYPE__ stride) {
228  _tile_2rpntlvwz1t1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
229                              &dst1->tile, base, stride);
230}
231
232/// Transpose 32-bit elements from src and write the result to dst.
233///
234/// \headerfile <immintrin.h>
235///
236/// This intrinsic corresponds to the <c> TTRANSPOSED </c> instruction.
237///
238/// \param dst
239///    The destination tile. Max size is 1024 Bytes.
240/// \param src
241///    The source tile. Max size is 1024 Bytes.
242__DEFAULT_FN_ATTRS_TRANSPOSE
243static void __tile_transposed(__tile1024i *dst, __tile1024i src) {
244  dst->tile = _tile_transposed_internal(dst->row, dst->col, src.tile);
245}
246
247#endif /* __x86_64__ */
248#endif /* __AMX_TRANSPOSEINTRIN_H */