master
  1/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===------------------------------------------------------------------------===
  8 */
  9
 10#ifndef __IMMINTRIN_H
 11#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
 12#endif /* __IMMINTRIN_H */
 13
 14#ifndef __AMXINTRIN_H
 15#define __AMXINTRIN_H
 16#ifdef __x86_64__
 17
 18/* Define the default attributes for the functions in this file. */
 19#define __DEFAULT_FN_ATTRS_TILE                                                \
 20  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
 21#define __DEFAULT_FN_ATTRS_INT8                                                \
 22  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
 23#define __DEFAULT_FN_ATTRS_BF16                                                \
 24  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
 25
 26/// Load tile configuration from a 64-byte memory location specified by
 27/// "mem_addr". The tile configuration includes the tile type palette, the
 28/// number of bytes per row, and the number of rows. If the specified
 29/// palette_id is zero, that signifies the init state for both the tile
 30/// config and the tile data, and the tiles are zeroed. Any invalid
 31/// configurations will result in #GP fault.
 32///
 33/// \headerfile <immintrin.h>
 34///
 35/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
 36///
 37/// \param __config
 38///    A pointer to 512-bits configuration
 39static __inline__ void __DEFAULT_FN_ATTRS_TILE
 40_tile_loadconfig(const void *__config) {
 41  __builtin_ia32_tile_loadconfig(__config);
 42}
 43
 44/// Stores the current tile configuration to a 64-byte memory location
 45/// specified by "mem_addr". The tile configuration includes the tile type
 46/// palette, the number of bytes per row, and the number of rows. If tiles
 47/// are not configured, all zeroes will be stored to memory.
 48///
 49/// \headerfile <immintrin.h>
 50///
 51/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
 52///
 53/// \param __config
 54///    A pointer to 512-bits configuration
 55static __inline__ void __DEFAULT_FN_ATTRS_TILE
 56_tile_storeconfig(void *__config) {
 57  __builtin_ia32_tile_storeconfig(__config);
 58}
 59
 60/// Release the tile configuration to return to the init state, which
 61/// releases all storage it currently holds.
 62///
 63/// \headerfile <immintrin.h>
 64///
 65/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
 66static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
 67  __builtin_ia32_tilerelease();
 68}
 69
 70/// Load tile rows from memory specifieid by "base" address and "stride" into
 71/// destination tile "dst" using the tile configuration previously configured
 72/// via "_tile_loadconfig".
 73///
 74/// \headerfile <immintrin.h>
 75///
 76/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
 77///
 78/// \param dst
 79///    A destination tile. Max size is 1024 Bytes.
 80/// \param base
 81///    A pointer to base address.
 82/// \param stride
 83///    The stride between the rows' data to be loaded in memory.
 84#define _tile_loadd(dst, base, stride)                                         \
 85  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
 86                             (__SIZE_TYPE__)(stride))
 87
 88/// Load tile rows from memory specifieid by "base" address and "stride" into
 89/// destination tile "dst" using the tile configuration previously configured
 90/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
 91/// that the data will likely not be reused in the near future and the data
 92/// caching can be optimized accordingly.
 93///
 94/// \headerfile <immintrin.h>
 95///
 96/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
 97///
 98/// \param dst
 99///    A destination tile. Max size is 1024 Bytes.
100/// \param base
101///    A pointer to base address.
102/// \param stride
103///    The stride between the rows' data to be loaded in memory.
104#define _tile_stream_loadd(dst, base, stride)                                  \
105  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
106                               (__SIZE_TYPE__)(stride))
107
108/// Store the tile specified by "src" to memory specifieid by "base" address and
109/// "stride" using the tile configuration previously configured via
110/// "_tile_loadconfig".
111///
112/// \headerfile <immintrin.h>
113///
114/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
115///
116/// \param dst
117///    A destination tile. Max size is 1024 Bytes.
118/// \param base
119///    A pointer to base address.
120/// \param stride
121///    The stride between the rows' data to be stored in memory.
122#define _tile_stored(dst, base, stride)                                        \
123  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
124
125/// Zero the tile specified by "tdest".
126///
127/// \headerfile <immintrin.h>
128///
129/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
130///
131/// \param tile
132///    The destination tile to be zero. Max size is 1024 Bytes.
133#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
134
135/// Compute dot-product of bytes in tiles with a source/destination accumulator.
136/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
137/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
138/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
139/// and store the 32-bit result back to tile "dst".
140///
141/// \headerfile <immintrin.h>
142///
143/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
144///
145/// \param dst
146///    The destination tile. Max size is 1024 Bytes.
147/// \param src0
148///    The 1st source tile. Max size is 1024 Bytes.
149/// \param src1
150///    The 2nd source tile. Max size is 1024 Bytes.
151#define _tile_dpbssd(dst, src0, src1)                                          \
152  __builtin_ia32_tdpbssd((dst), (src0), (src1))
153
154/// Compute dot-product of bytes in tiles with a source/destination accumulator.
155/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
156/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
157/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
158/// in "dst", and store the 32-bit result back to tile "dst".
159///
160/// \headerfile <immintrin.h>
161///
162/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
163///
164/// \param dst
165///    The destination tile. Max size is 1024 Bytes.
166/// \param src0
167///    The 1st source tile. Max size is 1024 Bytes.
168/// \param src1
169///    The 2nd source tile. Max size is 1024 Bytes.
170#define _tile_dpbsud(dst, src0, src1)                                          \
171  __builtin_ia32_tdpbsud((dst), (src0), (src1))
172
173/// Compute dot-product of bytes in tiles with a source/destination accumulator.
174/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
175/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
176/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
177/// and store the 32-bit result back to tile "dst".
178///
179/// \headerfile <immintrin.h>
180///
181/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
182///
183/// \param dst
184///    The destination tile. Max size is 1024 Bytes.
185/// \param src0
186///    The 1st source tile. Max size is 1024 Bytes.
187/// \param src1
188///    The 2nd source tile. Max size is 1024 Bytes.
189#define _tile_dpbusd(dst, src0, src1)                                          \
190  __builtin_ia32_tdpbusd((dst), (src0), (src1))
191
192/// Compute dot-product of bytes in tiles with a source/destination accumulator.
193/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
194/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
195/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
196/// "dst", and store the 32-bit result back to tile "dst".
197///
198/// \headerfile <immintrin.h>
199///
200/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
201///
202/// \param dst
203///    The destination tile. Max size is 1024 Bytes.
204/// \param src0
205///    The 1st source tile. Max size is 1024 Bytes.
206/// \param src1
207///    The 2nd source tile. Max size is 1024 Bytes.
208#define _tile_dpbuud(dst, src0, src1)                                          \
209  __builtin_ia32_tdpbuud((dst), (src0), (src1))
210
211/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
212/// src1, accumulating the intermediate single-precision (32-bit) floating-point
213/// elements with elements in "dst", and store the 32-bit result back to tile
214/// "dst".
215///
216/// \headerfile <immintrin.h>
217///
218/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
219///
220/// \param dst
221///    The destination tile. Max size is 1024 Bytes.
222/// \param src0
223///    The 1st source tile. Max size is 1024 Bytes.
224/// \param src1
225///    The 2nd source tile. Max size is 1024 Bytes.
226#define _tile_dpbf16ps(dst, src0, src1)                                        \
227  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
228
229/// AMX tile register size can be configured, the maximum size is 16x64=1024
230/// bytes. Since there is no 2D type in llvm IR, we use vector type to
231/// represent 2D tile and the fixed size is maximum amx tile register size.
232typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
233typedef int _tile1024i_1024a
234    __attribute__((__vector_size__(1024), __aligned__(1024)));
235
236/// This is internal intrinsic. C/C++ user should avoid calling it directly.
237static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
238_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
239                     __SIZE_TYPE__ stride) {
240  return __builtin_ia32_tileloadd64_internal(m, n, base,
241                                             (__SIZE_TYPE__)(stride));
242}
243
244/// This is internal intrinsic. C/C++ user should avoid calling it directly.
245static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TILE
246_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
247                       __SIZE_TYPE__ stride) {
248  return __builtin_ia32_tileloaddt164_internal(m, n, base,
249                                               (__SIZE_TYPE__)(stride));
250}
251
252/// This is internal intrinsic. C/C++ user should avoid calling it directly.
253static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
254_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
255                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
256  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
257}
258
259/// This is internal intrinsic. C/C++ user should avoid calling it directly.
260static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
261_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
262                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
263  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
264}
265
266/// This is internal intrinsic. C/C++ user should avoid calling it directly.
267static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
268_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
269                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
270  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
271}
272
273/// This is internal intrinsic. C/C++ user should avoid calling it directly.
274static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
275_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
276                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
277  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
278}
279
280/// This is internal intrinsic. C/C++ user should avoid calling it directly.
281static __inline__ void __DEFAULT_FN_ATTRS_TILE
282_tile_stored_internal(unsigned short m, unsigned short n, void *base,
283                      __SIZE_TYPE__ stride, _tile1024i tile) {
284  return __builtin_ia32_tilestored64_internal(m, n, base,
285                                              (__SIZE_TYPE__)(stride), tile);
286}
287
288/// This is internal intrinsic. C/C++ user should avoid calling it directly.
289static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
290_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
291                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
292  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
293}
294
295/// This struct pack the shape and tile data together for user. We suggest
296/// initializing the struct as early as possible, because compiler depends
297/// on the shape information to do configure. The constant value is preferred
298/// for optimization by compiler.
299typedef struct __tile1024i_str {
300  const unsigned short row;
301  const unsigned short col;
302  _tile1024i tile;
303} __tile1024i;
304
305/// Load tile rows from memory specifieid by "base" address and "stride" into
306/// destination tile "dst".
307///
308/// \headerfile <immintrin.h>
309///
310/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
311///
312/// \param dst
313///    A destination tile. Max size is 1024 Bytes.
314/// \param base
315///    A pointer to base address.
316/// \param stride
317///    The stride between the rows' data to be loaded in memory.
318__DEFAULT_FN_ATTRS_TILE
319static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
320                                    __SIZE_TYPE__ stride) {
321  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
322}
323
324/// Load tile rows from memory specifieid by "base" address and "stride" into
325/// destination tile "dst". This intrinsic provides a hint to the implementation
326/// that the data will likely not be reused in the near future and the data
327/// caching can be optimized accordingly.
328///
329/// \headerfile <immintrin.h>
330///
331/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
332///
333/// \param dst
334///    A destination tile. Max size is 1024 Bytes.
335/// \param base
336///    A pointer to base address.
337/// \param stride
338///    The stride between the rows' data to be loaded in memory.
339__DEFAULT_FN_ATTRS_TILE
340static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
341                                           __SIZE_TYPE__ stride) {
342  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
343}
344
345/// Compute dot-product of bytes in tiles with a source/destination accumulator.
346/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
347/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
348/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
349/// and store the 32-bit result back to tile "dst".
350///
351/// \headerfile <immintrin.h>
352///
353/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
354///
355/// \param dst
356///    The destination tile. Max size is 1024 Bytes.
357/// \param src0
358///    The 1st source tile. Max size is 1024 Bytes.
359/// \param src1
360///    The 2nd source tile. Max size is 1024 Bytes.
361__DEFAULT_FN_ATTRS_INT8
362static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
363                                     __tile1024i src1) {
364  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
365                                    src0.tile, src1.tile);
366}
367
368/// Compute dot-product of bytes in tiles with a source/destination accumulator.
369/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
370/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
371/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
372/// in "dst", and store the 32-bit result back to tile "dst".
373///
374/// \headerfile <immintrin.h>
375///
376/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
377///
378/// \param dst
379///    The destination tile. Max size is 1024 Bytes.
380/// \param src0
381///    The 1st source tile. Max size is 1024 Bytes.
382/// \param src1
383///    The 2nd source tile. Max size is 1024 Bytes.
384__DEFAULT_FN_ATTRS_INT8
385static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
386                                     __tile1024i src1) {
387  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
388                                    src0.tile, src1.tile);
389}
390
391/// Compute dot-product of bytes in tiles with a source/destination accumulator.
392/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
393/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
394/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
395/// and store the 32-bit result back to tile "dst".
396///
397/// \headerfile <immintrin.h>
398///
399/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
400///
401/// \param dst
402///    The destination tile. Max size is 1024 Bytes.
403/// \param src0
404///    The 1st source tile. Max size is 1024 Bytes.
405/// \param src1
406///    The 2nd source tile. Max size is 1024 Bytes.
407__DEFAULT_FN_ATTRS_INT8
408static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
409                                     __tile1024i src1) {
410  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
411                                    src0.tile, src1.tile);
412}
413
414/// Compute dot-product of bytes in tiles with a source/destination accumulator.
415/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
416/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
417/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
418/// "dst", and store the 32-bit result back to tile "dst".
419///
420/// \headerfile <immintrin.h>
421///
422/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
423///
424/// \param dst
425///    The destination tile. Max size is 1024 Bytes.
426/// \param src0
427///    The 1st source tile. Max size is 1024 Bytes.
428/// \param src1
429///    The 2nd source tile. Max size is 1024 Bytes.
430__DEFAULT_FN_ATTRS_INT8
431static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
432                                     __tile1024i src1) {
433  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
434                                    src0.tile, src1.tile);
435}
436
437/// Store the tile specified by "src" to memory specifieid by "base" address and
438/// "stride".
439///
440/// \headerfile <immintrin.h>
441///
442/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
443///
444/// \param base
445///    A pointer to base address.
446/// \param stride
447///    The stride between the rows' data to be stored in memory.
448__DEFAULT_FN_ATTRS_TILE
449static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
450                                     __tile1024i src) {
451  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
452}
453
454/// Zero the tile specified by "dst".
455///
456/// \headerfile <immintrin.h>
457///
458/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
459///
460/// \param dst
461///    The destination tile to be zero. Max size is 1024 Bytes.
462__DEFAULT_FN_ATTRS_TILE
463static __inline__ void __tile_zero(__tile1024i *dst) {
464  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
465}
466
467/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
468/// src1, accumulating the intermediate single-precision (32-bit) floating-point
469/// elements with elements in "dst", and store the 32-bit result back to tile
470/// "dst".
471///
472/// \headerfile <immintrin.h>
473///
474/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
475///
476/// \param dst
477///    The destination tile. Max size is 1024 Bytes.
478/// \param src0
479///    The 1st source tile. Max size is 1024 Bytes.
480/// \param src1
481///    The 2nd source tile. Max size is 1024 Bytes.
482__DEFAULT_FN_ATTRS_BF16
483static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
484                                       __tile1024i src1) {
485  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
486                                      src0.tile, src1.tile);
487}
488
489#undef __DEFAULT_FN_ATTRS_TILE
490#undef __DEFAULT_FN_ATTRS_INT8
491#undef __DEFAULT_FN_ATTRS_BF16
492
493#endif /* __x86_64__ */
494#endif /* __AMXINTRIN_H */