master
  1/* ===--- amxmovrstransposeintrin.h - AMX_MOVRS_TRANSPOSE intrinsics --------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 * ===-----------------------------------------------------------------------===
  8 */
  9
 10#ifndef __IMMINTRIN_H
 11#error                                                                         \
 12    "Never use <amxmovrstransposeintrin.h> directly; use <immintrin.h> instead."
 13#endif /* __IMMINTRIN_H */
 14
 15#ifndef __AMX_MOVRS_TRANSPOSEINTRIN_H
 16#define __AMX_MOVRS_TRANSPOSEINTRIN_H
 17#ifdef __x86_64__
 18
 19#define __DEFAULT_FN_ATTRS                                                     \
 20  __attribute__((__always_inline__, __nodebug__,                               \
 21                 __target__("amx-transpose,amx-movrs")))
 22
 23#define _tile_2rpntlvwz0rs(tdst, base, stride)                                 \
 24  __builtin_ia32_t2rpntlvwz0rs(tdst, base, stride)
 25#define _tile_2rpntlvwz0rst1(tdst, base, stride)                               \
 26  __builtin_ia32_t2rpntlvwz0rst1(tdst, base, stride)
 27#define _tile_2rpntlvwz1rs(tdst, base, stride)                                 \
 28  __builtin_ia32_t2rpntlvwz1rs(tdst, base, stride)
 29#define _tile_2rpntlvwz1rst1(tdst, base, stride)                               \
 30  __builtin_ia32_t2rpntlvwz1rst1(tdst, base, stride)
 31
 32static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rs_internal(
 33    unsigned short row, unsigned short col0, unsigned short col1,
 34    _tile1024i *dst0, _tile1024i *dst1, const void *base,
 35    __SIZE_TYPE__ stride) {
 36  // Use __tile1024i_1024a* to escape the alignment check in
 37  // clang/test/Headers/x86-intrinsics-headers-clean.cpp
 38  __builtin_ia32_t2rpntlvwz0rs_internal(
 39      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
 40      (__SIZE_TYPE__)(stride));
 41}
 42
 43static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz0rst1_internal(
 44    unsigned short row, unsigned short col0, unsigned short col1,
 45    _tile1024i *dst0, _tile1024i *dst1, const void *base,
 46    __SIZE_TYPE__ stride) {
 47  __builtin_ia32_t2rpntlvwz0rst1_internal(
 48      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
 49      (__SIZE_TYPE__)(stride));
 50}
 51
 52static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rs_internal(
 53    unsigned short row, unsigned short col0, unsigned short col1,
 54    _tile1024i *dst0, _tile1024i *dst1, const void *base,
 55    __SIZE_TYPE__ stride) {
 56  __builtin_ia32_t2rpntlvwz1rs_internal(
 57      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
 58      (__SIZE_TYPE__)(stride));
 59}
 60
 61static __inline__ void __DEFAULT_FN_ATTRS _tile_2rpntlvwz1rst1_internal(
 62    unsigned short row, unsigned short col0, unsigned short col1,
 63    _tile1024i *dst0, _tile1024i *dst1, const void *base,
 64    __SIZE_TYPE__ stride) {
 65  __builtin_ia32_t2rpntlvwz1rst1_internal(
 66      row, col0, col1, (_tile1024i_1024a *)dst0, (_tile1024i_1024a *)dst1, base,
 67      (__SIZE_TYPE__)(stride));
 68}
 69
 70/// Converts a pair of tiles from memory into VNNI format, and places the
 71/// results in a pair of destinations specified by dst. The pair of tiles
 72/// in memory is specified via a tsib; the second tile is after the first
 73/// one, separated by the same stride that separates each row.
 74/// The tile configuration for the destination tiles indicates the amount
 75/// of data to read from memory. The instruction will load a number of rows
 76/// that is equal to twice the number of rows in tmm1. The size of each row
 77/// is equal to the average width of the destination tiles. If the second
 78/// tile is configured with zero rows and columns, only the first tile will
 79/// be written.
 80/// Provides a hint to the implementation that the data will likely become
 81/// read shared in the near future and the data caching can be optimized.
 82///
 83/// \headerfile <immintrin.h>
 84///
 85/// This intrinsic corresponds to the <c> T2RPNTLVWZ0RS </c> instruction.
 86///
 87/// \param dst0
 88///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
 89/// \param dst1
 90///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
 91/// \param base
 92///    A pointer to base address.
 93/// \param stride
 94///    The stride between the rows' data to be loaded in memory.
 95__DEFAULT_FN_ATTRS
 96static void __tile_2rpntlvwz0rs(__tile1024i *dst0, __tile1024i *dst1,
 97                                const void *base, __SIZE_TYPE__ stride) {
 98  _tile_2rpntlvwz0rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
 99                              &dst1->tile, base, stride);
100}
101
102/// Converts a pair of tiles from memory into VNNI format, and places the
103/// results in a pair of destinations specified by dst. The pair of tiles
104/// in memory is specified via a tsib; the second tile is after the first
105/// one, separated by the same stride that separates each row.
106/// The tile configuration for the destination tiles indicates the amount
107/// of data to read from memory. The instruction will load a number of rows
108/// that is equal to twice the number of rows in tmm1. The size of each row
109/// is equal to the average width of the destination tiles. If the second
110/// tile is configured with zero rows and columns, only the first tile will
111/// be written.
112///
113/// \headerfile <immintrin.h>
114///
115/// This intrinsic corresponds to the <c> T2RPNTLVWZ0T1RS </c> instruction.
116///
117/// \param dst0
118///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
119/// \param dst1
120///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
121/// \param base
122///    A pointer to base address.
123/// \param stride
124///    The stride between the rows' data to be loaded in memory.
125__DEFAULT_FN_ATTRS
126static void __tile_2rpntlvwz0rst1(__tile1024i *dst0, __tile1024i *dst1,
127                                  const void *base, __SIZE_TYPE__ stride) {
128  _tile_2rpntlvwz0rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
129                                &dst1->tile, base, stride);
130}
131
132/// Converts a pair of tiles from memory into VNNI format, and places the
133/// results in a pair of destinations specified by dst. The pair of tiles
134/// in memory is specified via a tsib; the second tile is after the first
135/// one, separated by the same stride that separates each row.
136/// The tile configuration for the destination tiles indicates the amount
137/// of data to read from memory. The instruction will load a number of rows
138/// that is equal to twice the number of rows in tmm1. The size of each row
139/// is equal to the average width of the destination tiles. If the second
140/// tile is configured with zero rows and columns, only the first tile will
141/// be written. The last row will be not be read from memory but instead
142/// filled with zeros.
143/// Provides a hint to the implementation that the data will likely become
144/// read shared in the near future and the data caching can be optimized.
145///
146/// \headerfile <immintrin.h>
147///
148/// This intrinsic corresponds to the <c> T2RPNTLVWZ1 </c> instruction.
149///
150/// \param dst0
151///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
152/// \param dst1
153///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
154/// \param base
155///    A pointer to base address.
156/// \param stride
157///    The stride between the rows' data to be loaded in memory.
158__DEFAULT_FN_ATTRS
159static void __tile_2rpntlvwz1rs(__tile1024i *dst0, __tile1024i *dst1,
160                                const void *base, __SIZE_TYPE__ stride) {
161  _tile_2rpntlvwz1rs_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
162                              &dst1->tile, base, stride);
163}
164
165/// Converts a pair of tiles from memory into VNNI format, and places the
166/// results in a pair of destinations specified by dst. The pair of tiles
167/// in memory is specified via a tsib; the second tile is after the first
168/// one, separated by the same stride that separates each row.
169/// The tile configuration for the destination tiles indicates the amount
170/// of data to read from memory. The instruction will load a number of rows
171/// that is equal to twice the number of rows in tmm1. The size of each row
172/// is equal to the average width of the destination tiles. If the second
173/// tile is configured with zero rows and columns, only the first tile will
174/// be written. The last row will be not be read from memory but instead
175/// filled with zeros.
176/// Provides a hint to the implementation that the data will likely become
177/// read shared in the near future and the data caching can be optimized.
178///
179/// \headerfile <immintrin.h>
180///
181/// This intrinsic corresponds to the <c> T2RPNTLVWZ1T1RS </c> instruction.
182///
183/// \param dst0
184///    First tile of destination tile pair. Max size is 1024i*2 Bytes.
185/// \param dst1
186///    Second tile of destination tile pair. Max size is 1024i*2 Bytes.
187/// \param base
188///    A pointer to base address.
189/// \param stride
190///    The stride between the rows' data to be loaded in memory.
191__DEFAULT_FN_ATTRS
192static void __tile_2rpntlvwz1rst1(__tile1024i *dst0, __tile1024i *dst1,
193                                  const void *base, __SIZE_TYPE__ stride) {
194  _tile_2rpntlvwz1rst1_internal(dst0->row, dst0->col, dst1->col, &dst0->tile,
195                                &dst1->tile, base, stride);
196}
197
198#undef __DEFAULT_FN_ATTRS
199#endif /* __x86_64__ */
200#endif /* __AMX_MOVRS_TRANSPOSEINTRIN_H */