master
  1/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===------------------------------------------------------------------------===
  8 */
  9
 10#ifndef __IMMINTRIN_H
 11#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
 12#endif // __IMMINTRIN_H
 13
 14#ifndef __AMX_COMPLEXINTRIN_H
 15#define __AMX_COMPLEXINTRIN_H
 16#ifdef __x86_64__
 17
 18#define __DEFAULT_FN_ATTRS_COMPLEX                                             \
 19  __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
 20
 21/// Perform matrix multiplication of two tiles containing complex elements and
 22///    accumulate the results into a packed single precision tile. Each dword
 23///    element in input tiles \a a and \a b is interpreted as a complex number
 24///    with FP16 real part and FP16 imaginary part.
 25/// Calculates the imaginary part of the result. For each possible combination
 26///    of (row of \a a, column of \a b), it performs a set of multiplication
 27///    and accumulations on all corresponding complex numbers (one from \a a
 28///    and one from \a b). The imaginary part of the \a a element is multiplied
 29///    with the real part of the corresponding \a b element, and the real part
 30///    of the \a a element is multiplied with the imaginary part of the
 31///    corresponding \a b elements. The two accumulated results are added, and
 32///    then accumulated into the corresponding row and column of \a dst.
 33///
 34/// \headerfile <x86intrin.h>
 35///
 36/// \code
 37/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
 38/// \endcode
 39///
 40/// \code{.operation}
 41/// FOR m := 0 TO dst.rows - 1
 42///	tmp := dst.row[m]
 43///	FOR k := 0 TO (a.colsb / 4) - 1
 44///		FOR n := 0 TO (dst.colsb / 4) - 1
 45///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
 46///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
 47///		ENDFOR
 48///	ENDFOR
 49///	write_row_and_zero(dst, m, tmp, dst.colsb)
 50/// ENDFOR
 51/// zero_upper_rows(dst, dst.rows)
 52/// zero_tileconfig_start()
 53/// \endcode
 54///
 55/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
 56///
 57/// \param dst
 58///    The destination tile. Max size is 1024 Bytes.
 59/// \param a
 60///    The 1st source tile. Max size is 1024 Bytes.
 61/// \param b
 62///    The 2nd source tile. Max size is 1024 Bytes.
 63#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
 64
 65/// Perform matrix multiplication of two tiles containing complex elements and
 66///    accumulate the results into a packed single precision tile. Each dword
 67///    element in input tiles \a a and \a b is interpreted as a complex number
 68///    with FP16 real part and FP16 imaginary part.
 69/// Calculates the real part of the result. For each possible combination
 70///    of (row of \a a, column of \a b), it performs a set of multiplication
 71///    and accumulations on all corresponding complex numbers (one from \a a
 72///    and one from \a b). The real part of the \a a element is multiplied
 73///    with the real part of the corresponding \a b element, and the negated
 74///    imaginary part of the \a a element is multiplied with the imaginary
 75///    part of the corresponding \a b elements. The two accumulated results
 76///    are added, and then accumulated into the corresponding row and column
 77///    of \a dst.
 78///
 79/// \headerfile <x86intrin.h>
 80///
 81/// \code
 82/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
 83/// \endcode
 84///
 85/// \code{.operation}
 86/// FOR m := 0 TO dst.rows - 1
 87///	tmp := dst.row[m]
 88///	FOR k := 0 TO (a.colsb / 4) - 1
 89///		FOR n := 0 TO (dst.colsb / 4) - 1
 90///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
 91///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
 92///		ENDFOR
 93///	ENDFOR
 94///	write_row_and_zero(dst, m, tmp, dst.colsb)
 95/// ENDFOR
 96/// zero_upper_rows(dst, dst.rows)
 97/// zero_tileconfig_start()
 98/// \endcode
 99///
100/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
101///
102/// \param dst
103///    The destination tile. Max size is 1024 Bytes.
104/// \param a
105///    The 1st source tile. Max size is 1024 Bytes.
106/// \param b
107///    The 2nd source tile. Max size is 1024 Bytes.
108#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
109
110static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
111_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
112                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
113  return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
114}
115
116static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
117_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
118                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
119  return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
120}
121
122/// Perform matrix multiplication of two tiles containing complex elements and
123/// accumulate the results into a packed single precision tile. Each dword
124/// element in input tiles src0 and src1 is interpreted as a complex number with
125/// FP16 real part and FP16 imaginary part.
126/// This function calculates the imaginary part of the result.
127///
128/// \headerfile <immintrin.h>
129///
130/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
131///
132/// \param dst
133///    The destination tile. Max size is 1024 Bytes.
134/// \param src0
135///    The 1st source tile. Max size is 1024 Bytes.
136/// \param src1
137///    The 2nd source tile. Max size is 1024 Bytes.
138static __inline__ void __DEFAULT_FN_ATTRS_COMPLEX
139__tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0, __tile1024i src1) {
140  dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
141                                         dst->tile, src0.tile, src1.tile);
142}
143
144/// Perform matrix multiplication of two tiles containing complex elements and
145/// accumulate the results into a packed single precision tile. Each dword
146/// element in input tiles src0 and src1 is interpreted as a complex number with
147/// FP16 real part and FP16 imaginary part.
148/// This function calculates the real part of the result.
149///
150/// \headerfile <immintrin.h>
151///
152/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
153///
154/// \param dst
155///    The destination tile. Max size is 1024 Bytes.
156/// \param src0
157///    The 1st source tile. Max size is 1024 Bytes.
158/// \param src1
159///    The 2nd source tile. Max size is 1024 Bytes.
160static __inline__ void __DEFAULT_FN_ATTRS_COMPLEX
161__tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0, __tile1024i src1) {
162  dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
163                                         dst->tile, src0.tile, src1.tile);
164}
165
166#endif // __x86_64__
167#endif // __AMX_COMPLEXINTRIN_H