master
  1/*===--------------- sm4intrin.h - SM4 intrinsics -----------------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9
 10#ifndef __IMMINTRIN_H
 11#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
 12#endif // __IMMINTRIN_H
 13
 14#ifndef __SM4INTRIN_H
 15#define __SM4INTRIN_H
 16
 17/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
 18///    operates on independent 128-bit lanes. The calculated results are
 19///    stored in \a dst.
 20/// \headerfile <immintrin.h>
 21///
 22/// \code
 23/// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B)
 24/// \endcode
 25///
 26/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
 27///
 28/// \param __A
 29///    A 128-bit vector of [4 x int].
 30/// \param __B
 31///    A 128-bit vector of [4 x int].
 32/// \returns
 33///    A 128-bit vector of [4 x int].
 34///
 35/// \code{.operation}
 36/// DEFINE ROL32(dword, n) {
 37/// 	count := n % 32
 38/// 	dest := (dword << count) | (dword >> (32-count))
 39/// 	RETURN dest
 40/// }
 41/// DEFINE SBOX_BYTE(dword, i) {
 42/// 	RETURN sbox[dword.byte[i]]
 43/// }
 44/// DEFINE lower_t(dword) {
 45/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
 46/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
 47/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
 48/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
 49/// 	RETURN tmp
 50/// }
 51/// DEFINE L_KEY(dword) {
 52/// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
 53/// }
 54/// DEFINE T_KEY(dword) {
 55/// 	RETURN L_KEY(lower_t(dword))
 56/// }
 57/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
 58/// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
 59/// }
 60/// FOR i:= 0 to 0
 61/// 	P[0] := __B.xmm[i].dword[0]
 62/// 	P[1] := __B.xmm[i].dword[1]
 63/// 	P[2] := __B.xmm[i].dword[2]
 64/// 	P[3] := __B.xmm[i].dword[3]
 65/// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
 66/// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
 67/// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
 68/// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
 69/// 	DEST.xmm[i].dword[0] := C[0]
 70/// 	DEST.xmm[i].dword[1] := C[1]
 71/// 	DEST.xmm[i].dword[2] := C[2]
 72/// 	DEST.xmm[i].dword[3] := C[3]
 73/// ENDFOR
 74/// DEST[MAX:128] := 0
 75/// \endcode
 76#define _mm_sm4key4_epi32(A, B)                                                \
 77  (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B)
 78
 79/// This intrinsic performs four rounds of SM4 key expansion. The intrinsic
 80///    operates on independent 128-bit lanes. The calculated results are
 81///    stored in \a dst.
 82/// \headerfile <immintrin.h>
 83///
 84/// \code
 85/// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B)
 86/// \endcode
 87///
 88/// This intrinsic corresponds to the \c VSM4KEY4 instruction.
 89///
 90/// \param __A
 91///    A 256-bit vector of [8 x int].
 92/// \param __B
 93///    A 256-bit vector of [8 x int].
 94/// \returns
 95///    A 256-bit vector of [8 x int].
 96///
 97/// \code{.operation}
 98/// DEFINE ROL32(dword, n) {
 99/// 	count := n % 32
100/// 	dest := (dword << count) | (dword >> (32-count))
101/// 	RETURN dest
102/// }
103/// DEFINE SBOX_BYTE(dword, i) {
104/// 	RETURN sbox[dword.byte[i]]
105/// }
106/// DEFINE lower_t(dword) {
107/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
108/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
109/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
110/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
111/// 	RETURN tmp
112/// }
113/// DEFINE L_KEY(dword) {
114/// 	RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23)
115/// }
116/// DEFINE T_KEY(dword) {
117/// 	RETURN L_KEY(lower_t(dword))
118/// }
119/// DEFINE F_KEY(X0, X1, X2, X3, round_key) {
120/// 	RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key)
121/// }
122/// FOR i:= 0 to 1
123/// 	P[0] := __B.xmm[i].dword[0]
124/// 	P[1] := __B.xmm[i].dword[1]
125/// 	P[2] := __B.xmm[i].dword[2]
126/// 	P[3] := __B.xmm[i].dword[3]
127/// 	C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
128/// 	C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
129/// 	C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
130/// 	C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
131/// 	DEST.xmm[i].dword[0] := C[0]
132/// 	DEST.xmm[i].dword[1] := C[1]
133/// 	DEST.xmm[i].dword[2] := C[2]
134/// 	DEST.xmm[i].dword[3] := C[3]
135/// ENDFOR
136/// DEST[MAX:256] := 0
137/// \endcode
138#define _mm256_sm4key4_epi32(A, B)                                             \
139  (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B)
140
141/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
142///    operates on independent 128-bit lanes. The calculated results are
143///    stored in \a dst.
144/// \headerfile <immintrin.h>
145///
146/// \code
147/// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B)
148/// \endcode
149///
150/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
151///
152/// \param __A
153///    A 128-bit vector of [4 x int].
154/// \param __B
155///    A 128-bit vector of [4 x int].
156/// \returns
157///    A 128-bit vector of [4 x int].
158///
159/// \code{.operation}
160/// DEFINE ROL32(dword, n) {
161/// 	count := n % 32
162/// 	dest := (dword << count) | (dword >> (32-count))
163/// 	RETURN dest
164/// }
165/// DEFINE lower_t(dword) {
166/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
167/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
168/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
169/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
170/// 	RETURN tmp
171/// }
172/// DEFINE L_RND(dword) {
173/// 	tmp := dword
174/// 	tmp := tmp ^ ROL32(dword, 2)
175/// 	tmp := tmp ^ ROL32(dword, 10)
176/// 	tmp := tmp ^ ROL32(dword, 18)
177/// 	tmp := tmp ^ ROL32(dword, 24)
178///   RETURN tmp
179/// }
180/// DEFINE T_RND(dword) {
181/// 	RETURN L_RND(lower_t(dword))
182/// }
183/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
184/// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
185/// }
186/// FOR i:= 0 to 0
187/// 	P[0] := __B.xmm[i].dword[0]
188/// 	P[1] := __B.xmm[i].dword[1]
189/// 	P[2] := __B.xmm[i].dword[2]
190/// 	P[3] := __B.xmm[i].dword[3]
191/// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
192/// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
193/// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
194/// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
195/// 	DEST.xmm[i].dword[0] := C[0]
196/// 	DEST.xmm[i].dword[1] := C[1]
197/// 	DEST.xmm[i].dword[2] := C[2]
198/// 	DEST.xmm[i].dword[3] := C[3]
199/// ENDFOR
200/// DEST[MAX:128] := 0
201/// \endcode
202#define _mm_sm4rnds4_epi32(A, B)                                               \
203  (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B)
204
205/// This intrinisc performs four rounds of SM4 encryption. The intrinisc
206///    operates on independent 128-bit lanes. The calculated results are
207///    stored in \a dst.
208/// \headerfile <immintrin.h>
209///
210/// \code
211/// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B)
212/// \endcode
213///
214/// This intrinsic corresponds to the \c VSM4RNDS4 instruction.
215///
216/// \param __A
217///    A 256-bit vector of [8 x int].
218/// \param __B
219///    A 256-bit vector of [8 x int].
220/// \returns
221///    A 256-bit vector of [8 x int].
222///
223/// \code{.operation}
224/// DEFINE ROL32(dword, n) {
225/// 	count := n % 32
226/// 	dest := (dword << count) | (dword >> (32-count))
227/// 	RETURN dest
228/// }
229/// DEFINE lower_t(dword) {
230/// 	tmp.byte[0] := SBOX_BYTE(dword, 0)
231/// 	tmp.byte[1] := SBOX_BYTE(dword, 1)
232/// 	tmp.byte[2] := SBOX_BYTE(dword, 2)
233/// 	tmp.byte[3] := SBOX_BYTE(dword, 3)
234/// 	RETURN tmp
235/// }
236/// DEFINE L_RND(dword) {
237/// 	tmp := dword
238/// 	tmp := tmp ^ ROL32(dword, 2)
239/// 	tmp := tmp ^ ROL32(dword, 10)
240/// 	tmp := tmp ^ ROL32(dword, 18)
241/// 	tmp := tmp ^ ROL32(dword, 24)
242///   RETURN tmp
243/// }
244/// DEFINE T_RND(dword) {
245/// 	RETURN L_RND(lower_t(dword))
246/// }
247/// DEFINE F_RND(X0, X1, X2, X3, round_key) {
248/// 	RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key)
249/// }
250/// FOR i:= 0 to 0
251/// 	P[0] := __B.xmm[i].dword[0]
252/// 	P[1] := __B.xmm[i].dword[1]
253/// 	P[2] := __B.xmm[i].dword[2]
254/// 	P[3] := __B.xmm[i].dword[3]
255/// 	C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0])
256/// 	C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1])
257/// 	C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2])
258/// 	C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3])
259/// 	DEST.xmm[i].dword[0] := C[0]
260/// 	DEST.xmm[i].dword[1] := C[1]
261/// 	DEST.xmm[i].dword[2] := C[2]
262/// 	DEST.xmm[i].dword[3] := C[3]
263/// ENDFOR
264/// DEST[MAX:256] := 0
265/// \endcode
266#define _mm256_sm4rnds4_epi32(A, B)                                            \
267  (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B)
268
269#endif // __SM4INTRIN_H