zig/lib/include/fmaintrin.h at master

  1/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
  2 *
  3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4 * See https://llvm.org/LICENSE.txt for license information.
  5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6 *
  7 *===-----------------------------------------------------------------------===
  8 */
  9
 10#ifndef __IMMINTRIN_H
 11#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
 12#endif
 13
 14#ifndef __FMAINTRIN_H
 15#define __FMAINTRIN_H
 16
 17/* Define the default attributes for the functions in this file. */
 18#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
 19#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
 20
 21/// Computes a multiply-add of 128-bit vectors of [4 x float].
 22///    For each element, computes <c> (__A * __B) + __C </c>.
 23///
 24/// \headerfile <immintrin.h>
 25///
 26/// This intrinsic corresponds to the \c VFMADD213PS instruction.
 27///
 28/// \param __A
 29///    A 128-bit vector of [4 x float] containing the multiplicand.
 30/// \param __B
 31///    A 128-bit vector of [4 x float] containing the multiplier.
 32/// \param __C
 33///    A 128-bit vector of [4 x float] containing the addend.
 34/// \returns A 128-bit vector of [4 x float] containing the result.
 35static __inline__ __m128 __DEFAULT_FN_ATTRS128
 36_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 37{
 38  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 39}
 40
 41/// Computes a multiply-add of 128-bit vectors of [2 x double].
 42///    For each element, computes <c> (__A * __B) + __C </c>.
 43///
 44/// \headerfile <immintrin.h>
 45///
 46/// This intrinsic corresponds to the \c VFMADD213PD instruction.
 47///
 48/// \param __A
 49///    A 128-bit vector of [2 x double] containing the multiplicand.
 50/// \param __B
 51///    A 128-bit vector of [2 x double] containing the multiplier.
 52/// \param __C
 53///    A 128-bit vector of [2 x double] containing the addend.
 54/// \returns A 128-bit [2 x double] vector containing the result.
 55static __inline__ __m128d __DEFAULT_FN_ATTRS128
 56_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 57{
 58  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 59}
 60
 61/// Computes a scalar multiply-add of the single-precision values in the
 62///    low 32 bits of 128-bit vectors of [4 x float].
 63///
 64/// \code{.operation}
 65/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 66/// result[127:32] = __A[127:32]
 67/// \endcode
 68///
 69/// \headerfile <immintrin.h>
 70///
 71/// This intrinsic corresponds to the \c VFMADD213SS instruction.
 72///
 73/// \param __A
 74///    A 128-bit vector of [4 x float] containing the multiplicand in the low
 75///    32 bits.
 76/// \param __B
 77///    A 128-bit vector of [4 x float] containing the multiplier in the low
 78///    32 bits.
 79/// \param __C
 80///    A 128-bit vector of [4 x float] containing the addend in the low
 81///    32 bits.
 82/// \returns A 128-bit vector of [4 x float] containing the result in the low
 83///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
 84static __inline__ __m128 __DEFAULT_FN_ATTRS128
 85_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 86{
 87  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 88}
 89
 90/// Computes a scalar multiply-add of the double-precision values in the
 91///    low 64 bits of 128-bit vectors of [2 x double].
 92///
 93/// \code{.operation}
 94/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 95/// result[127:64] = __A[127:64]
 96/// \endcode
 97///
 98/// \headerfile <immintrin.h>
 99///
100/// This intrinsic corresponds to the \c VFMADD213SD instruction.
101///
102/// \param __A
103///    A 128-bit vector of [2 x double] containing the multiplicand in the low
104///    64 bits.
105/// \param __B
106///    A 128-bit vector of [2 x double] containing the multiplier in the low
107///    64 bits.
108/// \param __C
109///    A 128-bit vector of [2 x double] containing the addend in the low
110///    64 bits.
111/// \returns A 128-bit vector of [2 x double] containing the result in the low
112///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
113static __inline__ __m128d __DEFAULT_FN_ATTRS128
114_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
115{
116  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
117}
118
119/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
120///    For each element, computes <c> (__A * __B) - __C </c>.
121///
122/// \headerfile <immintrin.h>
123///
124/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
125///
126/// \param __A
127///    A 128-bit vector of [4 x float] containing the multiplicand.
128/// \param __B
129///    A 128-bit vector of [4 x float] containing the multiplier.
130/// \param __C
131///    A 128-bit vector of [4 x float] containing the subtrahend.
132/// \returns A 128-bit vector of [4 x float] containing the result.
133static __inline__ __m128 __DEFAULT_FN_ATTRS128
134_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
135{
136  return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
137}
138
139/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
140///    For each element, computes <c> (__A * __B) - __C </c>.
141///
142/// \headerfile <immintrin.h>
143///
144/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
145///
146/// \param __A
147///    A 128-bit vector of [2 x double] containing the multiplicand.
148/// \param __B
149///    A 128-bit vector of [2 x double] containing the multiplier.
150/// \param __C
151///    A 128-bit vector of [2 x double] containing the addend.
152/// \returns A 128-bit vector of [2 x double] containing the result.
153static __inline__ __m128d __DEFAULT_FN_ATTRS128
154_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
155{
156  return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
157}
158
159/// Computes a scalar multiply-subtract of the single-precision values in
160///    the low 32 bits of 128-bit vectors of [4 x float].
161///
162/// \code{.operation}
163/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
164/// result[127:32] = __A[127:32]
165/// \endcode
166///
167/// \headerfile <immintrin.h>
168///
169/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
170///
171/// \param __A
172///    A 128-bit vector of [4 x float] containing the multiplicand in the low
173///    32 bits.
174/// \param __B
175///    A 128-bit vector of [4 x float] containing the multiplier in the low
176///    32 bits.
177/// \param __C
178///    A 128-bit vector of [4 x float] containing the subtrahend in the low
179///   32 bits.
180/// \returns A 128-bit vector of [4 x float] containing the result in the low
181///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
182static __inline__ __m128 __DEFAULT_FN_ATTRS128
183_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
184{
185  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
186}
187
188/// Computes a scalar multiply-subtract of the double-precision values in
189///    the low 64 bits of 128-bit vectors of [2 x double].
190///
191/// \code{.operation}
192/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
193/// result[127:64] = __A[127:64]
194/// \endcode
195///
196/// \headerfile <immintrin.h>
197///
198/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
199///
200/// \param __A
201///    A 128-bit vector of [2 x double] containing the multiplicand in the low
202///    64 bits.
203/// \param __B
204///    A 128-bit vector of [2 x double] containing the multiplier in the low
205///    64 bits.
206/// \param __C
207///    A 128-bit vector of [2 x double] containing the subtrahend in the low
208///    64 bits.
209/// \returns A 128-bit vector of [2 x double] containing the result in the low
210///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
211static __inline__ __m128d __DEFAULT_FN_ATTRS128
212_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
213{
214  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
215}
216
217/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
218///    For each element, computes <c> -(__A * __B) + __C </c>.
219///
220/// \headerfile <immintrin.h>
221///
222/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
223///
224/// \param __A
225///    A 128-bit vector of [4 x float] containing the multiplicand.
226/// \param __B
227///    A 128-bit vector of [4 x float] containing the multiplier.
228/// \param __C
229///    A 128-bit vector of [4 x float] containing the addend.
230/// \returns A 128-bit [4 x float] vector containing the result.
231static __inline__ __m128 __DEFAULT_FN_ATTRS128
232_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
233{
234  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
235}
236
237/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
238///    For each element, computes <c> -(__A * __B) + __C </c>.
239///
240/// \headerfile <immintrin.h>
241///
242/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
243///
244/// \param __A
245///    A 128-bit vector of [2 x double] containing the multiplicand.
246/// \param __B
247///    A 128-bit vector of [2 x double] containing the multiplier.
248/// \param __C
249///    A 128-bit vector of [2 x double] containing the addend.
250/// \returns A 128-bit vector of [2 x double] containing the result.
251static __inline__ __m128d __DEFAULT_FN_ATTRS128
252_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
253{
254  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
255}
256
257/// Computes a scalar negated multiply-add of the single-precision values in
258///    the low 32 bits of 128-bit vectors of [4 x float].
259///
260/// \code{.operation}
261/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
262/// result[127:32] = __A[127:32]
263/// \endcode
264///
265/// \headerfile <immintrin.h>
266///
267/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
268///
269/// \param __A
270///    A 128-bit vector of [4 x float] containing the multiplicand in the low
271///    32 bits.
272/// \param __B
273///    A 128-bit vector of [4 x float] containing the multiplier in the low
274///    32 bits.
275/// \param __C
276///    A 128-bit vector of [4 x float] containing the addend in the low
277///    32 bits.
278/// \returns A 128-bit vector of [4 x float] containing the result in the low
279///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
280static __inline__ __m128 __DEFAULT_FN_ATTRS128
281_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
282{
283  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
284}
285
286/// Computes a scalar negated multiply-add of the double-precision values
287///    in the low 64 bits of 128-bit vectors of [2 x double].
288///
289/// \code{.operation}
290/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
291/// result[127:64] = __A[127:64]
292/// \endcode
293///
294/// \headerfile <immintrin.h>
295///
296/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
297///
298/// \param __A
299///    A 128-bit vector of [2 x double] containing the multiplicand in the low
300///    64 bits.
301/// \param __B
302///    A 128-bit vector of [2 x double] containing the multiplier in the low
303///    64 bits.
304/// \param __C
305///    A 128-bit vector of [2 x double] containing the addend in the low
306///    64 bits.
307/// \returns A 128-bit vector of [2 x double] containing the result in the low
308///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
309static __inline__ __m128d __DEFAULT_FN_ATTRS128
310_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
311{
312  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
313}
314
315/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
316///    For each element, computes <c> -(__A * __B) - __C </c>.
317///
318/// \headerfile <immintrin.h>
319///
320/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
321///
322/// \param __A
323///    A 128-bit vector of [4 x float] containing the multiplicand.
324/// \param __B
325///    A 128-bit vector of [4 x float] containing the multiplier.
326/// \param __C
327///    A 128-bit vector of [4 x float] containing the subtrahend.
328/// \returns A 128-bit vector of [4 x float] containing the result.
329static __inline__ __m128 __DEFAULT_FN_ATTRS128
330_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
331{
332  return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
333}
334
335/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
336///    For each element, computes <c> -(__A * __B) - __C </c>.
337///
338/// \headerfile <immintrin.h>
339///
340/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
341///
342/// \param __A
343///    A 128-bit vector of [2 x double] containing the multiplicand.
344/// \param __B
345///    A 128-bit vector of [2 x double] containing the multiplier.
346/// \param __C
347///    A 128-bit vector of [2 x double] containing the subtrahend.
348/// \returns A 128-bit vector of [2 x double] containing the result.
349static __inline__ __m128d __DEFAULT_FN_ATTRS128
350_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
351{
352  return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
353}
354
355/// Computes a scalar negated multiply-subtract of the single-precision
356///    values in the low 32 bits of 128-bit vectors of [4 x float].
357///
358/// \code{.operation}
359/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
360/// result[127:32] = __A[127:32]
361/// \endcode
362///
363/// \headerfile <immintrin.h>
364///
365/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
366///
367/// \param __A
368///    A 128-bit vector of [4 x float] containing the multiplicand in the low
369///    32 bits.
370/// \param __B
371///    A 128-bit vector of [4 x float] containing the multiplier in the low
372///    32 bits.
373/// \param __C
374///    A 128-bit vector of [4 x float] containing the subtrahend in the low
375///    32 bits.
376/// \returns A 128-bit vector of [4 x float] containing the result in the low
377///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
378static __inline__ __m128 __DEFAULT_FN_ATTRS128
379_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
380{
381  return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
382}
383
384/// Computes a scalar negated multiply-subtract of the double-precision
385///    values in the low 64 bits of 128-bit vectors of [2 x double].
386///
387/// \code{.operation}
388/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
389/// result[127:64] = __A[127:64]
390/// \endcode
391///
392/// \headerfile <immintrin.h>
393///
394/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
395///
396/// \param __A
397///    A 128-bit vector of [2 x double] containing the multiplicand in the low
398///    64 bits.
399/// \param __B
400///    A 128-bit vector of [2 x double] containing the multiplier in the low
401///    64 bits.
402/// \param __C
403///    A 128-bit vector of [2 x double] containing the subtrahend in the low
404///    64 bits.
405/// \returns A 128-bit vector of [2 x double] containing the result in the low
406///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
407static __inline__ __m128d __DEFAULT_FN_ATTRS128
408_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
409{
410  return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
411}
412
413/// Computes a multiply with alternating add/subtract of 128-bit vectors of
414///    [4 x float].
415///
416/// \code{.operation}
417/// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
418/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
419/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
420/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
421/// \endcode
422///
423/// \headerfile <immintrin.h>
424///
425/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
426///
427/// \param __A
428///    A 128-bit vector of [4 x float] containing the multiplicand.
429/// \param __B
430///    A 128-bit vector of [4 x float] containing the multiplier.
431/// \param __C
432///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
433/// \returns A 128-bit vector of [4 x float] containing the result.
434static __inline__ __m128 __DEFAULT_FN_ATTRS128
435_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
436{
437  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
438}
439
440/// Computes a multiply with alternating add/subtract of 128-bit vectors of
441///    [2 x double].
442///
443/// \code{.operation}
444/// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
445/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
446/// \endcode
447///
448/// \headerfile <immintrin.h>
449///
450/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
451///
452/// \param __A
453///    A 128-bit vector of [2 x double] containing the multiplicand.
454/// \param __B
455///    A 128-bit vector of [2 x double] containing the multiplier.
456/// \param __C
457///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
458/// \returns A 128-bit vector of [2 x double] containing the result.
459static __inline__ __m128d __DEFAULT_FN_ATTRS128
460_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
461{
462  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
463}
464
465/// Computes a multiply with alternating add/subtract of 128-bit vectors of
466///    [4 x float].
467///
468/// \code{.operation}
469/// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
470/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
471/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
472/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
473/// \endcode
474///
475/// \headerfile <immintrin.h>
476///
477/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
478///
479/// \param __A
480///    A 128-bit vector of [4 x float] containing the multiplicand.
481/// \param __B
482///    A 128-bit vector of [4 x float] containing the multiplier.
483/// \param __C
484///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
485/// \returns A 128-bit vector of [4 x float] containing the result.
486static __inline__ __m128 __DEFAULT_FN_ATTRS128
487_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
488{
489  return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
490}
491
492/// Computes a multiply with alternating add/subtract of 128-bit vectors of
493///    [2 x double].
494///
495/// \code{.operation}
496/// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
497/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
498/// \endcode
499///
500/// \headerfile <immintrin.h>
501///
502/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
503///
504/// \param __A
505///    A 128-bit vector of [2 x double] containing the multiplicand.
506/// \param __B
507///    A 128-bit vector of [2 x double] containing the multiplier.
508/// \param __C
509///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
510/// \returns A 128-bit vector of [2 x double] containing the result.
511static __inline__ __m128d __DEFAULT_FN_ATTRS128
512_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
513{
514  return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
515}
516
517/// Computes a multiply-add of 256-bit vectors of [8 x float].
518///    For each element, computes <c> (__A * __B) + __C </c>.
519///
520/// \headerfile <immintrin.h>
521///
522/// This intrinsic corresponds to the \c VFMADD213PS instruction.
523///
524/// \param __A
525///    A 256-bit vector of [8 x float] containing the multiplicand.
526/// \param __B
527///    A 256-bit vector of [8 x float] containing the multiplier.
528/// \param __C
529///    A 256-bit vector of [8 x float] containing the addend.
530/// \returns A 256-bit vector of [8 x float] containing the result.
531static __inline__ __m256 __DEFAULT_FN_ATTRS256
532_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
533{
534  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
535}
536
537/// Computes a multiply-add of 256-bit vectors of [4 x double].
538///    For each element, computes <c> (__A * __B) + __C </c>.
539///
540/// \headerfile <immintrin.h>
541///
542/// This intrinsic corresponds to the \c VFMADD213PD instruction.
543///
544/// \param __A
545///    A 256-bit vector of [4 x double] containing the multiplicand.
546/// \param __B
547///    A 256-bit vector of [4 x double] containing the multiplier.
548/// \param __C
549///    A 256-bit vector of [4 x double] containing the addend.
550/// \returns A 256-bit vector of [4 x double] containing the result.
551static __inline__ __m256d __DEFAULT_FN_ATTRS256
552_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
553{
554  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
555}
556
557/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
558///    For each element, computes <c> (__A * __B) - __C </c>.
559///
560/// \headerfile <immintrin.h>
561///
562/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
563///
564/// \param __A
565///    A 256-bit vector of [8 x float] containing the multiplicand.
566/// \param __B
567///    A 256-bit vector of [8 x float] containing the multiplier.
568/// \param __C
569///    A 256-bit vector of [8 x float] containing the subtrahend.
570/// \returns A 256-bit vector of [8 x float] containing the result.
571static __inline__ __m256 __DEFAULT_FN_ATTRS256
572_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
573{
574  return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
575}
576
577/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
578///    For each element, computes <c> (__A * __B) - __C </c>.
579///
580/// \headerfile <immintrin.h>
581///
582/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
583///
584/// \param __A
585///    A 256-bit vector of [4 x double] containing the multiplicand.
586/// \param __B
587///    A 256-bit vector of [4 x double] containing the multiplier.
588/// \param __C
589///    A 256-bit vector of [4 x double] containing the subtrahend.
590/// \returns A 256-bit vector of [4 x double] containing the result.
591static __inline__ __m256d __DEFAULT_FN_ATTRS256
592_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
593{
594  return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
595}
596
597/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
598///    For each element, computes <c> -(__A * __B) + __C </c>.
599///
600/// \headerfile <immintrin.h>
601///
602/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
603///
604/// \param __A
605///    A 256-bit vector of [8 x float] containing the multiplicand.
606/// \param __B
607///    A 256-bit vector of [8 x float] containing the multiplier.
608/// \param __C
609///    A 256-bit vector of [8 x float] containing the addend.
610/// \returns A 256-bit vector of [8 x float] containing the result.
611static __inline__ __m256 __DEFAULT_FN_ATTRS256
612_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
613{
614  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
615}
616
617/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
618///    For each element, computes <c> -(__A * __B) + __C </c>.
619///
620/// \headerfile <immintrin.h>
621///
622/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
623///
624/// \param __A
625///    A 256-bit vector of [4 x double] containing the multiplicand.
626/// \param __B
627///    A 256-bit vector of [4 x double] containing the multiplier.
628/// \param __C
629///    A 256-bit vector of [4 x double] containing the addend.
630/// \returns A 256-bit vector of [4 x double] containing the result.
631static __inline__ __m256d __DEFAULT_FN_ATTRS256
632_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
633{
634  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
635}
636
637/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
638///    For each element, computes <c> -(__A * __B) - __C </c>.
639///
640/// \headerfile <immintrin.h>
641///
642/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
643///
644/// \param __A
645///    A 256-bit vector of [8 x float] containing the multiplicand.
646/// \param __B
647///    A 256-bit vector of [8 x float] containing the multiplier.
648/// \param __C
649///    A 256-bit vector of [8 x float] containing the subtrahend.
650/// \returns A 256-bit vector of [8 x float] containing the result.
651static __inline__ __m256 __DEFAULT_FN_ATTRS256
652_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
653{
654  return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
655}
656
657/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
658///    For each element, computes <c> -(__A * __B) - __C </c>.
659///
660/// \headerfile <immintrin.h>
661///
662/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
663///
664/// \param __A
665///    A 256-bit vector of [4 x double] containing the multiplicand.
666/// \param __B
667///    A 256-bit vector of [4 x double] containing the multiplier.
668/// \param __C
669///    A 256-bit vector of [4 x double] containing the subtrahend.
670/// \returns A 256-bit vector of [4 x double] containing the result.
671static __inline__ __m256d __DEFAULT_FN_ATTRS256
672_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
673{
674  return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
675}
676
677/// Computes a multiply with alternating add/subtract of 256-bit vectors of
678///    [8 x float].
679///
680/// \code{.operation}
681/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
682/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
683/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
684/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
685/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
686/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
687/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
688/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
689/// \endcode
690///
691/// \headerfile <immintrin.h>
692///
693/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
694///
695/// \param __A
696///    A 256-bit vector of [8 x float] containing the multiplicand.
697/// \param __B
698///    A 256-bit vector of [8 x float] containing the multiplier.
699/// \param __C
700///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
701/// \returns A 256-bit vector of [8 x float] containing the result.
702static __inline__ __m256 __DEFAULT_FN_ATTRS256
703_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
704{
705  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
706}
707
708/// Computes a multiply with alternating add/subtract of 256-bit vectors of
709///    [4 x double].
710///
711/// \code{.operation}
712/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
713/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
714/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
715/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
716/// \endcode
717///
718/// \headerfile <immintrin.h>
719///
720/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
721///
722/// \param __A
723///    A 256-bit vector of [4 x double] containing the multiplicand.
724/// \param __B
725///    A 256-bit vector of [4 x double] containing the multiplier.
726/// \param __C
727///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
728/// \returns A 256-bit vector of [4 x double] containing the result.
729static __inline__ __m256d __DEFAULT_FN_ATTRS256
730_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
731{
732  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
733}
734
735/// Computes a vector multiply with alternating add/subtract of 256-bit
736///    vectors of [8 x float].
737///
738/// \code{.operation}
739/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
740/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
741/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
742/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
743/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
744/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
745/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
746/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
747/// \endcode
748///
749/// \headerfile <immintrin.h>
750///
751/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
752///
753/// \param __A
754///    A 256-bit vector of [8 x float] containing the multiplicand.
755/// \param __B
756///    A 256-bit vector of [8 x float] containing the multiplier.
757/// \param __C
758///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
759/// \returns A 256-bit vector of [8 x float] containing the result.
760static __inline__ __m256 __DEFAULT_FN_ATTRS256
761_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
762{
763  return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
764}
765
766/// Computes a vector multiply with alternating add/subtract of 256-bit
767///    vectors of [4 x double].
768///
769/// \code{.operation}
770/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
771/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
772/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
773/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
774/// \endcode
775///
776/// \headerfile <immintrin.h>
777///
778/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
779///
780/// \param __A
781///    A 256-bit vector of [4 x double] containing the multiplicand.
782/// \param __B
783///    A 256-bit vector of [4 x double] containing the multiplier.
784/// \param __C
785///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
786/// \returns A 256-bit vector of [4 x double] containing the result.
787static __inline__ __m256d __DEFAULT_FN_ATTRS256
788_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
789{
790  return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
791}
792
793#undef __DEFAULT_FN_ATTRS128
794#undef __DEFAULT_FN_ATTRS256
795
796#endif /* __FMAINTRIN_H */