Fused Multiply-Add

__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
#include <lsxintrin.h>
Instruction: vfmadd.d vr, vr, vr, vr
CPU Flags: LSX

Description

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
}

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 5 2
3C5000 5 2

__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
#include <lsxintrin.h>
Instruction: vfmadd.s vr, vr, vr, vr
CPU Flags: LSX

Description

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
}

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 5 2
3C5000 5 2

__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
#include <lsxintrin.h>
Instruction: vfmsub.d vr, vr, vr, vr
CPU Flags: LSX

Description

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
}

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 5 2
3C5000 5 2

__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
#include <lsxintrin.h>
Instruction: vfmsub.s vr, vr, vr, vr
CPU Flags: LSX

Description

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
}

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 5 2
3C5000 5 2

__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
#include <lsxintrin.h>
Instruction: vfnmadd.d vr, vr, vr, vr
CPU Flags: LSX

Description

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
}

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 5 2
3C5000 5 2

__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
#include <lsxintrin.h>
Instruction: vfnmadd.s vr, vr, vr, vr
CPU Flags: LSX

Description

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
}

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 5 2
3C5000 5 2

__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)

Synopsis

__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
#include <lsxintrin.h>
Instruction: vfnmsub.d vr, vr, vr, vr
CPU Flags: LSX

Description

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
}

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 5 2
3C5000 5 2

__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)

Synopsis

__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
#include <lsxintrin.h>
Instruction: vfnmsub.s vr, vr, vr, vr
CPU Flags: LSX

Description

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
}

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 5 2
3C5000 5 2