Fused Multiply-Add

m256d lasx_xvfmadd_d (m256d a, m256d b, __m256d c)

Synopsis

__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)
#include <lasxintrin.h>
Instruction: xvfmadd.d xr, xr, xr, xr
CPU Flags: LASX

Description

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
}

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	5	2
3C5000	5	2

m256 lasx_xvfmadd_s (m256 a, m256 b, __m256 c)

Synopsis

__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)
#include <lasxintrin.h>
Instruction: xvfmadd.s xr, xr, xr, xr
CPU Flags: LASX

Description

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
}

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	5	2
3C5000	5	2

m256d lasx_xvfmsub_d (m256d a, m256d b, __m256d c)

Synopsis

__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)
#include <lasxintrin.h>
Instruction: xvfmsub.d xr, xr, xr, xr
CPU Flags: LASX

Description

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
}

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	5	2
3C5000	5	2

m256 lasx_xvfmsub_s (m256 a, m256 b, __m256 c)

Synopsis

__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)
#include <lasxintrin.h>
Instruction: xvfmsub.s xr, xr, xr, xr
CPU Flags: LASX

Description

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
}

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	5	2
3C5000	5	2

m256d lasx_xvfnmadd_d (m256d a, m256d b, __m256d c)

Synopsis

__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)
#include <lasxintrin.h>
Instruction: xvfnmadd.d xr, xr, xr, xr
CPU Flags: LASX

Description

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
}

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	5	2
3C5000	5	2

m256 lasx_xvfnmadd_s (m256 a, m256 b, __m256 c)

Synopsis

__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)
#include <lasxintrin.h>
Instruction: xvfnmadd.s xr, xr, xr, xr
CPU Flags: LASX

Description

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, accumulate to elements in c and store the negated result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
}

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	5	2
3C5000	5	2

m256d lasx_xvfnmsub_d (m256d a, m256d b, __m256d c)

Synopsis

__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)
#include <lasxintrin.h>
Instruction: xvfnmsub.d xr, xr, xr, xr
CPU Flags: LASX

Description

Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
}

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	5	2
3C5000	5	2

m256 lasx_xvfnmsub_s (m256 a, m256 b, __m256 c)

Synopsis

__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)
#include <lasxintrin.h>
Instruction: xvfnmsub.s xr, xr, xr, xr
CPU Flags: LASX

Description

Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in a and b, subtract elements in c and store the negated result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
}

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	5	2
3C5000	5	2