Floating Point Computation
__m128 __lsx_vfadd_s (__m128 a, __m128 b)
Synopsis
__m128 __lsx_vfadd_s (__m128 a, __m128 b)
#include <lsxintrin.h>
Instruction: vfadd.s vr, vr, vr
CPU Flags: LSX
Description
Add single precision floating point elements in a
to elements in b
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = a.fp32[i] + b.fp32[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 3 | 4 |
3C5000 | 5 | 2 |
__m128d __lsx_vfadd_d (__m128d a, __m128d b)
Synopsis
__m128d __lsx_vfadd_d (__m128d a, __m128d b)
#include <lsxintrin.h>
Instruction: vfadd.d vr, vr, vr
CPU Flags: LSX
Description
Add double precision floating point elements in a
to elements in b
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = a.fp64[i] + b.fp64[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 3 | 4 |
3C5000 | 5 | 2 |
__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
Synopsis
__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
#include <lsxintrin.h>
Instruction: vfdiv.s vr, vr, vr
CPU Flags: LSX
Description
Divide single precision floating point elements in a
by elements in b
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = a.fp32[i] / b.fp32[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 11 | 0.18(1/5.5) |
3C5000 | 11, 19.5 | 0.13(1/7.5) |
__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
Synopsis
__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
#include <lsxintrin.h>
Instruction: vfdiv.d vr, vr, vr
CPU Flags: LSX
Description
Divide double precision floating point elements in a
by elements in b
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = a.fp64[i] / b.fp64[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 8, 21.5 | 0.25(1/4) |
3C5000 | 8, 16.5 | 0.08(1/12.5) |
__m128 __lsx_vfmax_s (__m128 a, __m128 b)
Synopsis
__m128 __lsx_vfmax_s (__m128 a, __m128 b)
#include <lsxintrin.h>
Instruction: vfmax.s vr, vr, vr
CPU Flags: LSX
Description
Compute maximum of single precision floating point elements in a
and b
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 4 |
3C5000 | 2 | 2 |
__m128d __lsx_vfmax_d (__m128d a, __m128d b)
Synopsis
__m128d __lsx_vfmax_d (__m128d a, __m128d b)
#include <lsxintrin.h>
Instruction: vfmax.d vr, vr, vr
CPU Flags: LSX
Description
Compute maximum of double precision floating point elements in a
and b
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 4 |
3C5000 | 2 | 2 |
__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)
Synopsis
__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)
#include <lsxintrin.h>
Instruction: vfmaxa.s vr, vr, vr
CPU Flags: LSX
Description
Compute maximum of single precision floating point elements in a
and b
by magnitude.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 4 |
3C5000 | 2 | 2 |
__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)
Synopsis
__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)
#include <lsxintrin.h>
Instruction: vfmaxa.d vr, vr, vr
CPU Flags: LSX
Description
Compute maximum of double precision floating point elements in a
and b
by magnitude.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 4 |
3C5000 | 2 | 2 |
__m128 __lsx_vfmin_s (__m128 a, __m128 b)
Synopsis
__m128 __lsx_vfmin_s (__m128 a, __m128 b)
#include <lsxintrin.h>
Instruction: vfmax.s vr, vr, vr
CPU Flags: LSX
Description
Compute minimum of single precision floating point elements in a
and b
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 4 |
3C5000 | 2 | 2 |
__m128d __lsx_vfmin_d (__m128d a, __m128d b)
Synopsis
__m128d __lsx_vfmin_d (__m128d a, __m128d b)
#include <lsxintrin.h>
Instruction: vfmax.d vr, vr, vr
CPU Flags: LSX
Description
Compute minimum of double precision floating point elements in a
and b
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 4 |
3C5000 | 2 | 2 |
__m128 __lsx_vfmina_s (__m128 a, __m128 b)
Synopsis
__m128 __lsx_vfmina_s (__m128 a, __m128 b)
#include <lsxintrin.h>
Instruction: vfmina.s vr, vr, vr
CPU Flags: LSX
Description
Compute minimum of single precision floating point elements in a
and b
by magnitude.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 4 |
3C5000 | 2 | 2 |
__m128d __lsx_vfmina_d (__m128d a, __m128d b)
Synopsis
__m128d __lsx_vfmina_d (__m128d a, __m128d b)
#include <lsxintrin.h>
Instruction: vfmina.d vr, vr, vr
CPU Flags: LSX
Description
Compute minimum of double precision floating point elements in a
and b
by magnitude.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 4 |
3C5000 | 2 | 2 |
__m128 __lsx_vfmul_s (__m128 a, __m128 b)
Synopsis
__m128 __lsx_vfmul_s (__m128 a, __m128 b)
#include <lsxintrin.h>
Instruction: vfmul.s vr, vr, vr
CPU Flags: LSX
Description
Multiply single precision floating point elements in a
and elements in b
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = a.fp32[i] * b.fp32[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 5 | 2 |
3C5000 | 5 | 2 |
__m128d __lsx_vfmul_d (__m128d a, __m128d b)
Synopsis
__m128d __lsx_vfmul_d (__m128d a, __m128d b)
#include <lsxintrin.h>
Instruction: vfmul.d vr, vr, vr
CPU Flags: LSX
Description
Multiply double precision floating point elements in a
and elements in b
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = a.fp64[i] * b.fp64[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 5 | 2 |
3C5000 | 5 | 2 |
__m128 __lsx_vfsub_s (__m128 a, __m128 b)
Synopsis
__m128 __lsx_vfsub_s (__m128 a, __m128 b)
#include <lsxintrin.h>
Instruction: vfsub.s vr, vr, vr
CPU Flags: LSX
Description
Subtract single precision floating point elements in a
by elements in b
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = a.fp32[i] - b.fp32[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 3 | 4 |
3C5000 | 5 | 2 |
__m128d __lsx_vfsub_d (__m128d a, __m128d b)
Synopsis
__m128d __lsx_vfsub_d (__m128d a, __m128d b)
#include <lsxintrin.h>
Instruction: vfsub.d vr, vr, vr
CPU Flags: LSX
Description
Subtract double precision floating point elements in a
by elements in b
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = a.fp64[i] - b.fp64[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 3 | 4 |
3C5000 | 5 | 2 |
__m128 __lsx_vflogb_s (__m128 a)
Synopsis
__m128 __lsx_vflogb_s (__m128 a)
#include <lsxintrin.h>
Instruction: vflogb.s vr, vr
CPU Flags: LSX
Description
Compute 2-based logarithm of single precision floating point elements in a
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = log2(a.fp32[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 4 | 4 |
3C5000 | 4 | 2 |
__m128d __lsx_vflogb_d (__m128d a)
Synopsis
__m128d __lsx_vflogb_d (__m128d a)
#include <lsxintrin.h>
Instruction: vflogb.d vr, vr
CPU Flags: LSX
Description
Compute 2-based logarithm of double precision floating point elements in a
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = log2(a.fp64[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 4 | 4 |
3C5000 | 4 | 2 |
__m128 __lsx_vfsqrt_s (__m128 a)
Synopsis
__m128 __lsx_vfsqrt_s (__m128 a)
#include <lsxintrin.h>
Instruction: vfsqrt.s vr, vr
CPU Flags: LSX
Description
Compute square root of single precision floating point elements in a
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = sqrt(a.fp32[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 11 | 0.08(1/12) |
3C5000 | 27 | 0.17(1/6) |
__m128d __lsx_vfsqrt_d (__m128d a)
Synopsis
__m128d __lsx_vfsqrt_d (__m128d a)
#include <lsxintrin.h>
Instruction: vfsqrt.d vr, vr
CPU Flags: LSX
Description
Compute square root of double precision floating point elements in a
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = sqrt(a.fp64[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 36 | 0.06(1/17.5) |
3C5000 | 36 | 0.05(1/18.5) |
__m128 __lsx_vfrsqrt_s (__m128 a)
Synopsis
__m128 __lsx_vfrsqrt_s (__m128 a)
#include <lsxintrin.h>
Instruction: vfrsqrt.s vr, vr
CPU Flags: LSX
Description
Compute reciprocal of square root of single precision floating point elements in a
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 17 | 0.05(1/19) |
3C5000 | 21 | 0.11(1/9) |
__m128d __lsx_vfrsqrt_d (__m128d a)
Synopsis
__m128d __lsx_vfrsqrt_d (__m128d a)
#include <lsxintrin.h>
Instruction: vfrsqrt.d vr, vr
CPU Flags: LSX
Description
Compute reciprocal of square root of double precision floating point elements in a
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 15 | 0.04(1/26.5) |
3C5000 | 15 | 0.04(1/27.5) |
__m128 __lsx_vfrecip_s (__m128 a)
Synopsis
__m128 __lsx_vfrecip_s (__m128 a)
#include <lsxintrin.h>
Instruction: vfrecip.s vr, vr
CPU Flags: LSX
Description
Compute reciprocal of single precision floating point elements in a
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = 1 / a.fp32[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 11 | 0.18(1/5.5) |
3C5000 | 27 | 0.14(1/7) |
__m128d __lsx_vfrecip_d (__m128d a)
Synopsis
__m128d __lsx_vfrecip_d (__m128d a)
#include <lsxintrin.h>
Instruction: vfrecip.d vr, vr
CPU Flags: LSX
Description
Compute reciprocal of double precision floating point elements in a
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = 1 / a.fp64[i];
}
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 8 | 0.25(1/4) |
3C5000 | 23 | 0.08(1/12) |
__m128 __lsx_vfrsqrte_s (__m128 a)
Synopsis
__m128 __lsx_vfrsqrte_s (__m128 a)
#include <lsxintrin.h>
Instruction: vfrsqrte.s vr, vr
CPU Flags: LSX
Description
Compute estimated reciprocal of square root of single precision floating point elements in a
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
}
__m128d __lsx_vfrsqrte_d (__m128d a)
Synopsis
__m128d __lsx_vfrsqrte_d (__m128d a)
#include <lsxintrin.h>
Instruction: vfrsqrte.d vr, vr
CPU Flags: LSX
Description
Compute estimated reciprocal of square root of double precision floating point elements in a
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
}
__m128 __lsx_vfrecipe_s (__m128 a)
Synopsis
__m128 __lsx_vfrecipe_s (__m128 a)
#include <lsxintrin.h>
Instruction: vfrecipe.s vr, vr
CPU Flags: LSX
Description
Compute estimated reciprocal of single precision floating point elements in a
.
Operation
for (int i = 0; i < 4; i++) {
dst.fp32[i] = 1 / a.fp32[i]; // estimated
}
__m128d __lsx_vfrecipe_d (__m128d a)
Synopsis
__m128d __lsx_vfrecipe_d (__m128d a)
#include <lsxintrin.h>
Instruction: vfrecipe.d vr, vr
CPU Flags: LSX
Description
Compute estimated reciprocal of double precision floating point elements in a
.
Operation
for (int i = 0; i < 2; i++) {
dst.fp64[i] = 1 / a.fp64[i]; // estimated
}