Misc
__m128i __lsx_vexth_h_b (__m128i a)
Synopsis
__m128i __lsx_vexth_h_b (__m128i a)
#include <lsxintrin.h>
Instruction: vexth.h.b vr, vr
CPU Flags: LSX
Description
Extend signed 8-bit elements in the higher half of a
to 16-bit.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (s16)(s8)a.byte[8 + i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vexth_hu_bu (__m128i a)
Synopsis
__m128i __lsx_vexth_hu_bu (__m128i a)
#include <lsxintrin.h>
Instruction: vexth.hu.bu vr, vr
CPU Flags: LSX
Description
Extend unsigned 8-bit elements in the higher half of a
to 16-bit.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (u16)(u8)a.byte[8 + i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vexth_w_h (__m128i a)
Synopsis
__m128i __lsx_vexth_w_h (__m128i a)
#include <lsxintrin.h>
Instruction: vexth.w.h vr, vr
CPU Flags: LSX
Description
Extend signed 16-bit elements in the higher half of a
to 32-bit.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (s32)(s16)a.half[4 + i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vexth_wu_hu (__m128i a)
Synopsis
__m128i __lsx_vexth_wu_hu (__m128i a)
#include <lsxintrin.h>
Instruction: vexth.wu.hu vr, vr
CPU Flags: LSX
Description
Extend unsigned 16-bit elements in the higher half of a
to 32-bit.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (u32)(u16)a.half[4 + i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vexth_d_w (__m128i a)
Synopsis
__m128i __lsx_vexth_d_w (__m128i a)
#include <lsxintrin.h>
Instruction: vexth.d.w vr, vr
CPU Flags: LSX
Description
Extend signed 32-bit elements in the higher half of a
to 64-bit.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (s64)(s32)a.word[2 + i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vexth_du_wu (__m128i a)
Synopsis
__m128i __lsx_vexth_du_wu (__m128i a)
#include <lsxintrin.h>
Instruction: vexth.du.wu vr, vr
CPU Flags: LSX
Description
Extend unsigned 32-bit elements in the higher half of a
to 64-bit.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (u64)(u32)a.word[2 + i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vexth_q_d (__m128i a)
Synopsis
__m128i __lsx_vexth_q_d (__m128i a)
#include <lsxintrin.h>
Instruction: vexth.q.d vr, vr
CPU Flags: LSX
Description
Extend signed 64-bit elements in the higher half of a
to 128-bit.
Operation
for (int i = 0; i < 1; i++) {
dst.qword[i] = (s128)(s64)a.dword[1 + i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vexth_qu_du (__m128i a)
Synopsis
__m128i __lsx_vexth_qu_du (__m128i a)
#include <lsxintrin.h>
Instruction: vexth.qu.du vr, vr
CPU Flags: LSX
Description
Extend unsigned 64-bit elements in the higher half of a
to 128-bit.
Operation
for (int i = 0; i < 1; i++) {
dst.qword[i] = (u128)(u64)a.dword[1 + i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vextl_q_d (__m128i a)
Synopsis
__m128i __lsx_vextl_q_d (__m128i a)
#include <lsxintrin.h>
Instruction: vextl.q.d vr, vr
CPU Flags: LSX
Description
Extend signed 64-bit elements in the lower half of a
to 128-bit.
Operation
for (int i = 0; i < 1; i++) {
dst.qword[i] = (s128)(s64)a.dword[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vextl_qu_du (__m128i a)
Synopsis
__m128i __lsx_vextl_qu_du (__m128i a)
#include <lsxintrin.h>
Instruction: vextl.qu.du vr, vr
CPU Flags: LSX
Description
Extend unsigned 64-bit elements in the lower half of a
to 128-bit.
Operation
for (int i = 0; i < 1; i++) {
dst.qword[i] = (u128)(u64)a.dword[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
Synopsis
__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vextrins.b vr, vr, imm
CPU Flags: LSX
Description
Extract one 8-bit element in b
and insert it to a
according to imm
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
Synopsis
__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vextrins.h vr, vr, imm
CPU Flags: LSX
Description
Extract one 16-bit element in b
and insert it to a
according to imm
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
Synopsis
__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vextrins.w vr, vr, imm
CPU Flags: LSX
Description
Extract one 32-bit element in b
and insert it to a
according to imm
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
Synopsis
__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vextrins.d vr, vr, imm
CPU Flags: LSX
Description
Extract one 64-bit element in b
and insert it to a
according to imm
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vilvh_b (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vilvh_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vilvh.b vr, vr, vr
CPU Flags: LSX
Description
Interleave 8-bit elements in higher half of a
and b
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vilvh_h (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vilvh_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vilvh.h vr, vr, vr
CPU Flags: LSX
Description
Interleave 16-bit elements in higher half of a
and b
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vilvh_w (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vilvh_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vilvh.w vr, vr, vr
CPU Flags: LSX
Description
Interleave 32-bit elements in higher half of a
and b
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vilvh_d (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vilvh_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vilvh.d vr, vr, vr
CPU Flags: LSX
Description
Interleave 64-bit elements in higher half of a
and b
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vilvl_b (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vilvl_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vilvl.b vr, vr, vr
CPU Flags: LSX
Description
Interleave 8-bit elements in lower half of a
and b
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vilvl_h (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vilvl_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vilvl.h vr, vr, vr
CPU Flags: LSX
Description
Interleave 16-bit elements in lower half of a
and b
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vilvl_w (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vilvl_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vilvl.w vr, vr, vr
CPU Flags: LSX
Description
Interleave 32-bit elements in lower half of a
and b
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vilvl_d (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vilvl_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vilvl.d vr, vr, vr
CPU Flags: LSX
Description
Interleave 64-bit elements in lower half of a
and b
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)
Synopsis
__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vinsgr2vr.b vr, r, imm
CPU Flags: LSX
Description
Insert 8-bit element into lane indexed imm
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = (i == imm) ? b : a.byte[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)
Synopsis
__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vinsgr2vr.h vr, r, imm
CPU Flags: LSX
Description
Insert 16-bit element into lane indexed imm
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (i == imm) ? b : a.half[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)
Synopsis
__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)
#include <lsxintrin.h>
Instruction: vinsgr2vr.w vr, r, imm
CPU Flags: LSX
Description
Insert 32-bit element into lane indexed imm
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (i == imm) ? b : a.word[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)
Synopsis
__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)
#include <lsxintrin.h>
Instruction: vinsgr2vr.d vr, r, imm
CPU Flags: LSX
Description
Insert 64-bit element into lane indexed imm
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (i == imm) ? b : a.dword[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)
Synopsis
__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)
#include <lsxintrin.h>
Instruction: vfrstp.b vr, vr, vr
CPU Flags: LSX
Description
Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by c
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = a.byte[i];
}
int i;
for (i = 0; i < 16; i++) {
if ((s8)b.byte[i] < 0) {
break;
}
}
dst.byte[c.byte[0] % 16] = i;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)
Synopsis
__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)
#include <lsxintrin.h>
Instruction: vfrstp.h vr, vr, vr
CPU Flags: LSX
Description
Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by c
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = a.half[i];
}
int i;
for (i = 0; i < 8; i++) {
if ((s16)b.half[i] < 0) {
break;
}
}
dst.half[c.half[0] % 8] = i;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)
Synopsis
__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vfrstpi.b vr, vr, imm
CPU Flags: LSX
Description
Find the first negative 8-bit element in b
, set the index of the element to the lane of a
specified by imm
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = a.byte[i];
}
int i;
for (i = 0; i < 16; i++) {
if ((s8)b.byte[i] < 0) {
break;
}
}
dst.byte[imm % 16] = i;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)
Synopsis
__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vfrstpi.h vr, vr, imm
CPU Flags: LSX
Description
Find the first negative 16-bit element in b
, set the index of the element to the lane of a
specified by imm
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = a.half[i];
}
int i;
for (i = 0; i < 8; i++) {
if ((s16)b.half[i] < 0) {
break;
}
}
dst.half[imm % 8] = i;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vmskgez_b (__m128i a)
Synopsis
__m128i __lsx_vmskgez_b (__m128i a)
#include <lsxintrin.h>
Instruction: vmskgez.b vr, vr
CPU Flags: LSX
Description
For each 8-bit element in a
, if the element is greater than or equal to zero, set one bit in dst
, otherwise clear it.
Examples
__m128i __lsx_vmskgez_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x00000000000001fe 0x0000000000000000
__m128i __lsx_vmskgez_b(__m128i{0x0000808000000000, 0x0081000081716151})
= 0x000000000000b7cf 0x0000000000000000
Operation
u64 m = 0x8080808080808080;
u64 c = m & a.dword[0];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] = c;
c = m & a.dword[1];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] |= c << 8;
dst.dword[0] = (u16)~dst.dword[0];
dst.dword[1] = 0;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vmskltz_b (__m128i a)
Synopsis
__m128i __lsx_vmskltz_b (__m128i a)
#include <lsxintrin.h>
Instruction: vmskltz.b vr, vr
CPU Flags: LSX
Description
For each 8-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
Examples
__m128i __lsx_vmskltz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x000000000000fe01 0x0000000000000000
__m128i __lsx_vmskltz_b(__m128i{0x0000808000000000, 0x0081000081716151})
= 0x0000000000004830 0x0000000000000000
Operation
u64 m = 0x8080808080808080;
u64 c = m & a.dword[0];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] = c;
c = m & a.dword[1];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] |= c << 8;
dst.dword[1] = 0;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vmskltz_h (__m128i a)
Synopsis
__m128i __lsx_vmskltz_h (__m128i a)
#include <lsxintrin.h>
Instruction: vmskltz.h vr, vr
CPU Flags: LSX
Description
For each 16-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
Examples
__m128i __lsx_vmskltz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x00000000000000f0 0x0000000000000000
__m128i __lsx_vmskltz_h(__m128i{0x0000808000000000, 0x0081000081716151})
= 0x0000000000000024 0x0000000000000000
Operation
u64 m = 0x8000800080008000;
u64 c = m & a.dword[0];
c |= c << 15;
c |= c << 30;
c >>= 60;
dst.dword[0] = c;
c = m & a.dword[1];
c |= c << 15;
c |= c << 30;
c >>= 60;
dst.dword[0] |= c << 4;
dst.dword[1] = 0;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vmskltz_w (__m128i a)
Synopsis
__m128i __lsx_vmskltz_w (__m128i a)
#include <lsxintrin.h>
Instruction: vmskltz.w vr, vr
CPU Flags: LSX
Description
For each 32-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
Examples
__m128i __lsx_vmskltz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x000000000000000c 0x0000000000000000
__m128i __lsx_vmskltz_w(__m128i{0x0000808000000000, 0x0081000081716151})
= 0x0000000000000004 0x0000000000000000
Operation
u64 m = 0x8000000080000000;
u64 c = m & a.dword[0];
c |= c << 31;
c >>= 62;
dst.dword[0] = c;
c = m & a.dword[1];
c |= c << 31;
c >>= 62;
dst.dword[0] |= c << 2;
dst.dword[1] = 0;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vmskltz_d (__m128i a)
Synopsis
__m128i __lsx_vmskltz_d (__m128i a)
#include <lsxintrin.h>
Instruction: vmskltz.d vr, vr
CPU Flags: LSX
Description
For each 64-bit element in a
, if the element is less than zero, set one bit in dst
, otherwise clear it.
Examples
__m128i __lsx_vmskltz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0000000000000002 0x0000000000000000
__m128i __lsx_vmskltz_d(__m128i{0x0000808000000000, 0x0081000081716151})
= 0x0000000000000000 0x0000000000000000
Operation
u64 m = 0x8000000000000000;
u64 c = m & a.dword[0];
c >>= 63;
dst.dword[0] = c;
c = m & a.dword[1];
c >>= 63;
dst.dword[0] |= c << 1;
dst.dword[1] = 0;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vmsknz_b (__m128i a)
Synopsis
__m128i __lsx_vmsknz_b (__m128i a)
#include <lsxintrin.h>
Instruction: vmsknz.b vr, vr
CPU Flags: LSX
Description
For each 8-bit element in a
, if the element is non-zero, set one bit in dst
, otherwise clear it.
Examples
__m128i __lsx_vmsknz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x000000000000feff 0x0000000000000000
__m128i __lsx_vmsknz_b(__m128i{0x0000111100000000, 0x0011000011111111})
= 0x0000000000004f30 0x0000000000000000
Operation
u64 m = 0x7F7F7F7F7F7F7F7F;
u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] = c;
c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] |= c << 8;
dst.dword[0] = (u16)~dst.dword[0];
dst.dword[1] = 0;
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpackev_b (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpackev_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpackev.b vr, vr, vr
CPU Flags: LSX
Description
Collect and pack even-positioned 8-bit elements in a
and b
and store dst
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpackev_h (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpackev_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpackev.h vr, vr, vr
CPU Flags: LSX
Description
Collect and pack even-positioned 16-bit elements in a
and b
and store dst
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpackev_w (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpackev_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpackev.w vr, vr, vr
CPU Flags: LSX
Description
Collect and pack even-positioned 32-bit elements in a
and b
and store dst
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpackev_d (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpackev_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpackev.d vr, vr, vr
CPU Flags: LSX
Description
Collect and pack even-positioned 64-bit elements in a
and b
and store dst
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpackod_b (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpackod_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpackod.b vr, vr, vr
CPU Flags: LSX
Description
Collect and pack odd-positioned 8-bit elements in a
and b
and store dst
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpackod_h (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpackod_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpackod.h vr, vr, vr
CPU Flags: LSX
Description
Collect and pack odd-positioned 16-bit elements in a
and b
and store dst
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpackod_w (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpackod_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpackod.w vr, vr, vr
CPU Flags: LSX
Description
Collect and pack odd-positioned 32-bit elements in a
and b
and store dst
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpackod_d (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpackod_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpackod.d vr, vr, vr
CPU Flags: LSX
Description
Collect and pack odd-positioned 64-bit elements in a
and b
and store dst
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpickev_b (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpickev_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpickev.b vr, vr, vr
CPU Flags: LSX
Description
Pick even-positioned 8-bit elements in b
first, then pick even-positioned 8-bit elements in a
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpickev_h (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpickev_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpickev.h vr, vr, vr
CPU Flags: LSX
Description
Pick even-positioned 16-bit elements in b
first, then pick even-positioned 16-bit elements in a
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpickev_w (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpickev_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpickev.w vr, vr, vr
CPU Flags: LSX
Description
Pick even-positioned 32-bit elements in b
first, then pick even-positioned 32-bit elements in a
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpickev_d (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpickev_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpickev.d vr, vr, vr
CPU Flags: LSX
Description
Pick even-positioned 64-bit elements in b
first, then pick even-positioned 64-bit elements in a
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)
Synopsis
int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)
#include <lsxintrin.h>
Instruction: vpickve2gr.b r, vr, imm
CPU Flags: LSX
Description
Pick the lane
specified by idx
from a
and store into dst
.
Operation
dst = (s8)a.byte[idx];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)
Synopsis
unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)
#include <lsxintrin.h>
Instruction: vpickve2gr.bu r, vr, imm
CPU Flags: LSX
Description
Pick the lane
specified by idx
from a
and store into dst
.
Operation
dst = (u8)a.byte[idx];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)
Synopsis
int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)
#include <lsxintrin.h>
Instruction: vpickve2gr.h r, vr, imm
CPU Flags: LSX
Description
Pick the lane
specified by idx
from a
and store into dst
.
Operation
dst = (s16)a.half[idx];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)
Synopsis
unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)
#include <lsxintrin.h>
Instruction: vpickve2gr.hu r, vr, imm
CPU Flags: LSX
Description
Pick the lane
specified by idx
from a
and store into dst
.
Operation
dst = (u16)a.half[idx];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)
Synopsis
int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)
#include <lsxintrin.h>
Instruction: vpickve2gr.w r, vr, imm
CPU Flags: LSX
Description
Pick the lane
specified by idx
from a
and store into dst
.
Operation
dst = (s32)a.word[idx];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)
Synopsis
unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)
#include <lsxintrin.h>
Instruction: vpickve2gr.wu r, vr, imm
CPU Flags: LSX
Description
Pick the lane
specified by idx
from a
and store into dst
.
Operation
dst = (u32)a.word[idx];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)
Synopsis
long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)
#include <lsxintrin.h>
Instruction: vpickve2gr.d r, vr, imm
CPU Flags: LSX
Description
Pick the lane
specified by idx
from a
and store into dst
.
Operation
dst = (s64)a.dword[idx];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)
Synopsis
unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)
#include <lsxintrin.h>
Instruction: vpickve2gr.du r, vr, imm
CPU Flags: LSX
Description
Pick the lane
specified by idx
from a
and store into dst
.
Operation
dst = (u64)a.dword[idx];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vpickod_b (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpickod_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpickod.b vr, vr, vr
CPU Flags: LSX
Description
Pick odd-positioned 8-bit elements in b
first, then pick odd-positioned 8-bit elements in a
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpickod_h (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpickod_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpickod.h vr, vr, vr
CPU Flags: LSX
Description
Pick odd-positioned 16-bit elements in b
first, then pick odd-positioned 16-bit elements in a
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpickod_w (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpickod_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpickod.w vr, vr, vr
CPU Flags: LSX
Description
Pick odd-positioned 32-bit elements in b
first, then pick odd-positioned 32-bit elements in a
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vpickod_d (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vpickod_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vpickod.d vr, vr, vr
CPU Flags: LSX
Description
Pick odd-positioned 64-bit elements in b
first, then pick odd-positioned 64-bit elements in a
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vrepli_b (imm_n512_511 imm)
Synopsis
__m128i __lsx_vrepli_b (imm_n512_511 imm)
#include <lsxintrin.h>
Instruction: vldi vr, imm
CPU Flags: LSX
Description
Repeat imm
to fill whole vector.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = imm;
}
Tested on real machine.
__m128i __lsx_vrepli_h (imm_n512_511 imm)
Synopsis
__m128i __lsx_vrepli_h (imm_n512_511 imm)
#include <lsxintrin.h>
Instruction: vldi vr, imm
CPU Flags: LSX
Description
Repeat imm
to fill whole vector.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = imm;
}
Tested on real machine.
__m128i __lsx_vrepli_w (imm_n512_511 imm)
Synopsis
__m128i __lsx_vrepli_w (imm_n512_511 imm)
#include <lsxintrin.h>
Instruction: vldi vr, imm
CPU Flags: LSX
Description
Repeat imm
to fill whole vector.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = imm;
}
Tested on real machine.
__m128i __lsx_vrepli_d (imm_n512_511 imm)
Synopsis
__m128i __lsx_vrepli_d (imm_n512_511 imm)
#include <lsxintrin.h>
Instruction: vldi vr, imm
CPU Flags: LSX
Description
Repeat imm
to fill whole vector.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = imm;
}
Tested on real machine.
__m128i __lsx_vreplgr2vr_b (int val)
Synopsis
__m128i __lsx_vreplgr2vr_b (int val)
#include <lsxintrin.h>
Instruction: vreplgr2vr.b vr, r
CPU Flags: LSX
Description
Repeat val
to whole vector.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = val;
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | N/A | 1 |
3C5000 | N/A | 1 |
__m128i __lsx_vreplgr2vr_h (int val)
Synopsis
__m128i __lsx_vreplgr2vr_h (int val)
#include <lsxintrin.h>
Instruction: vreplgr2vr.h vr, r
CPU Flags: LSX
Description
Repeat val
to whole vector.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = val;
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | N/A | 1 |
3C5000 | N/A | 1 |
__m128i __lsx_vreplgr2vr_w (int val)
Synopsis
__m128i __lsx_vreplgr2vr_w (int val)
#include <lsxintrin.h>
Instruction: vreplgr2vr.w vr, r
CPU Flags: LSX
Description
Repeat val
to whole vector.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = val;
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | N/A | 1 |
3C5000 | N/A | 1 |
__m128i __lsx_vreplgr2vr_d (long int val)
Synopsis
__m128i __lsx_vreplgr2vr_d (long int val)
#include <lsxintrin.h>
Instruction: vreplgr2vr.d vr, r
CPU Flags: LSX
Description
Repeat val
to whole vector.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = val;
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | N/A | 1 |
3C5000 | N/A | 1 |
__m128i __lsx_vreplve_b (__m128i a, int idx)
Synopsis
__m128i __lsx_vreplve_b (__m128i a, int idx)
#include <lsxintrin.h>
Instruction: vreplve.b vr, vr, r
CPU Flags: LSX
Description
Repeat the element in lane idx
of a
to fill whole vector.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = a.byte[idx % 16];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vreplve_h (__m128i a, int idx)
Synopsis
__m128i __lsx_vreplve_h (__m128i a, int idx)
#include <lsxintrin.h>
Instruction: vreplve.h vr, vr, r
CPU Flags: LSX
Description
Repeat the element in lane idx
of a
to fill whole vector.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = a.half[idx % 8];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vreplve_w (__m128i a, int idx)
Synopsis
__m128i __lsx_vreplve_w (__m128i a, int idx)
#include <lsxintrin.h>
Instruction: vreplve.w vr, vr, r
CPU Flags: LSX
Description
Repeat the element in lane idx
of a
to fill whole vector.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = a.word[idx % 4];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vreplve_d (__m128i a, int idx)
Synopsis
__m128i __lsx_vreplve_d (__m128i a, int idx)
#include <lsxintrin.h>
Instruction: vreplve.d vr, vr, r
CPU Flags: LSX
Description
Repeat the element in lane idx
of a
to fill whole vector.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = a.dword[idx % 2];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 1 |
3C5000 | 1 | 1 |
__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)
Synopsis
__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)
#include <lsxintrin.h>
Instruction: vreplvei.b vr, vr, imm
CPU Flags: LSX
Description
Repeat the element in lane idx
of a
to fill whole vector.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = a.byte[idx];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)
Synopsis
__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)
#include <lsxintrin.h>
Instruction: vreplvei.h vr, vr, imm
CPU Flags: LSX
Description
Repeat the element in lane idx
of a
to fill whole vector.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = a.half[idx];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)
Synopsis
__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)
#include <lsxintrin.h>
Instruction: vreplvei.w vr, vr, imm
CPU Flags: LSX
Description
Repeat the element in lane idx
of a
to fill whole vector.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = a.word[idx];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)
Synopsis
__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)
#include <lsxintrin.h>
Instruction: vreplvei.d vr, vr, imm
CPU Flags: LSX
Description
Repeat the element in lane idx
of a
to fill whole vector.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = a.dword[idx];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)
Synopsis
__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vsat.b vr, vr, imm
CPU Flags: LSX
Description
Clamp signed 8-bit elements in a
to range specified by imm
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)
Synopsis
__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vsat.bu vr, vr, imm
CPU Flags: LSX
Description
Clamp unsigned 8-bit elements in a
to range specified by imm
.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)
Synopsis
__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsat.h vr, vr, imm
CPU Flags: LSX
Description
Clamp signed 16-bit elements in a
to range specified by imm
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)
Synopsis
__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsat.hu vr, vr, imm
CPU Flags: LSX
Description
Clamp unsigned 16-bit elements in a
to range specified by imm
.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)
Synopsis
__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsat.w vr, vr, imm
CPU Flags: LSX
Description
Clamp signed 32-bit elements in a
to range specified by imm
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)
Synopsis
__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsat.wu vr, vr, imm
CPU Flags: LSX
Description
Clamp unsigned 32-bit elements in a
to range specified by imm
.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)
Synopsis
__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsat.d vr, vr, imm
CPU Flags: LSX
Description
Clamp signed 64-bit elements in a
to range specified by imm
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)
Synopsis
__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsat.du vr, vr, imm
CPU Flags: LSX
Description
Clamp unsigned 64-bit elements in a
to range specified by imm
.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 2 | 2 |
3C5000 | 2 | 2 |
__m128i __lsx_vsigncov_b (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vsigncov_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsigncov.b vr, vr, vr
CPU Flags: LSX
Description
If the 8-bit element in a
equals to zero, set the result to zero. If the signed 8-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
Operation
for (int i = 0; i < 16; i++) {
dst.byte[i] =
(a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 2 |
3C5000 | 1 | 2 |
__m128i __lsx_vsigncov_h (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vsigncov_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsigncov.h vr, vr, vr
CPU Flags: LSX
Description
If the 16-bit element in a
equals to zero, set the result to zero. If the signed 16-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
Operation
for (int i = 0; i < 8; i++) {
dst.half[i] =
(a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 2 |
3C5000 | 1 | 2 |
__m128i __lsx_vsigncov_w (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vsigncov_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsigncov.w vr, vr, vr
CPU Flags: LSX
Description
If the 32-bit element in a
equals to zero, set the result to zero. If the signed 32-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
Operation
for (int i = 0; i < 4; i++) {
dst.word[i] =
(a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 2 |
3C5000 | 1 | 2 |
__m128i __lsx_vsigncov_d (__m128i a, __m128i b)
Synopsis
__m128i __lsx_vsigncov_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsigncov.d vr, vr, vr
CPU Flags: LSX
Description
If the 64-bit element in a
equals to zero, set the result to zero. If the signed 64-bit element in a
is positive, copy element in b
to result. Otherwise, copy negated element in b
to result. If a
and b
are the same vectors, it is equivalent to computing absolute value.
Operation
for (int i = 0; i < 2; i++) {
dst.dword[i] =
(a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 2 |
3C5000 | 1 | 2 |
__m128i __lsx_vldi (imm_n1024_1023 imm)
Synopsis
__m128i __lsx_vldi (imm_n1024_1023 imm)
#include <lsxintrin.h>
Instruction: vldi vr, imm
CPU Flags: LSX
Description
Initialize dst
using predefined patterns:
imm[12:10]=0b000
: broadcastimm[7:0]
as 8-bit elements to all lanesimm[12:10]=0b001
: broadcast sign-extendedimm[9:0]
as 16-bit elements to all lanesimm[12:10]=0b010
: broadcast sign-extendedimm[9:0]
as 32-bit elements to all lanesimm[12:10]=0b011
: broadcast sign-extendedimm[9:0]
as 64-bit elements to all lanesimm[12:8]=0b10000
: broadcastimm[7:0]
as 32-bit elements to all lanesimm[12:8]=0b10001
: broadcastimm[7:0] << 8
as 32-bit elements to all lanesimm[12:8]=0b10010
: broadcastimm[7:0] << 16
as 32-bit elements to all lanesimm[12:8]=0b10011
: broadcastimm[7:0] << 24
as 32-bit elements to all lanesimm[12:8]=0b10100
: broadcastimm[7:0]
as 16-bit elements to all lanesimm[12:8]=0b10101
: broadcastimm[7:0] << 8
as 16-bit elements to all lanesimm[12:8]=0b10110
: broadcast(imm[7:0] << 8) | 0xFF
as 32-bit elements to all lanesimm[12:8]=0b10111
: broadcast(imm[7:0] << 16) | 0xFFFF
as 32-bit elements to all lanesimm[12:8]=0b11000
: broadcastimm[7:0]
as 8-bit elements to all lanesimm[12:8]=0b11001
: repeat each bit ofimm[7:0]
eight times, and broadcast the result as 64-bit elements to all lanesimm[12:8]=0b11010
: broadcast(imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 32-bit elements to all lanesimm[12:8]=0b11011
: broadcast(imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)
as 64-bit elements to all lanesimm[12:8]=0b11100
: broadcast(imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)
as 64-bit elements to all lanes
Operation
u64 imm12_10 = (imm >> 10) & 0b111;
u64 imm12_8 = (imm >> 8) & 0b11111;
u64 imm9_0 = imm & 0x3FF;
s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
u64 imm7_0 = imm & 0xFF;
u64 imm7 = (imm >> 7) & 0x1;
u64 imm6 = (imm >> 6) & 0x1;
u64 imm5 = (imm >> 5) & 0x1;
u64 imm5_0 = imm & 0x3F;
u64 imm4 = (imm >> 4) & 0x1;
u64 imm3 = (imm >> 3) & 0x1;
u64 imm2 = (imm >> 2) & 0x1;
u64 imm1 = (imm >> 1) & 0x1;
u64 imm0 = imm & 0x1;
u64 broadcast_value;
u64 broadcast_width;
if (imm12_10 == 0b000) {
broadcast_value = imm7_0;
broadcast_width = 8;
} else if (imm12_10 == 0b001) {
broadcast_value = simm9_0;
broadcast_width = 16;
} else if (imm12_10 == 0b010) {
broadcast_value = simm9_0;
broadcast_width = 32;
} else if (imm12_10 == 0b011) {
broadcast_value = simm9_0;
broadcast_width = 64;
} else if (imm12_8 == 0b10000) {
broadcast_value = imm7_0;
broadcast_width = 32;
} else if (imm12_8 == 0b10001) {
broadcast_value = imm7_0 << 8;
broadcast_width = 32;
} else if (imm12_8 == 0b10010) {
broadcast_value = imm7_0 << 16;
broadcast_width = 32;
} else if (imm12_8 == 0b10011) {
broadcast_value = imm7_0 << 24;
broadcast_width = 32;
} else if (imm12_8 == 0b10100) {
broadcast_value = imm7_0;
broadcast_width = 16;
} else if (imm12_8 == 0b10101) {
broadcast_value = imm7_0 << 8;
broadcast_width = 16;
} else if (imm12_8 == 0b10110) {
broadcast_value = (imm7_0 << 8) | 0xFF;
broadcast_width = 32;
} else if (imm12_8 == 0b10111) {
broadcast_value = (imm7_0 << 16) | 0xFFFF;
broadcast_width = 32;
} else if (imm12_8 == 0b11000) {
broadcast_value = imm7_0;
broadcast_width = 8;
} else if (imm12_8 == 0b11001) {
broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
imm7 * 0xFF00000000000000;
broadcast_width = 64;
} else if (imm12_8 == 0b11010) {
broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
(imm5_0 << 19);
broadcast_width = 32;
} else if (imm12_8 == 0b11011) {
broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
(imm5_0 << 19);
broadcast_width = 64;
} else if (imm12_8 == 0b11100) {
broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
(imm5_0 << 48);
broadcast_width = 64;
}
if (broadcast_width == 8) {
for (int i = 0; i < 16; i++) {
dst.byte[i] = broadcast_value;
}
} else if (broadcast_width == 16) {
for (int i = 0; i < 8; i++) {
dst.half[i] = broadcast_value;
}
} else if (broadcast_width == 32) {
for (int i = 0; i < 4; i++) {
dst.word[i] = broadcast_value;
}
} else if (broadcast_width == 64) {
for (int i = 0; i < 2; i++) {
dst.dword[i] = broadcast_value;
}
}
Tested on real machine.