Permutation
__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)
Synopsis
__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)
#include <lasxintrin.h>
Instruction: xvpermi.w xr, xr, imm
CPU Flags: LASX
Description
Permute words from a
and b
with indices recorded in imm
and store into dst
.
Operation
dst.word[0] = b.word[imm & 0x3];
dst.word[1] = b.word[(imm >> 2) & 0x3];
dst.word[2] = a.word[(imm >> 4) & 0x3];
dst.word[3] = a.word[(imm >> 6) & 0x3];
dst.word[4] = b.word[4 + (imm & 0x3)];
dst.word[5] = b.word[4 + ((imm >> 2) & 0x3)];
dst.word[6] = a.word[4 + ((imm >> 4) & 0x3)];
dst.word[7] = a.word[4 + ((imm >> 6) & 0x3)];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 1 | 4 |
3C5000 | 1 | 2 |
__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)
Synopsis
__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)
#include <lasxintrin.h>
Instruction: xvpermi.d xr, xr, imm
CPU Flags: LASX
Description
Permute double words from a
and b
with indices recorded in imm
and store into dst
.
Operation
dst.dword[0] = a.dword[imm & 0x3];
dst.dword[1] = a.dword[(imm >> 2) & 0x3];
dst.dword[2] = a.dword[(imm >> 4) & 0x3];
dst.dword[3] = a.dword[(imm >> 6) & 0x3];
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 3 | 4 |
3C5000 | 3 | 2 |
__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)
Synopsis
__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)
#include <lasxintrin.h>
Instruction: xvpermi.d xr, xr, imm
CPU Flags: LASX
Description
Permute quad words from a
and b
with indices recorded in imm
and store into dst
.
Operation
if ((imm & 0x4) && MACHINE_3C5000) {
// Caveat: observed in 3C5000
dst.qword[0] = 0;
} else {
dst.qword[0] = (imm & 2) ? a.qword[imm & 0x1] : b.qword[imm & 0x1];
}
if ((imm & 0x80) && MACHINE_3C5000) {
// Caveat: observed in 3C5000
dst.qword[1] = 0;
} else {
dst.qword[1] =
(imm & 0x20) ? a.qword[(imm >> 4) & 0x1] : b.qword[(imm >> 4) & 0x1];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 3 | 4 |
3C5000 | 3 | 2 |
__m256i __lasx_xvperm_w (__m256i a, __m256i b)
Synopsis
__m256i __lasx_xvperm_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvperm.w xr, xr, xr
CPU Flags: LASX
Description
Permute words from a
with indices recorded in b
and store into dst
.
Operation
for (int i = 0; i < 8; i++) {
dst.word[i] = a.word[b.word[i] % 0x8];
}
Tested on real machine.
Latency and Throughput
CPU | Latency | Throughput (IPC) |
---|---|---|
3A6000 | 3 | 4 |
3C5000 | 3 | 2 |