Shuffling

m128i lsx_vshuf_b (m128i a, m128i b, __m128i c)

Synopsis

__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
#include <lsxintrin.h>
Instruction: vshuf.b vr, vr, vr, vr
CPU Flags: LSX

Description

Shuffle bytes from a and b with indices from c.

Caveat: the indices are placed in c, while in other vshuf intrinsics, they are placed in a.

Examples

__m128i __lsx_vshuf_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, __m128i{0x0011021304050607, 0x0811120213031404})
= 0x7877155513efcdab 0x2177661555144413

Operation

for (int i = 0; i < 16; i++) {
  if (c.byte[i] >= 64 && MACHINE_3C5000) {
    // Caveat: observed in 3C5000
    dst.byte[i] = 0;
  } else if ((c.byte[i] % 32) < 16) {
    dst.byte[i] = b.byte[c.byte[i] % 16];
  } else {
    dst.byte[i] = a.byte[c.byte[i] % 16];
  }
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	2
3C5000	1	2

m128i lsx_vshuf_h (m128i a, m128i b, __m128i c)

Synopsis

__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
#include <lsxintrin.h>
Instruction: vshuf.h vr, vr, vr
CPU Flags: LSX

Description

Shuffle 16-bit elements in b and c with indices from a, save the result to dst.

Examples

__m128i __lsx_vshuf_h(__m128i{0x0001000200030004, 0x0005000a000b000c}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
= 0x1415ef13abcd4321 0x432133441122ff00

Operation

for (int i = 0; i < 8; i++) {
  if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {
    // Caveat: observed in 3C5000
    dst.half[i] = 0;
  } else if ((a.half[i] % 16) < 8) {
    dst.half[i] = c.half[a.half[i] % 8];
  } else {
    dst.half[i] = b.half[a.half[i] % 8];
  }
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	2
3C5000	1	2

m128i lsx_vshuf_w (m128i a, m128i b, __m128i c)

Synopsis

__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
#include <lsxintrin.h>
Instruction: vshuf.w vr, vr, vr
CPU Flags: LSX

Description

Shuffle 32-bit elements in b and c with indices from a, save the result to dst.

Examples

__m128i __lsx_vshuf_w(__m128i{0x0000000200000004, 0x0000000700000005}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
= 0x4321432155667788 0x99aabbcc11223344

Operation

for (int i = 0; i < 4; i++) {
  if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {
    // Caveat: observed in 3C5000
    dst.word[i] = 0;
  } else if ((a.word[i] % 8) < 4) {
    dst.word[i] = c.word[a.word[i] % 4];
  } else {
    dst.word[i] = b.word[a.word[i] % 4];
  }
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	2
3C5000	1	2

m128i lsx_vshuf_d (m128i a, m128i b, __m128i c)

Synopsis

__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
#include <lsxintrin.h>
Instruction: vshuf.d vr, vr, vr
CPU Flags: LSX

Description

Shuffle 64-bit elements in b and c with indices from a, save the result to dst.

Examples

__m128i __lsx_vshuf_d(__m128i{0x0000000000000001, 0x0000000000000002}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
= 0x1234123443214321 0x1122334455667788

Operation

for (int i = 0; i < 2; i++) {
  if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {
    // Caveat: observed in 3C5000
    dst.dword[i] = 0;
  } else if ((a.dword[i] % 4) < 2) {
    dst.dword[i] = c.dword[a.dword[i] % 2];
  } else {
    dst.dword[i] = b.dword[a.dword[i] % 2];
  }
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	2
3C5000	1	2

m128i lsx_vshuf4i_b (__m128i a, imm0_255 imm)

Synopsis

__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vshuf4i.b vr, vr, imm
CPU Flags: LSX

Description

Shuffle every four 8-bit elements in a with indices packed in imm, save the result to dst.

Examples

__m128i __lsx_vshuf4i_b(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
= 0x13ef13cd78667815 0x3412343421432121

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m128i lsx_vshuf4i_h (__m128i a, imm0_255 imm)

Synopsis

__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vshuf4i.h vr, vr, imm
CPU Flags: LSX

Description

Shuffle every four 16-bit elements in a with indices packed in imm, save the result to dst.

Examples

__m128i __lsx_vshuf4i_h(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
= 0x667814156678ef13 0x4321432143211234

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m128i lsx_vshuf4i_w (__m128i a, imm0_255 imm)

Synopsis

__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vshuf4i.w vr, vr, imm
CPU Flags: LSX

Description

Shuffle every four 32-bit elements in a with indices packed in imm, save the result to dst.

Examples

__m128i __lsx_vshuf4i_w(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
= 0x1415667843214321 0x14156678abcdef13

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m128i lsx_vshuf4i_d (m128i a, m128i b, imm0_255 imm)

Synopsis

__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vshuf4i.d vr, vr, imm
CPU Flags: LSX

Description

Shuffle every four 64-bit elements in a and b with indices packed in imm, save the result to dst.

Examples

__m128i __lsx_vshuf4i_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
= 0xabcdef1314156678 0x1122334455667788

Operation

dst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];
dst.dword[1] =
    (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2