Bitwise Operations

__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)

Synopsis

__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
#include <lsxintrin.h>
Instruction: vbitsel.v vr, vr, vr, vr
CPU Flags: LSX

Description

Compute bitwise selection: for each bit position, if the bit in c equals to one, copy the bit from b to dst, otherwise copy from a.

Examples

__m128i __lsx_vbitsel_v(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, __m128i{0xffff0000aaaabbbb, 0x1111222233334444})
= 0xabab3344ffeeefab 0x98ba9beccfedfb00

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 2
3C5000 1 2

__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)

Synopsis

__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
#include <lsxintrin.h>
Instruction: vbitseli.b vr, vr, imm
CPU Flags: LSX

Description

Compute bitwise selection: for each bit position, if the bit in a equals to one, copy the bit from imm to dst, otherwise copy from b.

Examples

__m128i __lsx_vbitseli_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, 0x12)
= 0xba8b9aabba8b9a23 0x1216123012031221

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 2
3C5000 1 2

__m128i __lsx_vbitclr_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitclr_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitclr.b vr, vr, vr
CPU Flags: LSX

Description

Clear the bit specified by elements in b from 8-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitclr_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitclr_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitclr_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitclr.h vr, vr, vr
CPU Flags: LSX

Description

Clear the bit specified by elements in b from 16-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitclr_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0xf7fff7fff7fff7ff 0x99aabbccddecff00

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitclr_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitclr_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitclr.w vr, vr, vr
CPU Flags: LSX

Description

Clear the bit specified by elements in b from 32-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitclr_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0xfffff7fffffff7ff 0x99aabbccddeeff00

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitclr_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitclr_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitclr.d vr, vr, vr
CPU Flags: LSX

Description

Clear the bit specified by elements in b from 64-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitclr_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0xfffff7ffffffffff 0x99aabbccddeeff00

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vbitclri.b vr, vr, imm
CPU Flags: LSX

Description

Clear the bit specified by imm from 8-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitclri_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] & (~((u8)1 << imm));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vbitclri.h vr, vr, imm
CPU Flags: LSX

Description

Clear the bit specified by imm from 16-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitclri_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
= 0xfffdfffdfffdfffd 0x99a8bbccddecff00

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] & (~((u16)1 << imm));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vbitclri.w vr, vr, imm
CPU Flags: LSX

Description

Clear the bit specified by imm from 32-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitclri_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
= 0xfffffffdfffffffd 0x99aabbccddeeff00

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] & (~((u32)1 << imm));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vbitclri.d vr, vr, imm
CPU Flags: LSX

Description

Clear the bit specified by imm from 64-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitclri_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
= 0xfffffffffffffffd 0x99aabbccddeeff00

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] & (~((u64)1 << imm));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitset_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitset_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitset.b vr, vr, vr
CPU Flags: LSX

Description

Set the bit specified by elements in b from 8-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitset_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0x0808080808080808 0x9dbabfdcddeeff02

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitset_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitset_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitset.h vr, vr, vr
CPU Flags: LSX

Description

Set the bit specified by elements in b from 16-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitset_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0x0800080008000800 0x99babbdcddeeff02

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitset_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitset_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitset.w vr, vr, vr
CPU Flags: LSX

Description

Set the bit specified by elements in b from 32-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitset_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0x0000080000000800 0x99babbccddeeff02

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitset_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitset_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitset.d vr, vr, vr
CPU Flags: LSX

Description

Set the bit specified by elements in b from 64-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitset_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0x0000080000000000 0x99aabbceddeeff00

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vbitseti.b vr, vr, imm
CPU Flags: LSX

Description

Set the bit specified by imm from 8-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitseti_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
= 0x0202020202020202 0x9baabbcedfeeff02

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] | ((u8)1 << imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vbitseti.h vr, vr, imm
CPU Flags: LSX

Description

Set the bit specified by imm from 16-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitseti_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
= 0x0002000200020002 0x99aabbceddeeff02

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] | ((u16)1 << imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vbitseti.w vr, vr, imm
CPU Flags: LSX

Description

Set the bit specified by imm from 32-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitseti_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
= 0x0000000200000002 0x99aabbceddeeff02

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] | ((u32)1 << imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vbitseti.d vr, vr, imm
CPU Flags: LSX

Description

Set the bit specified by imm from 64-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitseti_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
= 0x0000000000000002 0x99aabbccddeeff02

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] | ((u64)1 << imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitrev_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitrev_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitrev.b vr, vr, vr
CPU Flags: LSX

Description

Toggle the bit specified by elements in b from 8-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitrev_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0x0707070707070707 0x9dbabfdcd5ecf702

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitrev_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitrev_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitrev.h vr, vr, vr
CPU Flags: LSX

Description

Toggle the bit specified by elements in b from 16-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitrev_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0x070f070f070f070f 0x99babbdcddecff02

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitrev_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitrev_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitrev.w vr, vr, vr
CPU Flags: LSX

Description

Toggle the bit specified by elements in b from 32-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitrev_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0x0f0f070f0f0f070f 0x99babbccddeeff02

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitrev_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vbitrev_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vbitrev.d vr, vr, vr
CPU Flags: LSX

Description

Toggle the bit specified by elements in b from 64-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitrev_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vbitrevi.b vr, vr, imm
CPU Flags: LSX

Description

Toggle the bit specified by imm from 8-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitrevi_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vbitrevi.h vr, vr, imm
CPU Flags: LSX

Description

Toggle the bit specified by imm from 16-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitrevi_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] ^ ((u16)1 << imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vbitrevi.w vr, vr, imm
CPU Flags: LSX

Description

Toggle the bit specified by imm from 32-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitrevi_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] ^ ((u32)1 << imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vbitrevi.d vr, vr, imm
CPU Flags: LSX

Description

Toggle the bit specified by imm from 64-bit elements in a, save the result in dst.

Examples

__m128i __lsx_vbitrevi_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vclo_b (__m128i a)

Synopsis

__m128i __lsx_vclo_b (__m128i a)
#include <lsxintrin.h>
Instruction: vclo.b vr, vr
CPU Flags: LSX

Description

Count leading ones of 8-bit elements in a.

Examples

__m128i __lsx_vclo_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0000000000000001 0x0101010202030800

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = clo(a.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m128i __lsx_vclo_h (__m128i a)

Synopsis

__m128i __lsx_vclo_h (__m128i a)
#include <lsxintrin.h>
Instruction: vclo.h vr, vr
CPU Flags: LSX

Description

Count leading ones of 16-bit elements in a.

Examples

__m128i __lsx_vclo_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0000000000000000 0x0001000100020008

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = clo(a.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m128i __lsx_vclo_w (__m128i a)

Synopsis

__m128i __lsx_vclo_w (__m128i a)
#include <lsxintrin.h>
Instruction: vclo.w vr, vr
CPU Flags: LSX

Description

Count leading ones of 32-bit elements in a.

Examples

__m128i __lsx_vclo_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0000000000000000 0x0000000100000002

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = clo(a.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m128i __lsx_vclo_d (__m128i a)

Synopsis

__m128i __lsx_vclo_d (__m128i a)
#include <lsxintrin.h>
Instruction: vclo.d vr, vr
CPU Flags: LSX

Description

Count leading ones of 64-bit elements in a.

Examples

__m128i __lsx_vclo_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0000000000000000 0x0000000000000001

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = clo(a.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m128i __lsx_vclz_b (__m128i a)

Synopsis

__m128i __lsx_vclz_b (__m128i a)
#include <lsxintrin.h>
Instruction: vclz.b vr, vr
CPU Flags: LSX

Description

Count leading zeros of 8-bit elements in a.

Examples

__m128i __lsx_vclz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0302020101010100 0x0000000000000008

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = clz(a.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m128i __lsx_vclz_h (__m128i a)

Synopsis

__m128i __lsx_vclz_h (__m128i a)
#include <lsxintrin.h>
Instruction: vclz.h vr, vr
CPU Flags: LSX

Description

Count leading zeros of 16-bit elements in a.

Examples

__m128i __lsx_vclz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0003000200010001 0x0000000000000000

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = clz(a.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m128i __lsx_vclz_w (__m128i a)

Synopsis

__m128i __lsx_vclz_w (__m128i a)
#include <lsxintrin.h>
Instruction: vclz.w vr, vr
CPU Flags: LSX

Description

Count leading zeros of 32-bit elements in a.

Examples

__m128i __lsx_vclz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0000000300000001 0x0000000000000000

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = clz(a.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m128i __lsx_vclz_d (__m128i a)

Synopsis

__m128i __lsx_vclz_d (__m128i a)
#include <lsxintrin.h>
Instruction: vclz.d vr, vr
CPU Flags: LSX

Description

Count leading zeros of 64-bit elements in a.

Examples

__m128i __lsx_vclz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0000000000000003 0x0000000000000000

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = clz(a.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m128i __lsx_vpcnt_b (__m128i a)

Synopsis

__m128i __lsx_vpcnt_b (__m128i a)
#include <lsxintrin.h>
Instruction: vpcnt.b vr, vr
CPU Flags: LSX

Description

Count the number of ones (population, popcount) in 8-bit elements in a.

Examples

__m128i __lsx_vpcnt_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0202040204040602 0x0404060406060800

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = popcount(a.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vpcnt_h (__m128i a)

Synopsis

__m128i __lsx_vpcnt_h (__m128i a)
#include <lsxintrin.h>
Instruction: vpcnt.h vr, vr
CPU Flags: LSX

Description

Count the number of ones (population, popcount) in 16-bit elements in a.

Examples

__m128i __lsx_vpcnt_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0004000600080008 0x0008000a000c0008

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = popcount(a.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vpcnt_w (__m128i a)

Synopsis

__m128i __lsx_vpcnt_w (__m128i a)
#include <lsxintrin.h>
Instruction: vpcnt.w vr, vr
CPU Flags: LSX

Description

Count the number of ones (population, popcount) in 32-bit elements in a.

Examples

__m128i __lsx_vpcnt_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x0000000a00000010 0x0000001200000014

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = popcount(a.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m128i __lsx_vpcnt_d (__m128i a)

Synopsis

__m128i __lsx_vpcnt_d (__m128i a)
#include <lsxintrin.h>
Instruction: vpcnt.d vr, vr
CPU Flags: LSX

Description

Count the number of ones (population, popcount) in 64-bit elements in a.

Examples

__m128i __lsx_vpcnt_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
= 0x000000000000001a 0x0000000000000026

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = popcount(a.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2