Misc

m256i lasx_xvexth_h_b (__m256i a)

Synopsis

__m256i __lasx_xvexth_h_b (__m256i a)
#include <lasxintrin.h>
Instruction: xvexth.h.b xr, xr
CPU Flags: LASX

Description

Extend signed 8-bit elements in the higher half of a to 16-bit.

Operation

int i;
for (i = 0; i < 8; i++) {
  dst.half[i] = (s16)(s8)a.byte[8 + i];
}
for (; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[16 + i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvexth_hu_bu (__m256i a)

Synopsis

__m256i __lasx_xvexth_hu_bu (__m256i a)
#include <lasxintrin.h>
Instruction: xvexth.hu.bu xr, xr
CPU Flags: LASX

Description

Extend unsigned 8-bit elements in the higher half of a to 16-bit.

Operation

int i;
for (i = 0; i < 8; i++) {
  dst.half[i] = (u16)(u8)a.byte[8 + i];
}
for (; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[16 + i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvexth_w_h (__m256i a)

Synopsis

__m256i __lasx_xvexth_w_h (__m256i a)
#include <lasxintrin.h>
Instruction: xvexth.w.h xr, xr
CPU Flags: LASX

Description

Extend signed 16-bit elements in the higher half of a to 32-bit.

Operation

int i;
for (i = 0; i < 4; i++) {
  dst.word[i] = (s32)(s16)a.half[4 + i];
}
for (; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[8 + i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvexth_wu_hu (__m256i a)

Synopsis

__m256i __lasx_xvexth_wu_hu (__m256i a)
#include <lasxintrin.h>
Instruction: xvexth.wu.hu xr, xr
CPU Flags: LASX

Description

Extend unsigned 16-bit elements in the higher half of a to 32-bit.

Operation

int i;
for (i = 0; i < 4; i++) {
  dst.word[i] = (u32)(u16)a.half[4 + i];
}
for (; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[8 + i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvexth_d_w (__m256i a)

Synopsis

__m256i __lasx_xvexth_d_w (__m256i a)
#include <lasxintrin.h>
Instruction: xvexth.d.w xr, xr
CPU Flags: LASX

Description

Extend signed 32-bit elements in the higher half of a to 64-bit.

Operation

int i;
for (i = 0; i < 2; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 + i];
}
for (; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[4 + i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvexth_du_wu (__m256i a)

Synopsis

__m256i __lasx_xvexth_du_wu (__m256i a)
#include <lasxintrin.h>
Instruction: xvexth.du.wu xr, xr
CPU Flags: LASX

Description

Extend unsigned 32-bit elements in the higher half of a to 64-bit.

Operation

int i;
for (i = 0; i < 2; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 + i];
}
for (; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[4 + i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvexth_q_d (__m256i a)

Synopsis

__m256i __lasx_xvexth_q_d (__m256i a)
#include <lasxintrin.h>
Instruction: xvexth.q.d xr, xr
CPU Flags: LASX

Description

Extend signed 64-bit elements in the higher half of a to 128-bit.

Operation

int i;
for (i = 0; i < 1; i++) {
  dst.qword[i] = (s128)(s64)a.dword[1 + i];
}
for (; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 + i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvexth_qu_du (__m256i a)

Synopsis

__m256i __lasx_xvexth_qu_du (__m256i a)
#include <lasxintrin.h>
Instruction: xvexth.qu.du xr, xr
CPU Flags: LASX

Description

Extend unsigned 64-bit elements in the higher half of a to 128-bit.

Operation

int i;
for (i = 0; i < 1; i++) {
  dst.qword[i] = (u128)(u64)a.dword[1 + i];
}
for (; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 + i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvextl_q_d (__m256i a)

Synopsis

__m256i __lasx_xvextl_q_d (__m256i a)
#include <lasxintrin.h>
Instruction: xvextl.q.d xr, xr
CPU Flags: LASX

Description

Extend signed 64-bit elements in the lower half of a to 128-bit.

Operation

int i;
for (i = 0; i < 1; i++) {
  dst.qword[i] = (s128)(s64)a.dword[i];
}
for (; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[i + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvextl_qu_du (__m256i a)

Synopsis

__m256i __lasx_xvextl_qu_du (__m256i a)
#include <lasxintrin.h>
Instruction: xvextl.qu.du xr, xr
CPU Flags: LASX

Description

Extend unsigned 64-bit elements in the lower half of a to 128-bit.

Operation

int i;
for (i = 0; i < 1; i++) {
  dst.qword[i] = (u128)(u64)a.dword[i];
}
for (; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[i + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvextrins_b (m256i a, m256i b, imm0_255 imm)

Synopsis

__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)
#include <lasxintrin.h>
Instruction: xvextrins.b xr, xr, imm
CPU Flags: LASX

Description

Extract one 8-bit element in b and insert it to a according to imm.

Operation

int i;
for (i = 0; i < 16; i++) {
  dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];
}
for (; i < 32; i++) {
  dst.byte[i] =
      (i - 16 == ((imm >> 4) & 15)) ? b.byte[(imm & 15) + 16] : a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvextrins_h (m256i a, m256i b, imm0_255 imm)

Synopsis

__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)
#include <lasxintrin.h>
Instruction: xvextrins.h xr, xr, imm
CPU Flags: LASX

Description

Extract one 16-bit element in b and insert it to a according to imm.

Operation

int i;
for (i = 0; i < 8; i++) {
  dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];
}
for (; i < 16; i++) {
  dst.half[i] = (i - 8 == ((imm >> 4) & 7)) ? b.half[(imm & 7) + 8] : a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvextrins_w (m256i a, m256i b, imm0_255 imm)

Synopsis

__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)
#include <lasxintrin.h>
Instruction: xvextrins.w xr, xr, imm
CPU Flags: LASX

Description

Extract one 32-bit element in b and insert it to a according to imm.

Operation

int i;
for (i = 0; i < 4; i++) {
  dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];
}
for (; i < 8; i++) {
  dst.word[i] = (i - 4 == ((imm >> 4) & 3)) ? b.word[(imm & 3) + 4] : a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvextrins_d (m256i a, m256i b, imm0_255 imm)

Synopsis

__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)
#include <lasxintrin.h>
Instruction: xvextrins.d xr, xr, imm
CPU Flags: LASX

Description

Extract one 64-bit element in b and insert it to a according to imm.

Operation

int i;
for (i = 0; i < 2; i++) {
  dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];
}
for (; i < 4; i++) {
  dst.dword[i] =
      (i - 2 == ((imm >> 4) & 1)) ? b.dword[(imm & 1) + 2] : a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_vext2xv_h_b (__m256i a)

Synopsis

__m256i __lasx_vext2xv_h_b (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.h.b xr, xr
CPU Flags: LSX

Description

Extend signed 8-bit lane of a to signed 16-bit elements.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_hu_bu (__m256i a)

Synopsis

__m256i __lasx_vext2xv_hu_bu (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.hu.bu xr, xr
CPU Flags: LSX

Description

Extend unsigned 8-bit lane of a to unsigned 16-bit elements.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_w_b (__m256i a)

Synopsis

__m256i __lasx_vext2xv_w_b (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.w.b xr, xr
CPU Flags: LSX

Description

Extend signed 8-bit lane of a to signed 32-bit elements.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s8)a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_wu_bu (__m256i a)

Synopsis

__m256i __lasx_vext2xv_wu_bu (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.wu.bu xr, xr
CPU Flags: LSX

Description

Extend unsigned 8-bit lane of a to unsigned 32-bit elements.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u8)a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_w_h (__m256i a)

Synopsis

__m256i __lasx_vext2xv_w_h (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.w.h xr, xr
CPU Flags: LSX

Description

Extend signed 16-bit lane of a to signed 32-bit elements.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_wu_hu (__m256i a)

Synopsis

__m256i __lasx_vext2xv_wu_hu (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.wu.hu xr, xr
CPU Flags: LSX

Description

Extend unsigned 16-bit lane of a to unsigned 32-bit elements.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_d_b (__m256i a)

Synopsis

__m256i __lasx_vext2xv_d_b (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.d.b xr, xr
CPU Flags: LSX

Description

Extend signed 8-bit lane of a to signed 64-bit elements.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s8)a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_du_bu (__m256i a)

Synopsis

__m256i __lasx_vext2xv_du_bu (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.du.bu xr, xr
CPU Flags: LSX

Description

Extend unsigned 8-bit lane of a to unsigned 64-bit elements.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u8)a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_d_h (__m256i a)

Synopsis

__m256i __lasx_vext2xv_d_h (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.d.h xr, xr
CPU Flags: LSX

Description

Extend signed 16-bit lane of a to signed 64-bit elements.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_du_hu (__m256i a)

Synopsis

__m256i __lasx_vext2xv_du_hu (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.du.hu xr, xr
CPU Flags: LSX

Description

Extend unsigned 16-bit lane of a to unsigned 64-bit elements.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_d_w (__m256i a)

Synopsis

__m256i __lasx_vext2xv_d_w (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.d.w xr, xr
CPU Flags: LSX

Description

Extend signed 32-bit lane of a to signed 64-bit elements.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_vext2xv_du_wu (__m256i a)

Synopsis

__m256i __lasx_vext2xv_du_wu (__m256i a)
#include <lsxintrin.h>
Instruction: vext2xv.du.wu xr, xr
CPU Flags: LSX

Description

Extend unsigned 32-bit lane of a to unsigned 64-bit elements.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_xvilvh_b (m256i a, m256i b)

Synopsis

__m256i __lasx_xvilvh_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvilvh.b xr, xr, xr
CPU Flags: LASX

Description

Interleave 8-bit elements in higher half of a and b.

Operation

int i;
for (i = 0; i < 16; i++) {
  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
}
for (; i < 32; i++) {
  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 16] : b.byte[i / 2 + 16];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvilvh_h (m256i a, m256i b)

Synopsis

__m256i __lasx_xvilvh_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvilvh.h xr, xr, xr
CPU Flags: LASX

Description

Interleave 16-bit elements in higher half of a and b.

Operation

int i;
for (i = 0; i < 8; i++) {
  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
}
for (; i < 16; i++) {
  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 8] : b.half[i / 2 + 8];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvilvh_w (m256i a, m256i b)

Synopsis

__m256i __lasx_xvilvh_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvilvh.w xr, xr, xr
CPU Flags: LASX

Description

Interleave 32-bit elements in higher half of a and b.

Operation

int i;
for (i = 0; i < 4; i++) {
  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
}
for (; i < 8; i++) {
  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 4] : b.word[i / 2 + 4];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvilvh_d (m256i a, m256i b)

Synopsis

__m256i __lasx_xvilvh_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvilvh.d xr, xr, xr
CPU Flags: LASX

Description

Interleave 64-bit elements in higher half of a and b.

Operation

int i;
for (i = 0; i < 2; i++) {
  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
}
for (; i < 4; i++) {
  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 2] : b.dword[i / 2 + 2];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvilvl_b (m256i a, m256i b)

Synopsis

__m256i __lasx_xvilvl_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvilvl.b xr, xr, xr
CPU Flags: LASX

Description

Interleave 8-bit elements in lower half of a and b.

Operation

int i;
for (i = 0; i < 16; i++) {
  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
}
for (; i < 32; i++) {
  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvilvl_h (m256i a, m256i b)

Synopsis

__m256i __lasx_xvilvl_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvilvl.h xr, xr, xr
CPU Flags: LASX

Description

Interleave 16-bit elements in lower half of a and b.

Operation

int i;
for (i = 0; i < 8; i++) {
  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
}
for (; i < 16; i++) {
  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvilvl_w (m256i a, m256i b)

Synopsis

__m256i __lasx_xvilvl_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvilvl.w xr, xr, xr
CPU Flags: LASX

Description

Interleave 32-bit elements in lower half of a and b.

Operation

int i;
for (i = 0; i < 4; i++) {
  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
}
for (; i < 8; i++) {
  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvilvl_d (m256i a, m256i b)

Synopsis

__m256i __lasx_xvilvl_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvilvl.d xr, xr, xr
CPU Flags: LASX

Description

Interleave 64-bit elements in lower half of a and b.

Operation

int i;
for (i = 0; i < 2; i++) {
  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
}
for (; i < 4; i++) {
  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)

Synopsis

__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)
#include <lasxintrin.h>
Instruction: xvinsgr2vr.w xr, r, imm
CPU Flags: LASX

Description

Insert 32-bit element into lane indexed imm.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (i == imm) ? b : a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

m256i lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)

Synopsis

__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)
#include <lasxintrin.h>
Instruction: xvinsgr2vr.d xr, r, imm
CPU Flags: LASX

Description

Insert 64-bit element into lane indexed imm.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (i == imm) ? b : a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

m256i lasx_xvinsve0_w (m256i a, m256i b, imm0_7 imm)

Synopsis

__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)
#include <lasxintrin.h>
Instruction: xvinsve0.w xr, xr, imm
CPU Flags: LASX

Description

Insert the first 32-bit lane of b into lane indexed imm of a.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (i == imm) ? b.word[0] : a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvinsve0_d (m256i a, m256i b, imm0_3 imm)

Synopsis

__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)
#include <lasxintrin.h>
Instruction: xvinsve0.d xr, xr, imm
CPU Flags: LASX

Description

Insert the first 64-bit lane of b into lane indexed imm of a.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (i == imm) ? b.dword[0] : a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvfrstp_b (m256i a, m256i b, __m256i c)

Synopsis

__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvfrstp.b xr, xr, xr
CPU Flags: LASX

Description

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by c.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = a.byte[i];
}
int i;
for (i = 0; i < 16; i++) {
  if ((s8)b.byte[i] < 0) {
    break;
  }
}
dst.byte[c.byte[0] % 16] = i;
for (i = 16; i < 32; i++) {
  if ((s8)b.byte[i] < 0) {
    break;
  }
}
dst.byte[(c.byte[16] % 16) + 16] = i - 16;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvfrstp_h (m256i a, m256i b, __m256i c)

Synopsis

__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvfrstp.h xr, xr, xr
CPU Flags: LASX

Description

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by c.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = a.half[i];
}
int i;
for (i = 0; i < 8; i++) {
  if ((s16)b.half[i] < 0) {
    break;
  }
}
dst.half[c.half[0] % 8] = i;
for (i = 8; i < 16; i++) {
  if ((s16)b.half[i] < 0) {
    break;
  }
}
dst.half[(c.half[8] % 8) + 8] = i - 8;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvfrstpi_b (m256i a, m256i b, imm0_31 imm)

Synopsis

__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvfrstpi.b xr, xr, imm
CPU Flags: LASX

Description

Find the first negative 8-bit element in b, set the index of the element to the lane of a specified by imm.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = a.byte[i];
}
int i;
for (i = 0; i < 16; i++) {
  if ((s8)b.byte[i] < 0) {
    break;
  }
}
dst.byte[imm % 16] = i;
for (i = 16; i < 32; i++) {
  if ((s8)b.byte[i] < 0) {
    break;
  }
}
dst.byte[(imm % 16) + 16] = i - 16;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvfrstpi_h (m256i a, m256i b, imm0_31 imm)

Synopsis

__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvfrstpi.h xr, xr, imm
CPU Flags: LASX

Description

Find the first negative 16-bit element in b, set the index of the element to the lane of a specified by imm.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = a.half[i];
}
int i;
for (i = 0; i < 8; i++) {
  if ((s16)b.half[i] < 0) {
    break;
  }
}
dst.half[imm % 8] = i;
for (i = 8; i < 16; i++) {
  if ((s16)b.half[i] < 0) {
    break;
  }
}
dst.half[(imm % 8) + 8] = i - 8;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvmskgez_b (__m256i a)

Synopsis

__m256i __lasx_xvmskgez_b (__m256i a)
#include <lasxintrin.h>
Instruction: xvmskgez.b xr, xr
CPU Flags: LASX

Description

For each 8-bit element in a, if the element is greater than or equal to zero, set one bit in dst, otherwise clear it.

Examples

__m256i __lasx_xvmskgez_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
= 0x00000000000001fe 0x0000000000000000 0x000000000000ff0f 0x0000000000000000
__m256i __lasx_xvmskgez_b(__m256i{0x0000191100000000, 0x00a1000011b11c11, 0x1181000008010101, 0x0000000000000000})
= 0x000000000000bbff 0x0000000000000000 0x000000000000ffbf 0x0000000000000000

Operation

u64 m = 0x8080808080808080;
u64 c = m & a.dword[0];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] = c;
c = m & a.dword[1];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] |= c << 8;
dst.dword[0] = (u16)~dst.dword[0];
dst.dword[1] = 0;

c = m & a.dword[2];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[2] = c;
c = m & a.dword[3];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[2] |= c << 8;
dst.dword[2] = (u16)~dst.dword[2];
dst.dword[3] = 0;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvmskltz_b (__m256i a)

Synopsis

__m256i __lasx_xvmskltz_b (__m256i a)
#include <lasxintrin.h>
Instruction: xvmskltz.b xr, xr
CPU Flags: LASX

Description

For each 8-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

Examples

__m256i __lasx_xvmskltz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
= 0x000000000000fe01 0x0000000000000000 0x00000000000000f0 0x0000000000000000
__m256i __lasx_xvmskltz_b(__m256i{0x0000118100000000, 0x0081000081111118, 0x1181000001010801, 0x0000000000000000})
= 0x0000000000004810 0x0000000000000000 0x0000000000000040 0x0000000000000000

Operation

u64 m = 0x8080808080808080;
u64 c = m & a.dword[0];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] = c;
c = m & a.dword[1];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] |= c << 8;
dst.dword[1] = 0;

c = m & a.dword[2];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[2] = c;
c = m & a.dword[3];
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[2] |= c << 8;
dst.dword[3] = 0;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvmskltz_h (__m256i a)

Synopsis

__m256i __lasx_xvmskltz_h (__m256i a)
#include <lasxintrin.h>
Instruction: xvmskltz.h xr, xr
CPU Flags: LASX

Description

For each 16-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

Examples

__m256i __lasx_xvmskltz_h(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
= 0x00000000000000f0 0x0000000000000000 0x000000000000000c 0x0000000000000000
__m256i __lasx_xvmskltz_h(__m256i{0x0000818100000000, 0x0018000018181881, 0x1181000008080808, 0x0000000000000000})
= 0x0000000000000004 0x0000000000000000 0x0000000000000000 0x0000000000000000

Operation

u64 m = 0x8000800080008000;
u64 c = m & a.dword[0];
c |= c << 15;
c |= c << 30;
c >>= 60;
dst.dword[0] = c;
c = m & a.dword[1];
c |= c << 15;
c |= c << 30;
c >>= 60;
dst.dword[0] |= c << 4;
dst.dword[1] = 0;

c = m & a.dword[2];
c |= c << 15;
c |= c << 30;
c >>= 60;
dst.dword[2] = c;
c = m & a.dword[3];
c |= c << 15;
c |= c << 30;
c >>= 60;
dst.dword[2] |= c << 4;
dst.dword[3] = 0;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvmskltz_w (__m256i a)

Synopsis

__m256i __lasx_xvmskltz_w (__m256i a)
#include <lasxintrin.h>
Instruction: xvmskltz.w xr, xr
CPU Flags: LASX

Description

For each 32-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

Examples

__m256i __lasx_xvmskltz_w(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
= 0x000000000000000c 0x0000000000000000 0x0000000000000002 0x0000000000000000
__m256i __lasx_xvmskltz_w(__m256i{0x0000811100000000, 0x0018000081111111, 0x8111000001010108, 0x0000000000000000})
= 0x0000000000000004 0x0000000000000000 0x0000000000000002 0x0000000000000000

Operation

u64 m = 0x8000000080000000;
u64 c = m & a.dword[0];
c |= c << 31;
c >>= 62;
dst.dword[0] = c;
c = m & a.dword[1];
c |= c << 31;
c >>= 62;
dst.dword[0] |= c << 2;
dst.dword[1] = 0;

c = m & a.dword[2];
c |= c << 31;
c >>= 62;
dst.dword[2] = c;
c = m & a.dword[3];
c |= c << 31;
c >>= 62;
dst.dword[2] |= c << 2;
dst.dword[3] = 0;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvmskltz_d (__m256i a)

Synopsis

__m256i __lasx_xvmskltz_d (__m256i a)
#include <lasxintrin.h>
Instruction: xvmskltz.d xr, xr
CPU Flags: LASX

Description

For each 64-bit element in a, if the element is less than zero, set one bit in dst, otherwise clear it.

Examples

__m256i __lasx_xvmskltz_d(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
= 0x0000000000000002 0x0000000000000000 0x0000000000000001 0x0000000000000000
__m256i __lasx_xvmskltz_d(__m256i{0x0000111800000000, 0x0081000081111111, 0x8111000008010101, 0x0000000000000000})
= 0x0000000000000000 0x0000000000000000 0x0000000000000001 0x0000000000000000

Operation

u64 m = 0x8000000000000000;
u64 c = m & a.dword[0];
c >>= 63;
dst.dword[0] = c;
c = m & a.dword[1];
c >>= 63;
dst.dword[0] |= c << 1;
dst.dword[1] = 0;

c = m & a.dword[2];
c >>= 63;
dst.dword[2] = c;
c = m & a.dword[3];
c >>= 63;
dst.dword[2] |= c << 1;
dst.dword[3] = 0;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvmsknz_b (__m256i a)

Synopsis

__m256i __lasx_xvmsknz_b (__m256i a)
#include <lasxintrin.h>
Instruction: xvmsknz.b xr, xr
CPU Flags: LASX

Description

For each 8-bit element in a, if the element is non-zero, set one bit in dst, otherwise clear it.

Examples

__m256i __lasx_xvmsknz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
= 0x000000000000feff 0x0000000000000000 0x000000000000ffff 0x0000000000000000
__m256i __lasx_xvmsknz_b(__m256i{0x0000111100000000, 0x0011000011111111, 0x1111000001010101, 0x0000000000000000})
= 0x0000000000004f30 0x0000000000000000 0x00000000000000cf 0x0000000000000000

Operation

u64 m = 0x7F7F7F7F7F7F7F7F;
u64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] = c;
c = ~(((a.dword[1] & m) + m) | a.dword[1] | m);
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[0] |= c << 8;
dst.dword[0] = (u16)~dst.dword[0];
dst.dword[1] = 0;

c = ~(((a.dword[2] & m) + m) | a.dword[2] | m);
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[2] = c;
c = ~(((a.dword[3] & m) + m) | a.dword[3] | m);
c |= c << 7;
c |= c << 14;
c |= c << 28;
c >>= 56;
dst.dword[2] |= c << 8;
dst.dword[2] = (u16)~dst.dword[2];
dst.dword[3] = 0;

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpackev_b (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpackev_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpackev.b xr, xr, xr
CPU Flags: LASX

Description

Collect and pack even-positioned 8-bit elements in a and b and store dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpackev_h (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpackev_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpackev.h xr, xr, xr
CPU Flags: LASX

Description

Collect and pack even-positioned 16-bit elements in a and b and store dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpackev_w (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpackev_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpackev.w xr, xr, xr
CPU Flags: LASX

Description

Collect and pack even-positioned 32-bit elements in a and b and store dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpackev_d (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpackev_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpackev.d xr, xr, xr
CPU Flags: LASX

Description

Collect and pack even-positioned 64-bit elements in a and b and store dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpackod_b (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpackod_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpackod.b xr, xr, xr
CPU Flags: LASX

Description

Collect and pack odd-positioned 8-bit elements in a and b and store dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpackod_h (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpackod_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpackod.h xr, xr, xr
CPU Flags: LASX

Description

Collect and pack odd-positioned 16-bit elements in a and b and store dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpackod_w (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpackod_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpackod.w xr, xr, xr
CPU Flags: LASX

Description

Collect and pack odd-positioned 32-bit elements in a and b and store dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpackod_d (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpackod_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpackod.d xr, xr, xr
CPU Flags: LASX

Description

Collect and pack odd-positioned 64-bit elements in a and b and store dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpickev_b (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpickev_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpickev.b xr, xr, xr
CPU Flags: LASX

Description

Pick even-positioned 8-bit elements in b first, then pick even-positioned 8-bit elements in a.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
}
for (int i = 16; i < 32; i++) {
  dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2] : a.byte[(i - 16) * 2];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpickev_h (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpickev_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpickev.h xr, xr, xr
CPU Flags: LASX

Description

Pick even-positioned 16-bit elements in b first, then pick even-positioned 16-bit elements in a.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
}
for (int i = 8; i < 16; i++) {
  dst.half[i] = (i < 12) ? b.half[(i - 4) * 2] : a.half[(i - 8) * 2];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpickev_w (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpickev_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpickev.w xr, xr, xr
CPU Flags: LASX

Description

Pick even-positioned 32-bit elements in b first, then pick even-positioned 32-bit elements in a.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = (i < 6) ? b.word[(i - 2) * 2] : a.word[(i - 4) * 2];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpickev_d (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpickev_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpickev.d xr, xr, xr
CPU Flags: LASX

Description

Pick even-positioned 64-bit elements in b first, then pick even-positioned 64-bit elements in a.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
}
for (int i = 2; i < 4; i++) {
  dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2] : a.dword[(i - 2) * 2];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpickve_w (__m256i a, imm0_7 imm)

Synopsis

__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)
#include <lasxintrin.h>
Instruction: xvpickve.w xr, xr, imm
CPU Flags: LASX

Description

Copy one 32-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (i == 0) ? a.word[imm] : 0;
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_xvpickve_d (__m256i a, imm0_3 imm)

Synopsis

__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)
#include <lasxintrin.h>
Instruction: xvpickve.d xr, xr, imm
CPU Flags: LASX

Description

Copy one 64-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256 lasx_xvpickve_w_f (__m256 a, imm0_7 imm)

Synopsis

__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)
#include <lasxintrin.h>
Instruction: xvpickve.w xr, xr, imm
CPU Flags: LASX

Description

Copy one 32-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (i == 0) ? a.word[imm] : 0;
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256d lasx_xvpickve_d_f (__m256d a, imm0_3 imm)

Synopsis

__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)
#include <lasxintrin.h>
Instruction: xvpickve.d xr, xr, imm
CPU Flags: LASX

Description

Copy one 64-bit lane from a specified by imm to the first lane of dst, and set the other lanes to zero.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)

Synopsis

int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)
#include <lasxintrin.h>
Instruction: xvpickve2gr.w r, xr, imm
CPU Flags: LASX

Description

Pick the lane specified by idx from a and store into dst.

Operation

dst = (s32)a.word[idx];

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)

Synopsis

unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)
#include <lasxintrin.h>
Instruction: xvpickve2gr.wu r, xr, imm
CPU Flags: LASX

Description

Pick the lane specified by idx from a and store into dst.

Operation

dst = (u32)a.word[idx];

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)

Synopsis

long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)
#include <lasxintrin.h>
Instruction: xvpickve2gr.d r, xr, imm
CPU Flags: LASX

Description

Pick the lane specified by idx from a and store into dst.

Operation

dst = (s64)a.dword[idx];

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)

Synopsis

unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)
#include <lasxintrin.h>
Instruction: xvpickve2gr.du r, xr, imm
CPU Flags: LASX

Description

Pick the lane specified by idx from a and store into dst.

Operation

dst = (u64)a.dword[idx];

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

m256i lasx_xvpickod_b (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpickod_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpickod.b xr, xr, xr
CPU Flags: LASX

Description

Pick odd-positioned 8-bit elements in b first, then pick odd-positioned 8-bit elements in a.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
}
for (int i = 16; i < 32; i++) {
  dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2 + 1] : a.byte[(i - 16) * 2 + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpickod_h (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpickod_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpickod.h xr, xr, xr
CPU Flags: LASX

Description

Pick odd-positioned 16-bit elements in b first, then pick odd-positioned 16-bit elements in a.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
}
for (int i = 8; i < 16; i++) {
  dst.half[i] = (i < 12) ? b.half[(i - 4) * 2 + 1] : a.half[(i - 8) * 2 + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpickod_w (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpickod_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpickod.w xr, xr, xr
CPU Flags: LASX

Description

Pick odd-positioned 32-bit elements in b first, then pick odd-positioned 32-bit elements in a.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = (i < 6) ? b.word[(i - 2) * 2 + 1] : a.word[(i - 4) * 2 + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvpickod_d (m256i a, m256i b)

Synopsis

__m256i __lasx_xvpickod_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvpickod.d xr, xr, xr
CPU Flags: LASX

Description

Pick odd-positioned 64-bit elements in b first, then pick odd-positioned 64-bit elements in a.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
}
for (int i = 2; i < 4; i++) {
  dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2 + 1] : a.dword[(i - 2) * 2 + 1];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvrepli_b (imm_n512_511 imm)

Synopsis

__m256i __lasx_xvrepli_b (imm_n512_511 imm)
#include <lasxintrin.h>
Instruction: xvldi xr, imm
CPU Flags: LASX

Description

Repeat imm to fill whole vector.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = imm;
}

Tested on real machine.

m256i lasx_xvrepli_h (imm_n512_511 imm)

Synopsis

__m256i __lasx_xvrepli_h (imm_n512_511 imm)
#include <lasxintrin.h>
Instruction: xvldi xr, imm
CPU Flags: LASX

Description

Repeat imm to fill whole vector.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = imm;
}

Tested on real machine.

m256i lasx_xvrepli_w (imm_n512_511 imm)

Synopsis

__m256i __lasx_xvrepli_w (imm_n512_511 imm)
#include <lasxintrin.h>
Instruction: xvldi xr, imm
CPU Flags: LASX

Description

Repeat imm to fill whole vector.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = imm;
}

Tested on real machine.

m256i lasx_xvrepli_d (imm_n512_511 imm)

Synopsis

__m256i __lasx_xvrepli_d (imm_n512_511 imm)
#include <lasxintrin.h>
Instruction: xvldi xr, imm
CPU Flags: LASX

Description

Repeat imm to fill whole vector.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = imm;
}

Tested on real machine.

m256i lasx_xvreplgr2vr_b (int val)

Synopsis

__m256i __lasx_xvreplgr2vr_b (int val)
#include <lasxintrin.h>
Instruction: xvreplgr2vr.b xr, r
CPU Flags: LASX

Description

Repeat val to whole vector.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = val;
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	N/A	1
3C5000	N/A	1

m256i lasx_xvreplgr2vr_h (int val)

Synopsis

__m256i __lasx_xvreplgr2vr_h (int val)
#include <lasxintrin.h>
Instruction: xvreplgr2vr.h xr, r
CPU Flags: LASX

Description

Repeat val to whole vector.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = val;
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	N/A	1
3C5000	N/A	1

m256i lasx_xvreplgr2vr_w (int val)

Synopsis

__m256i __lasx_xvreplgr2vr_w (int val)
#include <lasxintrin.h>
Instruction: xvreplgr2vr.w xr, r
CPU Flags: LASX

Description

Repeat val to whole vector.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = val;
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	N/A	1
3C5000	N/A	1

m256i lasx_xvreplgr2vr_d (long int val)

Synopsis

__m256i __lasx_xvreplgr2vr_d (long int val)
#include <lasxintrin.h>
Instruction: xvreplgr2vr.d xr, r
CPU Flags: LASX

Description

Repeat val to whole vector.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = val;
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	N/A	1
3C5000	N/A	1

m256i lasx_xvreplve_b (__m256i a, int idx)

Synopsis

__m256i __lasx_xvreplve_b (__m256i a, int idx)
#include <lasxintrin.h>
Instruction: xvreplve.b xr, xr, r
CPU Flags: LASX

Description

Repeat the element in lane idx of a to fill whole vector.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[idx % 16];
}
for (int i = 16; i < 32; i++) {
  dst.byte[i] = a.byte[(idx % 16) + 16];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

m256i lasx_xvreplve_h (__m256i a, int idx)

Synopsis

__m256i __lasx_xvreplve_h (__m256i a, int idx)
#include <lasxintrin.h>
Instruction: xvreplve.h xr, xr, r
CPU Flags: LASX

Description

Repeat the element in lane idx of a to fill whole vector.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[idx % 8];
}
for (int i = 8; i < 16; i++) {
  dst.half[i] = a.half[(idx % 8) + 8];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

m256i lasx_xvreplve_w (__m256i a, int idx)

Synopsis

__m256i __lasx_xvreplve_w (__m256i a, int idx)
#include <lasxintrin.h>
Instruction: xvreplve.w xr, xr, r
CPU Flags: LASX

Description

Repeat the element in lane idx of a to fill whole vector.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[idx % 4];
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = a.word[(idx % 4) + 4];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

m256i lasx_xvreplve_d (__m256i a, int idx)

Synopsis

__m256i __lasx_xvreplve_d (__m256i a, int idx)
#include <lasxintrin.h>
Instruction: xvreplve.d xr, xr, r
CPU Flags: LASX

Description

Repeat the element in lane idx of a to fill whole vector.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[idx % 2];
}
for (int i = 2; i < 4; i++) {
  dst.dword[i] = a.dword[(idx % 2) + 2];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	1
3C5000	1	1

m256i lasx_xvreplve0_b (__m256i a)

Synopsis

__m256i __lasx_xvreplve0_b (__m256i a)
#include <lasxintrin.h>
Instruction: xvreplve0.b xr, xr
CPU Flags: LASX

Description

Repeat the first 8-bit lane from a to all lanes of dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = a.byte[0];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_xvreplve0_h (__m256i a)

Synopsis

__m256i __lasx_xvreplve0_h (__m256i a)
#include <lasxintrin.h>
Instruction: xvreplve0.h xr, xr
CPU Flags: LASX

Description

Repeat the first 16-bit lane from a to all lanes of dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = a.half[0];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_xvreplve0_w (__m256i a)

Synopsis

__m256i __lasx_xvreplve0_w (__m256i a)
#include <lasxintrin.h>
Instruction: xvreplve0.w xr, xr
CPU Flags: LASX

Description

Repeat the first 32-bit lane from a to all lanes of dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = a.word[0];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_xvreplve0_d (__m256i a)

Synopsis

__m256i __lasx_xvreplve0_d (__m256i a)
#include <lasxintrin.h>
Instruction: xvreplve0.d xr, xr
CPU Flags: LASX

Description

Repeat the first 64-bit lane from a to all lanes of dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = a.dword[0];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_xvreplve0_q (__m256i a)

Synopsis

__m256i __lasx_xvreplve0_q (__m256i a)
#include <lasxintrin.h>
Instruction: xvreplve0.q xr, xr
CPU Flags: LASX

Description

Repeat the first 128-bit lane from a to all lanes of dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = a.qword[0];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	3	4
3C5000	3	2

m256i lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)

Synopsis

__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)
#include <lasxintrin.h>
Instruction: xvrepl128vei.b xr, xr, imm
CPU Flags: LASX

Description

Repeat the element in lane idx of a to fill whole vector.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[idx];
}
for (int i = 16; i < 32; i++) {
  dst.byte[i] = a.byte[idx + 16];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)

Synopsis

__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)
#include <lasxintrin.h>
Instruction: xvrepl128vei.h xr, xr, imm
CPU Flags: LASX

Description

Repeat the element in lane idx of a to fill whole vector.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[idx];
}
for (int i = 8; i < 16; i++) {
  dst.half[i] = a.half[idx + 8];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)

Synopsis

__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)
#include <lasxintrin.h>
Instruction: xvrepl128vei.w xr, xr, imm
CPU Flags: LASX

Description

Repeat the element in lane idx of a to fill whole vector.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[idx];
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = a.word[idx + 4];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)

Synopsis

__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)
#include <lasxintrin.h>
Instruction: xvrepl128vei.d xr, xr, imm
CPU Flags: LASX

Description

Repeat the element in lane idx of a to fill whole vector.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[idx];
}
for (int i = 2; i < 4; i++) {
  dst.dword[i] = a.dword[idx + 2];
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	4
3C5000	1	2

m256i lasx_xvsat_b (__m256i a, imm0_7 imm)

Synopsis

__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)
#include <lasxintrin.h>
Instruction: xvsat.b xr, xr, imm
CPU Flags: LASX

Description

Clamp signed 8-bit elements in a to range specified by imm.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvsat_bu (__m256i a, imm0_7 imm)

Synopsis

__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)
#include <lasxintrin.h>
Instruction: xvsat.bu xr, xr, imm
CPU Flags: LASX

Description

Clamp unsigned 8-bit elements in a to range specified by imm.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvsat_h (__m256i a, imm0_15 imm)

Synopsis

__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)
#include <lasxintrin.h>
Instruction: xvsat.h xr, xr, imm
CPU Flags: LASX

Description

Clamp signed 16-bit elements in a to range specified by imm.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvsat_hu (__m256i a, imm0_15 imm)

Synopsis

__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)
#include <lasxintrin.h>
Instruction: xvsat.hu xr, xr, imm
CPU Flags: LASX

Description

Clamp unsigned 16-bit elements in a to range specified by imm.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvsat_w (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvsat.w xr, xr, imm
CPU Flags: LASX

Description

Clamp signed 32-bit elements in a to range specified by imm.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvsat_wu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvsat.wu xr, xr, imm
CPU Flags: LASX

Description

Clamp unsigned 32-bit elements in a to range specified by imm.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvsat_d (__m256i a, imm0_63 imm)

Synopsis

__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)
#include <lasxintrin.h>
Instruction: xvsat.d xr, xr, imm
CPU Flags: LASX

Description

Clamp signed 64-bit elements in a to range specified by imm.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvsat_du (__m256i a, imm0_63 imm)

Synopsis

__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)
#include <lasxintrin.h>
Instruction: xvsat.du xr, xr, imm
CPU Flags: LASX

Description

Clamp unsigned 64-bit elements in a to range specified by imm.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	2	2
3C5000	2	2

m256i lasx_xvsigncov_b (m256i a, m256i b)

Synopsis

__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsigncov.b xr, xr, xr
CPU Flags: LASX

Description

If the 8-bit element in a equals to zero, set the result to zero. If the signed 8-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] =
      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	2
3C5000	1	2

m256i lasx_xvsigncov_h (m256i a, m256i b)

Synopsis

__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsigncov.h xr, xr, xr
CPU Flags: LASX

Description

If the 16-bit element in a equals to zero, set the result to zero. If the signed 16-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] =
      (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	2
3C5000	1	2

m256i lasx_xvsigncov_w (m256i a, m256i b)

Synopsis

__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsigncov.w xr, xr, xr
CPU Flags: LASX

Description

If the 32-bit element in a equals to zero, set the result to zero. If the signed 32-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] =
      (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	2
3C5000	1	2

m256i lasx_xvsigncov_d (m256i a, m256i b)

Synopsis

__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsigncov.d xr, xr, xr
CPU Flags: LASX

Description

If the 64-bit element in a equals to zero, set the result to zero. If the signed 64-bit element in a is positive, copy element in b to result. Otherwise, copy negated element in b to result. If a and b are the same vectors, it is equivalent to computing absolute value.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] =
      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU	Latency	Throughput (CPI)
3A6000	1	2
3C5000	1	2

m256i lasx_xvldi (imm_n1024_1023 imm)

Synopsis

__m256i __lasx_xvldi (imm_n1024_1023 imm)
#include <lasxintrin.h>
Instruction: xvldi xr, imm
CPU Flags: LASX

Description

Initialize dst using predefined patterns:

imm[12:10]=0b000: broadcast imm[7:0] as 8-bit elements to all lanes
imm[12:10]=0b001: broadcast sign-extended imm[9:0] as 16-bit elements to all lanes
imm[12:10]=0b010: broadcast sign-extended imm[9:0] as 32-bit elements to all lanes
imm[12:10]=0b011: broadcast sign-extended imm[9:0] as 64-bit elements to all lanes
imm[12:8]=0b10000: broadcast imm[7:0] as 32-bit elements to all lanes
imm[12:8]=0b10001: broadcast imm[7:0] << 8 as 32-bit elements to all lanes
imm[12:8]=0b10010: broadcast imm[7:0] << 16 as 32-bit elements to all lanes
imm[12:8]=0b10011: broadcast imm[7:0] << 24 as 32-bit elements to all lanes
imm[12:8]=0b10100: broadcast imm[7:0] as 16-bit elements to all lanes
imm[12:8]=0b10101: broadcast imm[7:0] << 8 as 16-bit elements to all lanes
imm[12:8]=0b10110: broadcast (imm[7:0] << 8) | 0xFF as 32-bit elements to all lanes
imm[12:8]=0b10111: broadcast (imm[7:0] << 16) | 0xFFFF as 32-bit elements to all lanes
imm[12:8]=0b11000: broadcast imm[7:0] as 8-bit elements to all lanes
imm[12:8]=0b11001: repeat each bit of imm[7:0] eight times, and broadcast the result as 64-bit elements to all lanes
imm[12:8]=0b11010: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 32-bit elements to all lanes
imm[12:8]=0b11011: broadcast (imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19) as 64-bit elements to all lanes
imm[12:8]=0b11100: broadcast (imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48) as 64-bit elements to all lanes

Operation

u64 imm12_10 = (imm >> 10) & 0b111;
u64 imm12_8 = (imm >> 8) & 0b11111;
u64 imm9_0 = imm & 0x3FF;
s64 simm9_0 = ((s64)imm9_0 << 54) >> 54;
u64 imm7_0 = imm & 0xFF;
u64 imm7 = (imm >> 7) & 0x1;
u64 imm6 = (imm >> 6) & 0x1;
u64 imm5 = (imm >> 5) & 0x1;
u64 imm5_0 = imm & 0x3F;
u64 imm4 = (imm >> 4) & 0x1;
u64 imm3 = (imm >> 3) & 0x1;
u64 imm2 = (imm >> 2) & 0x1;
u64 imm1 = (imm >> 1) & 0x1;
u64 imm0 = imm & 0x1;

u64 broadcast_value;
u64 broadcast_width;
if (imm12_10 == 0b000) {
  broadcast_value = imm7_0;
  broadcast_width = 8;
} else if (imm12_10 == 0b001) {
  broadcast_value = simm9_0;
  broadcast_width = 16;
} else if (imm12_10 == 0b010) {
  broadcast_value = simm9_0;
  broadcast_width = 32;
} else if (imm12_10 == 0b011) {
  broadcast_value = simm9_0;
  broadcast_width = 64;
} else if (imm12_8 == 0b10000) {
  broadcast_value = imm7_0;
  broadcast_width = 32;
} else if (imm12_8 == 0b10001) {
  broadcast_value = imm7_0 << 8;
  broadcast_width = 32;
} else if (imm12_8 == 0b10010) {
  broadcast_value = imm7_0 << 16;
  broadcast_width = 32;
} else if (imm12_8 == 0b10011) {
  broadcast_value = imm7_0 << 24;
  broadcast_width = 32;
} else if (imm12_8 == 0b10100) {
  broadcast_value = imm7_0;
  broadcast_width = 16;
} else if (imm12_8 == 0b10101) {
  broadcast_value = imm7_0 << 8;
  broadcast_width = 16;
} else if (imm12_8 == 0b10110) {
  broadcast_value = (imm7_0 << 8) | 0xFF;
  broadcast_width = 32;
} else if (imm12_8 == 0b10111) {
  broadcast_value = (imm7_0 << 16) | 0xFFFF;
  broadcast_width = 32;
} else if (imm12_8 == 0b11000) {
  broadcast_value = imm7_0;
  broadcast_width = 8;
} else if (imm12_8 == 0b11001) {
  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
                    imm7 * 0xFF00000000000000;
  broadcast_width = 64;
} else if (imm12_8 == 0b11010) {
  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
                    (imm5_0 << 19);
  broadcast_width = 32;
} else if (imm12_8 == 0b11011) {
  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |
                    (imm5_0 << 19);
  broadcast_width = 64;
} else if (imm12_8 == 0b11100) {
  broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |
                    (imm5_0 << 48);
  broadcast_width = 64;
}

if (broadcast_width == 8) {
  for (int i = 0; i < 32; i++) {
    dst.byte[i] = broadcast_value;
  }
} else if (broadcast_width == 16) {
  for (int i = 0; i < 16; i++) {
    dst.half[i] = broadcast_value;
  }
} else if (broadcast_width == 32) {
  for (int i = 0; i < 8; i++) {
    dst.word[i] = broadcast_value;
  }
} else if (broadcast_width == 64) {
  for (int i = 0; i < 4; i++) {
    dst.dword[i] = broadcast_value;
  }
}