Integer Computation

__m256i __lasx_xvadd_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadd_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadd.b xr, xr, xr
CPU Flags: LASX

Description

Add 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = a.byte[i] + b.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvadd_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadd_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadd.h xr, xr, xr
CPU Flags: LASX

Description

Add 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = a.half[i] + b.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvadd_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadd_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadd.w xr, xr, xr
CPU Flags: LASX

Description

Add 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = a.word[i] + b.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvadd_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadd_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadd.d xr, xr, xr
CPU Flags: LASX

Description

Add 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = a.dword[i] + b.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvadd_q (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadd_q (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadd.q xr, xr, xr
CPU Flags: LASX

Description

Add 128-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = a.qword[i] + b.qword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvabsd_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvabsd_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvabsd.b xr, xr, xr
CPU Flags: LASX

Description

Compute absolute difference of signed 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
                                                : (b.byte[i] - a.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvabsd.bu xr, xr, xr
CPU Flags: LASX

Description

Compute absolute difference of unsigned 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
                                                : (b.byte[i] - a.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvabsd_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvabsd_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvabsd.h xr, xr, xr
CPU Flags: LASX

Description

Compute absolute difference of signed 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])
                                                  : (b.half[i] - a.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvabsd.hu xr, xr, xr
CPU Flags: LASX

Description

Compute absolute difference of unsigned 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])
                                                  : (b.half[i] - a.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvabsd_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvabsd_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvabsd.w xr, xr, xr
CPU Flags: LASX

Description

Compute absolute difference of signed 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])
                                                  : (b.word[i] - a.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvabsd.wu xr, xr, xr
CPU Flags: LASX

Description

Compute absolute difference of unsigned 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])
                                                  : (b.word[i] - a.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvabsd_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvabsd_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvabsd.d xr, xr, xr
CPU Flags: LASX

Description

Compute absolute difference of signed 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])
                     ? (a.dword[i] - b.dword[i])
                     : (b.dword[i] - a.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvabsd_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvabsd_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvabsd.du xr, xr, xr
CPU Flags: LASX

Description

Compute absolute difference of unsigned 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])
                     ? (a.dword[i] - b.dword[i])
                     : (b.dword[i] - a.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvadda_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadda_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadda.b xr, xr, xr
CPU Flags: LASX

Description

Add absolute of 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvadda_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadda_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadda.h xr, xr, xr
CPU Flags: LASX

Description

Add absolute of 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvadda_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadda_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadda.w xr, xr, xr
CPU Flags: LASX

Description

Add absolute of 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvadda_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvadda_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvadda.d xr, xr, xr
CPU Flags: LASX

Description

Add absolute of 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvaddi.bu xr, xr, imm
CPU Flags: LASX

Description

Add 8-bit elements in a and imm, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = a.byte[i] + imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvaddi.hu xr, xr, imm
CPU Flags: LASX

Description

Add 16-bit elements in a and imm, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = a.half[i] + imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvaddi.wu xr, xr, imm
CPU Flags: LASX

Description

Add 32-bit elements in a and imm, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = a.word[i] + imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvaddi.du xr, xr, imm
CPU Flags: LASX

Description

Add 64-bit elements in a and imm, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = a.dword[i] + imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.h.b xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.h.bu xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.h.bu.b xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.w.h xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.w.hu xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.w.hu.h xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.d.w xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.d.wu xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.d.wu.w xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.q.d xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.q.du xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwev.q.du.d xr, xr, xr
CPU Flags: LASX

Description

Add even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.h.b xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.h.bu xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.h.bu.b xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.w.h xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.w.hu xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.w.hu.h xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.d.w xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.d.wu xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.d.wu.w xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.q.d xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.q.du xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvaddwod.q.du.d xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvavg_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavg_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavg.b xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards negative infinity) of signed 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
                ((a.byte[i] & b.byte[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavg_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavg_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavg.bu xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
                ((a.byte[i] & b.byte[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavg_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavg_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavg.h xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards negative infinity) of signed 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
                ((a.half[i] & b.half[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavg_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavg_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavg.hu xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
                ((a.half[i] & b.half[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavg_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavg_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavg.w xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards negative infinity) of signed 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
                ((a.word[i] & b.word[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavg_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavg_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavg.wu xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
                ((a.word[i] & b.word[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavg_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavg_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavg.d xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards negative infinity) of signed 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
                 ((a.dword[i] & b.dword[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvavg_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavg_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavg.du xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
                 ((a.dword[i] & b.dword[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvavgr_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavgr_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavgr.b xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards positive infinity) of signed 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +
                ((a.byte[i] | b.byte[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavgr.bu xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +
                ((a.byte[i] | b.byte[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavgr_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavgr_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavgr.h xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards positive infinity) of signed 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +
                ((a.half[i] | b.half[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavgr.hu xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +
                ((a.half[i] | b.half[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavgr_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavgr_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavgr.w xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards positive infinity) of signed 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +
                ((a.word[i] | b.word[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavgr.wu xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +
                ((a.word[i] | b.word[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvavgr_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavgr_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavgr.d xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards positive infinity) of signed 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +
                 ((a.dword[i] | b.dword[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvavgr_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvavgr_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvavgr.du xr, xr, xr
CPU Flags: LASX

Description

Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +
                 ((a.dword[i] | b.dword[i]) & 1);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvdiv_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvdiv_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvdiv.b xr, xr, xr
CPU Flags: LASX

Description

Divide signed 8-bit elements in a by elements in b.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 29, 32 0.06(1/15.5)
3C5000 32, 36 0.05(1/20.5)

__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvdiv.bu xr, xr, xr
CPU Flags: LASX

Description

Divide unsigned 8-bit elements in a by elements in b.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 29, 33 0.06(1/16.5)
3C5000 29, 36 0.05(1/20.5)

__m256i __lasx_xvdiv_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvdiv_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvdiv.h xr, xr, xr
CPU Flags: LASX

Description

Divide signed 16-bit elements in a by elements in b.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 17 0.12(1/8.5)
3C5000 21.5, 22 0.08(1/13)

__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvdiv.hu xr, xr, xr
CPU Flags: LASX

Description

Divide unsigned 16-bit elements in a by elements in b.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 17, 22 0.11(1/9)
3C5000 17, 21.5 0.07(1/15)

__m256i __lasx_xvdiv_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvdiv_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvdiv.w xr, xr, xr
CPU Flags: LASX

Description

Divide signed 32-bit elements in a by elements in b.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 11 0.18(1/5.5)
3C5000 11, 17.5 0.09(1/11.5)

__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvdiv.wu xr, xr, xr
CPU Flags: LASX

Description

Divide unsigned 32-bit elements in a by elements in b.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 11 0.18(1/5.5)
3C5000 11, 17.5 0.07(1/15)

__m256i __lasx_xvdiv_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvdiv_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvdiv.d xr, xr, xr
CPU Flags: LASX

Description

Divide signed 64-bit elements in a by elements in b.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 8 0.25(1/4)
3C5000 8, 18.5 0.11(1/9)

__m256i __lasx_xvdiv_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvdiv_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvdiv.du xr, xr, xr
CPU Flags: LASX

Description

Divide unsigned 64-bit elements in a by elements in b.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 8 0.25(1/4)
3C5000 8, 18.5 0.11(1/9)

__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhaddw.h.b xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned signed 8-bit elements in a to even-positioned signed 8-bit elements in 'b' to get 16-bit result.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhaddw.hu.bu xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 8-bit elements in a to even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhaddw.w.h xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned signed 16-bit elements in a to even-positioned signed 16-bit elements in 'b' to get 32-bit result.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhaddw.wu.hu xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 16-bit elements in a to even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhaddw.d.w xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned signed 32-bit elements in a to even-positioned signed 32-bit elements in 'b' to get 64-bit result.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhaddw.du.wu xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 32-bit elements in a to even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhaddw.q.d xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned signed 64-bit elements in a to even-positioned signed 64-bit elements in 'b' to get 128-bit result.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhaddw.qu.du xr, xr, xr
CPU Flags: LASX

Description

Add odd-positioned unsigned 64-bit elements in a to even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhsubw.h.b xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned signed 8-bit elements in a by even-positioned signed 8-bit elements in 'b' to get 16-bit result.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhsubw.hu.bu xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned unsigned 8-bit elements in a by even-positioned unsigned 8-bit elements in 'b' to get 16-bit result.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhsubw.w.h xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned signed 16-bit elements in a by even-positioned signed 16-bit elements in 'b' to get 32-bit result.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhsubw.wu.hu xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned unsigned 16-bit elements in a by even-positioned unsigned 16-bit elements in 'b' to get 32-bit result.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhsubw.d.w xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned signed 32-bit elements in a by even-positioned signed 32-bit elements in 'b' to get 64-bit result.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhsubw.du.wu xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned unsigned 32-bit elements in a by even-positioned unsigned 32-bit elements in 'b' to get 64-bit result.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhsubw.q.d xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned signed 64-bit elements in a by even-positioned signed 64-bit elements in 'b' to get 128-bit result.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvhsubw.qu.du xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned unsigned 64-bit elements in a by even-positioned unsigned 64-bit elements in 'b' to get 128-bit result.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmadd.b xr, xr, xr
CPU Flags: LASX

Description

Multiply 8-bit elements in b and c, add to elements in a, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmadd.h xr, xr, xr
CPU Flags: LASX

Description

Multiply 16-bit elements in b and c, add to elements in a, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = b.half[i] * c.half[i] + a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmadd.w xr, xr, xr
CPU Flags: LASX

Description

Multiply 32-bit elements in b and c, add to elements in a, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = b.word[i] * c.word[i] + a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmadd.d xr, xr, xr
CPU Flags: LASX

Description

Multiply 64-bit elements in b and c, add to elements in a, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.h.b xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] =
      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.h.bu xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] =
      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.h.bu.b xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] =
      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.w.h xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] =
      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.w.hu xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] =
      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.w.hu.h xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] =
      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.d.w xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] =
      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.d.wu xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] =
      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.d.wu.w xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] =
      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.q.d xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] =
      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 1.14
3C5000 7 1.14

__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.q.du xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] =
      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 1.14
3C5000 7 1.14

__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwev.q.du.d xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] =
      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 1.14
3C5000 7 1.14

__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.h.b xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned signed 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] =
      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.h.bu xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 8-bit elements in b and unsigned elements in c, add to 16-bit elements in a.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] =
      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.h.bu.b xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 8-bit elements in b and signed elements in c, add to 16-bit elements in a.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] =
      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.w.h xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned signed 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
                (s32)a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.w.hu xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 16-bit elements in b and unsigned elements in c, add to 32-bit elements in a.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
                (u32)a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.w.hu.h xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 16-bit elements in b and signed elements in c, add to 32-bit elements in a.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
                (s32)a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.d.w xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned signed 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
                 (s64)a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.d.wu xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 32-bit elements in b and unsigned elements in c, add to 64-bit elements in a.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
                 (u64)a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.d.wu.w xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 32-bit elements in b and signed elements in c, add to 64-bit elements in a.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
                 (s64)a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.q.d xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned signed 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
                 (s128)a.qword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 1.14
3C5000 7 1.14

__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.q.du xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 64-bit elements in b and unsigned elements in c, add to 128-bit elements in a.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
                 (u128)a.qword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 1.14
3C5000 7 1.14

__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmaddwod.q.du.d xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 64-bit elements in b and signed elements in c, add to 128-bit elements in a.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
                 (s128)a.qword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 1.14
3C5000 7 1.14

__m256i __lasx_xvmax_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmax_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmax.b xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise maximum for signed 8-bit elements in a and b.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmax_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmax_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmax.bu xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise maximum for unsigned 8-bit elements in a and b.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmax_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmax_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmax.h xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise maximum for signed 16-bit elements in a and b.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmax_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmax_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmax.hu xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise maximum for unsigned 16-bit elements in a and b.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmax_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmax_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmax.w xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise maximum for signed 32-bit elements in a and b.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmax_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmax_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmax.wu xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise maximum for unsigned 32-bit elements in a and b.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmax_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmax_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmax.d xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise maximum for signed 64-bit elements in a and b.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvmax_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmax_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmax.du xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise maximum for unsigned 64-bit elements in a and b.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)

Synopsis

__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)
#include <lasxintrin.h>
Instruction: xvmaxi.b xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise maximum for signed 8-bit elements in a and imm.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = max((s8)a.byte[i], (s8)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvmaxi.bu xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise maximum for unsigned 8-bit elements in a and imm.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = max((u8)a.byte[i], (u8)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)

Synopsis

__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)
#include <lasxintrin.h>
Instruction: xvmaxi.h xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise maximum for signed 16-bit elements in a and imm.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = max((s16)a.half[i], (s16)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvmaxi.hu xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise maximum for unsigned 16-bit elements in a and imm.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = max((u16)a.half[i], (u16)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)

Synopsis

__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)
#include <lasxintrin.h>
Instruction: xvmaxi.w xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise maximum for signed 32-bit elements in a and imm.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = max((s32)a.word[i], (s32)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvmaxi.wu xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise maximum for unsigned 32-bit elements in a and imm.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = max((u32)a.word[i], (u32)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)

Synopsis

__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)
#include <lasxintrin.h>
Instruction: xvmaxi.d xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise maximum for signed 64-bit elements in a and imm.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = max((s64)a.dword[i], (s64)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvmaxi.du xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise maximum for unsigned 64-bit elements in a and imm.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = max((u64)a.dword[i], (u64)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvmin_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmin_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmin.b xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise minimum for signed 8-bit elements in a and b.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmin_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmin_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmin.bu xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise minimum for unsigned 8-bit elements in a and b.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmin_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmin_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmin.h xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise minimum for signed 16-bit elements in a and b.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmin_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmin_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmin.hu xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise minimum for unsigned 16-bit elements in a and b.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmin_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmin_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmin.w xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise minimum for signed 32-bit elements in a and b.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmin_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmin_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmin.wu xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise minimum for unsigned 32-bit elements in a and b.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmin_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmin_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmin.d xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise minimum for signed 64-bit elements in a and b.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvmin_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmin_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmin.du xr, xr, xr
CPU Flags: LASX

Description

Compute elementwise minimum for unsigned 64-bit elements in a and b.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)

Synopsis

__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)
#include <lasxintrin.h>
Instruction: xvmini.b xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise minimum for signed 8-bit elements in a and imm.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = min((s8)a.byte[i], (s8)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvmini.bu xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise minimum for unsigned 8-bit elements in a and imm.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = min((u8)a.byte[i], (u8)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)

Synopsis

__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)
#include <lasxintrin.h>
Instruction: xvmini.h xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise minimum for signed 16-bit elements in a and imm.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = min((s16)a.half[i], (s16)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvmini.hu xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise minimum for unsigned 16-bit elements in a and imm.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = min((u16)a.half[i], (u16)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)

Synopsis

__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)
#include <lasxintrin.h>
Instruction: xvmini.w xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise minimum for signed 32-bit elements in a and imm.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = min((s32)a.word[i], (s32)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvmini.wu xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise minimum for unsigned 32-bit elements in a and imm.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = min((u32)a.word[i], (u32)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)

Synopsis

__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)
#include <lasxintrin.h>
Instruction: xvmini.d xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise minimum for signed 64-bit elements in a and imm.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = min((s64)a.dword[i], (s64)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvmini.du xr, xr, imm
CPU Flags: LASX

Description

Compute elementwise minimum for unsigned 64-bit elements in a and imm.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = min((u64)a.dword[i], (u64)imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 4
3C5000 2 2

__m256i __lasx_xvmod_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmod_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmod.b xr, xr, xr
CPU Flags: LASX

Description

Modulo residual signed 8-bit elements in a by elements in b.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 29, 41 0.06(1/15.5)
3C5000 29, 33 0.05(1/21.5)

__m256i __lasx_xvmod_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmod_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmod.bu xr, xr, xr
CPU Flags: LASX

Description

Modulo residual unsigned 8-bit elements in a by elements in b.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 29, 37 0.06(1/17.5)
3C5000 29, 37 0.05(1/22)

__m256i __lasx_xvmod_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmod_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmod.h xr, xr, xr
CPU Flags: LASX

Description

Modulo residual signed 16-bit elements in a by elements in b.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 17, 21 0.12(1/8.5)
3C5000 17, 21 0.07(1/13.5)

__m256i __lasx_xvmod_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmod_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmod.hu xr, xr, xr
CPU Flags: LASX

Description

Modulo residual unsigned 16-bit elements in a by elements in b.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 17, 25 0.11(1/9.5)
3C5000 17, 23 0.06(1/16)

__m256i __lasx_xvmod_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmod_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmod.w xr, xr, xr
CPU Flags: LASX

Description

Modulo residual signed 32-bit elements in a by elements in b.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 11, 13 0.18(1/5.5)
3C5000 11, 15 0.07(1/13.5)

__m256i __lasx_xvmod_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmod_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmod.wu xr, xr, xr
CPU Flags: LASX

Description

Modulo residual unsigned 32-bit elements in a by elements in b.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 11, 13 0.18(1/5.5)
3C5000 11, 15 0.06(1/16)

__m256i __lasx_xvmod_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmod_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmod.d xr, xr, xr
CPU Flags: LASX

Description

Modulo residual signed 64-bit elements in a by elements in b.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 8, 10 0.25(1/4)
3C5000 8, 10 0.11(1/9.5)

__m256i __lasx_xvmod_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmod_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmod.du xr, xr, xr
CPU Flags: LASX

Description

Modulo residual unsigned 64-bit elements in a by elements in b.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 8, 10 0.25(1/4)
3C5000 8, 10 0.11(1/9.5)

__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmsub.b xr, xr, xr
CPU Flags: LASX

Description

Multiply 8-bit elements in b and c, negate and add elements in a, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmsub.h xr, xr, xr
CPU Flags: LASX

Description

Multiply 16-bit elements in b and c, negate and add elements in a, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmsub.w xr, xr, xr
CPU Flags: LASX

Description

Multiply 32-bit elements in b and c, negate and add elements in a, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)

Synopsis

__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)
#include <lasxintrin.h>
Instruction: xvmsub.d xr, xr, xr
CPU Flags: LASX

Description

Multiply 64-bit elements in b and c, negate and add elements in a, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmuh_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmuh_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmuh.b xr, xr, xr
CPU Flags: LASX

Description

Multiply signed 8-bit elements in a and b, save the high 8-bit result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmuh.bu xr, xr, xr
CPU Flags: LASX

Description

Multiply unsigned 8-bit elements in a and b, save the high 8-bit result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmuh_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmuh_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmuh.h xr, xr, xr
CPU Flags: LASX

Description

Multiply signed 16-bit elements in a and b, save the high 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmuh.hu xr, xr, xr
CPU Flags: LASX

Description

Multiply unsigned 16-bit elements in a and b, save the high 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmuh_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmuh_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmuh.w xr, xr, xr
CPU Flags: LASX

Description

Multiply signed 32-bit elements in a and b, save the high 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmuh.wu xr, xr, xr
CPU Flags: LASX

Description

Multiply unsigned 32-bit elements in a and b, save the high 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmuh_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmuh_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmuh.d xr, xr, xr
CPU Flags: LASX

Description

Multiply signed 64-bit elements in a and b, save the high 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmuh_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmuh_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmuh.du xr, xr, xr
CPU Flags: LASX

Description

Multiply unsigned 64-bit elements in a and b, save the high 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmul_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmul_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmul.b xr, xr, xr
CPU Flags: LASX

Description

Multiply 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = a.byte[i] * b.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmul_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmul_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmul.h xr, xr, xr
CPU Flags: LASX

Description

Multiply 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = a.half[i] * b.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmul_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmul_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmul.w xr, xr, xr
CPU Flags: LASX

Description

Multiply 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = a.word[i] * b.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmul_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmul_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmul.d xr, xr, xr
CPU Flags: LASX

Description

Multiply 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = a.dword[i] * b.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.h.b xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.h.bu xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.h.bu.b xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.w.h xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.w.hu xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.w.hu.h xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.d.w xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.d.wu xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.d.wu.w xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.q.d xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 2
3C5000 7 2

__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.q.du xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 2
3C5000 7 2

__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwev.q.du.d xr, xr, xr
CPU Flags: LASX

Description

Multiply even-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 2
3C5000 7 2

__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.h.b xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.h.bu xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.h.bu.b xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.w.h xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.w.hu xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.w.hu.h xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.d.w xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.d.wu xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.d.wu.w xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 2

__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.q.d xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 2
3C5000 7 2

__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.q.du xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 2
3C5000 7 2

__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvmulwod.q.du.d xr, xr, xr
CPU Flags: LASX

Description

Multiply odd-positioned unsigned 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 7 2
3C5000 7 2

__m256i __lasx_xvneg_b (__m256i a)

Synopsis

__m256i __lasx_xvneg_b (__m256i a)
#include <lasxintrin.h>
Instruction: xvneg.b xr, xr
CPU Flags: LASX

Description

Negate 8-bit elements in a and save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = -a.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvneg_h (__m256i a)

Synopsis

__m256i __lasx_xvneg_h (__m256i a)
#include <lasxintrin.h>
Instruction: xvneg.h xr, xr
CPU Flags: LASX

Description

Negate 16-bit elements in a and save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = -a.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvneg_w (__m256i a)

Synopsis

__m256i __lasx_xvneg_w (__m256i a)
#include <lasxintrin.h>
Instruction: xvneg.w xr, xr
CPU Flags: LASX

Description

Negate 32-bit elements in a and save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = -a.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvneg_d (__m256i a)

Synopsis

__m256i __lasx_xvneg_d (__m256i a)
#include <lasxintrin.h>
Instruction: xvneg.d xr, xr
CPU Flags: LASX

Description

Negate 64-bit elements in a and save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = -a.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsadd_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsadd_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsadd.b xr, xr, xr
CPU Flags: LASX

Description

Saturing add the signed 8-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsadd.bu xr, xr, xr
CPU Flags: LASX

Description

Saturing add the unsigned 8-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsadd_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsadd_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsadd.h xr, xr, xr
CPU Flags: LASX

Description

Saturing add the signed 16-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsadd.hu xr, xr, xr
CPU Flags: LASX

Description

Saturing add the unsigned 16-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsadd_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsadd_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsadd.w xr, xr, xr
CPU Flags: LASX

Description

Saturing add the signed 32-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsadd.wu xr, xr, xr
CPU Flags: LASX

Description

Saturing add the unsigned 32-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsadd_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsadd_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsadd.d xr, xr, xr
CPU Flags: LASX

Description

Saturing add the signed 64-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsadd_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsadd_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsadd.du xr, xr, xr
CPU Flags: LASX

Description

Saturing add the unsigned 64-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvssub_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvssub_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvssub.b xr, xr, xr
CPU Flags: LASX

Description

Saturing subtract the signed 8-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvssub_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvssub_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvssub.bu xr, xr, xr
CPU Flags: LASX

Description

Saturing subtract the unsigned 8-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvssub_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvssub_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvssub.h xr, xr, xr
CPU Flags: LASX

Description

Saturing subtract the signed 16-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvssub_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvssub_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvssub.hu xr, xr, xr
CPU Flags: LASX

Description

Saturing subtract the unsigned 16-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvssub_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvssub_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvssub.w xr, xr, xr
CPU Flags: LASX

Description

Saturing subtract the signed 32-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvssub_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvssub_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvssub.wu xr, xr, xr
CPU Flags: LASX

Description

Saturing subtract the unsigned 32-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvssub_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvssub_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvssub.d xr, xr, xr
CPU Flags: LASX

Description

Saturing subtract the signed 64-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvssub_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvssub_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvssub.du xr, xr, xr
CPU Flags: LASX

Description

Saturing subtract the unsigned 64-bit elements in a and b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsub_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsub_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsub.b xr, xr, xr
CPU Flags: LASX

Description

Subtract 8-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = a.byte[i] - b.byte[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsub_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsub_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsub.h xr, xr, xr
CPU Flags: LASX

Description

Subtract 16-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = a.half[i] - b.half[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsub_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsub_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsub.w xr, xr, xr
CPU Flags: LASX

Description

Subtract 32-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = a.word[i] - b.word[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsub_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsub_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsub.d xr, xr, xr
CPU Flags: LASX

Description

Subtract 64-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = a.dword[i] - b.dword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsub_q (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsub_q (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsub.q xr, xr, xr
CPU Flags: LASX

Description

Subtract 128-bit elements in a and b, save the result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = a.qword[i] - b.qword[i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvsubi.bu xr, xr, imm
CPU Flags: LASX

Description

Subtract 8-bit elements in a by imm, save the result in dst.

Operation

for (int i = 0; i < 32; i++) {
  dst.byte[i] = a.byte[i] - imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvsubi.hu xr, xr, imm
CPU Flags: LASX

Description

Subtract 16-bit elements in a by imm, save the result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = a.half[i] - imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvsubi.wu xr, xr, imm
CPU Flags: LASX

Description

Subtract 32-bit elements in a by imm, save the result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = a.word[i] - imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)

Synopsis

__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)
#include <lasxintrin.h>
Instruction: xvsubi.du xr, xr, imm
CPU Flags: LASX

Description

Subtract 64-bit elements in a by imm, save the result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = a.dword[i] - imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwev.h.b xr, xr, xr
CPU Flags: LASX

Description

Subtract even-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwev.h.bu xr, xr, xr
CPU Flags: LASX

Description

Subtract even-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwev.w.h xr, xr, xr
CPU Flags: LASX

Description

Subtract even-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwev.w.hu xr, xr, xr
CPU Flags: LASX

Description

Subtract even-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwev.d.w xr, xr, xr
CPU Flags: LASX

Description

Subtract even-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwev.d.wu xr, xr, xr
CPU Flags: LASX

Description

Subtract even-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwev.q.d xr, xr, xr
CPU Flags: LASX

Description

Subtract even-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwev.q.du xr, xr, xr
CPU Flags: LASX

Description

Subtract even-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwod.h.b xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned signed 8-bit elements in a and signed elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwod.h.bu xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned unsigned 8-bit elements in a and unsigned elements in b, save the 16-bit result in dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwod.w.h xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned signed 16-bit elements in a and signed elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwod.w.hu xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned unsigned 16-bit elements in a and unsigned elements in b, save the 32-bit result in dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwod.d.w xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned signed 32-bit elements in a and signed elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwod.d.wu xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned unsigned 32-bit elements in a and unsigned elements in b, save the 64-bit result in dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 2

__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwod.q.d xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned signed 64-bit elements in a and signed elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)

Synopsis

__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvsubwod.q.du xr, xr, xr
CPU Flags: LASX

Description

Subtract odd-positioned unsigned 64-bit elements in a and unsigned elements in b, save the 128-bit result in dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2