Shift

__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vbsll.v vr, vr, imm
CPU Flags: LSX

Description

Compute whole vector a shifted left by imm * 8 bits.

Operation

int shift = (imm * 8) % 128;
dst.qword[0] = (u128)a.qword[0] << shift;

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vbsrl.v vr, vr, imm
CPU Flags: LSX

Description

Compute whole vector a shifted right by imm * 8 bits.

Operation

int shift = (imm * 8) % 128;
dst.qword[0] = (u128)a.qword[0] >> shift;

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsll_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsll_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsll.b vr, vr, vr
CPU Flags: LSX

Description

Logical left shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsll_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsll_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsll.h vr, vr, vr
CPU Flags: LSX

Description

Logical left shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] << (b.half[i] & 0xf);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsll_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsll_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsll.w vr, vr, vr
CPU Flags: LSX

Description

Logical left shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] << (b.word[i] & 0x1f);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsll_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsll_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsll.d vr, vr, vr
CPU Flags: LSX

Description

Logical left shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vslli.b vr, vr, imm
CPU Flags: LSX

Description

Logical left shift the unsigned 8-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vslli.h vr, vr, imm
CPU Flags: LSX

Description

Logical left shift the unsigned 16-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vslli.w vr, vr, imm
CPU Flags: LSX

Description

Logical left shift the unsigned 32-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vslli.d vr, vr, imm
CPU Flags: LSX

Description

Logical left shift the unsigned 64-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vsllwil.h.b vr, vr, imm
CPU Flags: LSX

Description

Extend and shift signed 8-bit elements in a by imm to signed 16-bit result.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = (s16)(s8)a.byte[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vsllwil.hu.bu vr, vr, imm
CPU Flags: LSX

Description

Extend and shift unsigned 8-bit elements in a by imm to unsigned 16-bit result.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = (u16)(u8)a.byte[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsllwil.w.h vr, vr, imm
CPU Flags: LSX

Description

Extend and shift signed 16-bit elements in a by imm to signed 32-bit result.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (s32)(s16)a.half[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsllwil.wu.hu vr, vr, imm
CPU Flags: LSX

Description

Extend and shift unsigned 16-bit elements in a by imm to unsigned 32-bit result.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (u32)(u16)a.half[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsllwil.d.w vr, vr, imm
CPU Flags: LSX

Description

Extend and shift signed 32-bit elements in a by imm to signed 64-bit result.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (s64)(s32)a.word[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsllwil.du.wu vr, vr, imm
CPU Flags: LSX

Description

Extend and shift unsigned 32-bit elements in a by imm to unsigned 64-bit result.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (u64)(u32)a.word[i] << imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsra_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsra_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsra.b vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 8-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsra_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsra_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsra.h vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 16-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsra_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsra_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsra.w vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 32-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsra_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsra_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsra.d vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 64-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vsrai.b vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 8-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = ((s8)a.byte[i]) >> imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsrai.h vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 16-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = ((s16)a.half[i]) >> imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsrai.w vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 32-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = ((s32)a.word[i]) >> imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsrai.d vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 64-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = ((s64)a.dword[i]) >> imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsran_b_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsran_b_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsran.b.h vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsran_h_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsran_h_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsran.h.w vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsran_w_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsran_w_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsran.w.d vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsrani.b.h vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] =
      (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsrani.h.w vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] =
      (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsrani.w.d vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)
                        : (s32)((s64)a.dword[i - 2] >> imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vsrani.d.q vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)
                         : (s64)((s128)a.qword[i - 1] >> imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrar_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrar_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrar.b vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 8-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if ((b.byte[i] & 0x7) == 0) {
    dst.byte[i] = a.byte[i];
  } else {
    dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +
                  (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrar_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrar_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrar.h vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if ((b.half[i] & 0xf) == 0) {
    dst.half[i] = a.half[i];
  } else {
    dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +
                  (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrar_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrar_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrar.w vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if ((b.word[i] & 0x1f) == 0) {
    dst.word[i] = a.word[i];
  } else {
    dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +
                  (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrar_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrar_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrar.d vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if ((b.dword[i] & 0x3f) == 0) {
    dst.dword[i] = a.dword[i];
  } else {
    dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +
                   (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vsrari.b vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 8-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (imm == 0) {
    dst.byte[i] = a.byte[i];
  } else {
    dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsrari.h vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 16-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (imm == 0) {
    dst.half[i] = a.half[i];
  } else {
    dst.half[i] =
        ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsrari.w vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 32-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (imm == 0) {
    dst.word[i] = a.word[i];
  } else {
    dst.word[i] =
        ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsrari.d vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 64-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (imm == 0) {
    dst.dword[i] = a.dword[i];
  } else {
    dst.dword[i] =
        ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrarn.b.h vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u8 shift = (b.half[i] & 15);
    if (shift == 0) {
      dst.byte[i] = (s8)(s16)a.half[i];
    } else {
      dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +
                         (((s16)a.half[i] >> (shift - 1)) & 0x1));
    }
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrarn.h.w vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u8 shift = (b.word[i] & 31);
    if (shift == 0) {
      dst.half[i] = (s16)(s32)a.word[i];
    } else {
      dst.half[i] = (s16)(((s32)a.word[i] >> shift) +
                          (((s32)a.word[i] >> (shift - 1)) & 0x1));
    }
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrarn.w.d vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u8 shift = (b.dword[i] & 63);
    if (shift == 0) {
      dst.word[i] = (s32)(s64)a.dword[i];
    } else {
      dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +
                          (((s64)a.dword[i] >> (shift - 1)) & 0x1));
    }
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsrarni.b.h vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    if (imm == 0) {
      dst.byte[i] = (s8)(s16)b.half[i];
    } else {
      dst.byte[i] =
          (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));
    }
  } else {
    if (imm == 0) {
      dst.byte[i] = (s8)(s16)a.half[i - 8];
    } else {
      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +
                         (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));
    }
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsrarni.h.w vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    if (imm == 0) {
      dst.half[i] = (s16)(s32)b.word[i];
    } else {
      dst.half[i] = (s16)(((s32)b.word[i] >> imm) +
                          (((s32)b.word[i] >> (imm - 1)) & 0x1));
    }
  } else {
    if (imm == 0) {
      dst.half[i] = (s16)(s32)a.word[i - 4];
    } else {
      dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +
                          (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));
    }
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsrarni.w.d vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    if (imm == 0) {
      dst.word[i] = (s32)(s64)b.dword[i];
    } else {
      dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +
                          (((s64)b.dword[i] >> (imm - 1)) & 0x1));
    }
  } else {
    if (imm == 0) {
      dst.word[i] = (s32)(s64)a.dword[i - 2];
    } else {
      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +
                          (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));
    }
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vsrarni.d.q vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    if (imm == 0) {
      dst.dword[i] = (s64)(s128)b.qword[i];
    } else {
      dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +
                           (((s128)b.qword[i] >> (imm - 1)) & 0x1));
    }
  } else {
    if (imm == 0) {
      dst.dword[i] = (s64)(s128)a.qword[i - 1];
    } else {
      dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +
                           (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));
    }
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrl_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrl_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrl.b vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 8-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrl_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrl_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrl.h vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 16-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] >> (b.half[i] & 0xf);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrl_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrl_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrl.w vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 32-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrl_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrl_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrl.d vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 64-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vsrli.b vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 8-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = a.byte[i] >> imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsrli.h vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 16-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = a.half[i] >> imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsrli.w vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 32-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = a.word[i] >> imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsrli.d vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 64-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = a.dword[i] >> imm;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 1 2

__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrln.b.h vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrln.h.w vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrln.w.d vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 2 2
3C5000 2 1

__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsrlni.b.h vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] =
      (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsrlni.h.w vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] =
      (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsrlni.w.d vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)
                        : (u32)((u64)a.dword[i - 2] >> imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vsrlni.d.q vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)
                         : (u64)((u128)a.qword[i - 1] >> imm);
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlr_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrlr_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrlr.b vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 8-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if ((b.byte[i] & 0x7) == 0) {
    dst.byte[i] = a.byte[i];
  } else {
    dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +
                  ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlr_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrlr_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrlr.h vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if ((b.half[i] & 0xf) == 0) {
    dst.half[i] = a.half[i];
  } else {
    dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +
                  ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlr_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrlr_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrlr.w vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if ((b.word[i] & 0x1f) == 0) {
    dst.word[i] = a.word[i];
  } else {
    dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +
                  ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlr_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrlr_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrlr.d vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if ((b.dword[i] & 0x3f) == 0) {
    dst.dword[i] = a.dword[i];
  } else {
    dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +
                   ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vsrlri.b vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 8-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (imm == 0) {
    dst.byte[i] = a.byte[i];
  } else {
    dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsrlri.h vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 16-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (imm == 0) {
    dst.half[i] = a.half[i];
  } else {
    dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsrlri.w vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 32-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (imm == 0) {
    dst.word[i] = a.word[i];
  } else {
    dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsrlri.d vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 64-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (imm == 0) {
    dst.dword[i] = a.dword[i];
  } else {
    dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrlrn.b.h vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, truncate to 8-bit and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u8 shift = (b.half[i] & 15);
    if (shift == 0) {
      dst.byte[i] = (u8)(u16)a.half[i];
    } else {
      dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +
                         (((u16)a.half[i] >> (shift - 1)) & 0x1));
    }
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrlrn.h.w vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, truncate to 16-bit and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u8 shift = (b.word[i] & 31);
    if (shift == 0) {
      dst.half[i] = (u16)(u32)a.word[i];
    } else {
      dst.half[i] = (u16)(((u32)a.word[i] >> shift) +
                          (((u32)a.word[i] >> (shift - 1)) & 0x1));
    }
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vsrlrn.w.d vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, truncate to 32-bit and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u8 shift = (b.dword[i] & 63);
    if (shift == 0) {
      dst.word[i] = (u32)(u64)a.dword[i];
    } else {
      dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +
                          (((u64)a.dword[i] >> (shift - 1)) & 0x1));
    }
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vsrlrni.b.h vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, truncate to 8-bit and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    if (imm == 0) {
      dst.byte[i] = (u8)(u16)b.half[i];
    } else {
      dst.byte[i] =
          (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));
    }
  } else {
    if (imm == 0) {
      dst.byte[i] = (u8)(u16)a.half[i - 8];
    } else {
      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +
                         (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));
    }
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vsrlrni.h.w vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, truncate to 16-bit and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    if (imm == 0) {
      dst.half[i] = (u16)(u32)b.word[i];
    } else {
      dst.half[i] = (u16)(((u32)b.word[i] >> imm) +
                          (((u32)b.word[i] >> (imm - 1)) & 0x1));
    }
  } else {
    if (imm == 0) {
      dst.half[i] = (u16)(u32)a.word[i - 4];
    } else {
      dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +
                          (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));
    }
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vsrlrni.w.d vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, truncate to 32-bit and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    if (imm == 0) {
      dst.word[i] = (u32)(u64)b.dword[i];
    } else {
      dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +
                          (((u64)b.dword[i] >> (imm - 1)) & 0x1));
    }
  } else {
    if (imm == 0) {
      dst.word[i] = (u32)(u64)a.dword[i - 2];
    } else {
      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +
                          (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));
    }
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vsrlrni.d.q vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, truncate to 64-bit and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    if (imm == 0) {
      dst.dword[i] = (u64)(u128)b.qword[i];
    } else {
      dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +
                           (((u128)b.qword[i] >> (imm - 1)) & 0x1));
    }
  } else {
    if (imm == 0) {
      dst.dword[i] = (u64)(u128)a.qword[i - 1];
    } else {
      dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +
                           (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));
    }
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vssran_b_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssran_b_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssran.b.h vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
    dst.byte[i] = clamp<s16>(temp, -128, 127);
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssran.bu.h vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);
    dst.byte[i] = clamp<s16>(temp, 0, 255);
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssran_h_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssran_h_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssran.h.w vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
    dst.half[i] = clamp<s32>(temp, -32768, 32767);
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssran.hu.w vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);
    dst.half[i] = clamp<s32>(temp, 0, 65535);
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssran_w_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssran_w_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssran.w.d vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssran.wu.d vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);
    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vssrani.b.h vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    s16 temp = (s16)b.half[i] >> imm;
    dst.byte[i] = clamp<s16>(temp, -128, 127);
  } else {
    s16 temp = (s16)a.half[i - 8] >> imm;
    dst.byte[i] = clamp<s16>(temp, -128, 127);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vssrani.bu.h vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    s16 temp = (s16)b.half[i] >> imm;
    dst.byte[i] = clamp<s16>(temp, 0, 255);
  } else {
    s16 temp = (s16)a.half[i - 8] >> imm;
    dst.byte[i] = clamp<s16>(temp, 0, 255);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vssrani.h.w vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    s32 temp = (s32)b.word[i] >> imm;
    dst.half[i] = clamp<s32>(temp, -32768, 32767);
  } else {
    s32 temp = (s32)a.word[i - 4] >> imm;
    dst.half[i] = clamp<s32>(temp, -32768, 32767);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vssrani.hu.w vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    s32 temp = (s32)b.word[i] >> imm;
    dst.half[i] = clamp<s32>(temp, 0, 65535);
  } else {
    s32 temp = (s32)a.word[i - 4] >> imm;
    dst.half[i] = clamp<s32>(temp, 0, 65535);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vssrani.w.d vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    s64 temp = (s64)b.dword[i] >> imm;
    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
  } else {
    s64 temp = (s64)a.dword[i - 2] >> imm;
    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vssrani.wu.d vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    s64 temp = (s64)b.dword[i] >> imm;
    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
  } else {
    s64 temp = (s64)a.dword[i - 2] >> imm;
    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vssrani.d.q vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    s128 temp = (s128)b.qword[i] >> imm;
    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
  } else {
    s128 temp = (s128)a.qword[i - 1] >> imm;
    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vssrani.du.q vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    s128 temp = (s128)b.qword[i] >> imm;
    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
  } else {
    s128 temp = (s128)a.qword[i - 1] >> imm;
    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrarn.b.h vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    s16 temp;
    if ((b.half[i] & 15) == 0) {
      temp = (s16)a.half[i];
    } else {
      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
    }
    dst.byte[i] = clamp<s16>(temp, -128, 127);
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrarn.bu.h vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    s16 temp;
    if ((b.half[i] & 15) == 0) {
      temp = (s16)a.half[i];
    } else {
      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +
             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
    }
    dst.byte[i] = clamp<s16>(temp, 0, 255);
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrarn.h.w vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    s32 temp;
    if ((b.word[i] & 31) == 0) {
      temp = (s32)a.word[i];
    } else {
      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
    }
    dst.half[i] = clamp<s32>(temp, -32768, 32767);
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrarn.hu.w vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    s32 temp;
    if ((b.word[i] & 31) == 0) {
      temp = (s32)a.word[i];
    } else {
      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +
             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
    }
    dst.half[i] = clamp<s32>(temp, 0, 65535);
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrarn.w.d vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    s64 temp;
    if ((b.dword[i] & 63) == 0) {
      temp = (s64)a.dword[i];
    } else {
      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
    }
    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrarn.wu.d vr, vr, vr
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    s64 temp;
    if ((b.dword[i] & 63) == 0) {
      temp = (s64)a.dword[i];
    } else {
      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +
             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
    }
    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vssrarni.b.h vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    s16 temp;
    if (imm == 0) {
      temp = (s16)b.half[i];
    } else {
      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
    }
    dst.byte[i] = clamp<s16>(temp, -128, 127);
  } else {
    s16 temp;
    if (imm == 0) {
      temp = (s16)a.half[i - 8];
    } else {
      temp =
          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
    }
    dst.byte[i] = clamp<s16>(temp, -128, 127);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vssrarni.bu.h vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    s16 temp;
    if (imm == 0) {
      temp = (s16)b.half[i];
    } else {
      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);
    }
    dst.byte[i] = clamp<s16>(temp, 0, 255);
  } else {
    s16 temp;
    if (imm == 0) {
      temp = (s16)a.half[i - 8];
    } else {
      temp =
          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);
    }
    dst.byte[i] = clamp<s16>(temp, 0, 255);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vssrarni.h.w vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    s32 temp;
    if (imm == 0) {
      temp = (s32)b.word[i];
    } else {
      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
    }
    dst.half[i] = clamp<s32>(temp, -32768, 32767);
  } else {
    s32 temp;
    if (imm == 0) {
      temp = (s32)a.word[i - 4];
    } else {
      temp =
          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
    }
    dst.half[i] = clamp<s32>(temp, -32768, 32767);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vssrarni.hu.w vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    s32 temp;
    if (imm == 0) {
      temp = (s32)b.word[i];
    } else {
      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);
    }
    dst.half[i] = clamp<s32>(temp, 0, 65535);
  } else {
    s32 temp;
    if (imm == 0) {
      temp = (s32)a.word[i - 4];
    } else {
      temp =
          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);
    }
    dst.half[i] = clamp<s32>(temp, 0, 65535);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vssrarni.w.d vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    s64 temp;
    if (imm == 0) {
      temp = (s64)b.dword[i];
    } else {
      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
    }
    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
  } else {
    s64 temp;
    if (imm == 0) {
      temp = (s64)a.dword[i - 2];
    } else {
      temp = ((s64)a.dword[i - 2] >> imm) +
             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
    }
    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vssrarni.wu.d vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    s64 temp;
    if (imm == 0) {
      temp = (s64)b.dword[i];
    } else {
      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);
    }
    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
  } else {
    s64 temp;
    if (imm == 0) {
      temp = (s64)a.dword[i - 2];
    } else {
      temp = ((s64)a.dword[i - 2] >> imm) +
             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);
    }
    dst.word[i] = clamp<s64>(temp, 0, 4294967295);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vssrarni.d.q vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    s128 temp;
    if (imm == 0) {
      temp = (s128)b.qword[i];
    } else {
      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
    }
    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
  } else {
    s128 temp;
    if (imm == 0) {
      temp = (s128)a.qword[i - 1];
    } else {
      temp = ((s128)a.qword[i - 1] >> imm) +
             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
    }
    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vssrarni.du.q vr, vr, imm
CPU Flags: LSX

Description

Arithmetic right shift (with rounding) the signed 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    s128 temp;
    if (imm == 0) {
      temp = (s128)b.qword[i];
    } else {
      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);
    }
    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
  } else {
    s128 temp;
    if (imm == 0) {
      temp = (s128)a.qword[i - 1];
    } else {
      temp = ((s128)a.qword[i - 1] >> imm) +
             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);
    }
    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrln.b.h vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
    dst.byte[i] = clamp<u16>(temp, 0, 127);
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrln.bu.h vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);
    dst.byte[i] = clamp<u16>(temp, 0, 255);
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrln.h.w vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
    dst.half[i] = clamp<u32>(temp, 0, 32767);
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrln.hu.w vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);
    dst.half[i] = clamp<u32>(temp, 0, 65535);
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrln.w.d vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrln.wu.d vr, vr, vr
CPU Flags: LSX

Description

Logical right shift the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);
    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vssrlni.b.h vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u16 temp = (u16)b.half[i] >> imm;
    dst.byte[i] = clamp<u16>(temp, 0, 127);
  } else {
    u16 temp = (u16)a.half[i - 8] >> imm;
    dst.byte[i] = clamp<u16>(temp, 0, 127);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vssrlni.bu.h vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u16 temp = (u16)b.half[i] >> imm;
    dst.byte[i] = clamp<u16>(temp, 0, 255);
  } else {
    u16 temp = (u16)a.half[i - 8] >> imm;
    dst.byte[i] = clamp<u16>(temp, 0, 255);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vssrlni.h.w vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u32 temp = (u32)b.word[i] >> imm;
    dst.half[i] = clamp<u32>(temp, 0, 32767);
  } else {
    u32 temp = (u32)a.word[i - 4] >> imm;
    dst.half[i] = clamp<u32>(temp, 0, 32767);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vssrlni.hu.w vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u32 temp = (u32)b.word[i] >> imm;
    dst.half[i] = clamp<u32>(temp, 0, 65535);
  } else {
    u32 temp = (u32)a.word[i - 4] >> imm;
    dst.half[i] = clamp<u32>(temp, 0, 65535);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vssrlni.w.d vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u64 temp = (u64)b.dword[i] >> imm;
    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
  } else {
    u64 temp = (u64)a.dword[i - 2] >> imm;
    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vssrlni.wu.d vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u64 temp = (u64)b.dword[i] >> imm;
    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
  } else {
    u64 temp = (u64)a.dword[i - 2] >> imm;
    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vssrlni.d.q vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    u128 temp = (u128)b.qword[i] >> imm;
    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
  } else {
    u128 temp = (u128)a.qword[i - 1] >> imm;
    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vssrlni.du.q vr, vr, imm
CPU Flags: LSX

Description

Logical right shift the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    u128 temp = (u128)b.qword[i] >> imm;
    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
  } else {
    u128 temp = (u128)a.qword[i - 1] >> imm;
    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrlrn.b.h vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in signed 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u16 temp;
    if ((b.half[i] & 15) == 0) {
      temp = (u16)a.half[i];
    } else {
      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
    }
    dst.byte[i] = clamp<u16>(temp, 0, 127);
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrlrn.bu.h vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 16-bit elements in a by elements in b, clamp to fit in unsigned 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u16 temp;
    if ((b.half[i] & 15) == 0) {
      temp = (u16)a.half[i];
    } else {
      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +
             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);
    }
    dst.byte[i] = clamp<u16>(temp, 0, 255);
  } else {
    dst.byte[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrlrn.h.w vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in signed 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u32 temp;
    if ((b.word[i] & 31) == 0) {
      temp = (u32)a.word[i];
    } else {
      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
    }
    dst.half[i] = clamp<u32>(temp, 0, 32767);
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrlrn.hu.w vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 32-bit elements in a by elements in b, clamp to fit in unsigned 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u32 temp;
    if ((b.word[i] & 31) == 0) {
      temp = (u32)a.word[i];
    } else {
      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +
             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);
    }
    dst.half[i] = clamp<u32>(temp, 0, 65535);
  } else {
    dst.half[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrlrn.w.d vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in signed 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u64 temp;
    if ((b.dword[i] & 63) == 0) {
      temp = (u64)a.dword[i];
    } else {
      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
    }
    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vssrlrn.wu.d vr, vr, vr
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 64-bit elements in a by elements in b, clamp to fit in unsigned 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u64 temp;
    if ((b.dword[i] & 63) == 0) {
      temp = (u64)a.dword[i];
    } else {
      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +
             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);
    }
    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
  } else {
    dst.word[i] = 0;
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vssrlrni.b.h vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in signed 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u16 temp;
    if (imm == 0) {
      temp = (u16)b.half[i];
    } else {
      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
    }
    dst.byte[i] = clamp<u16>(temp, 0, 127);
  } else {
    u16 temp;
    if (imm == 0) {
      temp = (u16)a.half[i - 8];
    } else {
      temp =
          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
    }
    dst.byte[i] = clamp<u16>(temp, 0, 127);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)

Synopsis

__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vssrlrni.bu.h vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 16-bit elements in a and b by imm, clamp to fit in unsigned 8-bit integer and store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    u16 temp;
    if (imm == 0) {
      temp = (u16)b.half[i];
    } else {
      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);
    }
    dst.byte[i] = clamp<u16>(temp, 0, 255);
  } else {
    u16 temp;
    if (imm == 0) {
      temp = (u16)a.half[i - 8];
    } else {
      temp =
          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);
    }
    dst.byte[i] = clamp<u16>(temp, 0, 255);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vssrlrni.h.w vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in signed 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u32 temp;
    if (imm == 0) {
      temp = (u32)b.word[i];
    } else {
      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
    }
    dst.half[i] = clamp<u32>(temp, 0, 32767);
  } else {
    u32 temp;
    if (imm == 0) {
      temp = (u32)a.word[i - 4];
    } else {
      temp =
          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
    }
    dst.half[i] = clamp<u32>(temp, 0, 32767);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)

Synopsis

__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vssrlrni.hu.w vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 32-bit elements in a and b by imm, clamp to fit in unsigned 16-bit integer and store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    u32 temp;
    if (imm == 0) {
      temp = (u32)b.word[i];
    } else {
      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);
    }
    dst.half[i] = clamp<u32>(temp, 0, 65535);
  } else {
    u32 temp;
    if (imm == 0) {
      temp = (u32)a.word[i - 4];
    } else {
      temp =
          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);
    }
    dst.half[i] = clamp<u32>(temp, 0, 65535);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vssrlrni.w.d vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in signed 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u64 temp;
    if (imm == 0) {
      temp = (u64)b.dword[i];
    } else {
      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
    }
    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
  } else {
    u64 temp;
    if (imm == 0) {
      temp = (u64)a.dword[i - 2];
    } else {
      temp = ((u64)a.dword[i - 2] >> imm) +
             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
    }
    dst.word[i] = clamp<u64>(temp, 0, 2147483647);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)

Synopsis

__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vssrlrni.wu.d vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 64-bit elements in a and b by imm, clamp to fit in unsigned 32-bit integer and store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  if (i < 2) {
    u64 temp;
    if (imm == 0) {
      temp = (u64)b.dword[i];
    } else {
      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);
    }
    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
  } else {
    u64 temp;
    if (imm == 0) {
      temp = (u64)a.dword[i - 2];
    } else {
      temp = ((u64)a.dword[i - 2] >> imm) +
             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);
    }
    dst.word[i] = clamp<u64>(temp, 0, 4294967295);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 4 2
3C5000 4 1

__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vssrlrni.d.q vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in signed 64-bit integer and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    u128 temp;
    if (imm == 0) {
      temp = (u128)b.qword[i];
    } else {
      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
    }
    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
  } else {
    u128 temp;
    if (imm == 0) {
      temp = (u128)a.qword[i - 1];
    } else {
      temp = ((u128)a.qword[i - 1] >> imm) +
             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
    }
    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)

Synopsis

__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)
#include <lsxintrin.h>
Instruction: vssrlrni.du.q vr, vr, imm
CPU Flags: LSX

Description

Logical right shift (with rounding) the unsigned 128-bit elements in a and b by imm, clamp to fit in unsigned 64-bit integer and store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  if (i < 1) {
    u128 temp;
    if (imm == 0) {
      temp = (u128)b.qword[i];
    } else {
      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);
    }
    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
  } else {
    u128 temp;
    if (imm == 0) {
      temp = (u128)a.qword[i - 1];
    } else {
      temp = ((u128)a.qword[i - 1] >> imm) +
             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);
    }
    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);
  }
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 3 2
3C5000 3 2

__m128i __lsx_vrotr_b (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vrotr_b (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vrotr.b vr, vr, vr
CPU Flags: LSX

Description

Rotate right the unsigned 8-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] =
      (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 2 2

__m128i __lsx_vrotr_h (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vrotr_h (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vrotr.h vr, vr, vr
CPU Flags: LSX

Description

Rotate right the unsigned 16-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |
                (a.half[i] << (16 - (b.half[i] & 0xf)));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 2 2

__m128i __lsx_vrotr_w (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vrotr_w (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vrotr.w vr, vr, vr
CPU Flags: LSX

Description

Rotate right the unsigned 32-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |
                (a.word[i] << (32 - (b.word[i] & 0x1f)));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 2 2

__m128i __lsx_vrotr_d (__m128i a, __m128i b)

Synopsis

__m128i __lsx_vrotr_d (__m128i a, __m128i b)
#include <lsxintrin.h>
Instruction: vrotr.d vr, vr, vr
CPU Flags: LSX

Description

Rotate right the unsigned 64-bit elements in a by elements in b, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |
                 (a.dword[i] << (64 - (b.dword[i] & 0x3f)));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 2 2

__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)

Synopsis

__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)
#include <lsxintrin.h>
Instruction: vrotri.b vr, vr, imm
CPU Flags: LSX

Description

Rotate right the unsigned 8-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 16; i++) {
  dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 2 2

__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)

Synopsis

__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)
#include <lsxintrin.h>
Instruction: vrotri.h vr, vr, imm
CPU Flags: LSX

Description

Rotate right the unsigned 16-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 8; i++) {
  dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 2 2

__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)

Synopsis

__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)
#include <lsxintrin.h>
Instruction: vrotri.w vr, vr, imm
CPU Flags: LSX

Description

Rotate right the unsigned 32-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 2 2

__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)

Synopsis

__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)
#include <lsxintrin.h>
Instruction: vrotri.d vr, vr, imm
CPU Flags: LSX

Description

Rotate right the unsigned 64-bit elements in a by imm, store the result to dst.

Operation

for (int i = 0; i < 2; i++) {
  dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (CPI)
3A6000 1 4
3C5000 2 2