Floating Point Conversion

__m256d __lasx_xvfcvth_d_s (__m256 a)

Synopsis

__m256d __lasx_xvfcvth_d_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvfcvth.d.s xr, xr
CPU Flags: LASX

Description

Convert single precision floating point elements in higher half of a to double precision.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = a.fp32[4 + i];
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 3 2
3C5000 3 1

__m256d __lasx_xvfcvtl_d_s (__m256 a)

Synopsis

__m256d __lasx_xvfcvtl_d_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvfcvtl.d.s xr, xr
CPU Flags: LASX

Description

Convert single precision floating point elements in lower half of a to double precision.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = a.fp32[i];
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 3 2
3C5000 3 1

__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)

Synopsis

__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)
#include <lasxintrin.h>
Instruction: xvfcvt.s.d xr, xr, xr
CPU Flags: LASX

Description

Convert double precision floating point elements in a and b to single precision.

Operation

for (int i = 0; i < 8; i++) {
  if (i < 4) {
    dst.fp32[i] = b.fp64[i];
  } else {
    dst.fp32[i] = a.fp64[i - 4];
  }
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 3 2
3C5000 3 1

__m256 __lasx_xvfcvth_s_h (__m256i a)

Synopsis

__m256 __lasx_xvfcvth_s_h (__m256i a)
#include <lasxintrin.h>
Instruction: xvfcvth.s.h xr, xr
CPU Flags: LASX

Description

Convert half precision floating point elements in higher half of a to single precision.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] = a.fp16[8 + i];
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 3 2
3C5000 3 1

__m256 __lasx_xvfcvtl_s_h (__m256i a)

Synopsis

__m256 __lasx_xvfcvtl_s_h (__m256i a)
#include <lasxintrin.h>
Instruction: xvfcvtl.s.h xr, xr
CPU Flags: LASX

Description

Convert half precision floating point elements in lower half of a to single precision.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] = a.fp16[i];
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 3 2
3C5000 3 1

__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)

Synopsis

__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)
#include <lasxintrin.h>
Instruction: xvfcvt.h.s xr, xr, xr
CPU Flags: LASX

Description

Convert single precision floating point elements in a and b to half precision.

Operation

for (int i = 0; i < 16; i++) {
  if (i < 8) {
    dst.fp16[i] = b.fp32[i];
  } else {
    dst.fp16[i] = a.fp32[i - 8];
  }
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 3 2
3C5000 3 1

__m256d __lasx_xvffinth_d_w (__m256i a)

Synopsis

__m256d __lasx_xvffinth_d_w (__m256i a)
#include <lasxintrin.h>
Instruction: xvffinth.d.w xr, xr
CPU Flags: LASX

Description

Convert 32-bit integer elements in higher part of a to double precision floating point numbers.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = (f64)(s32)a.word[i + 4]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256d __lasx_xvffintl_d_w (__m256i a)

Synopsis

__m256d __lasx_xvffintl_d_w (__m256i a)
#include <lasxintrin.h>
Instruction: xvffintl.d.w xr, xr
CPU Flags: LASX

Description

Convert 32-bit integer elements in lower part of a to double precision floating point numbers.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256d __lasx_xvffint_d_l (__m256i a)

Synopsis

__m256d __lasx_xvffint_d_l (__m256i a)
#include <lasxintrin.h>
Instruction: xvffint.d.l xr, xr
CPU Flags: LASX

Description

Convert signed 64-bit integer elements in a to double-precision floating point numbers.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256d __lasx_xvffint_d_lu (__m256i a)

Synopsis

__m256d __lasx_xvffint_d_lu (__m256i a)
#include <lasxintrin.h>
Instruction: xvffint.d.lu xr, xr
CPU Flags: LASX

Description

Convert unsigned 64-bit integer elements in a to double-precision floating point numbers.

Operation

for (int i = 0; i < 4; i++) {
  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256 __lasx_xvffint_s_w (__m256i a)

Synopsis

__m256 __lasx_xvffint_s_w (__m256i a)
#include <lasxintrin.h>
Instruction: xvffint.s.w xr, xr
CPU Flags: LASX

Description

Convert signed 32-bit integer elements in a to single-precision floating point numbers.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256 __lasx_xvffint_s_wu (__m256i a)

Synopsis

__m256 __lasx_xvffint_s_wu (__m256i a)
#include <lasxintrin.h>
Instruction: xvffint.s.wu xr, xr
CPU Flags: LASX

Description

Convert unsigned 32-bit integer elements in a to single-precision floating point numbers.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)

Synopsis

__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)
#include <lasxintrin.h>
Instruction: xvffint.s.l xr, xr, xr
CPU Flags: LASX

Description

Convert 64-bit integer elements in a and b to single-precision floating point numbers.

Operation

for (int i = 0; i < 8; i++) {
  dst.fp32[i] =
      (i < 4) ? (f32)(s32)a.dword[i]
              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintl_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintl_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintl.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in lower part of a to 64-bit integer, using current rounding mode specified in fscr.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftinth_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftinth_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftinth.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in higher part of a to 64-bit integer, using current rounding mode specified in fscr.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrml_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrml_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrml.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards negative infinity.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrmh_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrmh_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrmh.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards negative infinity.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrpl_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrpl_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrpl.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards positive infinity.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrph_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrph_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrph.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards positive infinity.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrzl_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrzl_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrzl.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards zero.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrzh_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrzh_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrzh.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards zero.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrnel_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrnel_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrnel.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in lower part of a to 64-bit integer, rounding towards nearest even.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrneh_l_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrneh_l_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrneh.l.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in higher part of a to 64-bit integer, rounding towards nearest even.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftint_l_d (__m256d a)

Synopsis

__m256i __lasx_xvftint_l_d (__m256d a)
#include <lasxintrin.h>
Instruction: xvftint.l.d xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a to signed 64-bit integer, using current rounding mode specified in fscr.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftint_w_s (__m256 a)

Synopsis

__m256i __lasx_xvftint_w_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftint.w.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in a to signed 32-bit integer, using current rounding mode specified in fscr.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrm_l_d (__m256d a)

Synopsis

__m256i __lasx_xvftintrm_l_d (__m256d a)
#include <lasxintrin.h>
Instruction: xvftintrm.l.d xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards negative infinity.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrm_w_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrm_w_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrm.w.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards negative infinity.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrp_l_d (__m256d a)

Synopsis

__m256i __lasx_xvftintrp_l_d (__m256d a)
#include <lasxintrin.h>
Instruction: xvftintrp.l.d xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards positive infinity.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrp_w_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrp_w_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrp.w.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards positive infinity.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrz_l_d (__m256d a)

Synopsis

__m256i __lasx_xvftintrz_l_d (__m256d a)
#include <lasxintrin.h>
Instruction: xvftintrz.l.d xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards zero.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrz_w_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrz_w_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrz.w.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards zero.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrne_l_d (__m256d a)

Synopsis

__m256i __lasx_xvftintrne_l_d (__m256d a)
#include <lasxintrin.h>
Instruction: xvftintrne.l.d xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a to signed 64-bit integer, rounding towards nearest even.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrne_w_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrne_w_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrne.w.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in a to signed 32-bit integer, rounding towards nearest even.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftint_lu_d (__m256d a)

Synopsis

__m256i __lasx_xvftint_lu_d (__m256d a)
#include <lasxintrin.h>
Instruction: xvftint.lu.d xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a to unsigned 64-bit integer, using current rounding mode specified in fscr.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftint_wu_s (__m256 a)

Synopsis

__m256i __lasx_xvftint_wu_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftint.wu.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in a to unsigned 32-bit integer, using current rounding mode specified in fscr.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrz_lu_d (__m256d a)

Synopsis

__m256i __lasx_xvftintrz_lu_d (__m256d a)
#include <lasxintrin.h>
Instruction: xvftintrz.lu.d xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a to unsigned 64-bit integer, rounding towards zero.

Operation

for (int i = 0; i < 4; i++) {
  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftintrz_wu_s (__m256 a)

Synopsis

__m256i __lasx_xvftintrz_wu_s (__m256 a)
#include <lasxintrin.h>
Instruction: xvftintrz.wu.s xr, xr
CPU Flags: LASX

Description

Convert single-precision floating point elements in a to unsigned 32-bit integer, rounding towards zero.

Operation

for (int i = 0; i < 8; i++) {
  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
}

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 4 4
3C5000 4 2

__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)

Synopsis

__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)
#include <lasxintrin.h>
Instruction: xvftint.w.d xr, xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a and b to 32-bit integer, using current rounding mode specified in fscr.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2)
                    ? (s32)b.fp64[i]
                    : (s32)a.fp64[i - 2]; // rounding mode is not expressed in C
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = (i < 6)
                    ? (s32)b.fp64[i - 2]
                    : (s32)a.fp64[i - 4]; // rounding mode is not expressed in C
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)

Synopsis

__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)
#include <lasxintrin.h>
Instruction: xvftintrm.w.d xr, xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards negative infinity.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2)
                    ? (s32)b.fp64[i]
                    : (s32)a.fp64[i - 2]; // rounding mode is not expressed in C
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = (i < 6)
                    ? (s32)b.fp64[i - 2]
                    : (s32)a.fp64[i - 4]; // rounding mode is not expressed in C
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)

Synopsis

__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)
#include <lasxintrin.h>
Instruction: xvftintrp.w.d xr, xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards positive infinity.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2)
                    ? (s32)b.fp64[i]
                    : (s32)a.fp64[i - 2]; // rounding mode is not expressed in C
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = (i < 6)
                    ? (s32)b.fp64[i - 2]
                    : (s32)a.fp64[i - 4]; // rounding mode is not expressed in C
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)

Synopsis

__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)
#include <lasxintrin.h>
Instruction: xvftintrz.w.d xr, xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards zero.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2)
                    ? (s32)b.fp64[i]
                    : (s32)a.fp64[i - 2]; // rounding mode is not expressed in C
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = (i < 6)
                    ? (s32)b.fp64[i - 2]
                    : (s32)a.fp64[i - 4]; // rounding mode is not expressed in C
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1

__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)

Synopsis

__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)
#include <lasxintrin.h>
Instruction: xvftintrne.w.d xr, xr, xr
CPU Flags: LASX

Description

Convert double-precision floating point elements in a and b to 32-bit integer, rounding towards nearest even.

Operation

for (int i = 0; i < 4; i++) {
  dst.word[i] = (i < 2)
                    ? (s32)b.fp64[i]
                    : (s32)a.fp64[i - 2]; // rounding mode is not expressed in C
}
for (int i = 4; i < 8; i++) {
  dst.word[i] = (i < 6)
                    ? (s32)b.fp64[i - 2]
                    : (s32)a.fp64[i - 4]; // rounding mode is not expressed in C
}

Tested on real machine.

Latency and Throughput

CPU Latency Throughput (IPC)
3A6000 5 2
3C5000 5 1