Migrating from SSE to LSX
SSE is a 128-bit SIMD extension to X86. It is possible to migrate existing SSE code to leverage LoongArch LSX extension by rewriting the intrinsics or instructions manually, or by using tools like SIMD Everywhere to implement SSE intrinsics with LSX counterparts. But to unleash the full performance, you may want to port your code to LSX manually.
Thankfully, LSX intrinsics adopt the same type as SSE: you can use the following familiar types for SIMD:
- __m128: 128-bit vector of single precision floating point numbers
- __m128d: 128-bit vector of double precision floating point numbers
- __m128i: 128-bit vector of integers, which can be of any width
Here is a table of a mapping from SSE intrinsics to their LSX counterpart (WIP):
SSE | LSX |
---|---|
_mm_abs_epi16 | __lsx_vsigncov_h |
_mm_abs_epi32 | __lsx_vsigncov_w |
_mm_abs_epi8 | __lsx_vsigncov_b |
_mm_add_epi16 | __lsx_vadd_h |
_mm_add_epi32 | __lsx_vadd_w |
_mm_add_epi64 | __lsx_vadd_d |
_mm_add_epi8 | __lsx_vadd_b |
_mm_add_pd | __lsx_vfadd_d |
_mm_add_ps | __lsx_vfadd_s |
_mm_add_sd | __lsx_vfadd_d + __lsx_vextrins_d |
_mm_add_ss | __lsx_vfadd_s + __lsx_vextrins_w |
_mm_adds_epi16 | __lsx_vsadd_h |
_mm_adds_epi8 | __lsx_vsadd_b |
_mm_adds_epu16 | __lsx_vsadd_hu |
_mm_adds_epu8 | __lsx_vsadd_bu |
_mm_addsub_pd | |
_mm_addsub_ps | |
_mm_alignr_epi8 | |
_mm_and_pd | __lsx_vand_v |
_mm_and_ps | __lsx_vand_v |
_mm_and_si128 | __lsx_vand_v |
_mm_andnot_pd | __lsx_vandn_v |
_mm_andnot_ps | __lsx_vandn_v |
_mm_andnot_si128 | __lsx_vandn_v |
_mm_avg_epu16 | __lsx_vavgr_hu |
_mm_avg_epu8 | __lsx_vavgr_bu |
_mm_blend_epi16 | |
_mm_blend_pd | |
_mm_blend_ps | |
_mm_blendv_epi8 | |
_mm_blendv_pd | |
_mm_blendv_ps | |
_mm_bslli_si128 | __lsx_vbsll_v |
_mm_bsrli_si128 | __lsx_vbsrl_v |
_mm_castpd_ps | type conversion |
_mm_castpd_si128 | type conversion |
_mm_castps_pd | type conversion |
_mm_castps_si128 | type conversion |
_mm_castsi128_pd | type conversion |
_mm_castsi128_ps | type conversion |
_mm_ceil_pd | __lsx_vfrintrp_d |
_mm_ceil_ps | __lsx_vfrintrp_s |
_mm_ceil_sd | __lsx_vfrintrp_d + __lsx_vextrins_d |
_mm_ceil_ss | __lsx_vfrintrp_s + __lsx_vextrins_w |
_mm_cmpeq_epi16 | __lsx_vseq_h |
_mm_cmpeq_epi32 | __lsx_vseq_w |
_mm_cmpeq_epi64 | __lsx_vseq_d |
_mm_cmpeq_epi8 | __lsx_vseq_b |
_mm_cmpeq_pd | __lsx_vfcmp_ceq_d |
_mm_cmpeq_ps | __lsx_vfcmp_ceq_s |
_mm_cmpeq_sd | __lsx_vfcmp_ceq_d + __lsx_vextrins_d |
_mm_cmpeq_ss | __lsx_vfcmp_ceq_s + __lsx_vextrins_w |
_mm_cmpestra | |
_mm_cmpestrc | |
_mm_cmpestri | |
_mm_cmpestrm | |
_mm_cmpestro | |
_mm_cmpestrs | |
_mm_cmpestrz | |
_mm_cmpge_pd | __lsx_vfcmp_cle_d |
_mm_cmpge_ps | __lsx_vfcmp_cle_s |
_mm_cmpge_sd | __lsx_vfcmp_cle_d + __lsx_vextrins_d |
_mm_cmpge_ss | __lsx_vfcmp_cle_s + __lsx_vextrins_w |
_mm_cmpgt_epi16 | __lsx_vslt_h |
_mm_cmpgt_epi32 | __lsx_vslt_w |
_mm_cmpgt_epi64 | __lsx_vslt_d |
_mm_cmpgt_epi8 | __lsx_vslt_b |
_mm_cmpgt_pd | __lsx_vfcmp_clt_d |
_mm_cmpgt_ps | __lsx_vfcmp_clt_s |
_mm_cmpgt_sd | __lsx_vfcmp_clt_d + __lsx_vextrins_d |
_mm_cmpgt_ss | __lsx_vfcmp_clt_s + __lsx_vextrins_w |
_mm_cmpistra | |
_mm_cmpistrc | |
_mm_cmpistri | |
_mm_cmpistrm | |
_mm_cmpistro | |
_mm_cmpistrs | |
_mm_cmpistrz | |
_mm_cmple_pd | __lsx_vfcmp_cle_d |
_mm_cmple_ps | __lsx_vfcmp_cle_s |
_mm_cmple_sd | __lsx_vfcmp_cle_d + __lsx_vextrins_d |
_mm_cmple_ss | __lsx_vfcmp_cle_s + __lsx_vextrins_w |
_mm_cmplt_epi16 | __lsx_vslt_h |
_mm_cmplt_epi32 | __lsx_vslt_w |
_mm_cmplt_epi8 | __lsx_vslt_b |
_mm_cmplt_pd | __lsx_vfcmp_clt_d |
_mm_cmplt_ps | __lsx_vfcmp_clt_s |
_mm_cmplt_sd | __lsx_vfcmp_clt_d + __lsx_vextrins_d |
_mm_cmplt_ss | __lsx_vfcmp_clt_s + __lsx_vextrins_w |
_mm_cmpneq_pd | __lsx_vfcmp_cune_d |
_mm_cmpneq_ps | __lsx_vfcmp_cune_s |
_mm_cmpneq_sd | __lsx_vfcmp_cune_d + __lsx_vextrins_d |
_mm_cmpneq_ss | __lsx_vfcmp_cune_s + __lsx_vextrins_w |
_mm_cmpnge_pd | __lsx_vfcmp_cult_d |
_mm_cmpnge_ps | __lsx_vfcmp_cult_s |
_mm_cmpnge_sd | __lsx_vfcmp_cult_d + __lsx_vextrins_d |
_mm_cmpnge_ss | __lsx_vfcmp_cult_s + __lsx_vextrins_w |
_mm_cmpngt_pd | __lsx_vfcmp_cule_d |
_mm_cmpngt_ps | __lsx_vfcmp_cule_s |
_mm_cmpngt_sd | __lsx_vfcmp_cule_d + __lsx_vextrins_d |
_mm_cmpngt_ss | __lsx_vfcmp_cule_s + __lsx_vextrins_w |
_mm_cmpnle_pd | __lsx_vfcmp_cult_d |
_mm_cmpnle_ps | __lsx_vfcmp_cult_s |
_mm_cmpnle_sd | __lsx_vfcmp_cult_d + __lsx_vextrins_d |
_mm_cmpnle_ss | __lsx_vfcmp_cult_s + __lsx_vextrins_w |
_mm_cmpnlt_pd | __lsx_vfcmp_cule_d |
_mm_cmpnlt_ps | __lsx_vfcmp_cule_s |
_mm_cmpnlt_sd | __lsx_vfcmp_cule_d + __lsx_vextrins_d |
_mm_cmpnlt_ss | __lsx_vfcmp_cule_s + __lsx_vextrins_w |
_mm_cmpord_pd | __lsx_vfcmp_cor_d |
_mm_cmpord_ps | __lsx_vfcmp_cor_s |
_mm_cmpord_sd | __lsx_vfcmp_cor_d + __lsx_vextrins_d |
_mm_cmpord_ss | __lsx_vfcmp_cor_s + __lsx_vextrins_w |
_mm_cmpunord_pd | __lsx_vfcmp_cun_d |
_mm_cmpunord_ps | __lsx_vfcmp_cun_s |
_mm_cmpunord_sd | __lsx_vfcmp_cun_d + __lsx_vextrins_d |
_mm_cmpunord_ss | __lsx_vfcmp_cun_s + __lsx_vextrins_w |
_mm_comieq_sd | |
_mm_comieq_ss | |
_mm_comige_sd | |
_mm_comige_ss | |
_mm_comigt_sd | |
_mm_comigt_ss | |
_mm_comile_sd | |
_mm_comile_ss | |
_mm_comilt_sd | |
_mm_comilt_ss | |
_mm_comineq_sd | |
_mm_comineq_ss | |
_mm_cvt_pi2ps | |
_mm_cvt_ps2pi | |
_mm_cvt_si2ss | |
_mm_cvt_ss2si | |
_mm_cvtepi16_epi32 | __lsx_vsllwil_w_h |
_mm_cvtepi16_epi64 | |
_mm_cvtepi32_epi64 | __lsx_vsllwil_d_w |
_mm_cvtepi32_pd | __lsx_vffintl_d_w |
_mm_cvtepi32_ps | __lsx_vffint_s_w |
_mm_cvtepi8_epi16 | __lsx_vsllwil_h_b |
_mm_cvtepi8_epi32 | |
_mm_cvtepi8_epi64 | |
_mm_cvtepu16_epi32 | __lsx_vsllwil_wu_hu |
_mm_cvtepu16_epi64 | |
_mm_cvtepu32_epi64 | __lsx_vsllwil_du_wu |
_mm_cvtepu8_epi16 | __lsx_vsllwil_hu_bu |
_mm_cvtepu8_epi32 | |
_mm_cvtepu8_epi64 | |
_mm_cvtpd_epi32 | __lsx_vftint_w_d |
_mm_cvtpd_pi32 | |
_mm_cvtpd_ps | __lsx_vfcvt_s_d |
_mm_cvtpi16_ps | |
_mm_cvtpi32_pd | |
_mm_cvtpi32_ps | |
_mm_cvtpi32x2_ps | |
_mm_cvtpi8_ps | |
_mm_cvtps_epi32 | __lsx_vftint_w_s |
_mm_cvtps_pd | __lsx_vfcvtl_d_s |
_mm_cvtps_pi16 | |
_mm_cvtps_pi32 | |
_mm_cvtps_pi8 | |
_mm_cvtpu16_ps | |
_mm_cvtpu8_ps | |
_mm_cvtsd_f64 | |
_mm_cvtsd_si32 | |
_mm_cvtsd_si64 | |
_mm_cvtsd_si64x | |
_mm_cvtsd_ss | |
_mm_cvtsi128_si32 | __lsx_vpickve2gr_w |
_mm_cvtsi128_si64 | __lsx_vpickve2gr_d |
_mm_cvtsi128_si64x | __lsx_vpickve2gr_d |
_mm_cvtsi32_sd | |
_mm_cvtsi32_si128 | |
_mm_cvtsi32_ss | |
_mm_cvtsi64_sd | |
_mm_cvtsi64_si128 | |
_mm_cvtsi64_ss | |
_mm_cvtsi64x_sd | |
_mm_cvtsi64x_si128 | |
_mm_cvtss_f32 | |
_mm_cvtss_sd | |
_mm_cvtss_si32 | |
_mm_cvtss_si64 | |
_mm_cvtt_ps2pi | |
_mm_cvtt_ss2si | |
_mm_cvttpd_epi32 | __lsx_vftint_w_d |
_mm_cvttpd_pi32 | |
_mm_cvttps_epi32 | __lsx_vftint_w_s |
_mm_cvttps_pi32 | |
_mm_cvttsd_si32 | |
_mm_cvttsd_si64 | |
_mm_cvttsd_si64x | |
_mm_cvttss_si32 | |
_mm_cvttss_si64 | |
_mm_div_pd | __lsx_vfdiv_d |
_mm_div_ps | __lsx_vfdiv_s |
_mm_div_sd | __lsx_vfdiv_d + __lsx_vextrins_d |
_mm_div_ss | __lsx_vfdiv_s + __lsx_vextrins_w |
_mm_dp_pd | |
_mm_dp_ps | |
_mm_extract_epi16 | __lsx_vpickve2gr_h |
_mm_extract_epi32 | __lsx_vpickve2gr_w |
_mm_extract_epi64 | __lsx_vpickve2gr_d |
_mm_extract_epi8 | __lsx_vpickve2gr_b |
_mm_extract_ps | __lsx_vpickve2gr_w |
_mm_floor_pd | __lsx_vfrintrm_d |
_mm_floor_ps | __lsx_vfrintrm_s |
_mm_floor_sd | __lsx_vfrintrm_d + __lsx_vextrins_d |
_mm_floor_ss | __lsx_vfrintrm_s + __lsx_vextrins_w |
_mm_hadd_epi16 | |
_mm_hadd_epi32 | |
_mm_hadd_pd | |
_mm_hadd_ps | |
_mm_hadds_epi16 | |
_mm_hsub_epi16 | |
_mm_hsub_epi32 | |
_mm_hsub_pd | |
_mm_hsub_ps | |
_mm_hsubs_epi16 | |
_mm_insert_epi16 | __lsx_vinsgr2vr_h |
_mm_insert_epi32 | __lsx_vinsgr2vr_w |
_mm_insert_epi64 | __lsx_vinsgr2vr_d |
_mm_insert_epi8 | __lsx_vinsgr2vr_b |
_mm_insert_ps | __lsx_vinsgr2vr_w |
_mm_lddqu_si128 | |
_mm_load_pd | __lsx_vld |
_mm_load_pd1 | __lsx_vldrepl_d |
_mm_load_ps | __lsx_vld |
_mm_load_ps1 | __lsx_vldrepl_w |
_mm_load_sd | |
_mm_load_si128 | |
_mm_load_ss | |
_mm_load1_pd | __lsx_vldrepl_d |
_mm_load1_ps | __lsx_vldrepl_w |
_mm_loaddup_pd | |
_mm_loadh_pd | |
_mm_loadh_pi | |
_mm_loadl_epi64 | |
_mm_loadl_pd | |
_mm_loadl_pi | |
_mm_loadr_pd | __lsx_vld + __lsx_vshuf4i_w |
_mm_loadr_ps | __lsx_vld + __lsx_vshuf4i_w |
_mm_loadu_pd | __lsx_vld |
_mm_loadu_ps | __lsx_vld |
_mm_loadu_si128 | __lsx_vld |
_mm_loadu_si16 | |
_mm_loadu_si32 | |
_mm_loadu_si64 | |
_mm_madd_epi16 | |
_mm_maddubs_epi16 | |
_mm_maskmoveu_si128 | |
_mm_max_epi16 | __lsx_vmax_h |
_mm_max_epi32 | __lsx_vmax_w |
_mm_max_epi8 | __lsx_vmax_b |
_mm_max_epu16 | __lsx_vmax_hu |
_mm_max_epu32 | __lsx_vmax_wu |
_mm_max_epu8 | __lsx_vmax_bu |
_mm_max_pd | __lsx_vfmax_d |
_mm_max_ps | __lsx_vfmax_s |
_mm_max_sd | __lsx_vfmax_d + __lsx_vextrins_d |
_mm_max_ss | __lsx_vfmax_s + __lsx_vextrins_w |
_mm_min_epi16 | __lsx_vmin_h |
_mm_min_epi32 | __lsx_vmin_w |
_mm_min_epi8 | __lsx_vmin_b |
_mm_min_epu16 | __lsx_vmin_hu |
_mm_min_epu32 | __lsx_vmin_wu |
_mm_min_epu8 | __lsx_vmin_bu |
_mm_min_pd | __lsx_vfmin_d |
_mm_min_ps | __lsx_vfmin_s |
_mm_min_sd | __lsx_vfmin_d + __lsx_vextrins_d |
_mm_min_ss | __lsx_vfmin_s + __lsx_vextrins_w |
_mm_minpos_epu16 | |
_mm_move_epi64 | |
_mm_move_sd | __lsx_vextrins_d |
_mm_move_ss | __lsx_vextrins_w |
_mm_movedup_pd | |
_mm_movehdup_ps | |
_mm_movehl_ps | __lsx_vilvh_d |
_mm_moveldup_ps | |
_mm_movelh_ps | __lsx_vilvl_d |
_mm_movemask_epi8 | |
_mm_movemask_pd | |
_mm_movemask_ps | __lsx_vmskltz_w + __lsx_vpickve2gr_wu |
_mm_movepi64_pi64 | |
_mm_movpi64_epi64 | |
_mm_mpsadbw_epu8 | |
_mm_mul_epi32 | |
_mm_mul_epu32 | |
_mm_mul_pd | __lsx_vfmul_d |
_mm_mul_ps | __lsx_vfmul_s |
_mm_mul_sd | |
_mm_mul_ss | |
_mm_mulhi_epi16 | |
_mm_mulhi_epu16 | |
_mm_mulhrs_epi16 | |
_mm_mullo_epi16 | |
_mm_mullo_epi32 | |
_mm_or_pd | __lsx_vor_v |
_mm_or_ps | __lsx_vor_v |
_mm_or_si128 | __lsx_vor_v |
_mm_packs_epi16 | |
_mm_packs_epi32 | |
_mm_packus_epi16 | |
_mm_packus_epi32 | |
_mm_rcp_ps | __lsx_vfrecip_s |
_mm_rcp_ss | |
_mm_round_pd | __lsx_vfrintr*_d |
_mm_round_ps | __lsx_vfrintr*_s |
_mm_round_sd | |
_mm_round_ss | |
_mm_rsqrt_ps | __lsx_vfrsqrt_s |
_mm_rsqrt_ss | |
_mm_sad_epu8 | |
_mm_set_epi16 | |
_mm_set_epi32 | |
_mm_set_epi64 | |
_mm_set_epi64x | |
_mm_set_epi8 | |
_mm_set_pd | |
_mm_set_pd1 | __lsx_vdrepl_d/__lsx_vreplgr2vd_d |
_mm_set_ps | |
_mm_set_ps1 | __lsx_vdrepl_w/__lsx_vreplgr2vr_w |
_mm_set_sd | |
_mm_set_ss | |
_mm_set1_epi16 | __lsx_vreplgr2vr_h |
_mm_set1_epi32 | __lsx_vreplgr2vr_w |
_mm_set1_epi64 | __lsx_vreplgr2vr_d |
_mm_set1_epi64x | |
_mm_set1_epi8 | __lsx_vreplgr2vr_b |
_mm_set1_pd | |
_mm_set1_ps | |
_mm_setr_epi16 | use lsxintrin.h--v8i16 to reverse construction |
_mm_setr_epi32 | use lsxintrin.h--v4i32 to reverse construction |
_mm_setr_epi64 | use lsxintrin.h--v2i64 to reverse construction |
_mm_setr_epi8 | use lsxintrin.h--v16i8 to reverse construction |
_mm_setr_pd | use lsxintrin.h--v2f64 to reverse construction |
_mm_setr_ps | use lsxintrin.h--v4f32 to reverse construction |
_mm_setzero_pd | (__m128d)__lsx_vldi(0) |
_mm_setzero_ps | (__m128)__lsx_vldi(0) |
_mm_setzero_si128 | __lsx_vldi(0) |
_mm_shuffle_epi32 | |
_mm_shuffle_epi8 | |
_mm_shuffle_pd | |
_mm_shuffle_ps | |
_mm_shufflehi_epi16 | |
_mm_shufflelo_epi16 | |
_mm_sign_epi16 | |
_mm_sign_epi32 | |
_mm_sign_epi8 | |
_mm_sll_epi16 | __lsx_vsll_h |
_mm_sll_epi32 | __lsx_vsll_w |
_mm_sll_epi64 | __lsx_vsll_d |
_mm_slli_epi16 | __lsx_vslli_h |
_mm_slli_epi32 | __lsx_vslli_w |
_mm_slli_epi64 | __lsx_vslli_d |
_mm_slli_si128 | |
_mm_sqrt_pd | __lsx_vfsqrt_d |
_mm_sqrt_ps | __lsx_vfsqrt_s |
_mm_sqrt_sd | |
_mm_sqrt_ss | |
_mm_sra_epi16 | __lsx_vsra_h |
_mm_sra_epi32 | __lsx_vsra_w |
_mm_srai_epi16 | __lsx_vsrai_h |
_mm_srai_epi32 | __lsx_vsrai_w |
_mm_srl_epi16 | __lsx_vsrl_h |
_mm_srl_epi32 | __lsx_vsrl_w |
_mm_srl_epi64 | __lsx_vsrl_d |
_mm_srli_epi16 | __lsx_vsrli_h |
_mm_srli_epi32 | __lsx_vsrli_w |
_mm_srli_epi64 | __lsx_vsrli_d |
_mm_srli_si128 | |
_mm_store_pd | __lsx_vst |
_mm_store_pd1 | |
_mm_store_ps | __lsx_vst |
_mm_store_ps1 | |
_mm_store_sd | |
_mm_store_si128 | |
_mm_store_ss | __lsx_vstelm_w |
_mm_store1_pd | __lsx_vreplvei_d + __lsx_vst |
_mm_store1_ps | __lsx_vreplvei_w + __lsx_vst |
_mm_storeh_pd | |
_mm_storeh_pi | |
_mm_storel_epi64 | |
_mm_storel_pd | |
_mm_storel_pi | |
_mm_storer_pd | |
_mm_storer_ps | __lsx_vshuf4i_w + __lsx_vst |
_mm_storeu_pd | __lsx_vst |
_mm_storeu_ps | __lsx_vst |
_mm_storeu_si128 | |
_mm_storeu_si16 | |
_mm_storeu_si32 | |
_mm_storeu_si64 | |
_mm_stream_load_si128 | |
_mm_stream_pd | |
_mm_stream_ps | |
_mm_stream_si128 | |
_mm_sub_epi16 | __lsx_vsub_h |
_mm_sub_epi32 | __lsx_vsub_w |
_mm_sub_epi64 | __lsx_vsub_d |
_mm_sub_epi8 | __lsx_vsub_b |
_mm_sub_pd | __lsx_vfsub_s |
_mm_sub_ps | __lsx_vfsub_s |
_mm_sub_sd | |
_mm_sub_ss | |
_mm_subs_epi16 | |
_mm_subs_epi8 | |
_mm_subs_epu16 | |
_mm_subs_epu8 | |
_mm_test_all_ones | |
_mm_test_all_zeros | |
_mm_test_mix_ones_zeros | |
_mm_testc_si128 | |
_mm_testnzc_si128 | |
_mm_testz_si128 | |
_MM_TRANSPOSE4_PS | |
_mm_ucomieq_sd | |
_mm_ucomieq_ss | |
_mm_ucomige_sd | |
_mm_ucomige_ss | |
_mm_ucomigt_sd | |
_mm_ucomigt_ss | |
_mm_ucomile_sd | |
_mm_ucomile_ss | |
_mm_ucomilt_sd | |
_mm_ucomilt_ss | |
_mm_ucomineq_sd | |
_mm_ucomineq_ss | |
_mm_undefined_pd | |
_mm_undefined_ps | |
_mm_undefined_si128 | |
_mm_unpackhi_epi16 | __lsx_vilvh_h |
_mm_unpackhi_epi32 | __lsx_vilvh_w |
_mm_unpackhi_epi64 | __lsx_vilvh_d |
_mm_unpackhi_epi8 | __lsx_vilvh_b |
_mm_unpackhi_pd | __lsx_vilvh_d |
_mm_unpackhi_ps | __lsx_vilvh_w |
_mm_unpacklo_epi16 | __lsx_vilvl_h |
_mm_unpacklo_epi32 | __lsx_vilvl_w |
_mm_unpacklo_epi64 | __lsx_vilvl_d |
_mm_unpacklo_epi8 | __lsx_vilvl_b |
_mm_unpacklo_pd | __lsx_vilvl_d |
_mm_unpacklo_ps | __lsx_vilvl_w |
_mm_xor_pd | __lsx_vxor_v |
_mm_xor_ps | __lsx_vxor_v |
_mm_xor_si128 | __lsx_vxor_v |
The list of SSE intrinsics came from Intel Intrinsics Guide.