/* * xmmintrin.h * * This file is part of the ReactOS CRT package. * * Contributors: * Timo Kreuzer (timo.kreuzer@reactos.org) * * THIS SOFTWARE IS NOT COPYRIGHTED * * This source code is offered for use in the public domain. You may * use, modify or distribute it freely. * * This code is distributed in the hope that it will be useful but * WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY * DISCLAIMED. This includes but is not limited to warranties of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ #pragma once #ifndef _INCLUDED_MM2 #define _INCLUDED_MM2 #include #if defined(_MM2_FUNCTIONALITY) && !defined(_MM_FUNCTIONALITY) #define _MM_FUNCTIONALITY #endif #if !defined _VCRT_BUILD && !defined _INC_MALLOC #include // For _mm_malloc() and _mm_free() #endif #ifdef __cplusplus extern "C" { #endif #if defined(_MSC_VER) && !defined(__clang__) typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128 { float m128_f32[4]; unsigned __int64 m128_u64[2]; __int8 m128_i8[16]; __int16 m128_i16[8]; __int32 m128_i32[4]; __int64 m128_i64[2]; unsigned __int8 m128_u8[16]; unsigned __int16 m128_u16[8]; unsigned __int32 m128_u32[4]; } __m128; #define __ATTRIBUTE_SSE__ #else /* _MSC_VER */ typedef float __v4sf __attribute__((__vector_size__(16))); typedef signed int __v4si __attribute__((__vector_size__(16))); typedef unsigned int __v4su __attribute__((__vector_size__(16))); typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); #ifdef __clang__ #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"),__min_vector_width__(128))) #else #define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"))) #endif #define __INTRIN_INLINE_SSE __INTRIN_INLINE __ATTRIBUTE_SSE__ #endif /* _MSC_VER */ #define _MM_ALIGN16 _VCRT_ALIGN(16) /* Constants for use with _mm_prefetch. */ #define _MM_HINT_NTA 0 #define _MM_HINT_T0 1 #define _MM_HINT_T1 2 #define _MM_HINT_T2 3 #define _MM_HINT_ENTA 4 #if 0 // Not supported yet #define _MM_HINT_ET0 5 #define _MM_HINT_ET1 6 #define _MM_HINT_ET2 7 #endif /* Create a selector for use with the SHUFPS instruction. */ #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) /* Bits in the MXCSR. */ #define _MM_EXCEPT_MASK 0x003f #define _MM_EXCEPT_INVALID 0x0001 #define _MM_EXCEPT_DENORM 0x0002 #define _MM_EXCEPT_DIV_ZERO 0x0004 #define _MM_EXCEPT_OVERFLOW 0x0008 #define _MM_EXCEPT_UNDERFLOW 0x0010 #define _MM_EXCEPT_INEXACT 0x0020 #define _MM_MASK_MASK 0x1f80 #define _MM_MASK_INVALID 0x0080 #define _MM_MASK_DENORM 0x0100 #define _MM_MASK_DIV_ZERO 0x0200 #define _MM_MASK_OVERFLOW 0x0400 #define _MM_MASK_UNDERFLOW 0x0800 #define _MM_MASK_INEXACT 0x1000 #define _MM_ROUND_MASK 0x6000 #define _MM_ROUND_NEAREST 0x0000 #define _MM_ROUND_DOWN 0x2000 #define _MM_ROUND_UP 0x4000 #define _MM_ROUND_TOWARD_ZERO 0x6000 #define _MM_FLUSH_ZERO_MASK 0x8000 #define _MM_FLUSH_ZERO_ON 0x8000 #define _MM_FLUSH_ZERO_OFF 0x0000 #ifdef __ICL void* __cdecl _mm_malloc(size_t Size, size_t Al); void __cdecl _mm_free(void* P); #endif void _mm_prefetch(_In_ char const* p, _In_ int i); __m128 _mm_setzero_ps(void); __m128 _mm_add_ss(__m128 a, __m128 b); __m128 _mm_sub_ss(__m128 a, __m128 b); __m128 _mm_mul_ss(__m128 a, __m128 b); __m128 _mm_div_ss(__m128 a, __m128 b); __m128 _mm_sqrt_ss(__m128 a); __m128 _mm_rcp_ss(__m128 a); __m128 _mm_rsqrt_ss(__m128 a); __m128 _mm_min_ss(__m128 a, __m128 b); __m128 _mm_max_ss(__m128 a, __m128 b); __m128 _mm_add_ps(__m128 a, __m128 b); __m128 _mm_sub_ps(__m128 a, __m128 b); __m128 _mm_mul_ps(__m128 a, __m128 b); __m128 _mm_div_ps(__m128 a, __m128 b); __m128 _mm_sqrt_ps(__m128 a); __m128 _mm_rcp_ps(__m128 a); __m128 _mm_rsqrt_ps(__m128 a); __m128 _mm_min_ps(__m128 a, __m128 b); __m128 _mm_max_ps(__m128 a, __m128 b); __m128 _mm_and_ps(__m128 a, __m128 b); __m128 _mm_andnot_ps(__m128 a, __m128 b); __m128 _mm_or_ps(__m128 a, __m128 b); __m128 _mm_xor_ps(__m128 a, __m128 b); __m128 _mm_cmpeq_ss(__m128 a, __m128 b); __m128 _mm_cmplt_ss(__m128 a, __m128 b); __m128 _mm_cmple_ss(__m128 a, __m128 b); __m128 _mm_cmpgt_ss(__m128 a, __m128 b); __m128 _mm_cmpge_ss(__m128 a, __m128 b); __m128 _mm_cmpneq_ss(__m128 a, __m128 b); __m128 _mm_cmpnlt_ss(__m128 a, __m128 b); __m128 _mm_cmpnle_ss(__m128 a, __m128 b); __m128 _mm_cmpngt_ss(__m128 a, __m128 b); __m128 _mm_cmpnge_ss(__m128 a, __m128 b); __m128 _mm_cmpord_ss(__m128 a, __m128 b); __m128 _mm_cmpunord_ss(__m128 a, __m128 b); __m128 _mm_cmpeq_ps(__m128 a, __m128 b); __m128 _mm_cmplt_ps(__m128 a, __m128 b); __m128 _mm_cmple_ps(__m128 a, __m128 b); __m128 _mm_cmpgt_ps(__m128 a, __m128 b); __m128 _mm_cmpge_ps(__m128 a, __m128 b); __m128 _mm_cmpneq_ps(__m128 a, __m128 b); __m128 _mm_cmpnlt_ps(__m128 a, __m128 b); __m128 _mm_cmpnle_ps(__m128 a, __m128 b); __m128 _mm_cmpngt_ps(__m128 a, __m128 b); __m128 _mm_cmpnge_ps(__m128 a, __m128 b); __m128 _mm_cmpord_ps(__m128 a, __m128 b); __m128 _mm_cmpunord_ps(__m128 a, __m128 b); int _mm_comieq_ss(__m128 a, __m128 b); int _mm_comilt_ss(__m128 a, __m128 b); int _mm_comile_ss(__m128 a, __m128 b); int _mm_comigt_ss(__m128 a, __m128 b); int _mm_comige_ss(__m128 a, __m128 b); int _mm_comineq_ss(__m128 a, __m128 b); int _mm_ucomieq_ss(__m128 a, __m128 b); int _mm_ucomilt_ss(__m128 a, __m128 b); int _mm_ucomile_ss(__m128 a, __m128 b); int _mm_ucomigt_ss(__m128 a, __m128 b); int _mm_ucomige_ss(__m128 a, __m128 b); int _mm_ucomineq_ss(__m128 a, __m128 b); int _mm_cvt_ss2si(__m128 a); int _mm_cvtt_ss2si(__m128 a); __m128 _mm_cvt_si2ss(__m128 a, int b); #ifdef _M_IX86 __m64 _mm_cvt_ps2pi(__m128 a); __m64 _mm_cvtt_ps2pi(__m128 a); __m128 _mm_cvt_pi2ps(__m128 a, __m64 b); #endif __m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8); __m128 _mm_unpackhi_ps(__m128 a, __m128 b); __m128 _mm_unpacklo_ps(__m128 a, __m128 b); __m128 _mm_loadh_pi(__m128 a, __m64 const* p); void _mm_storeh_pi(__m64* p, __m128 a); __m128 _mm_movehl_ps(__m128 a, __m128 b); __m128 _mm_movelh_ps(__m128 a, __m128 b); __m128 _mm_loadl_pi(__m128 a, __m64 const* p); void _mm_storel_pi(__m64* p, __m128 a); int _mm_movemask_ps(__m128 a); unsigned int _mm_getcsr(void); void _mm_setcsr(unsigned int a); __m128 _mm_set_ss(float a); __m128 _mm_set_ps1(float a); __m128 _mm_load_ss(float const* p); __m128 _mm_load_ps1(float const* p); __m128 _mm_load_ps(float const* p); __m128 _mm_loadu_ps(float const* p); __m128 _mm_loadr_ps(float const* p); __m128 _mm_set_ps(float e3, float e2, float e1, float e0); __m128 _mm_setr_ps(float e3, float e2, float e1, float e0); void _mm_store_ss(float* p, __m128 a); float _mm_cvtss_f32(__m128 a); void _mm_store_ps(float* p, __m128 a); void _mm_storeu_ps(float* p, __m128 a); void _mm_store_ps1(float* p, __m128 a); void _mm_storer_ps(float* p, __m128 a); __m128 _mm_move_ss(__m128 a, __m128 b); #ifdef _M_IX86 int _m_pextrw(__m64 a, int imm8); __m64 _m_pinsrw(__m64 a, int i, int imm8); __m64 _m_pmaxsw(__m64 a, __m64 b); __m64 _m_pmaxub(__m64 a, __m64 b); __m64 _m_pminsw(__m64 a, __m64 b); __m64 _m_pminub(__m64 a, __m64 b); int _m_pmovmskb(__m64 a); __m64 _m_pmulhuw(__m64 a, __m64 b); __m64 _m_pshufw(__m64 a, int imm8); void _m_maskmovq(__m64 a, __m64 b, char*); __m64 _m_pavgb(__m64 a, __m64 b); __m64 _m_pavgw(__m64 a, __m64 b); __m64 _m_psadbw(__m64 a, __m64 b); void _mm_stream_pi(__m64* p, __m64 a); #endif void _mm_stream_ps(float* p, __m128 a); void _mm_sfence(void); #ifdef _M_AMD64 __int64 _mm_cvtss_si64(__m128 a); __int64 _mm_cvttss_si64(__m128 a); __m128 _mm_cvtsi64_ss(__m128 a, __int64 b); #endif /* Alternate names */ #define _mm_cvtss_si32 _mm_cvt_ss2si #define _mm_cvttss_si32 _mm_cvtt_ss2si #define _mm_cvtsi32_ss _mm_cvt_si2ss #define _mm_set1_ps _mm_set_ps1 #define _mm_load1_ps _mm_load_ps1f #define _mm_store1_ps _mm_store_ps1 #define _mm_cvtps_pi32 _mm_cvt_ps2pi #define _mm_cvttps_pi32 _mm_cvtt_ps2pi #define _mm_cvtpi32_ps _mm_cvt_pi2ps #define _mm_extract_pi16 _m_pextrw #define _mm_insert_pi16 _m_pinsrw #define _mm_max_pi16 _m_pmaxsw #define _mm_max_pu8 _m_pmaxub #define _mm_min_pi16 _m_pminsw #define _mm_min_pu8 _m_pminub #define _mm_movemask_pi8 _m_pmovmskb #define _mm_mulhi_pu16 _m_pmulhuw #define _mm_shuffle_pi16 _m_pshufw #define _mm_maskmove_si64 _m_maskmovq #define _mm_avg_pu8 _m_pavgb #define _mm_avg_pu16 _m_pavgw #define _mm_sad_pu8 _m_psadbw #ifdef _M_IX86 /* Inline functions from Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h */ __ATTRIBUTE_SSE__ static __inline __m128 _mm_cvtpi16_ps(__m64 __a) { __m64 __b, __c; __m128 __r; __b = _mm_setzero_si64(); __b = _mm_cmpgt_pi16(__b, __a); __c = _mm_unpackhi_pi16(__a, __b); __r = _mm_setzero_ps(); __r = _mm_cvtpi32_ps(__r, __c); __r = _mm_movelh_ps(__r, __r); __c = _mm_unpacklo_pi16(__a, __b); __r = _mm_cvtpi32_ps(__r, __c); return __r; } __ATTRIBUTE_SSE__ static __inline __m128 _mm_cvtpu16_ps(__m64 __a) { __m64 __b, __c; __m128 __r; __b = _mm_setzero_si64(); __c = _mm_unpackhi_pi16(__a, __b); __r = _mm_setzero_ps(); __r = _mm_cvtpi32_ps(__r, __c); __r = _mm_movelh_ps(__r, __r); __c = _mm_unpacklo_pi16(__a, __b); __r = _mm_cvtpi32_ps(__r, __c); return __r; } __ATTRIBUTE_SSE__ static __inline __m128 _mm_cvtpi8_ps(__m64 __a) { __m64 __b; __b = _mm_setzero_si64(); __b = _mm_cmpgt_pi8(__b, __a); __b = _mm_unpacklo_pi8(__a, __b); return _mm_cvtpi16_ps(__b); } __ATTRIBUTE_SSE__ static __inline __m128 _mm_cvtpu8_ps(__m64 __a) { __m64 __b; __b = _mm_setzero_si64(); __b = _mm_unpacklo_pi8(__a, __b); return _mm_cvtpi16_ps(__b); } __ATTRIBUTE_SSE__ static __inline __m128 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) { __m128 __c; __c = _mm_setzero_ps(); __c = _mm_cvtpi32_ps(__c, __b); __c = _mm_movelh_ps(__c, __c); return _mm_cvtpi32_ps(__c, __a); } __ATTRIBUTE_SSE__ static __inline __m64 _mm_cvtps_pi16(__m128 __a) { __m64 __b, __c; __b = _mm_cvtps_pi32(__a); __a = _mm_movehl_ps(__a, __a); __c = _mm_cvtps_pi32(__a); return _mm_packs_pi32(__b, __c); } __ATTRIBUTE_SSE__ static __inline __m64 _mm_cvtps_pi8(__m128 __a) { __m64 __b, __c; __b = _mm_cvtps_pi16(__a); __c = _mm_setzero_si64(); return _mm_packs_pi16(__b, __c); } #endif /* _M_IX86 */ /* Transpose the 4x4 matrix composed of row[0-3]. */ #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ __m128 t0 = _mm_unpacklo_ps(row0, row1); \ __m128 t1 = _mm_unpacklo_ps(row2, row3); \ __m128 t2 = _mm_unpackhi_ps(row0, row1); \ __m128 t3 = _mm_unpackhi_ps(row2, row3); \ (row0) = _mm_movelh_ps(t0, t1); \ (row1) = _mm_movehl_ps(t1, t0); \ (row2) = _mm_movelh_ps(t2, t3); \ (row3) = _mm_movehl_ps(t3, t2); \ } while (0) #define _MM_GET_EXCEPTION_STATE() \ (_mm_getcsr() & _MM_EXCEPT_MASK) #define _MM_GET_EXCEPTION_MASK() \ (_mm_getcsr() & _MM_MASK_MASK) #define _MM_GET_ROUNDING_MODE() \ (_mm_getcsr() & _MM_ROUND_MASK) #define _MM_GET_FLUSH_ZERO_MODE() \ (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) #define _MM_SET_EXCEPTION_STATE(__mask) \ _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (__mask)) #define _MM_SET_EXCEPTION_MASK(__mask) \ _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (__mask)) #define _MM_SET_ROUNDING_MODE(__mode) \ _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (__mode)) #define _MM_SET_FLUSH_ZERO_MODE(__mode) \ _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (__mode)) /* Use intrinsics on MSVC */ #if defined(_MSC_VER) && !defined(__clang__) #pragma intrinsic(_mm_prefetch) #pragma intrinsic(_mm_setzero_ps) #pragma intrinsic(_mm_add_ss) #pragma intrinsic(_mm_sub_ss) #pragma intrinsic(_mm_mul_ss) #pragma intrinsic(_mm_div_ss) #pragma intrinsic(_mm_sqrt_ss) #pragma intrinsic(_mm_rcp_ss) #pragma intrinsic(_mm_rsqrt_ss) #pragma intrinsic(_mm_min_ss) #pragma intrinsic(_mm_max_ss) #pragma intrinsic(_mm_add_ps) #pragma intrinsic(_mm_sub_ps) #pragma intrinsic(_mm_mul_ps) #pragma intrinsic(_mm_div_ps) #pragma intrinsic(_mm_sqrt_ps) #pragma intrinsic(_mm_rcp_ps) #pragma intrinsic(_mm_rsqrt_ps) #pragma intrinsic(_mm_min_ps) #pragma intrinsic(_mm_max_ps) #pragma intrinsic(_mm_and_ps) #pragma intrinsic(_mm_andnot_ps) #pragma intrinsic(_mm_or_ps) #pragma intrinsic(_mm_xor_ps) #pragma intrinsic(_mm_cmpeq_ss) #pragma intrinsic(_mm_cmplt_ss) #pragma intrinsic(_mm_cmple_ss) #pragma intrinsic(_mm_cmpgt_ss) #pragma intrinsic(_mm_cmpge_ss) #pragma intrinsic(_mm_cmpneq_ss) #pragma intrinsic(_mm_cmpnlt_ss) #pragma intrinsic(_mm_cmpnle_ss) #pragma intrinsic(_mm_cmpngt_ss) #pragma intrinsic(_mm_cmpnge_ss) #pragma intrinsic(_mm_cmpord_ss) #pragma intrinsic(_mm_cmpunord_ss) #pragma intrinsic(_mm_cmpeq_ps) #pragma intrinsic(_mm_cmplt_ps) #pragma intrinsic(_mm_cmple_ps) #pragma intrinsic(_mm_cmpgt_ps) #pragma intrinsic(_mm_cmpge_ps) #pragma intrinsic(_mm_cmpneq_ps) #pragma intrinsic(_mm_cmpnlt_ps) #pragma intrinsic(_mm_cmpnle_ps) #pragma intrinsic(_mm_cmpngt_ps) #pragma intrinsic(_mm_cmpnge_ps) #pragma intrinsic(_mm_cmpord_ps) #pragma intrinsic(_mm_cmpunord_ps) #pragma intrinsic(_mm_comieq_ss) #pragma intrinsic(_mm_comilt_ss) #pragma intrinsic(_mm_comile_ss) #pragma intrinsic(_mm_comigt_ss) #pragma intrinsic(_mm_comige_ss) #pragma intrinsic(_mm_comineq_ss) #pragma intrinsic(_mm_ucomieq_ss) #pragma intrinsic(_mm_ucomilt_ss) #pragma intrinsic(_mm_ucomile_ss) #pragma intrinsic(_mm_ucomigt_ss) #pragma intrinsic(_mm_ucomige_ss) #pragma intrinsic(_mm_ucomineq_ss) #pragma intrinsic(_mm_cvt_ss2si) #pragma intrinsic(_mm_cvtt_ss2si) #pragma intrinsic(_mm_cvt_si2ss) #ifdef _M_IX86 #pragma intrinsic(_mm_cvt_ps2pi) #pragma intrinsic(_mm_cvtt_ps2pi) #pragma intrinsic(_mm_cvt_pi2ps) #endif // _M_IX86 #pragma intrinsic(_mm_shuffle_ps) #pragma intrinsic(_mm_unpackhi_ps) #pragma intrinsic(_mm_unpacklo_ps) #pragma intrinsic(_mm_loadh_pi) #pragma intrinsic(_mm_storeh_pi) #pragma intrinsic(_mm_movehl_ps) #pragma intrinsic(_mm_movelh_ps) #pragma intrinsic(_mm_loadl_pi) #pragma intrinsic(_mm_storel_pi) #pragma intrinsic(_mm_movemask_ps) #pragma intrinsic(_mm_getcsr) #pragma intrinsic(_mm_setcsr) #pragma intrinsic(_mm_set_ss) #pragma intrinsic(_mm_set_ps1) #pragma intrinsic(_mm_load_ss) #pragma intrinsic(_mm_load_ps1) #pragma intrinsic(_mm_load_ps) #pragma intrinsic(_mm_loadu_ps) #pragma intrinsic(_mm_loadr_ps) #pragma intrinsic(_mm_set_ps) #pragma intrinsic(_mm_setr_ps) #pragma intrinsic(_mm_store_ss) #pragma intrinsic(_mm_cvtss_f32) #pragma intrinsic(_mm_store_ps) #pragma intrinsic(_mm_storeu_ps) #pragma intrinsic(_mm_store_ps1) #pragma intrinsic(_mm_storer_ps) #pragma intrinsic(_mm_move_ss) #ifdef _M_IX86 #pragma intrinsic(_m_pextrw) #pragma intrinsic(_m_pinsrw) #pragma intrinsic(_m_pmaxsw) #pragma intrinsic(_m_pmaxub) #pragma intrinsic(_m_pminsw) #pragma intrinsic(_m_pminub) #pragma intrinsic(_m_pmovmskb) #pragma intrinsic(_m_pmulhuw) #pragma intrinsic(_m_pshufw) #pragma intrinsic(_m_maskmovq) #pragma intrinsic(_m_pavgb) #pragma intrinsic(_m_pavgw) #pragma intrinsic(_m_psadbw) #pragma intrinsic(_mm_stream_pi) #endif // _M_IX86 #pragma intrinsic(_mm_stream_ps) #pragma intrinsic(_mm_sfence) #ifdef _M_AMD64 #pragma intrinsic(_mm_cvtss_si64) #pragma intrinsic(_mm_cvttss_si64) #pragma intrinsic(_mm_cvtsi64_ss) #endif // _M_AMD64 #else /* _MSC_VER */ /* GCC: https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/xmmintrin.h Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h */ /* Use inline functions on GCC/Clang */ #if !HAS_BUILTIN(_mm_getcsr) __INTRIN_INLINE_SSE unsigned int _mm_getcsr(void) { return __builtin_ia32_stmxcsr(); } #endif #if !HAS_BUILTIN(_mm_setcsr) __INTRIN_INLINE_SSE void _mm_setcsr(unsigned int a) { __builtin_ia32_ldmxcsr(a); } #endif __INTRIN_INLINE_SSE __m128 _mm_add_ss(__m128 __a, __m128 __b) { __a[0] += __b[0]; return __a; } __INTRIN_INLINE_SSE __m128 _mm_add_ps(__m128 __a, __m128 __b) { return (__m128)((__v4sf)__a + (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_sub_ss(__m128 __a, __m128 __b) { __a[0] -= __b[0]; return __a; } __INTRIN_INLINE_SSE __m128 _mm_sub_ps(__m128 __a, __m128 __b) { return (__m128)((__v4sf)__a - (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_mul_ss(__m128 __a, __m128 __b) { __a[0] *= __b[0]; return __a; } __INTRIN_INLINE_SSE __m128 _mm_mul_ps(__m128 __a, __m128 __b) { return (__m128)((__v4sf)__a * (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_div_ss(__m128 __a, __m128 __b) { __a[0] /= __b[0]; return __a; } __INTRIN_INLINE_SSE __m128 _mm_div_ps(__m128 __a, __m128 __b) { return (__m128)((__v4sf)__a / (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_sqrt_ss(__m128 __a) { return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_sqrt_ps(__m128 __a) { return __builtin_ia32_sqrtps((__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_rcp_ss(__m128 __a) { return (__m128)__builtin_ia32_rcpss((__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_rcp_ps(__m128 __a) { return (__m128)__builtin_ia32_rcpps((__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_rsqrt_ss(__m128 __a) { return __builtin_ia32_rsqrtss((__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_rsqrt_ps(__m128 __a) { return __builtin_ia32_rsqrtps((__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_min_ss(__m128 __a, __m128 __b) { return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_min_ps(__m128 __a, __m128 __b) { return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_max_ss(__m128 __a, __m128 __b) { return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_max_ps(__m128 __a, __m128 __b) { return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_and_ps(__m128 __a, __m128 __b) { return (__m128)((__v4su)__a & (__v4su)__b); } __INTRIN_INLINE_SSE __m128 _mm_andnot_ps(__m128 __a, __m128 __b) { return (__m128)(~(__v4su)__a & (__v4su)__b); } __INTRIN_INLINE_SSE __m128 _mm_or_ps(__m128 __a, __m128 __b) { return (__m128)((__v4su)__a | (__v4su)__b); } __INTRIN_INLINE_SSE __m128 _mm_xor_ps(__m128 __a, __m128 __b) { return (__m128)((__v4su)__a ^ (__v4su)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpeq_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmplt_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmplt_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmple_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmple_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ss(__m128 __a, __m128 __b) { __v4sf temp = __builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a); #ifdef __clang__ return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); #else return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); #endif } __INTRIN_INLINE_SSE __m128 _mm_cmpgt_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_cmpge_ss(__m128 __a, __m128 __b) { __v4sf temp = __builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a); #ifdef __clang__ return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); #else return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); #endif } __INTRIN_INLINE_SSE __m128 _mm_cmpge_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpneq_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpnle_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ss(__m128 __a, __m128 __b) { __v4sf temp = __builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a); #ifdef __clang__ return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); #else return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); #endif } __INTRIN_INLINE_SSE __m128 _mm_cmpngt_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ss(__m128 __a, __m128 __b) { __v4sf temp = (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a); #ifdef __clang__ return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); #else return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); #endif } __INTRIN_INLINE_SSE __m128 _mm_cmpnge_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); } __INTRIN_INLINE_SSE __m128 _mm_cmpord_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpord_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ss(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE __m128 _mm_cmpunord_ps(__m128 __a, __m128 __b) { return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_comieq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_comilt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_comile_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_comigt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_comige_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_comineq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_ucomieq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_ucomilt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_ucomile_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_ucomigt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_ucomige_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); } __INTRIN_INLINE_SSE int _mm_ucomineq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); } // _mm_cvt_ss2si __INTRIN_INLINE_SSE int _mm_cvtss_si32(__m128 __a) { return __builtin_ia32_cvtss2si((__v4sf)__a); } #ifdef _M_AMD64 __INTRIN_INLINE_SSE long long _mm_cvtss_si64(__m128 __a) { return __builtin_ia32_cvtss2si64((__v4sf)__a); } #endif // _mm_cvt_ps2pi __INTRIN_INLINE_SSE __m64 _mm_cvtps_pi32(__m128 __a) { return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); } // _mm_cvtt_ss2si __INTRIN_INLINE_SSE int _mm_cvttss_si32(__m128 __a) { return __builtin_ia32_cvttss2si((__v4sf)__a); } #ifdef _M_AMD64 __INTRIN_INLINE_SSE long long _mm_cvttss_si64(__m128 __a) { return __builtin_ia32_cvttss2si64((__v4sf)__a); } #endif // _mm_cvtt_ps2pi __INTRIN_INLINE_SSE __m64 _mm_cvttps_pi32(__m128 __a) { return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); } // _mm_cvt_si2ss __INTRIN_INLINE_SSE __m128 _mm_cvtsi32_ss(__m128 __a, int __b) { __a[0] = __b; return __a; } #ifdef _M_AMD64 __INTRIN_INLINE_SSE __m128 _mm_cvtsi64_ss(__m128 __a, long long __b) { __a[0] = __b; return __a; } #endif // _mm_cvt_pi2ps __INTRIN_INLINE_SSE __m128 _mm_cvtpi32_ps(__m128 __a, __m64 __b) { return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); } __INTRIN_INLINE_SSE float _mm_cvtss_f32(__m128 __a) { return __a[0]; } __INTRIN_INLINE_SSE __m128 _mm_loadh_pi(__m128 __a, const __m64 *__p) { #ifdef __clang__ typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); struct __mm_loadh_pi_struct { __mm_loadh_pi_v2f32 __u; } __attribute__((__packed__, __may_alias__)); __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u; __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); #else return (__m128)__builtin_ia32_loadhps(__a, __p); #endif } __INTRIN_INLINE_SSE __m128 _mm_loadl_pi(__m128 __a, const __m64 *__p) { #ifdef __clang__ typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); struct __mm_loadl_pi_struct { __mm_loadl_pi_v2f32 __u; } __attribute__((__packed__, __may_alias__)); __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u; __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); #else return (__m128)__builtin_ia32_loadlps(__a, __p); #endif } __INTRIN_INLINE_SSE __m128 _mm_load_ss(const float *__p) { return _mm_set_ss(*__p); } // _mm_load_ps1 __INTRIN_INLINE_SSE __m128 _mm_load1_ps(const float *__p) { return _mm_set1_ps(*__p); } __INTRIN_INLINE_SSE __m128 _mm_load_ps(const float *__p) { return *(const __m128*)__p; } __INTRIN_INLINE_SSE __m128 _mm_loadu_ps(const float *__p) { struct __loadu_ps { __m128_u __v; } __attribute__((__packed__, __may_alias__)); return ((const struct __loadu_ps*)__p)->__v; } __INTRIN_INLINE_SSE __m128 _mm_loadr_ps(const float *__p) { __m128 __a = _mm_load_ps(__p); #ifdef __clang__ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); #else return (__m128)__builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3)); #endif } __INTRIN_INLINE_SSE __m128 _mm_undefined_ps(void) { #ifdef __clang__ return (__m128)__builtin_ia32_undef128(); #else __m128 undef = undef; return undef; #endif } __INTRIN_INLINE_SSE __m128 _mm_set_ss(float __w) { return __extension__ (__m128){ __w, 0, 0, 0 }; } // _mm_set_ps1 __INTRIN_INLINE_SSE __m128 _mm_set1_ps(float __w) { return __extension__ (__m128){ __w, __w, __w, __w }; } __INTRIN_INLINE_SSE __m128 _mm_set_ps(float __z, float __y, float __x, float __w) { return __extension__ (__m128){ __w, __x, __y, __z }; } __INTRIN_INLINE_SSE __m128 _mm_setr_ps(float __z, float __y, float __x, float __w) { return __extension__ (__m128){ __z, __y, __x, __w }; } __INTRIN_INLINE_SSE __m128 _mm_setzero_ps(void) { return __extension__ (__m128){ 0, 0, 0, 0 }; } __INTRIN_INLINE_SSE void _mm_storeh_pi(__m64 *__p, __m128 __a) { #ifdef __clang__ typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); struct __mm_storeh_pi_struct { __mm_storeh_pi_v2f32 __u; } __attribute__((__packed__, __may_alias__)); ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3); #else __builtin_ia32_storehps(__p, __a); #endif } __INTRIN_INLINE_SSE void _mm_storel_pi(__m64 *__p, __m128 __a) { #ifdef __clang__ typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); struct __mm_storeh_pi_struct { __mm_storeh_pi_v2f32 __u; } __attribute__((__packed__, __may_alias__)); ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1); #else __builtin_ia32_storelps(__p, __a); #endif } __INTRIN_INLINE_SSE void _mm_store_ss(float *__p, __m128 __a) { *__p = ((__v4sf)__a)[0]; } __INTRIN_INLINE_SSE void _mm_storeu_ps(float *__p, __m128 __a) { *(__m128_u *)__p = __a; } __INTRIN_INLINE_SSE void _mm_store_ps(float *__p, __m128 __a) { *(__m128*)__p = __a; } // _mm_store_ps1 __INTRIN_INLINE_SSE void _mm_store1_ps(float *__p, __m128 __a) { // FIXME: Should we use a temp instead? #ifdef __clang__ __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); #else __a = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,0,0,0)); #endif _mm_store_ps(__p, __a); } __INTRIN_INLINE_SSE void _mm_storer_ps(float *__p, __m128 __a) { #ifdef __clang__ __m128 __tmp = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); #else __m128 __tmp = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3)); #endif _mm_store_ps(__p, __tmp); } /* GCC / Clang specific consants */ #define _MM_HINT_NTA_ALT 0 #define _MM_HINT_T0_ALT 3 #define _MM_HINT_T1_ALT 2 #define _MM_HINT_T2_ALT 1 #define _MM_HINT_ENTA_ALT 4 // These are not supported yet //#define _MM_HINT_ET0_ALT 7 //#define _MM_HINT_ET1_ALT 6 //#define _MM_HINT_ET2_ALT 5 #define _MM_HINT_MS_TO_ALT(sel) \ (((sel) == _MM_HINT_NTA) ? _MM_HINT_NTA_ALT : \ ((sel) == _MM_HINT_T0) ? _MM_HINT_T0_ALT : \ ((sel) == _MM_HINT_T1) ? _MM_HINT_T1_ALT : \ ((sel) == _MM_HINT_T2) ? _MM_HINT_T2_ALT : \ ((sel) == _MM_HINT_ENTA) ? _MM_HINT_ENTA_ALT : 0) #ifdef _MSC_VER1 /* On clang-cl we have an intrinsic, but the constants are different */ #pragma intrinsic(_mm_prefetch) #define _mm_prefetch(p, sel) _mm_prefetch(p, _MM_HINT_MS_TO_ALT(sel)) #else /* _MSC_VER */ #define _mm_prefetch(p, sel) \ __builtin_prefetch((const void *)(p), (_MM_HINT_MS_TO_ALT(sel) >> 2) & 1, _MM_HINT_MS_TO_ALT(sel) & 0x3) #endif /* _MSC_VER */ __INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a) { #ifdef __clang__ __builtin_ia32_movntq((__v1di*)__p, __a); #else __builtin_ia32_movntq((long long unsigned int *)__p, (long long unsigned int)__a); #endif } __INTRIN_INLINE_SSE void _mm_stream_ps(float *__p, __m128 __a) { #ifdef __clang__ __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); #else __builtin_ia32_movntps(__p, (__v4sf)__a); #endif } #if !HAS_BUILTIN(_mm_sfence) __INTRIN_INLINE_SSE void _mm_sfence(void) { __builtin_ia32_sfence(); } #endif #ifdef __clang__ #define _m_pextrw(a, n) \ ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) #define _m_pinsrw(a, d, n) \ ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)) #else // _m_pextrw __INTRIN_INLINE_SSE int _mm_extract_pi16(__m64 const __a, int const __n) { return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__a, __n); } // _m_pinsrw __INTRIN_INLINE_SSE __m64 _mm_insert_pi16 (__m64 const __a, int const __d, int const __n) { return (__m64)__builtin_ia32_vec_set_v4hi ((__v4hi)__a, __d, __n); } #endif // _m_pmaxsw __INTRIN_INLINE_SSE __m64 _mm_max_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); } // _m_pmaxub __INTRIN_INLINE_SSE __m64 _mm_max_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); } // _m_pminsw __INTRIN_INLINE_SSE __m64 _mm_min_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); } // _m_pminub __INTRIN_INLINE_SSE __m64 _mm_min_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); } // _m_pmovmskb __INTRIN_INLINE_SSE int _mm_movemask_pi8(__m64 __a) { return __builtin_ia32_pmovmskb((__v8qi)__a); } // _m_pmulhuw __INTRIN_INLINE_SSE __m64 _mm_mulhi_pu16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); } #ifdef __clang__ #define _m_pshufw(a, n) \ ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))) #else // _m_pshufw __INTRIN_INLINE_MMX __m64 _mm_shuffle_pi16 (__m64 __a, int const __n) { return (__m64) __builtin_ia32_pshufw ((__v4hi)__a, __n); } #endif // _m_maskmovq __INTRIN_INLINE_SSE void _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) { __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); } // _m_pavgb __INTRIN_INLINE_SSE __m64 _mm_avg_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); } // _m_pavgw __INTRIN_INLINE_SSE __m64 _mm_avg_pu16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); } // _m_psadbw __INTRIN_INLINE_SSE __m64 _mm_sad_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); } #endif // __GNUC__ #ifdef __cplusplus } #endif // __cplusplus #endif /* _INCLUDED_MM2 */