From 318d696f0e2ba61fa4662029fdf400180e98a804 Mon Sep 17 00:00:00 2001 From: Timo Kreuzer Date: Sat, 4 Jun 2022 11:14:42 +0200 Subject: [PATCH] [CRT] Implement xmmintrin.h This includes inline functions for GCC/Clang. Note: the previous GCC definition of __m128 was broken and didn't work. --- sdk/include/crt/mingw32/intrin_x86.h | 9 - sdk/include/crt/xmmintrin.h | 1271 ++++++++++++++++++++++++-- 2 files changed, 1193 insertions(+), 87 deletions(-) diff --git a/sdk/include/crt/mingw32/intrin_x86.h b/sdk/include/crt/mingw32/intrin_x86.h index 888bf795313..1bfae865d77 100644 --- a/sdk/include/crt/mingw32/intrin_x86.h +++ b/sdk/include/crt/mingw32/intrin_x86.h @@ -111,15 +111,6 @@ __INTRIN_INLINE void _mm_lfence(void) } #endif -#if !HAS_BUILTIN(_mm_sfence) -__INTRIN_INLINE void _mm_sfence(void) -{ - _WriteBarrier(); - __asm__ __volatile__("sfence"); - _WriteBarrier(); -} -#endif - #if defined(__x86_64__) && !HAS_BUILTIN(__faststorefence) __INTRIN_INLINE void __faststorefence(void) { diff --git a/sdk/include/crt/xmmintrin.h b/sdk/include/crt/xmmintrin.h index 61ed2c9dcf6..a539f09be3b 100644 --- a/sdk/include/crt/xmmintrin.h +++ b/sdk/include/crt/xmmintrin.h @@ -1,25 +1,42 @@ -/** - * This file has no copyright assigned and is placed in the Public Domain. - * This file is part of the w64 mingw-runtime package. - * No warranty is given; refer to the file DISCLAIMER within this package. +/* + * xmmintrin.h + * + * This file is part of the ReactOS CRT package. + * + * Contributors: + * Timo Kreuzer (timo.kreuzer@reactos.org) + * + * THIS SOFTWARE IS NOT COPYRIGHTED + * + * This source code is offered for use in the public domain. You may + * use, modify or distribute it freely. + * + * This code is distributed in the hope that it will be useful but + * WITHOUT ANY WARRANTY. ALL WARRANTIES, EXPRESS OR IMPLIED ARE HEREBY + * DISCLAIMED. This includes but is not limited to warranties of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * */ #pragma once #ifndef _INCLUDED_MM2 #define _INCLUDED_MM2 -#include #include -#ifdef __clang__ +#if defined(_MM2_FUNCTIONALITY) && !defined(_MM_FUNCTIONALITY) +#define _MM_FUNCTIONALITY +#endif -typedef float __v4sf __attribute__((__vector_size__(16))); -typedef signed int __v4si __attribute__((__vector_size__(16))); -typedef unsigned int __v4su __attribute__((__vector_size__(16))); +#if !defined _VCRT_BUILD && !defined _INC_MALLOC +//#include // FIXME: This breaks build +#endif -typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); +#ifdef __cplusplus +extern "C" { +#endif -#else /* __clang__ */ +#if defined(_MSC_VER) && !defined(__clang__) typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128 { @@ -34,96 +51,1194 @@ typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128 unsigned __int32 m128_u32[4]; } __m128; -#endif /* __clang__ */ - - -#ifdef __cplusplus -extern "C" { -#endif - -extern __m128 _mm_load_ss(float const*); -extern int _mm_cvt_ss2si(__m128); -__m128 _mm_xor_ps(__m128 a, __m128 b); -__m128 _mm_div_ps(__m128 a, __m128 b); - -#ifdef _MSC_VER -unsigned int _mm_getcsr(void); -#pragma intrinsic(_mm_getcsr) -void _mm_setcsr(unsigned int); -#pragma intrinsic(_mm_setcsr) - -#ifndef __clang__ -#pragma intrinsic(_mm_xor_ps) -#pragma intrinsic(_mm_div_ps) -#else - -/* - * Clang implements these as inline functions in the header instead of real builtins - */ -__forceinline __m128 _mm_xor_ps(__m128 a, __m128 b) -{ - return (__m128)((__v4su)a ^ (__v4su)b); -} - -__forceinline __m128 _mm_div_ps(__m128 a, __m128 b) -{ - return (__m128)((__v4sf)a / (__v4sf)b); -} -#endif /* __clang__ */ +#define __ATTRIBUTE_SSE__ #else /* _MSC_VER */ -#if !defined(__INTRIN_INLINE) -# ifdef __clang__ -# define __ATTRIBUTE_ARTIFICIAL -# else -# define __ATTRIBUTE_ARTIFICIAL __attribute__((artificial)) -# endif -# define __INTRIN_INLINE extern __inline__ __attribute__((__always_inline__,__gnu_inline__)) __ATTRIBUTE_ARTIFICIAL -#endif /* !__INTRIN_INLINE */ + typedef float __v4sf __attribute__((__vector_size__(16))); + typedef signed int __v4si __attribute__((__vector_size__(16))); + typedef unsigned int __v4su __attribute__((__vector_size__(16))); + typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); + + typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); -#ifndef HAS_BUILTIN #ifdef __clang__ -#define HAS_BUILTIN(x) __has_builtin(x) +#define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"),__min_vector_width__(128))) #else -#define HAS_BUILTIN(x) 0 +#define __ATTRIBUTE_SSE__ __attribute__((__target__("sse"))) #endif +#define __INTRIN_INLINE_SSE __INTRIN_INLINE __ATTRIBUTE_SSE__ + +#endif /* _MSC_VER */ + +#define _MM_ALIGN16 _VCRT_ALIGN(16) + +/* Constants for use with _mm_prefetch. */ +#define _MM_HINT_NTA 0 +#define _MM_HINT_T0 1 +#define _MM_HINT_T1 2 +#define _MM_HINT_T2 3 +#define _MM_HINT_ENTA 4 +#if 0 // Not supported yet +#define _MM_HINT_ET0 5 +#define _MM_HINT_ET1 6 +#define _MM_HINT_ET2 7 #endif +/* Create a selector for use with the SHUFPS instruction. */ +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) + +/* Bits in the MXCSR. */ +#define _MM_EXCEPT_MASK 0x003f +#define _MM_EXCEPT_INVALID 0x0001 +#define _MM_EXCEPT_DENORM 0x0002 +#define _MM_EXCEPT_DIV_ZERO 0x0004 +#define _MM_EXCEPT_OVERFLOW 0x0008 +#define _MM_EXCEPT_UNDERFLOW 0x0010 +#define _MM_EXCEPT_INEXACT 0x0020 + +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_INVALID 0x0080 +#define _MM_MASK_DENORM 0x0100 +#define _MM_MASK_DIV_ZERO 0x0200 +#define _MM_MASK_OVERFLOW 0x0400 +#define _MM_MASK_UNDERFLOW 0x0800 +#define _MM_MASK_INEXACT 0x1000 + +#define _MM_ROUND_MASK 0x6000 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 + +#ifdef __ICL +void* __cdecl _mm_malloc(size_t Size, size_t Al); +void __cdecl _mm_free(void* P); +#endif + +void _mm_prefetch(_In_ char const* p, _In_ int i); +__m128 _mm_setzero_ps(void); +__m128 _mm_add_ss(__m128 a, __m128 b); +__m128 _mm_sub_ss(__m128 a, __m128 b); +__m128 _mm_mul_ss(__m128 a, __m128 b); +__m128 _mm_div_ss(__m128 a, __m128 b); +__m128 _mm_sqrt_ss(__m128 a); +__m128 _mm_rcp_ss(__m128 a); +__m128 _mm_rsqrt_ss(__m128 a); +__m128 _mm_min_ss(__m128 a, __m128 b); +__m128 _mm_max_ss(__m128 a, __m128 b); +__m128 _mm_add_ps(__m128 a, __m128 b); +__m128 _mm_sub_ps(__m128 a, __m128 b); +__m128 _mm_mul_ps(__m128 a, __m128 b); +__m128 _mm_div_ps(__m128 a, __m128 b); +__m128 _mm_sqrt_ps(__m128 a); +__m128 _mm_rcp_ps(__m128 a); +__m128 _mm_rsqrt_ps(__m128 a); +__m128 _mm_min_ps(__m128 a, __m128 b); +__m128 _mm_max_ps(__m128 a, __m128 b); +__m128 _mm_and_ps(__m128 a, __m128 b); +__m128 _mm_andnot_ps(__m128 a, __m128 b); +__m128 _mm_or_ps(__m128 a, __m128 b); +__m128 _mm_xor_ps(__m128 a, __m128 b); +__m128 _mm_cmpeq_ss(__m128 a, __m128 b); +__m128 _mm_cmplt_ss(__m128 a, __m128 b); +__m128 _mm_cmple_ss(__m128 a, __m128 b); +__m128 _mm_cmpgt_ss(__m128 a, __m128 b); +__m128 _mm_cmpge_ss(__m128 a, __m128 b); +__m128 _mm_cmpneq_ss(__m128 a, __m128 b); +__m128 _mm_cmpnlt_ss(__m128 a, __m128 b); +__m128 _mm_cmpnle_ss(__m128 a, __m128 b); +__m128 _mm_cmpngt_ss(__m128 a, __m128 b); +__m128 _mm_cmpnge_ss(__m128 a, __m128 b); +__m128 _mm_cmpord_ss(__m128 a, __m128 b); +__m128 _mm_cmpunord_ss(__m128 a, __m128 b); +__m128 _mm_cmpeq_ps(__m128 a, __m128 b); +__m128 _mm_cmplt_ps(__m128 a, __m128 b); +__m128 _mm_cmple_ps(__m128 a, __m128 b); +__m128 _mm_cmpgt_ps(__m128 a, __m128 b); +__m128 _mm_cmpge_ps(__m128 a, __m128 b); +__m128 _mm_cmpneq_ps(__m128 a, __m128 b); +__m128 _mm_cmpnlt_ps(__m128 a, __m128 b); +__m128 _mm_cmpnle_ps(__m128 a, __m128 b); +__m128 _mm_cmpngt_ps(__m128 a, __m128 b); +__m128 _mm_cmpnge_ps(__m128 a, __m128 b); +__m128 _mm_cmpord_ps(__m128 a, __m128 b); +__m128 _mm_cmpunord_ps(__m128 a, __m128 b); +int _mm_comieq_ss(__m128 a, __m128 b); +int _mm_comilt_ss(__m128 a, __m128 b); +int _mm_comile_ss(__m128 a, __m128 b); +int _mm_comigt_ss(__m128 a, __m128 b); +int _mm_comige_ss(__m128 a, __m128 b); +int _mm_comineq_ss(__m128 a, __m128 b); +int _mm_ucomieq_ss(__m128 a, __m128 b); +int _mm_ucomilt_ss(__m128 a, __m128 b); +int _mm_ucomile_ss(__m128 a, __m128 b); +int _mm_ucomigt_ss(__m128 a, __m128 b); +int _mm_ucomige_ss(__m128 a, __m128 b); +int _mm_ucomineq_ss(__m128 a, __m128 b); +int _mm_cvt_ss2si(__m128 a); +int _mm_cvtt_ss2si(__m128 a); +__m128 _mm_cvt_si2ss(__m128 a, int b); +#ifdef _M_IX86 +__m64 _mm_cvt_ps2pi(__m128 a); +__m64 _mm_cvtt_ps2pi(__m128 a); +__m128 _mm_cvt_pi2ps(__m128 a, __m64 b); +#endif +__m128 _mm_shuffle_ps(__m128 a, __m128 b, unsigned int imm8); +__m128 _mm_unpackhi_ps(__m128 a, __m128 b); +__m128 _mm_unpacklo_ps(__m128 a, __m128 b); +__m128 _mm_loadh_pi(__m128 a, __m64 const* p); +void _mm_storeh_pi(__m64* p, __m128 a); +__m128 _mm_movehl_ps(__m128 a, __m128 b); +__m128 _mm_movelh_ps(__m128 a, __m128 b); +__m128 _mm_loadl_pi(__m128 a, __m64 const* p); +void _mm_storel_pi(__m64* p, __m128 a); +int _mm_movemask_ps(__m128 a); +unsigned int _mm_getcsr(void); +void _mm_setcsr(unsigned int a); +__m128 _mm_set_ss(float a); +__m128 _mm_set_ps1(float a); +__m128 _mm_load_ss(float const* p); +__m128 _mm_load_ps1(float const* p); +__m128 _mm_load_ps(float const* p); +__m128 _mm_loadu_ps(float const* p); +__m128 _mm_loadr_ps(float const* p); +__m128 _mm_set_ps(float e3, float e2, float e1, float e0); +__m128 _mm_setr_ps(float e3, float e2, float e1, float e0); +void _mm_store_ss(float* p, __m128 a); +float _mm_cvtss_f32(__m128 a); +void _mm_store_ps(float* p, __m128 a); +void _mm_storeu_ps(float* p, __m128 a); +void _mm_store_ps1(float* p, __m128 a); +void _mm_storer_ps(float* p, __m128 a); +__m128 _mm_move_ss(__m128 a, __m128 b); +#ifdef _M_IX86 +int _m_pextrw(__m64 a, int imm8); +__m64 _m_pinsrw(__m64 a, int i, int imm8); +__m64 _m_pmaxsw(__m64 a, __m64 b); +__m64 _m_pmaxub(__m64 a, __m64 b); +__m64 _m_pminsw(__m64 a, __m64 b); +__m64 _m_pminub(__m64 a, __m64 b); +int _m_pmovmskb(__m64 a); +__m64 _m_pmulhuw(__m64 a, __m64 b); +__m64 _m_pshufw(__m64 a, int imm8); +void _m_maskmovq(__m64 a, __m64 b, char*); +__m64 _m_pavgb(__m64 a, __m64 b); +__m64 _m_pavgw(__m64 a, __m64 b); +__m64 _m_psadbw(__m64 a, __m64 b); +void _mm_stream_pi(__m64* p, __m64 a); +#endif +void _mm_stream_ps(float* p, __m128 a); +void _mm_sfence(void); +#ifdef _M_AMD64 +__int64 _mm_cvtss_si64(__m128 a); +__int64 _mm_cvttss_si64(__m128 a); +__m128 _mm_cvtsi64_ss(__m128 a, __int64 b); +#endif + +/* Alternate names */ +#define _mm_cvtss_si32 _mm_cvt_ss2si +#define _mm_cvttss_si32 _mm_cvtt_ss2si +#define _mm_cvtsi32_ss _mm_cvt_si2ss +#define _mm_set1_ps _mm_set_ps1 +#define _mm_load1_ps _mm_load_ps1f +#define _mm_store1_ps _mm_store_ps1 +#define _mm_cvtps_pi32 _mm_cvt_ps2pi +#define _mm_cvttps_pi32 _mm_cvtt_ps2pi +#define _mm_cvtpi32_ps _mm_cvt_pi2ps +#define _mm_extract_pi16 _m_pextrw +#define _mm_insert_pi16 _m_pinsrw +#define _mm_max_pi16 _m_pmaxsw +#define _mm_max_pu8 _m_pmaxub +#define _mm_min_pi16 _m_pminsw +#define _mm_min_pu8 _m_pminub +#define _mm_movemask_pi8 _m_pmovmskb +#define _mm_mulhi_pu16 _m_pmulhuw +#define _mm_shuffle_pi16 _m_pshufw +#define _mm_maskmove_si64 _m_maskmovq +#define _mm_avg_pu8 _m_pavgb +#define _mm_avg_pu16 _m_pavgw +#define _mm_sad_pu8 _m_psadbw + +#ifdef _M_IX86 +/* Inline functions from Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h */ + +__ATTRIBUTE_SSE__ +static __inline __m128 _mm_cvtpi16_ps(__m64 __a) +{ + __m64 __b, __c; + __m128 __r; + + __b = _mm_setzero_si64(); + __b = _mm_cmpgt_pi16(__b, __a); + __c = _mm_unpackhi_pi16(__a, __b); + __r = _mm_setzero_ps(); + __r = _mm_cvtpi32_ps(__r, __c); + __r = _mm_movelh_ps(__r, __r); + __c = _mm_unpacklo_pi16(__a, __b); + __r = _mm_cvtpi32_ps(__r, __c); + + return __r; +} + +__ATTRIBUTE_SSE__ +static __inline __m128 _mm_cvtpu16_ps(__m64 __a) +{ + __m64 __b, __c; + __m128 __r; + + __b = _mm_setzero_si64(); + __c = _mm_unpackhi_pi16(__a, __b); + __r = _mm_setzero_ps(); + __r = _mm_cvtpi32_ps(__r, __c); + __r = _mm_movelh_ps(__r, __r); + __c = _mm_unpacklo_pi16(__a, __b); + __r = _mm_cvtpi32_ps(__r, __c); + + return __r; +} + +__ATTRIBUTE_SSE__ +static __inline __m128 _mm_cvtpi8_ps(__m64 __a) +{ + __m64 __b; + + __b = _mm_setzero_si64(); + __b = _mm_cmpgt_pi8(__b, __a); + __b = _mm_unpacklo_pi8(__a, __b); + + return _mm_cvtpi16_ps(__b); +} + +__ATTRIBUTE_SSE__ +static __inline __m128 _mm_cvtpu8_ps(__m64 __a) +{ + __m64 __b; + + __b = _mm_setzero_si64(); + __b = _mm_unpacklo_pi8(__a, __b); + + return _mm_cvtpi16_ps(__b); +} + +__ATTRIBUTE_SSE__ +static __inline __m128 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) +{ + __m128 __c; + + __c = _mm_setzero_ps(); + __c = _mm_cvtpi32_ps(__c, __b); + __c = _mm_movelh_ps(__c, __c); + + return _mm_cvtpi32_ps(__c, __a); +} + +__ATTRIBUTE_SSE__ +static __inline __m64 _mm_cvtps_pi16(__m128 __a) +{ + __m64 __b, __c; + + __b = _mm_cvtps_pi32(__a); + __a = _mm_movehl_ps(__a, __a); + __c = _mm_cvtps_pi32(__a); + + return _mm_packs_pi32(__b, __c); +} + +__ATTRIBUTE_SSE__ +static __inline __m64 _mm_cvtps_pi8(__m128 __a) +{ + __m64 __b, __c; + + __b = _mm_cvtps_pi16(__a); + __c = _mm_setzero_si64(); + + return _mm_packs_pi16(__b, __c); +} + +#endif /* _M_IX86 */ + +/* Transpose the 4x4 matrix composed of row[0-3]. */ +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ +do { \ + __m128 t0 = _mm_unpacklo_ps(row0, row1); \ + __m128 t1 = _mm_unpacklo_ps(row2, row3); \ + __m128 t2 = _mm_unpackhi_ps(row0, row1); \ + __m128 t3 = _mm_unpackhi_ps(row2, row3); \ + (row0) = _mm_movelh_ps(t0, t1); \ + (row1) = _mm_movehl_ps(t1, t0); \ + (row2) = _mm_movelh_ps(t2, t3); \ + (row3) = _mm_movehl_ps(t3, t2); \ +} while (0) + +#define _MM_GET_EXCEPTION_STATE() \ + (_mm_getcsr() & _MM_EXCEPT_MASK) + +#define _MM_GET_EXCEPTION_MASK() \ + (_mm_getcsr() & _MM_MASK_MASK) + +#define _MM_GET_ROUNDING_MODE() \ + (_mm_getcsr() & _MM_ROUND_MASK) + +#define _MM_GET_FLUSH_ZERO_MODE() \ + (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) + +#define _MM_SET_EXCEPTION_STATE(__mask) \ + _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (__mask)) + +#define _MM_SET_EXCEPTION_MASK(__mask) \ + _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (__mask)) + +#define _MM_SET_ROUNDING_MODE(__mode) \ + _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (__mode)) + +#define _MM_SET_FLUSH_ZERO_MODE(__mode) \ + _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (__mode)) + +/* Use intrinsics on MSVC */ +#if defined(_MSC_VER) && !defined(__clang__) +#pragma intrinsic(_mm_prefetch) +#pragma intrinsic(_mm_setzero_ps) +#pragma intrinsic(_mm_add_ss) +#pragma intrinsic(_mm_sub_ss) +#pragma intrinsic(_mm_mul_ss) +#pragma intrinsic(_mm_div_ss) +#pragma intrinsic(_mm_sqrt_ss) +#pragma intrinsic(_mm_rcp_ss) +#pragma intrinsic(_mm_rsqrt_ss) +#pragma intrinsic(_mm_min_ss) +#pragma intrinsic(_mm_max_ss) +#pragma intrinsic(_mm_add_ps) +#pragma intrinsic(_mm_sub_ps) +#pragma intrinsic(_mm_mul_ps) +#pragma intrinsic(_mm_div_ps) +#pragma intrinsic(_mm_sqrt_ps) +#pragma intrinsic(_mm_rcp_ps) +#pragma intrinsic(_mm_rsqrt_ps) +#pragma intrinsic(_mm_min_ps) +#pragma intrinsic(_mm_max_ps) +#pragma intrinsic(_mm_and_ps) +#pragma intrinsic(_mm_andnot_ps) +#pragma intrinsic(_mm_or_ps) +#pragma intrinsic(_mm_xor_ps) +#pragma intrinsic(_mm_cmpeq_ss) +#pragma intrinsic(_mm_cmplt_ss) +#pragma intrinsic(_mm_cmple_ss) +#pragma intrinsic(_mm_cmpgt_ss) +#pragma intrinsic(_mm_cmpge_ss) +#pragma intrinsic(_mm_cmpneq_ss) +#pragma intrinsic(_mm_cmpnlt_ss) +#pragma intrinsic(_mm_cmpnle_ss) +#pragma intrinsic(_mm_cmpngt_ss) +#pragma intrinsic(_mm_cmpnge_ss) +#pragma intrinsic(_mm_cmpord_ss) +#pragma intrinsic(_mm_cmpunord_ss) +#pragma intrinsic(_mm_cmpeq_ps) +#pragma intrinsic(_mm_cmplt_ps) +#pragma intrinsic(_mm_cmple_ps) +#pragma intrinsic(_mm_cmpgt_ps) +#pragma intrinsic(_mm_cmpge_ps) +#pragma intrinsic(_mm_cmpneq_ps) +#pragma intrinsic(_mm_cmpnlt_ps) +#pragma intrinsic(_mm_cmpnle_ps) +#pragma intrinsic(_mm_cmpngt_ps) +#pragma intrinsic(_mm_cmpnge_ps) +#pragma intrinsic(_mm_cmpord_ps) +#pragma intrinsic(_mm_cmpunord_ps) +#pragma intrinsic(_mm_comieq_ss) +#pragma intrinsic(_mm_comilt_ss) +#pragma intrinsic(_mm_comile_ss) +#pragma intrinsic(_mm_comigt_ss) +#pragma intrinsic(_mm_comige_ss) +#pragma intrinsic(_mm_comineq_ss) +#pragma intrinsic(_mm_ucomieq_ss) +#pragma intrinsic(_mm_ucomilt_ss) +#pragma intrinsic(_mm_ucomile_ss) +#pragma intrinsic(_mm_ucomigt_ss) +#pragma intrinsic(_mm_ucomige_ss) +#pragma intrinsic(_mm_ucomineq_ss) +#pragma intrinsic(_mm_cvt_ss2si) +#pragma intrinsic(_mm_cvtt_ss2si) +#pragma intrinsic(_mm_cvt_si2ss) +#ifdef _M_IX86 +#pragma intrinsic(_mm_cvt_ps2pi) +#pragma intrinsic(_mm_cvtt_ps2pi) +#pragma intrinsic(_mm_cvt_pi2ps) +#endif // _M_IX86 +#pragma intrinsic(_mm_shuffle_ps) +#pragma intrinsic(_mm_unpackhi_ps) +#pragma intrinsic(_mm_unpacklo_ps) +#pragma intrinsic(_mm_loadh_pi) +#pragma intrinsic(_mm_storeh_pi) +#pragma intrinsic(_mm_movehl_ps) +#pragma intrinsic(_mm_movelh_ps) +#pragma intrinsic(_mm_loadl_pi) +#pragma intrinsic(_mm_storel_pi) +#pragma intrinsic(_mm_movemask_ps) +#pragma intrinsic(_mm_getcsr) +#pragma intrinsic(_mm_setcsr) +#pragma intrinsic(_mm_set_ss) +#pragma intrinsic(_mm_set_ps1) +#pragma intrinsic(_mm_load_ss) +#pragma intrinsic(_mm_load_ps1) +#pragma intrinsic(_mm_load_ps) +#pragma intrinsic(_mm_loadu_ps) +#pragma intrinsic(_mm_loadr_ps) +#pragma intrinsic(_mm_set_ps) +#pragma intrinsic(_mm_setr_ps) +#pragma intrinsic(_mm_store_ss) +#pragma intrinsic(_mm_cvtss_f32) +#pragma intrinsic(_mm_store_ps) +#pragma intrinsic(_mm_storeu_ps) +#pragma intrinsic(_mm_store_ps1) +#pragma intrinsic(_mm_storer_ps) +#pragma intrinsic(_mm_move_ss) +#ifdef _M_IX86 +#pragma intrinsic(_m_pextrw) +#pragma intrinsic(_m_pinsrw) +#pragma intrinsic(_m_pmaxsw) +#pragma intrinsic(_m_pmaxub) +#pragma intrinsic(_m_pminsw) +#pragma intrinsic(_m_pminub) +#pragma intrinsic(_m_pmovmskb) +#pragma intrinsic(_m_pmulhuw) +#pragma intrinsic(_m_pshufw) +#pragma intrinsic(_m_maskmovq) +#pragma intrinsic(_m_pavgb) +#pragma intrinsic(_m_pavgw) +#pragma intrinsic(_m_psadbw) +#pragma intrinsic(_mm_stream_pi) +#endif // _M_IX86 +#pragma intrinsic(_mm_stream_ps) +#pragma intrinsic(_mm_sfence) +#ifdef _M_AMD64 +#pragma intrinsic(_mm_cvtss_si64) +#pragma intrinsic(_mm_cvttss_si64) +#pragma intrinsic(_mm_cvtsi64_ss) +#endif // _M_AMD64 + +#else /* _MSC_VER */ + /* - * We can't use __builtin_ia32_* functions, - * are they are only available with the -msse2 compiler switch - */ + GCC: https://github.com/gcc-mirror/gcc/blob/master/gcc/config/i386/xmmintrin.h + Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/xmmintrin.h +*/ + +/* Use inline functions on GCC/Clang */ + #if !HAS_BUILTIN(_mm_getcsr) -__INTRIN_INLINE unsigned int _mm_getcsr(void) +__INTRIN_INLINE_SSE unsigned int _mm_getcsr(void) { - unsigned int retval; - __asm__ __volatile__("stmxcsr %0" : "=m"(retval)); - return retval; + return __builtin_ia32_stmxcsr(); } #endif #if !HAS_BUILTIN(_mm_setcsr) -__INTRIN_INLINE void _mm_setcsr(unsigned int val) +__INTRIN_INLINE_SSE void _mm_setcsr(unsigned int a) { - __asm__ __volatile__("ldmxcsr %0" : : "m"(val)); + __builtin_ia32_ldmxcsr(a); } #endif + +__INTRIN_INLINE_SSE __m128 _mm_add_ss(__m128 __a, __m128 __b) +{ + __a[0] += __b[0]; + return __a; +} + +__INTRIN_INLINE_SSE __m128 _mm_add_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4sf)__a + (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_sub_ss(__m128 __a, __m128 __b) +{ + __a[0] -= __b[0]; + return __a; +} + +__INTRIN_INLINE_SSE __m128 _mm_sub_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4sf)__a - (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_mul_ss(__m128 __a, __m128 __b) +{ + __a[0] *= __b[0]; + return __a; +} + +__INTRIN_INLINE_SSE __m128 _mm_mul_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4sf)__a * (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_div_ss(__m128 __a, __m128 __b) +{ + __a[0] /= __b[0]; + return __a; +} + +__INTRIN_INLINE_SSE __m128 _mm_div_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4sf)__a / (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_sqrt_ss(__m128 __a) +{ + return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_sqrt_ps(__m128 __a) +{ + return __builtin_ia32_sqrtps((__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_rcp_ss(__m128 __a) +{ + return (__m128)__builtin_ia32_rcpss((__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_rcp_ps(__m128 __a) +{ + return (__m128)__builtin_ia32_rcpps((__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_rsqrt_ss(__m128 __a) +{ + return __builtin_ia32_rsqrtss((__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_rsqrt_ps(__m128 __a) +{ + return __builtin_ia32_rsqrtps((__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_min_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_min_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_max_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_max_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_and_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4su)__a & (__v4su)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_andnot_ps(__m128 __a, __m128 __b) +{ + return (__m128)(~(__v4su)__a & (__v4su)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_or_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4su)__a | (__v4su)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_xor_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4su)__a ^ (__v4su)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpeq_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpeq_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmplt_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmplt_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmple_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmple_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpgt_ss(__m128 __a, __m128 __b) +{ + __v4sf temp = __builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a); +#ifdef __clang__ + return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); +#else + return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); +#endif +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpgt_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpge_ss(__m128 __a, __m128 __b) +{ + __v4sf temp = __builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a); +#ifdef __clang__ + return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); +#else + return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); +#endif +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpge_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpneq_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpneq_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpnlt_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpnle_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpnle_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpngt_ss(__m128 __a, __m128 __b) +{ + __v4sf temp = __builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a); +#ifdef __clang__ + return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); +#else + return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); +#endif +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpngt_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpnge_ss(__m128 __a, __m128 __b) +{ + __v4sf temp = (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a); +#ifdef __clang__ + return (__m128)__builtin_shufflevector((__v4sf)__a, temp, 4, 1, 2, 3); +#else + return (__m128)__builtin_ia32_movss((__v4sf)__a, temp); +#endif +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpnge_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpord_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpord_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpunord_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE __m128 _mm_cmpunord_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_comieq_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_comilt_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_comile_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_comigt_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_comige_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_comineq_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_ucomieq_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_ucomilt_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_ucomile_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_ucomigt_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_ucomige_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); +} + +__INTRIN_INLINE_SSE int _mm_ucomineq_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); +} + +// _mm_cvt_ss2si +__INTRIN_INLINE_SSE int _mm_cvtss_si32(__m128 __a) +{ + return __builtin_ia32_cvtss2si((__v4sf)__a); +} + +#ifdef _M_AMD64 +__INTRIN_INLINE_SSE long long _mm_cvtss_si64(__m128 __a) +{ + return __builtin_ia32_cvtss2si64((__v4sf)__a); +} +#endif + +// _mm_cvt_ps2pi +__INTRIN_INLINE_SSE __m64 _mm_cvtps_pi32(__m128 __a) +{ + return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); +} + +// _mm_cvtt_ss2si +__INTRIN_INLINE_SSE int _mm_cvttss_si32(__m128 __a) +{ + return __builtin_ia32_cvttss2si((__v4sf)__a); +} + +#ifdef _M_AMD64 +__INTRIN_INLINE_SSE long long _mm_cvttss_si64(__m128 __a) +{ + return __builtin_ia32_cvttss2si64((__v4sf)__a); +} +#endif + +// _mm_cvtt_ps2pi +__INTRIN_INLINE_SSE __m64 _mm_cvttps_pi32(__m128 __a) +{ + return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); +} + +// _mm_cvt_si2ss +__INTRIN_INLINE_SSE __m128 _mm_cvtsi32_ss(__m128 __a, int __b) +{ + __a[0] = __b; + return __a; +} + +#ifdef _M_AMD64 +__INTRIN_INLINE_SSE __m128 _mm_cvtsi64_ss(__m128 __a, long long __b) +{ + __a[0] = __b; + return __a; +} +#endif + +// _mm_cvt_pi2ps +__INTRIN_INLINE_SSE __m128 _mm_cvtpi32_ps(__m128 __a, __m64 __b) +{ + return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); +} + +__INTRIN_INLINE_SSE float _mm_cvtss_f32(__m128 __a) +{ + return __a[0]; +} + +__INTRIN_INLINE_SSE __m128 _mm_loadh_pi(__m128 __a, const __m64 *__p) +{ +#ifdef __clang__ + typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_loadh_pi_struct { + __mm_loadh_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u; + __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); + return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); +#else + return (__m128)__builtin_ia32_loadhps(__a, __p); +#endif +} + +__INTRIN_INLINE_SSE __m128 _mm_loadl_pi(__m128 __a, const __m64 *__p) +{ +#ifdef __clang__ + typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_loadl_pi_struct { + __mm_loadl_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u; + __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); + return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); +#else + return (__m128)__builtin_ia32_loadlps(__a, __p); +#endif +} + +__INTRIN_INLINE_SSE __m128 _mm_load_ss(const float *__p) +{ + return _mm_set_ss(*__p); +} + +// _mm_load_ps1 +__INTRIN_INLINE_SSE __m128 _mm_load1_ps(const float *__p) +{ + return _mm_set1_ps(*__p); +} + +__INTRIN_INLINE_SSE __m128 _mm_load_ps(const float *__p) +{ + return *(const __m128*)__p; +} + +__INTRIN_INLINE_SSE __m128 _mm_loadu_ps(const float *__p) +{ + struct __loadu_ps { + __m128_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ps*)__p)->__v; +} + +__INTRIN_INLINE_SSE __m128 _mm_loadr_ps(const float *__p) +{ + __m128 __a = _mm_load_ps(__p); +#ifdef __clang__ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); +#else + return (__m128)__builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3)); +#endif +} + +__INTRIN_INLINE_SSE __m128 _mm_undefined_ps(void) +{ +#ifdef __clang__ + return (__m128)__builtin_ia32_undef128(); +#else + __m128 undef = undef; + return undef; +#endif +} + +__INTRIN_INLINE_SSE __m128 _mm_set_ss(float __w) +{ + return __extension__ (__m128){ __w, 0, 0, 0 }; +} + +// _mm_set_ps1 +__INTRIN_INLINE_SSE __m128 _mm_set1_ps(float __w) +{ + return __extension__ (__m128){ __w, __w, __w, __w }; +} + +__INTRIN_INLINE_SSE __m128 _mm_set_ps(float __z, float __y, float __x, float __w) +{ + return __extension__ (__m128){ __w, __x, __y, __z }; +} + +__INTRIN_INLINE_SSE __m128 _mm_setr_ps(float __z, float __y, float __x, float __w) +{ + return __extension__ (__m128){ __z, __y, __x, __w }; +} + +__INTRIN_INLINE_SSE __m128 _mm_setzero_ps(void) +{ + return __extension__ (__m128){ 0, 0, 0, 0 }; +} + +__INTRIN_INLINE_SSE void _mm_storeh_pi(__m64 *__p, __m128 __a) +{ +#ifdef __clang__ + typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_storeh_pi_struct { + __mm_storeh_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3); +#else + __builtin_ia32_storehps(__p, __a); +#endif +} + +__INTRIN_INLINE_SSE void _mm_storel_pi(__m64 *__p, __m128 __a) +{ +#ifdef __clang__ + typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_storeh_pi_struct { + __mm_storeh_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1); +#else + __builtin_ia32_storelps(__p, __a); +#endif +} + +__INTRIN_INLINE_SSE void _mm_store_ss(float *__p, __m128 __a) +{ + *__p = ((__v4sf)__a)[0]; +} + +__INTRIN_INLINE_SSE void _mm_storeu_ps(float *__p, __m128 __a) +{ + *(__m128_u *)__p = __a; +} + +__INTRIN_INLINE_SSE void _mm_store_ps(float *__p, __m128 __a) +{ + *(__m128*)__p = __a; +} + +// _mm_store_ps1 +__INTRIN_INLINE_SSE void _mm_store1_ps(float *__p, __m128 __a) +{ + // FIXME: Should we use a temp instead? +#ifdef __clang__ + __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); +#else + __a = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,0,0,0)); +#endif + _mm_store_ps(__p, __a); +} + +__INTRIN_INLINE_SSE void _mm_storer_ps(float *__p, __m128 __a) +{ +#ifdef __clang__ + __m128 __tmp = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); +#else + __m128 __tmp = __builtin_ia32_shufps(__a, __a, _MM_SHUFFLE(0,1,2,3)); +#endif + _mm_store_ps(__p, __tmp); +} + +/* GCC / Clang specific consants */ +#define _MM_HINT_NTA_ALT 0 +#define _MM_HINT_T0_ALT 3 +#define _MM_HINT_T1_ALT 2 +#define _MM_HINT_T2_ALT 1 +#define _MM_HINT_ENTA_ALT 4 + +// These are not supported yet +//#define _MM_HINT_ET0_ALT 7 +//#define _MM_HINT_ET1_ALT 6 +//#define _MM_HINT_ET2_ALT 5 + +#define _MM_HINT_MS_TO_ALT(sel) \ + (((sel) == _MM_HINT_NTA) ? _MM_HINT_NTA_ALT : \ + ((sel) == _MM_HINT_T0) ? _MM_HINT_T0_ALT : \ + ((sel) == _MM_HINT_T1) ? _MM_HINT_T1_ALT : \ + ((sel) == _MM_HINT_T2) ? _MM_HINT_T2_ALT : \ + ((sel) == _MM_HINT_ENTA) ? _MM_HINT_ENTA_ALT : 0) + +#ifdef _MSC_VER1 + +/* On clang-cl we have an intrinsic, but the constants are different */ +#pragma intrinsic(_mm_prefetch) +#define _mm_prefetch(p, sel) _mm_prefetch(p, _MM_HINT_MS_TO_ALT(sel)) + +#else /* _MSC_VER */ + +#define _mm_prefetch(p, sel) \ + __builtin_prefetch((const void *)(p), (_MM_HINT_MS_TO_ALT(sel) >> 2) & 1, _MM_HINT_MS_TO_ALT(sel) & 0x3) + #endif /* _MSC_VER */ -/* Alternate names */ -#define _mm_cvtss_si32 _mm_cvt_ss2si +__INTRIN_INLINE_SSE void _mm_stream_pi(__m64 *__p, __m64 __a) +{ +#ifdef __clang__ + __builtin_ia32_movntq((__v1di*)__p, __a); +#else + __builtin_ia32_movntq((long long unsigned int *)__p, (long long unsigned int)__a); +#endif +} + +__INTRIN_INLINE_SSE void _mm_stream_ps(float *__p, __m128 __a) +{ +#ifdef __clang__ + __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); +#else + __builtin_ia32_movntps(__p, (__v4sf)__a); +#endif +} + +#if !HAS_BUILTIN(_mm_sfence) +__INTRIN_INLINE_SSE void _mm_sfence(void) +{ + __builtin_ia32_sfence(); +} +#endif + +#ifdef __clang__ +#define _m_pextrw(a, n) \ + ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) + +#define _m_pinsrw(a, d, n) \ + ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)) +#else +// _m_pextrw +__INTRIN_INLINE_SSE int _mm_extract_pi16(__m64 const __a, int const __n) +{ + return (unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)__a, __n); +} + +// _m_pinsrw +__INTRIN_INLINE_SSE __m64 _mm_insert_pi16 (__m64 const __a, int const __d, int const __n) +{ + return (__m64)__builtin_ia32_vec_set_v4hi ((__v4hi)__a, __d, __n); +} + +#endif + +// _m_pmaxsw +__INTRIN_INLINE_SSE __m64 _mm_max_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); +} + +// _m_pmaxub +__INTRIN_INLINE_SSE __m64 _mm_max_pu8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); +} + +// _m_pminsw +__INTRIN_INLINE_SSE __m64 _mm_min_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); +} + +// _m_pminub +__INTRIN_INLINE_SSE __m64 _mm_min_pu8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); +} + +// _m_pmovmskb +__INTRIN_INLINE_SSE int _mm_movemask_pi8(__m64 __a) +{ + return __builtin_ia32_pmovmskb((__v8qi)__a); +} + +// _m_pmulhuw +__INTRIN_INLINE_SSE __m64 _mm_mulhi_pu16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); +} + +#ifdef __clang__ +#define _m_pshufw(a, n) \ + ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))) +#else +// _m_pshufw +__INTRIN_INLINE_MMX __m64 _mm_shuffle_pi16 (__m64 __a, int const __n) +{ + return (__m64) __builtin_ia32_pshufw ((__v4hi)__a, __n); +} +#endif + +// _m_maskmovq +__INTRIN_INLINE_SSE void _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) +{ + __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); +} + +// _m_pavgb +__INTRIN_INLINE_SSE __m64 _mm_avg_pu8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); +} + +// _m_pavgw +__INTRIN_INLINE_SSE __m64 _mm_avg_pu16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); +} + +// _m_psadbw +__INTRIN_INLINE_SSE __m64 _mm_sad_pu8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); +} + +#endif // __GNUC__ #ifdef __cplusplus } -#endif - -/* _mm_prefetch constants */ -#define _MM_HINT_T0 1 -#define _MM_HINT_T1 2 -#define _MM_HINT_T2 3 -#define _MM_HINT_NTA 0 -#define _MM_HINT_ET1 6 - +#endif // __cplusplus #endif /* _INCLUDED_MM2 */