diff --git a/dll/win32/ucrtbase/ucrtbase.spec b/dll/win32/ucrtbase/ucrtbase.spec index 28dd4ad59d6..1c40d48a557 100644 --- a/dll/win32/ucrtbase/ucrtbase.spec +++ b/dll/win32/ucrtbase/ucrtbase.spec @@ -133,27 +133,27 @@ @ cdecl __iswcsym(long) @ cdecl __iswcsymf(long) @ stdcall -arch=arm __jump_unwind(ptr ptr) ntdll.__jump_unwind -@ cdecl -stub -arch=i386 -norelay __libm_sse2_acos() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_acosf() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_asin() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_asinf() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_atan() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_atan2() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_atanf() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_cos() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_cosf() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_exp() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_expf() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_log() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_log10() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_log10f() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_logf() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_pow() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_powf() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_sin() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_sinf() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_tan() -@ cdecl -stub -arch=i386 -norelay __libm_sse2_tanf() +@ cdecl -arch=i386 -norelay __libm_sse2_acos() +@ cdecl -arch=i386 -norelay __libm_sse2_acosf() +@ cdecl -arch=i386 -norelay __libm_sse2_asin() +@ cdecl -arch=i386 -norelay __libm_sse2_asinf() +@ cdecl -arch=i386 -norelay __libm_sse2_atan() +@ cdecl -arch=i386 -norelay __libm_sse2_atan2() +@ cdecl -arch=i386 -norelay __libm_sse2_atanf() +@ cdecl -arch=i386 -norelay __libm_sse2_cos() +@ cdecl -arch=i386 -norelay __libm_sse2_cosf() +@ cdecl -arch=i386 -norelay __libm_sse2_exp() +@ cdecl -arch=i386 -norelay __libm_sse2_expf() +@ cdecl -arch=i386 -norelay __libm_sse2_log() +@ cdecl -arch=i386 -norelay __libm_sse2_log10() +@ cdecl -arch=i386 -norelay __libm_sse2_log10f() +@ cdecl -arch=i386 -norelay __libm_sse2_logf() +@ cdecl -arch=i386 -norelay __libm_sse2_pow() +@ cdecl -arch=i386 -norelay __libm_sse2_powf() +@ cdecl -arch=i386 -norelay __libm_sse2_sin() +@ cdecl -arch=i386 -norelay __libm_sse2_sinf() +@ cdecl -arch=i386 -norelay __libm_sse2_tan() +@ cdecl -arch=i386 -norelay __libm_sse2_tanf() @ cdecl __p___argc() @ cdecl __p___argv() @ cdecl __p___wargv() @@ -602,17 +602,17 @@ @ cdecl _ldunscale(ptr ptr) _dunscale @ cdecl _lfind(ptr ptr ptr long ptr) @ cdecl _lfind_s(ptr ptr ptr long ptr ptr) -@ cdecl -stub -arch=i386 -norelay _libm_sse2_acos_precise() #__libm_sse2_acos -@ cdecl -stub -arch=i386 -norelay _libm_sse2_asin_precise() #__libm_sse2_asin -@ cdecl -stub -arch=i386 -norelay _libm_sse2_atan_precise() #__libm_sse2_atan -@ cdecl -stub -arch=i386 -norelay _libm_sse2_cos_precise() #__libm_sse2_cos -@ cdecl -stub -arch=i386 -norelay _libm_sse2_exp_precise() #__libm_sse2_exp -@ cdecl -stub -arch=i386 -norelay _libm_sse2_log10_precise() #__libm_sse2_log10 -@ cdecl -stub -arch=i386 -norelay _libm_sse2_log_precise() #__libm_sse2_log -@ cdecl -stub -arch=i386 -norelay _libm_sse2_pow_precise() #__libm_sse2_pow -@ cdecl -stub -arch=i386 -norelay _libm_sse2_sin_precise() #__libm_sse2_sin -@ cdecl -stub -arch=i386 -norelay _libm_sse2_sqrt_precise() #__libm_sse2_sqrt -@ cdecl -stub -arch=i386 -norelay _libm_sse2_tan_precise() #__libm_sse2_tan +@ cdecl -arch=i386 -norelay _libm_sse2_acos_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_asin_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_atan_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_cos_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_exp_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_log10_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_log_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_pow_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_sin_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_sqrt_precise() +@ cdecl -arch=i386 -norelay _libm_sse2_tan_precise() @ cdecl _loaddll(str) @ cdecl -arch=win64 _local_unwind(ptr ptr) ntdll._local_unwind @ cdecl -arch=i386 _local_unwind2(ptr long) diff --git a/sdk/lib/crt/math/i386/libm_sse2.c b/sdk/lib/crt/math/i386/libm_sse2.c new file mode 100644 index 00000000000..39b51379417 --- /dev/null +++ b/sdk/lib/crt/math/i386/libm_sse2.c @@ -0,0 +1,267 @@ +/* + * PROJECT: ReactOS CRT + * LICENSE: MIT (https://spdx.org/licenses/MIT) + * PURPOSE: Simplified implementation of __libm_sse2_* + * COPYRIGHT: Copyright 2025 Timo Kreuzer + */ + +#include +#include + +#if defined(_MSC_VER) && !defined(__clang__) +#pragma function(acos,asin,atan,atan2,cos) +#pragma function(exp,log,log10,pow,sin,tan) +#define __ATTRIBUTE_SSE2__ +#else +#define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"))) +#endif + +#ifdef __GNUC__ +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_acos(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = acos(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_acosf(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = acos(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_asin(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = asin(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_asinf(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = asin(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_atan(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = atan(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_atanf(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = atan(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_atan2(__m128d Xmm0, __m128d Xmm1) +{ + double x = _mm_cvtsd_f64(Xmm0); + double y = _mm_cvtsd_f64(Xmm1); + double result = atan2(x, y); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_cos(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = cos(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_cosf(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = cos(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_exp(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = exp(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_expf(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = exp(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_log(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = log(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_logf(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = log(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_log10(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = log10(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_log10f(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = log10(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_pow(__m128d Xmm0, __m128d Xmm1) +{ + double x = _mm_cvtsd_f64(Xmm0); + double y = _mm_cvtsd_f64(Xmm1); + double result = pow(x, y); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_powf(__m128 Xmm0, __m128 Xmm1) +{ + float x = _mm_cvtss_f32(Xmm0); + float y = _mm_cvtss_f32(Xmm1); + float result = powf(x, y); + return _mm_set_ss(result); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_sin(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = sin(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_sinf(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = sin(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d __libm_sse2_tan(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = tan(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128 __libm_sse2_tanf(__m128 Xmm0) +{ + __m128d Xmm0d = _mm_cvtss_sd(Xmm0d, Xmm0); + double x = _mm_cvtsd_f64(Xmm0d); + double result = tan(x); + __m128d result128 = _mm_set_sd(result); + return _mm_cvtpd_ps(result128); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_acos_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = acos(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_asin_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = asin(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_atan_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = atan(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_cos_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = cos(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_exp_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = exp(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_log_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = log(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_log10_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = log10(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_pow_precise(__m128d Xmm0, __m128d Xmm1) +{ + double x = _mm_cvtsd_f64(Xmm0); + double y = _mm_cvtsd_f64(Xmm1); + double result = pow(x, y); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_sin_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = sin(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_sqrt_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = sqrt(x); + return _mm_set_sd(result); +} + +__ATTRIBUTE_SSE2__ __m128d _libm_sse2_tan_precise(__m128d Xmm0) +{ + double x = _mm_cvtsd_f64(Xmm0); + double result = tan(x); + return _mm_set_sd(result); +} diff --git a/sdk/lib/crt/math/math.cmake b/sdk/lib/crt/math/math.cmake index 285ea431b4c..7a00709d8a5 100644 --- a/sdk/lib/crt/math/math.cmake +++ b/sdk/lib/crt/math/math.cmake @@ -24,6 +24,7 @@ if(ARCH STREQUAL "i386") math/i386/cisin.c math/i386/cisqrt.c math/i386/ldexp.c + math/i386/libm_sse2.c ) list(APPEND LIBCNTPR_MATH_ASM_SOURCE math/i386/alldiv_asm.s