; ; MIT License ; ----------- ; ; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. ; ; Permission is hereby granted, free of charge, to any person obtaining a copy ; of this Software and associated documentaon files (the "Software"), to deal ; in the Software without restriction, including without limitation the rights ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ; copies of the Software, and to permit persons to whom the Software is ; furnished to do so, subject to the following conditions: ; ; The above copyright notice and this permission notice shall be included in ; all copies or substantial portions of the Software. ; ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ; THE SOFTWARE. ; ; An implementation of the cosf function. ; ; Prototype: ; ; float cosf(float x); ; ; Computes cosf(x). ; Based on the NAG C implementation. ; It will provide proper C99 return values, ; but may not raise floating point status bits properly. ; Original Author: Harsha Jagasia .const ALIGN 16 L_real_one DQ 03ff0000000000000h ; 1.0 DQ 0 ; for alignment L_one_half DQ 03fe0000000000000h ; 0.5 DQ 0 L_2bypi DQ 03fe45f306dc9c883h ; 2./pi DQ 0 L_one_sixth DQ 03fc5555555555555h ; 0.166666666666 DQ 0 L_piby2 DQ 03fe921fb54442d18h DQ 0 L_piby2_1 DQ 03ff921fb54400000h ; piby2_1 DQ 0 L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail DQ 0 L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2 DQ 0 L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail DQ 0 L_large_x_sse2 DQ 0411E848000000000h ; 5e5 DQ 0 L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9 DQ 0 L_sign_mask DQ 07FFFFFFFFFFFFFFFh DQ 07FFFFFFFFFFFFFFFh L__int_three DQ 00000000000000003h DQ 00000000000000003h L__min_norm_double DQ 00010000000000000h DQ 00010000000000000h L_two_to_neg_7 DQ 03f80000000000000h DQ 0 L_two_to_neg_13 DQ 03f20000000000000h DQ 0 L_inf_mask_32 DD 07F800000h DQ 0 fname TEXTEQU fname_special TEXTEQU <_cosf_special> ;Define name and any external functions being called EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR EXTERN __remainder_piby2_fma3 : PROC ; NEAR EXTERN fname_special : PROC EXTERN _set_statfp : PROC EXTRN __Lcosfarray:QWORD EXTRN __Lsinfarray:QWORD EXTRN __use_fma3_lib:DWORD ; define local variable storage offsets p_temp equ 020h ; temporary for get/put bits operation p_temp1 equ 030h ; temporary for get/put bits operation dummy_space EQU 040h stack_size EQU 068h include fm.inc .code ALIGN 16 PUBLIC fname fname PROC FRAME StackAllocate stack_size .ENDPROLOG cmp DWORD PTR __use_fma3_lib, 0 jne Lcosf_fma3 Lcosf_sse2: xorpd xmm2, xmm2 ; zeroed out for later use ;; if NaN or inf movd edx, xmm0 mov eax, 07f800000h mov r10d, eax and r10d, edx cmp r10d, eax jz Lcosf_sse2_naninf cvtss2sd xmm0, xmm0 movd rdx, xmm0 ; ax = (ux & ~SIGNBIT_DP64); mov r10, rdx btr r10, 63 ; r10 <-- |x| mov r8d, 1 ; for determining region later on movapd xmm1, xmm0 ; xmm1 <-- copy of x ;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */ mov rax, 03fe921fb54442d18h cmp r10, rax jg Lcosf_sse2_absx_gt_piby4 ; *c = cos_piby4(x, 0.0); movapd xmm2, xmm0 mulsd xmm2, xmm2 ;x^2 xor eax, eax mov rdx, r10 movsd xmm5, QWORD PTR L_one_half jmp Lcosf_sse2_calc_sincosf_piby4 ; done ALIGN 16 Lcosf_sse2_absx_gt_piby4: ; reduce the argument to be in a range from -pi/4 to +pi/4 ; by subtracting multiples of pi/2 ; xneg = (ax != ux); movd xmm0, r10 ; xmm0 <-- |x| cmp r10, QWORD PTR L_large_x_sse2 jae Lcosf_sse2_reduce_precise ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; xmm0=abs(x), xmm1=x ;/* How many pi/2 is x a multiple of? */ movapd xmm2, xmm0 movsd xmm3, QWORD PTR L_2bypi movapd xmm4, xmm0 movsd xmm5, QWORD PTR L_one_half mulsd xmm2, xmm3 ; movsd xmm5, QWORD PTR L_one_half ; movapd xmm2, xmm0 ; mulsd xmm2, QWORD PTR L_2bypi ; movapd xmm4, xmm0 mov r9, r10 shr r9, 52 ; r9 <-- biased exponent of x ; npi2 = (int)(x * twobypi + 0.5); addsd xmm2, xmm5 ; npi2 movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1 cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2 movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2 ; Subtract the multiple from x to get an extra-precision remainder ; rhead = x - npi2 * piby2_1; mulsd xmm3, xmm2 ; use piby2_1 subsd xmm4, xmm3 ; rhead ; rtail = npi2 * piby2_1tail; mulsd xmm1, xmm2 ; rtail movd eax, xmm0 ; GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead movapd xmm0, xmm4 subsd xmm0, xmm1 movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2 movd rcx, xmm0 ; rcx <-- rhead-rtail movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail ; region = npi2 & 3; ; and eax, 3 ; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); shl rcx, 1 ; strip any sign bit shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 sub r9, rcx ; expdiff ;; if (expdiff > 15) cmp r9, 15 jle Lcosf_sse2_expdiff_le_15 ; The remainder is pretty small compared with x, which ; implies that x is a near multiple of pi/2 ; (x matches the multiple to at least 15 bits) ; t = rhead; movapd xmm1, xmm4 ; rtail = npi2 * piby2_2; mulsd xmm3, xmm2 ; rhead = t - rtail; mulsd xmm5, xmm2 ; npi2 * piby2_2tail subsd xmm4, xmm3 ; rhead ; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); subsd xmm1, xmm4 ; t - rhead subsd xmm1, xmm3 ; -rtail subsd xmm5, xmm1 ; rtail ; r = rhead - rtail; movapd xmm0, xmm4 ;HARSHA ;xmm1=rtail movapd xmm1, xmm5 subsd xmm0, xmm5 ; xmm0=r, xmm4=rhead, xmm1=rtail ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Lcosf_sse2_expdiff_le_15: cmp rcx, 03f2h ; is r < 2^-13 ? jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not cmp rcx, 03deh ; is r < 2^-33 ? jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r movapd xmm2, xmm0 mulsd xmm2, xmm0 ; xmm2 <-- x^2 ;; if region is 1 or 3 do a sinf calc. and r8d, eax jz Lcosf_sse2_r_small_calc_sin Lcosf_sse2_r_small_calc_cos: ; region 1 or 3 ; use simply polynomial ; *s = x - x*x*x*0.166666666666666666; movsd xmm3, QWORD PTR L_one_sixth mulsd xmm3, xmm0 ; * x mulsd xmm3, xmm2 ; * x^2 subsd xmm0, xmm3 ; xs jmp Lcosf_sse2_adjust_region ALIGN 16 Lcosf_sse2_r_small_calc_sin: ; region 0 or 2 ; cos = 1.0 - x*x*0.5; movsd xmm0, QWORD PTR L_real_one ; 1.0 mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2 subsd xmm0, xmm2 jmp Lcosf_sse2_adjust_region ALIGN 16 Lcosf_sse2_r_very_small: ; then sin(r) = r ; if region is 1 or 3 do a sin calc. and r8d, eax jnz Lcosf_sse2_adjust_region movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1 ; By this point, calculations should already have set inexact jmp Lcosf_sse2_adjust_region ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ALIGN 16 Lcosf_sse2_reduce_precise: ; Reduce abs(x) into range [-pi/4, pi/4] ; remainder_piby2d2f(ax, &r, ®ion); mov QWORD PTR p_temp[rsp], rdx ; save ux for use later mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later call __remainder_piby2d2f_forAsm mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later mov r8d, 1 ; for determining region later ; Reduced argument is in xmm0. No second word; after all, we started in ; single precision. Region is in rax. movapd xmm1, xmm0 movsd xmm5, QWORD PTR L_one_half jmp Lcosf_sse2_calc_sincosf_piby4 ; done with reducing the argument. Now perform the sin/cos calculations. ALIGN 16 Lcosf_sse2_calc_sincosf_piby4: movapd xmm2, xmm0 mulsd xmm2, xmm0 ; x^2 ;; if region is 0 or 2, do a cosf calc and r8d, eax jz Lcosf_sse2_do_cosf_calc ; region is 1 or 3: do a sinf calc. Lcosf_sse2_do_sinf_calc: movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 mulsd xmm1, xmm2 ; s4x2 movsd xmm4, xmm2 ; move for x4 mulsd xmm4, xmm2 ; x4 movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 mulsd xmm5, xmm2 ; s2x2 movsd xmm3, xmm0 ; move for x3 mulsd xmm3, xmm2 ; x3 addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 mulsd xmm1, xmm4 ; s3x4+s4x6 addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) jmp Lcosf_sse2_adjust_region ALIGN 16 Lcosf_sse2_do_cosf_calc: ; region 0 or 2 - do a cos calculation ; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8; ; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4 movsd xmm4, xmm2 ; move for x4 mulsd xmm1, xmm2 ; c4x2 movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2 mulsd xmm4, xmm2 ; x4 movsd xmm0, QWORD PTR __Lcosfarray ; c0 mulsd xmm3, xmm2 ; c2x2 mulsd xmm0, xmm2 ; c0x2 (=-0.5x2) addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2 mulsd xmm1, xmm4 ; c3x4 + c4x6 addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2 addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6 mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2 addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10 Lcosf_sse2_adjust_region: ; xmm1 is cos or sin, relies on previous sections to ; switch (region) add eax, 1 and eax, 2 jz Lcosf_sse2_cleanup ;; if region 1 or 2 then we negate the result. xorpd xmm2, xmm2 subsd xmm2, xmm0 movapd xmm0, xmm2 ALIGN 16 Lcosf_sse2_cleanup: cvtsd2ss xmm0, xmm0 StackDeallocate stack_size ret Lcosf_sse2_naninf: call fname_special StackDeallocate stack_size ret ALIGN 16 Lcosf_fma3: vmovd eax,xmm0 mov r8d,L_inf_mask_32 and eax,r8d cmp eax, r8d jz Lcosf_fma3_naninf vcvtss2sd xmm5,xmm0,xmm0 vmovq r9,xmm5 btr r9,63 ;clear sign cmp r9,L_piby2 jg Lcosf_fma3_range_reduce cmp r9,L_two_to_neg_7 jge Lcosf_fma3_compute_cosf_piby_4 cmp r9,L_two_to_neg_13 jge Lcosf_fma3_compute_1_xx_5 vmovq xmm0,QWORD PTR L_real_one ; Here we need to set inexact vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact jmp Lcosf_fma3_return ALIGN 16 Lcosf_fma3_compute_1_xx_5: vmulsd xmm0,xmm5,QWORD PTR L_one_half vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - x*x*(double2)0.5 jmp Lcosf_fma3_return ALIGN 16 Lcosf_fma3_compute_cosf_piby_4: movsd xmm0,xmm5 vmovapd xmm2,L_real_one vmulsd xmm3,xmm0,xmm0 vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r vsubsd xmm2,xmm2,xmm1 vmovsd xmm1,__Lcosfarray+018h vfmadd231sd xmm1,xmm3,__Lcosfarray+020h vfmadd213sd xmm1,xmm3,__Lcosfarray+010h vfmadd213sd xmm1,xmm3,__Lcosfarray+008h vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 vmovdqa xmm0,xmm2 vfmadd231sd xmm0,xmm1,xmm3 jmp Lcosf_fma3_return ALIGN 16 Lcosf_fma3_range_reduce: vmovq xmm0,r9 ; xmm0 <-- |x| cmp r9,L_large_x_fma3 jge Lcosf_reduce_precise ;cosff_range_e_5_s: vandpd xmm1,xmm0,L_sign_mask vmovapd xmm2,L_2bypi vfmadd213sd xmm2,xmm1,L_one_half vcvttpd2dq xmm2,xmm2 vpmovsxdq xmm1,xmm2 vandpd xmm4,xmm1,L__int_three ; region xmm4 vshufps xmm1 ,xmm1,xmm1,8 vcvtdq2pd xmm1,xmm1 vmovdqa xmm2,xmm0 vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0 vsubsd xmm2,xmm2,xmm0 vsubsd xmm1,xmm2,xmm3 vmovq rax,xmm4 jmp Lcosf_exit_s ALIGN 16 Lcosf_reduce_precise: vmovq xmm0,r9 ; r9 <-- |x| cmp r9,L_large_x_fma3 jge Lcos_remainder_piby2 ; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl ; have the following conventions: ; on input ; x is in xmm0 ; on output ; r is in xmm0 ; rr is in xmm1 ; region is in rax ; The _bdl routine is guaranteed not to touch r10 Lcos_remainder_piby2_small: ;; unused label ; Boldo-Daumas-Li reduction for reasonably small |x| call __remainder_piby2_fma3_bdl jmp Lcosf_exit_s ALIGN 16 Lcos_remainder_piby2: ; argument reduction for general x call __remainder_piby2_fma3 Lcosf_exit_s: bt rax,0 jnc Lcosf_piby4_compute ;sinf_piby4_compute: ; vmovapd xmm1,__Lsinfarray+010h vmovsd xmm1,__Lsinfarray+010h vmulsd xmm3,xmm0,xmm0 vfmadd231sd xmm1,xmm3,__Lsinfarray+018h vfmadd213sd xmm1,xmm3,__Lsinfarray+008h vfmadd213sd xmm1,xmm3,__Lsinfarray vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 vfmadd231sd xmm0,xmm1,xmm3 jmp Lcosf_fma3_adjust_sign ALIGN 16 Lcosf_piby4_compute: vmovapd xmm2,L_real_one vmulsd xmm3,xmm0,xmm0 vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r vsubsd xmm2,xmm2,xmm1 vmovsd xmm1,__Lcosfarray+018h vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 vmovdqa xmm0, xmm2 vfmadd231sd xmm0 ,xmm1,xmm3 Lcosf_fma3_adjust_sign: ; assuming FMA3 ==> AVX ==> SSE4.1 ; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one ; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two ; vorpd xmm3,xmm2,xmm1 ; vandpd xmm3,xmm3,L_signbit add rax,1 ; 1,2 --> 2,3 shr rax,1 ; 2,3 --> 1 shl rax,63 ; 1 --> sign bit vmovq xmm3,rax vxorpd xmm0,xmm0,xmm3 Lcosf_fma3_return: vcvtsd2ss xmm0,xmm0,xmm0 StackDeallocate stack_size ret Lcosf_fma3_naninf: call fname_special StackDeallocate stack_size ret fname endp END