; ; MIT License ; ----------- ; ; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. ; ; Permission is hereby granted, free of charge, to any person obtaining a copy ; of this Software and associated documentaon files (the "Software"), to deal ; in the Software without restriction, including without limitation the rights ; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell ; copies of the Software, and to permit persons to whom the Software is ; furnished to do so, subject to the following conditions: ; ; The above copyright notice and this permission notice shall be included in ; all copies or substantial portions of the Software. ; ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ; THE SOFTWARE. ; ; ; An implementation of the sin function. ; ; Prototype: ; ; double sin(double x); ; ; Computes sin(x). ; It will provide proper C99 return values, ; but may not raise floating point status bits properly. ; Based on the NAG C implementation. ; ; If FMA3 hardware is available, an FMA3 implementation of sin will be used. .const ALIGN 16 L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1 DQ 0 L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail DQ 0 L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2 DQ 0 L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail DQ 0 ALIGN 16 L_one DQ 03FF0000000000000h, 03FF0000000000000h L_signbit DQ 08000000000000000h, 00000000000000000h L_int_one DQ 00000000000000001h, 00000000000000000h L_int_two DQ 00000000000000002h, 00000000000000000h L_int_three DQ 00000000000000003h, 00000000000000000h L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi L_one_half DQ 03FE0000000000000h ; .5 L_one_sixth DQ 03FC5555555555555h ; .1666... L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27 L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13 L_piby4 DQ 03FE921FB54442D18h ; pi/4 L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL L__inf_mask_64 DQ 07FF0000000000000h ; +Inf EXTRN __Lcosarray:QWORD EXTRN __Lsinarray:QWORD EXTRN __use_fma3_lib:DWORD ; define local variable storage offsets p_temp EQU 030h p_temp1 EQU 040h save_r10 EQU 050h dummy_space EQU 060h stack_size EQU 078h include fm.inc fname TEXTEQU fname_special TEXTEQU <_sin_special> ;Define name and any external functions being called EXTERN __remainder_piby2_forAsm : PROC EXTERN __remainder_piby2_fma3 : PROC EXTERN __remainder_piby2_fma3_bdl : PROC EXTERN fname_special : PROC .code PUBLIC fname fname PROC FRAME StackAllocate stack_size .ENDPROLOG cmp DWORD PTR __use_fma3_lib, 0 jne Lsin_fma3 Lsin_sse2: movd rdx, xmm0 xorpd xmm2, xmm2 ; zeroed out for later use mov r10,rdx mov r8d, 1 ; for determining region later on btr r10,63 ; r10 <-- |x| cmp r10,L_piby4 jb Lsin_sse2_absx_lt_piby4 Lsin_sse2_absx_nlt_piby4: ; common case mov r11,rdx shr r11,63 movd xmm0,r10 ; xmm0 <-- |x| cmp r10, QWORD PTR L_small_arg_cw jae Lsin_reduce_precise ; Note NaN/Inf will branch ; At this point we have |x| < L_small_arg_cw, which is currently 500000. ; Note that if |x| were too large, conversion of npi2 to integer would fail. ; We reduce the argument to be in a range from -pi/4 to +pi/4 ; by subtracting multiples of pi/2 movapd xmm2, xmm0 mulsd xmm2, L_2_by_pi movapd xmm4, xmm0 ; xexp = ax >> EXPSHIFTBITS_DP64; mov r9, r10 shr r9, 52 ; >>EXPSHIFTBITS_DP64 ; How many pi/2 is |x| a multiple of? ; npi2 = (int)(x * twobypi + 0.5); addsd xmm2, L_one_half ; npi2 movsd xmm3, L_real_piby2_1 cvttpd2dq xmm0, xmm2 ; convert npi2 to integer movsd xmm1, L_real_piby2_1tail cvtdq2pd xmm2, xmm0 ; npi2 back to double ; Subtract the multiple from x to get an extra-precision remainder ; rhead = x - npi2 * piby2_1; mulsd xmm3, xmm2 subsd xmm4, xmm3 ; rhead ; rtail = npi2 * piby2_1tail; mulsd xmm1, xmm2 ; rtail movd eax, xmm0 ; eax <-- npi2 ; GET_BITS_DP64(rhead-rtail, uy); ; originally only rhead movapd xmm0, xmm4 subsd xmm0, xmm1 movsd xmm3, L_real_piby2_2 movd rcx, xmm0 ; rcx <-- rhead - rtail movsd xmm5, L_real_piby2_2tail ; piby2_2tail ; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, ; xmm4=rhead, xmm5= temp for calc ; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); ; expdiff measures how close rhead - rtail is to |x| ; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer) shl rcx, 1 ; strip any sign bit shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 sub r9, rcx ; expdiff ;; if (expdiff > 15) cmp r9, 15 jle Lsin_sse2_cw_reduction_done ; Here the remainder is pretty small compared with x, which ; implies that x is a near multiple of pi/2 ; (x matches the multiple to at least 15 bits) ; So we do another stage of argument reduction. ; t = rhead; movapd xmm1, xmm4 ; rtail = npi2 * piby2_2; mulsd xmm3, xmm2 ; rhead = t - rtail; mulsd xmm5, xmm2 ; npi2 * piby2_2tail subsd xmm4, xmm3 ; rhead ; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); subsd xmm1, xmm4 ; t - rhead subsd xmm1, xmm3 ; -rtail subsd xmm5, xmm1 ; rtail ; r = rhead - rtail; movapd xmm0, xmm4 ;HARSHA ;xmm1=rtail movapd xmm1, xmm5 ; xmm1 <-- copy of rtail subsd xmm0, xmm5 ; xmm0=r, xmm4=rhead, xmm1=rtail Lsin_sse2_cw_reduction_done: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; if the input was close to a pi/2 multiple ; The original NAG code missed this trick. ; If the input is very close to n*pi/2 after reduction, so r < 2^-27, ; then the sin is either ~ 1.0 or ~r, to within 53 bits. ; Note: Unfortunately this introduces two jcc instructions close to each ; other and to other branches. As r < 2^-13 should be rather uncommon, it ; almost certainly costs more than it saves. - WAT ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; region = npi2 & 3; subsd xmm4, xmm0 ; rhead-r subsd xmm4, xmm1 ; rr = (rhead-r) - rtail Lsin_piby4: ; perform taylor series to calc sinx, sinx for |x| <= pi/4 ; x2 = r * r; ;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path ;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path movapd xmm3, xmm0 movapd xmm2, xmm0 mulsd xmm2, xmm0 ;x2 bt eax,0 jc Lsin_sse2_calc_cos ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; region 0 or 2 do a sin calculation movsd xmm3, __Lsinarray+50h ; s6 mulsd xmm3, xmm2 ; x2s6 movsd xmm5, __Lsinarray+20h ; s3 movsd QWORD PTR p_temp[rsp], xmm4 ; store xx movapd xmm1, xmm2 ; move for x4 mulsd xmm1, xmm2 ; x4 movsd QWORD PTR p_temp1[rsp], xmm0 ; store x mulsd xmm5, xmm2 ; x2s3 movapd xmm4, xmm0 ; move for x3 addsd xmm3, __Lsinarray+40h ; s5+x2s6 mulsd xmm1, xmm2 ; x6 mulsd xmm3, xmm2 ; x2(s5+x2s6) mulsd xmm4, xmm2 ; x3 addsd xmm5, __Lsinarray+10h ; s2+x2s3 mulsd xmm5, xmm2 ; x2(s2+x2s3) addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6) mulsd xmm2, L_one_half ; 0.5 *x2 movsd xmm0, QWORD PTR p_temp[rsp] ; load xx mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6)) addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3) mulsd xmm2, xmm0 ; 0.5 * x2 *xx addsd xmm3, xmm5 ; zs mulsd xmm4, xmm3 ; *x3 subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx addsd xmm0, xmm4 ; +xx addsd xmm0, QWORD PTR p_temp1[rsp] ; +x jmp Lsin_sse2_adjust_region ALIGN 16 Lsin_sse2_calc_cos: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; region 1 or 3 - do a cos calculation ; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6)))); mulsd xmm4, xmm0 ; x*xx movsd xmm5, L_one_half movsd xmm1, __Lcosarray+50h ; c6 movsd xmm0, __Lcosarray+20h ; c3 mulsd xmm5, xmm2 ; r = 0.5 *x2 movapd xmm3, xmm2 ; copy of x2 movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx mulsd xmm1, xmm2 ; c6*x2 mulsd xmm0, xmm2 ; c3*x2 subsd xmm5, L_one ; -t=r-1.0, trash r mulsd xmm3, xmm2 ; x4 addsd xmm1, __Lcosarray+40h ; c5+x2c6 addsd xmm0, __Lcosarray+10h ; c2+x2C3 addsd xmm5, L_one ; 1 + (-t), trash t mulsd xmm3, xmm2 ; x6 mulsd xmm1, xmm2 ; x2(c5+x2c6) mulsd xmm0, xmm2 ; x2(c2+x2C3) movapd xmm4, xmm2 ; copy of x2 mulsd xmm4, L_one_half ; r recalculate addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6) addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3) mulsd xmm2, xmm2 ; x4 recalculate subsd xmm5, xmm4 ; (1 + (-t)) - r mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6)) addsd xmm0, xmm1 ; zc subsd xmm4, L_one ; t relaculate subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx mulsd xmm0, xmm2 ; x4 * zc addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx) subsd xmm0, xmm4 ; result - (-t) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Lsin_sse2_adjust_region: ; positive or negative ; switch (region) shr eax, 1 mov ecx, eax and eax, r11d not ecx not r11d and ecx, r11d or eax, ecx and eax, 1 jnz Lsin_sse2_cleanup ;; if the original region 0, 1 and arg is negative, then we negate the result. ;; if the original region 2, 3 and arg is positive, then we negate the result. movapd xmm2, xmm0 xorpd xmm0, xmm0 subsd xmm0, xmm2 ALIGN 16 Lsin_sse2_cleanup: StackDeallocate stack_size ret ALIGN 16 Lsin_sse2_absx_lt_piby4: ; sin = sin_piby4(x, 0.0); ; x2 = r * r; movapd xmm2, xmm0 mulsd xmm2, xmm0 ; x2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; region 0 - do a sin calculation ; zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6)))); movsd xmm3, __Lsinarray+50h ; s6 mulsd xmm3, xmm2 ; x2s6 movsd xmm5, __Lsinarray+20h ; s3 movapd xmm1, xmm2 ; move for x4 mulsd xmm1, xmm2 ; x4 mulsd xmm5, xmm2 ; x2s3 movapd xmm4, xmm0 ; move for x3 addsd xmm3, __Lsinarray+40h ; s5+x2s6 mulsd xmm1, xmm2 ; x6 mulsd xmm3, xmm2 ; x2(s5+x2s6) mulsd xmm4, xmm2 ; x3 addsd xmm5, __Lsinarray+10h ; s2+x2s3 mulsd xmm5, xmm2 ; x2(s2+x2s3) addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6) mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6)) addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3) addsd xmm3, xmm5 ; zs mulsd xmm4, xmm3 ; *x3 addsd xmm0, xmm4 ; +x jmp Lsin_sse2_cleanup ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ALIGN 16 Lsin_reduce_precise: ; Reduce x into range [-pi/4, pih/4] cmp r10,L__inf_mask_64 jae Lsin_x_naninf mov QWORD PTR p_temp[rsp], r11 call __remainder_piby2_forAsm mov r11, QWORD PTR p_temp[rsp] ; At this point xmm0 has r, xmm1 has rr, rax has region movapd xmm4, xmm1 ; xmm4 <-- rr jmp Lsin_piby4 ; xmm0 = x, xmm4 = xx, eax= region ALIGN 16 Lsin_x_naninf: call fname_special StackDeallocate stack_size ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; From this point we assume that FMA3 and AVX hardware are present. ALIGN 16 Lsin_fma3: vmovq r9,xmm0 mov r10,r9 ; save x to get sign later btr r9,63 ; r9 <-- |x| cmp r9,L_piby4 jae Lsin_fma3_absx_nlt_piby4 ; Note that NaN will branch cmp r9,L_two_to_neg_13 jae Lsin_fma3_calc_sin_for_absx_lt_piby4 cmp r9,L_two_to_neg_27 jae Lsin_fma3_compute_x_xxx_0_1666 StackDeallocate stack_size ret ; sin x ~= x for |x| < 2^-27 ALIGN 16 Lsin_fma3_compute_x_xxx_0_1666: ; |x| in [2^-27,2^-13] vmulsd xmm1,xmm0,xmm0 ; xmm1l <-- x*x vmulsd xmm1,xmm1,xmm0 ; xmm1l <-- x*x*x vfnmadd231sd xmm0,xmm1,L_one_sixth ; xmm0l <-- x - x*x*x*(1/6) StackDeallocate stack_size ret ALIGN 16 Lsin_fma3_calc_sin_for_absx_lt_piby4: ; |x| in [2^-13,pi/4] vmovsd xmm5,__Lsinarray+050h vmulsd xmm3,xmm0,xmm0 ; xmm3l <-- x^2 vfmadd213sd xmm5,xmm3,__Lsinarray+040h vfmadd213sd xmm5,xmm3,__Lsinarray+030h vfmadd213sd xmm5,xmm3,__Lsinarray+020h vfmadd213sd xmm5,xmm3,__Lsinarray+010h vmulsd xmm4,xmm0,xmm3 ; xmm4l <-- x^3 vfmadd213sd xmm5,xmm3,__Lsinarray vfmadd231sd xmm0,xmm4,xmm5 ; xmm0l <-- x + x^3 p(x^2) StackDeallocate stack_size ret ALIGN 16 Lsin_fma3_absx_nlt_piby4: ; !(|x| < pi/4) ; here r9 has |x| cmp r9,L__inf_mask_64 jae Lsin_x_naninf ;Lrange_reduce: ;; unused label vmovq xmm0,r9 ; xmm0 <-- |x| cmp r9,L_small_arg_bdl jae Lsin_fma3_do_general_arg_reduction ; Note that __remainder_piby2_fma3 conventions are ; on input ; |x| is in xmm0 ; on output ; r is in xmm0 ; rr is in xmm1 ; region of |x| is in rax ; Boldo-Daumas-Li reduction for reasonably small |x| call __remainder_piby2_fma3_bdl Lsin_fma3_exit_s: bt rax,0 vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x jc Lsin_fma3_calc_cos Lsin_fma3_calc_sin: ;; unused label ; region 0 or 2 ; compute the sine of r+rr, where this sum is in [-pi/4,pi/4] vmovsd xmm5,__Lsinarray+050h vfmadd213sd xmm5,xmm3,__Lsinarray+040h vfmadd213sd xmm5,xmm3,__Lsinarray+030h vfmadd213sd xmm5,xmm3,__Lsinarray+020h vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r vmulsd xmm2,xmm3,xmm2 vsubsd xmm2,xmm2,xmm1 vfnmadd231sd xmm2, xmm4,__Lsinarray vsubsd xmm0,xmm0,xmm2 jmp Lsin_fma3_exit_s_1 ALIGN 16 Lsin_fma3_calc_cos: ; region 1 or 3 ; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4] vmovapd xmm2,L_one vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5 vsubsd xmm2,xmm2,xmm4 ; 1-t vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r vmovsd xmm5,__Lcosarray+050h vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2 vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1 vfmadd213sd xmm5,xmm3,__Lcosarray+040h vfmadd213sd xmm5,xmm3,__Lcosarray+030h vfmadd213sd xmm5,xmm3,__Lcosarray+020h vfmadd213sd xmm5,xmm3,__Lcosarray+010h vfmadd213sd xmm5,xmm3,__Lcosarray vfmadd213sd xmm5,xmm1,xmm2 vaddsd xmm0,xmm5,xmm4 Lsin_fma3_exit_s_1: xor r8,r8 ; prepare r8 for cmov and r10,L_signbit ; isolate original sign of x bt eax,1 cmovc r8,L_signbit xor r8,r10 vmovq xmm3,r8 vxorpd xmm0,xmm0,xmm3 StackDeallocate stack_size ret ALIGN 16 Lsin_fma3_do_general_arg_reduction: ; argument reduction for general x ; NOTE: the BDL argument reduction routine does not touch r10, ; but the general-purpose reduction does. mov QWORD PTR [save_r10+rsp], r10 call __remainder_piby2_fma3 mov r10, QWORD PTR [save_r10+rsp] jmp Lsin_fma3_exit_s fname endp END