;
; MIT License
; -----------
; 
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
; 
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
; 
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
; 
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; exp.asm
;
; An implementation of the exp libm function.
;
; Prototype:
;
;     double exp(double x);
;

;
;   Algorithm:
;
;   e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
;
;   x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
;   n = 64*m + j,   0 <= j < 64
;
;   e^x = 2^((64*m + j + f)/64)
;       = (2^m) * (2^(j/64)) * 2^(f/64)
;       = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
;
;   f = x*(64/ln(2)) - n
;   r = f*(ln(2)/64) = x - n*(ln(2)/64)
;
;   e^x = (2^m) * (2^(j/64)) * e^r
;
;   (2^(j/64)) is precomputed
;
;   e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
;   e^r = 1 + q
;
;   q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
;

.const
ALIGN 16
; these codes and the ones in the corresponding .c file have to match
__flag_x_nan            DD 00000001
__flag_y_zero           DD 00000002
__flag_y_inf            DD 00000003

ALIGN 16

L__real_1_by_720              DQ 03f56c16c16c16c17h
                              DQ 03f56c16c16c16c17h   ; 1/720
L__real_1_by_120              DQ 03f81111111111111h
                              DQ 03f81111111111111h   ; 1/120
L__real_1_by_6                DQ 03fc5555555555555h
                              DQ 03fc5555555555555h   ; 1/6
L__real_1_by_2                DQ 03fe0000000000000h
                              DQ 03fe0000000000000h   ; 1/2
L__real_1_by_24               DQ 03fa5555555555555h
                              DQ 03fa5555555555555h   ; 1/24

ALIGN 16
L__log2_by_64_mtail_mhead     DQ 0bf862e42fefa0000h, 0bd1cf79abc9e3b39h
L__ln_of_smallest_normal      DQ 0C086232BDD7ABCD2h
L__zero                       DQ 00000000000000000h
L__max_exp_arg                DQ 040862e42fefa39efh   ;  709.78271289338397
L__denormal_tiny_threshold    DQ 0c0874046dfefd9d0h   ; -744.03460681327306
L__min_exp_arg                DQ 0c0874910d52d3051h   ; -745.13321910194111
L__real_64_by_log2            DQ 040571547652b82feh   ; 64/ln(2)
L__positive_infinity          DQ 07ff0000000000000h
L__negative_infinity          DQ 0fff0000000000000h
L__real_qnanbit               DQ 0008000000000000h    ; qnan set bit
L__real_x_near0_threshold     DQ 3c00000000000000h
L__log2_by_64_mhead           DQ 0bf862e42fefa0000h
L__log2_by_64_mtail           DQ 0bd1cf79abc9e3b39h
L__real_smallest_denormal     DQ 00000000000000001h
L__real_one                   DQ 03ff0000000000000h
L__2_to_neg_26                DQ 03E50000000000000h   ; 2^-26
L__min_normal                 DQ 00010000000000000h   ; smallest normal


EXTRN __two_to_jby64_table:QWORD
EXTRN __two_to_jby64_head_table:QWORD
EXTRN __two_to_jby64_tail_table:QWORD
EXTRN __use_fma3_lib:DWORD

; make room for fname_special to save things
dummy_space     EQU    020h
stack_size      EQU    038h

include fm.inc

fname           TEXTEQU <exp>
fname_special   TEXTEQU <_exp_special>

;Define name and any external functions being called
EXTERN       fname_special      : PROC

.code
PUBLIC fname
fname PROC FRAME
    StackAllocate stack_size
    .ENDPROLOG

    ; We need to avoid unwanted exceptions from a NaN argument.
    ; It could be argued that a signaling NaN should raise an exception,
    ; but the existing library doesn't.  At any rate, the comparison operations
    ; don't seem to like quiet NaN either, so...
    movd         rdx, xmm0
    btr          rdx, 63
    cmp          rdx, L__positive_infinity
    jge          Lexp_x_is_nan_or_inf

    cmp          DWORD PTR __use_fma3_lib, 0
    jne          Lexp_fma3

    movapd       xmm2, xmm0
    movapd       xmm3, xmm0

    ; Some hardware has problems with too many branches in a single
    ; 16- or 32-byte window, so let's peel off the common case into
    ; a single branch.
    cmplesd      xmm2, L__max_exp_arg  ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
    cmpnltsd     xmm3, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
    andps        xmm2, xmm3     ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
    ucomisd      xmm2, xmm2   ; note that FFF... is NaN, so this comparison should set PF for in-range x
    jp           Lexp_y_is_finite

    ucomisd      xmm0,   L__max_exp_arg
    ja           Lexp_y_is_inf
    ; Since we peeled off the cases with normal result,
    ; there is only one possibility remaining:
    jmp          Lexp_y_is_denormal_or_zero

ALIGN 16
Lexp_y_is_finite:
    ; x * (64/ln(2))
    movapd       xmm1,   xmm0
    btr          rdx, 63                  ; rdx <-- |x|
    cmp          rdx, L__2_to_neg_26
    jbe          Lexp_return_1_plus_x
    mulsd        xmm1,   L__real_64_by_log2

    ; n = int( x * (64/ln(2)) )
    cvttpd2dq    xmm2, xmm1               ; xmm2 = (int)n
    cvtdq2pd     xmm1, xmm2               ; xmm1 = (double)n
    movd         ecx, xmm2
    movapd       xmm2, xmm1
    
    ; r1 = x - n * ln(2)/64 head
    mulsd        xmm1, L__log2_by_64_mhead

    ; j = n & 0x3f
    mov          rax, 03fh
    and          eax, ecx                 ; eax = j
    ; m = (n - j) / 64
    sar          ecx,    6                ; ecx = m


    ; r2 = - n * ln(2)/64 tail
    mulsd        xmm2, L__log2_by_64_mtail
    addsd        xmm0, xmm1               ; xmm0 = r1

    ; r1+r2
    addsd        xmm2, xmm0               ; xmm2 = r

    ; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
    ; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
    movapd       xmm3, L__real_1_by_720   ; xmm3 = 1/720
    mulsd        xmm3, xmm2               ; xmm3 = r*1/720
    movapd       xmm0, L__real_1_by_6     ; xmm0 = 1/6
    movapd       xmm1, xmm2               ; xmm1 = r
    mulsd        xmm0, xmm2               ; xmm0 = r*1/6
    addsd        xmm3, L__real_1_by_120   ; xmm3 = 1/120 + (r*1/720)
    mulsd        xmm1, xmm2               ; xmm1 = r*r
    addsd        xmm0, L__real_1_by_2     ; xmm0 = 1/2 + (r*1/6)
    movapd       xmm4, xmm1               ; xmm4 = r*r
    mulsd        xmm4, xmm1               ; xmm4 = (r*r) * (r*r)
    mulsd        xmm3, xmm2               ; xmm3 = r * (1/120 + (r*1/720))
    mulsd        xmm0, xmm1               ; xmm0 = (r*r)*(1/2 + (r*1/6))
    addsd        xmm3, L__real_1_by_24    ; xmm3 = 1/24 + (r * (1/120 + (r*1/720)))
    addsd        xmm0, xmm2               ; xmm0 = r + ((r*r)*(1/2 + (r*1/6)))
    mulsd        xmm3, xmm4               ; xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
    addsd        xmm0, xmm3               ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))

    ;(f)*(q) + f2 + f1
    cmp          ecx, 0fffffc02h          ; -1022
    lea          rdx,  __two_to_jby64_table
    lea          r11,  __two_to_jby64_tail_table
    lea          r10,  __two_to_jby64_head_table
    mulsd        xmm0, QWORD PTR [rdx+rax * 8 ]
    addsd        xmm0, QWORD PTR [r11+rax * 8 ]
    addsd        xmm0, QWORD PTR [r10+rax * 8 ]

    jle          Lexp_process_denormal
Lexp_process_normal:
    shl          rcx,    52
    movd         xmm2,   rcx
    paddq        xmm0,   xmm2
    StackDeallocate stack_size
    ret

ALIGN 16
Lexp_process_denormal:
    jl           Lexp_process_true_denormal
    ucomisd      xmm0,   L__real_one
    jae          Lexp_process_normal
Lexp_process_true_denormal:
    ; here ( e^r < 1 and m = -1022 ) or m <= -1023
    add          ecx, 1074
    mov          rax, 1
    shl          rax, cl
    movd         xmm2, rax
    mulsd        xmm0, xmm2
    jmp          Lexp_finish

Lexp_y_is_one:
    movsd        xmm0, L__real_one
    jmp          Lexp_finish

ALIGN 16
Lexp_x_is_nan_or_inf:
    movd         rax, xmm0
    cmp          rax, L__positive_infinity
    je           Lexp_finish
    cmp          rax, L__negative_infinity
    je           Lexp_return_zero_without_exception
    or           rax, L__real_qnanbit
    movd         xmm1, rax
    mov          r8d, __flag_x_nan
    call         fname_special
    jmp          Lexp_finish

ALIGN 16
Lexp_y_is_inf:
    mov          rax, 07ff0000000000000h
    movd         xmm1, rax
    mov          r8d, __flag_y_inf
    call         fname_special
    jmp          Lexp_finish

ALIGN 16
Lexp_y_is_denormal_or_zero:
    ucomisd      xmm0, L__min_exp_arg
    jbe          Lexp_y_is_zero
    movapd       xmm0, L__real_smallest_denormal
    jmp          Lexp_finish

ALIGN 16
Lexp_y_is_zero:
    pxor         xmm1, xmm1
    mov          r8d, __flag_y_zero
    call         fname_special
    jmp          Lexp_finish

ALIGN 16
Lexp_return_1_plus_x:
    cmp          rdx, L__min_normal
    jbe          Lexp_return_1_plus_eps
    addsd        xmm0, L__real_one
    StackDeallocate stack_size
    ret          0

; Some hardware really does not like subnormals.  Try to avoid them.
ALIGN 16
Lexp_return_1_plus_eps:
    movsd        xmm0, L__real_one
    addsd        xmm0, L__min_normal         ; make sure inexact is set
    StackDeallocate stack_size
    ret          0

ALIGN 16
Lexp_return_zero_without_exception:
    pxor         xmm0,xmm0
    StackDeallocate stack_size
    ret          0


ALIGN 16
Lexp_finish:
    StackDeallocate stack_size
    ret          0

ALIGN 16
Lexp_fma3:
    ; Some hardware has problems with too many branches in a single
    ; 16- or 32-byte window, so let's peel off the common case into
    ; a single branch.
    vcmplesd     xmm2, xmm0, L__max_exp_arg  ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
    vcmpnltsd    xmm3, xmm0, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
    vandps       xmm2, xmm3, xmm2  ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
    vucomisd     xmm2, xmm2   ; note that FFF... is NaN, so this comparison should set PF for in-range x
    jp           Lexp_fma3_y_is_finite

    vucomisd     xmm0,L__max_exp_arg
    ja           Lexp_fma3_y_is_inf
    ; Since we peeled off the cases with normal result,
    ; there is only one possibility remaining:
    jmp          Lexp_fma3_y_is_zero

;   vpsllq       xmm1, xmm0, 1
;   vpsrlq       xmm1, xmm1, 1
;   vucomisd     xmm1, L__real_x_near0_threshold   ; 2^-63
;   jb           Lexp_fma3_y_is_one

ALIGN 16
Lexp_fma3_y_is_finite:
    vmovq        rdx, xmm0
    btr          rdx, 63                  ; rdx <-- |x|
    cmp          rdx, L__2_to_neg_26
    jbe          Lexp_fma3_return_1_plus_x

    ; x * (64/ln(2))
    vmulsd       xmm1,xmm0,L__real_64_by_log2

    ; n = int( x * (64/ln(2)) )
    vcvttpd2dq   xmm2,xmm1 ;xmm2 = (int)n
    vcvtdq2pd    xmm1,xmm2 ;xmm1 = (double)n ;can use round
    vmovd        ecx,xmm2

    ; r1 = x - n * ln(2)/64 head
    ; r2 = - n * ln(2)/64 tail
    ; r = r1+r2
    vmovlhps     xmm1,xmm1,xmm1 ;xmm1 = (double (double)n,)n
    vmovq        xmm0,xmm0 ;xmm0 = 0,x ;zero out the upper part
    vfmadd132pd  xmm1,xmm0,L__log2_by_64_mtail_mhead
    vhaddpd      xmm2,xmm1,xmm1 ;xmm2 = r,r

    ;j = n & 03fh
    mov          rax,03fh
    and          eax,ecx ;eax = j
    ; m = (n - j) / 64
    sar          ecx,6 ;ecx = m

    ; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
    ; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
    vmovapd      xmm3,L__real_1_by_720
    vfmadd213sd  xmm3,xmm2,L__real_1_by_120
    vfmadd213sd  xmm3,xmm2,L__real_1_by_24
    vfmadd213sd  xmm3,xmm2,L__real_1_by_6
    vfmadd213sd  xmm3,xmm2,L__real_1_by_2
    vmulsd       xmm0,xmm2,xmm2
    vfmadd213sd  xmm0,xmm3,xmm2

    ; (f)*(q) + f2 + f1
    cmp          ecx,0fffffc02h ; -1022
    lea          rdx,__two_to_jby64_table
    lea          r11,__two_to_jby64_tail_table
    lea          r10,__two_to_jby64_head_table
    vmulsd       xmm2,xmm0,QWORD PTR[rdx + rax * 8]
    vaddsd       xmm1,xmm2,QWORD PTR[r11 + rax * 8]
    vaddsd       xmm0,xmm1,QWORD PTR[r10 + rax * 8]

    jle          Lexp_fma3_process_denormal
Lexp_fma3_process_normal:
    shl          rcx,52
    vmovq        xmm2,rcx
    vpaddq       xmm0,xmm0,xmm2
    StackDeallocate stack_size
    ret

ALIGN 16
Lexp_fma3_process_denormal:
    jl           Lexp_fma3_process_true_denormal
    vucomisd     xmm0,L__real_one
    jae          Lexp_fma3_process_normal
Lexp_fma3_process_true_denormal:
    ; here ( e^r < 1 and m = -1022 ) or m <= -1023
    add          ecx,1074
    mov          rax,1
    shl          rax,cl
    vmovq        xmm2,rax
    vmulsd       xmm0,xmm0,xmm2
    jmp          Lexp_fma3_finish

Lexp_fma3_y_is_one:
    vmovsd       xmm0, L__real_one
    jmp          Lexp_fma3_finish


ALIGN 16
Lexp_fma3_y_is_inf:
    mov          rax,07ff0000000000000h
    vmovq        xmm1,rax
    mov          r8d,__flag_y_inf
    call         fname_special
    jmp          Lexp_fma3_finish

ALIGN 16
Lexp_fma3_return_1_plus_x:
    cmp          rdx, L__min_normal
    jbe          Lexp_fma3_return_1_plus_eps
    vaddsd       xmm0, xmm0, L__real_one
    StackDeallocate stack_size
    ret          0

; Some hardware really does not like subnormals.  Try to avoid them.
ALIGN 16
Lexp_fma3_return_1_plus_eps:
    vmovsd       xmm0, L__real_one
    vaddsd       xmm0, xmm0, L__min_normal         ; make sure inexact is set
    StackDeallocate stack_size
    ret          0

ALIGN 16
Lexp_fma3_y_is_zero:
    vpxor        xmm1,xmm1,xmm1
    mov          r8d,__flag_y_zero
    call         fname_special
    jmp          Lexp_fma3_finish

ALIGN 16
Lexp_fma3_return_zero_without_exception:
    vpxor        xmm0,xmm0,xmm0

ALIGN 16
Lexp_fma3_finish:
    StackDeallocate stack_size
    ret

fname            endp
END