;
; MIT License
; -----------
; 
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
; 
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
; 
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
; 
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; log.asm
;
; An implementation of the log libm function.
;
; Prototype:
;
;     double log(double x);
;

;
;   Algorithm:
;
;   Based on:
;   Ping-Tak Peter Tang
;   "Table-driven implementation of the logarithm function in IEEE
;   floating-point arithmetic"
;   ACM Transactions on Mathematical Software (TOMS)
;   Volume 16, Issue 4 (December 1990)
;
;
;   x very close to 1.0 is handled differently, for x everywhere else
;   a brief explanation is given below
;
;   x = (2^m)*A
;   x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9))
;   x = (2^m)*2*(G/2+g/2)
;   x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10))
;
;   Y = (2^(-1))*(2^(-m))*(2^m)*A
;   Now, range of Y is: 0.5 <= Y < 1
;
;   F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
;   Now, range of F is: 256 <= F <= 512
;   F = F / 512
;   Now, range of F is: 0.5 <= F <= 1
;
;   f = -(Y-F), with (f <= 2^(-10))
;
;   log(x) = m*log(2) + log(2) + log(F-f)
;   log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
;   log(x) = m*log(2) + log(2*F) + log(1-r)
;
;   r = (f/F), with (r <= 2^(-9))
;   r = f*(1/F) with (1/F) precomputed to avoid division
;
;   log(x) = m*log(2) + log(G) - poly
;
;   log(G) is precomputed
;   poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6))
;
;   log(2) and log(G) need to be maintained in extra precision
;   to avoid losing precision in the calculations
;

.const
ALIGN 16

__real_ninf         DQ 0fff0000000000000h   ; -inf
                    DQ 0000000000000000h
__real_inf          DQ 7ff0000000000000h    ; +inf
                    DQ 0000000000000000h
__real_neg_qnan     DQ 0fff8000000000000h   ; neg qNaN
                    DQ 0000000000000000h
__real_qnanbit      DQ 0008000000000000h
                    DQ 0000000000000000h
__real_min_norm     DQ 0010000000000000h
                    DQ 0000000000000000h
__real_mant         DQ 000FFFFFFFFFFFFFh    ; mantissa bits
                    DQ 0000000000000000h
__mask_1023         DQ 00000000000003ffh
                    DQ 0000000000000000h
__mask_001          DQ 0000000000000001h
                    DQ 0000000000000000h

__mask_mant_all8    DQ 000ff00000000000h
                    DQ 0000000000000000h
__mask_mant9        DQ 0000080000000000h
                    DQ 0000000000000000h

__real_two          DQ 4000000000000000h ; 2
                    DQ 0000000000000000h

__real_one          DQ 3ff0000000000000h ; 1
                    DQ 0000000000000000h

__real_near_one_lt  DQ 3fee000000000000h ; .9375
                    DQ 0000000000000000h

__real_near_one_gt  DQ 3ff1000000000000h ; 1.0625
                    DQ 0000000000000000h

__real_half         DQ 3fe0000000000000h ; 1/2
                    DQ 0000000000000000h

__mask_100          DQ 0000000000000100h
                    DQ 0000000000000000h

__real_1_over_512   DQ 3f60000000000000h
                    DQ 0000000000000000h

__real_1_over_2     DQ 3fe0000000000000h
                    DQ 0000000000000000h
__real_1_over_3     DQ 3fd5555555555555h
                    DQ 0000000000000000h
__real_1_over_4     DQ 3fd0000000000000h
                    DQ 0000000000000000h
__real_1_over_5     DQ 3fc999999999999ah
                    DQ 0000000000000000h
__real_1_over_6     DQ 3fc5555555555555h
                    DQ 0000000000000000h

__mask_1023_f       DQ 0c08ff80000000000h
                    DQ 0000000000000000h

__mask_2045         DQ 00000000000007fdh
                    DQ 0000000000000000h

__real_threshold    DQ 3fb0000000000000h ; .0625
                    DQ 0000000000000000h

__real_notsign      DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
                    DQ 0000000000000000h

__real_ca1          DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
                    DQ 0000000000000000h
__real_ca2          DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
                    DQ 0000000000000000h
__real_ca3          DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
                    DQ 0000000000000000h
__real_ca4          DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
                    DQ 0000000000000000h
__real_log2_lead    DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01
                    DQ 00000000000000000h
__real_log2_tail    DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08
                    DQ 00000000000000000h

; these codes and the ones in the corresponding .c file have to match
__flag_x_zero          DD 00000001
__flag_x_neg           DD 00000002
__flag_x_nan           DD 00000003


EXTRN __log_256_lead:QWORD
EXTRN __log_256_tail:QWORD
EXTRN __log_F_inv_qword:QWORD
EXTRN __use_fma3_lib:DWORD


fname           TEXTEQU <log>
fname_special   TEXTEQU <_log_special>

; define local variable storage offsets

save_xmm6       EQU     20h
dummy_space     EQU     40h

stack_size      EQU     58h

include fm.inc

; external function
EXTERN fname_special:PROC

.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
    StackAllocate stack_size
    SaveXmm      xmm6, save_xmm6
    .ENDPROLOG

    cmp          DWORD PTR __use_fma3_lib, 0
    jne          Llog_fma3

Llog_sse2:

    ; compute exponent part
    movdqa      xmm3, xmm0
    movapd      xmm4, xmm0
    psrlq       xmm3, 52
    movd        rax, xmm0
    psubq       xmm3, XMMWORD PTR __mask_1023

    ;  NaN or inf
    mov         rcx, rax
    btr         rcx, 63
    cmp         rcx, QWORD PTR __real_inf
    jae         __x_is_inf_or_nan

    movdqa      xmm2, xmm0
    cvtdq2pd    xmm6, xmm3 ; xexp


    pand        xmm2, XMMWORD PTR __real_mant
    subsd       xmm4, QWORD PTR __real_one

    comisd      xmm6, QWORD PTR __mask_1023_f
    je          __denormal_adjust

__continue_common:    

    andpd       xmm4, XMMWORD PTR __real_notsign
    ; compute index into the log tables
    mov         r9, rax
    and         rax, QWORD PTR __mask_mant_all8
    and         r9, QWORD PTR __mask_mant9
    shl         r9, 1
    add         rax, r9
    movd        xmm1, rax

    ; near one codepath
    comisd      xmm4, QWORD PTR __real_threshold
    jb          __near_one

    ; F, Y
    shr         rax, 44
    por         xmm2, XMMWORD PTR __real_half
    por         xmm1, XMMWORD PTR __real_half
    lea         r9, __log_F_inv_qword

    ; check for negative numbers or zero
    xorpd       xmm5, xmm5
    comisd      xmm0, xmm5
    jbe         __x_is_zero_or_neg

    ; f = F - Y, r = f * inv
    subsd       xmm1, xmm2                       ; xmm1 <-- f = F - Y
    mulsd       xmm1, QWORD PTR [r9+rax*8]       ; xmm1 <-- r = f * inv

    movapd      xmm2, xmm1                       ; xmm2 <-- copy of r
    movapd      xmm0, xmm1                       ; xmm0 <-- copy of r
    lea         r9, QWORD PTR __log_256_lead

    ; poly
    movsd       xmm3, QWORD PTR __real_1_over_6
    movsd       xmm1, QWORD PTR __real_1_over_3
    mulsd       xmm3, xmm2                      ; xmm3 <-- r/6
    mulsd       xmm1, xmm2                      ; xmm1 <-- r/3
    mulsd       xmm0, xmm2                      ; xmm0 <-- r*r
    movapd      xmm4, xmm0                      ; xmm4 <-- copy of r*r
    addsd       xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5
    addsd       xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2
    mulsd       xmm4, xmm0                      ; xmm4 <-- r^4
    mulsd       xmm3, xmm2                      ; xmm3 <-- (r/6 + 1/5)*r
    mulsd       xmm1, xmm0                      ; xmm1 <-- (r/3 + 1/2)*r^2
    addsd       xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4
    addsd       xmm1, xmm2                      ; xmm1 <-- (r/3 + 1/2)*r^2 + r
    mulsd       xmm3, xmm4                      ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4
    addsd       xmm1, xmm3                      ; xmm1 <-- poly

    ; m*log(2)_tail + log(G)_tail - poly
    movsd       xmm5, QWORD PTR __real_log2_tail
    mulsd       xmm5, xmm6                      ; xmm5 <-- m*log2_tail
    subsd       xmm5, xmm1                      ; xmm5 <-- m*log2_tail - poly

    movsd       xmm0, QWORD PTR [r9+rax*8]      ; xmm0 <-- log(G)_lead
    lea         rdx, QWORD PTR __log_256_tail
    movsd       xmm2, QWORD PTR [rdx+rax*8]     ; xmm2 <-- log(G)_tail
    addsd       xmm2, xmm5                      ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail

    movsd       xmm4, QWORD PTR __real_log2_lead
    mulsd       xmm4, xmm6                      ; xmm4 <-- m*log2_lead
    addsd       xmm0, xmm4                      ; xmm0 <-- m*log2_lead + log(G)_lead

    addsd       xmm0, xmm2        ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly

    RestoreXmm  xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN 16
__near_one:

    ; r = x - 1.0
    movsd       xmm2, QWORD PTR __real_two
    subsd       xmm0, QWORD PTR __real_one ; r

    addsd       xmm2, xmm0
    movsd       xmm1, xmm0
    divsd       xmm1, xmm2 ; r/(2+r) = u/2

    movsd       xmm4, QWORD PTR __real_ca2
    movsd       xmm5, QWORD PTR __real_ca4

    movsd       xmm6, xmm0
    mulsd       xmm6, xmm1 ; correction

    addsd       xmm1, xmm1 ; u
    movsd       xmm2, xmm1

    mulsd       xmm2, xmm1 ; u^2

    mulsd       xmm4, xmm2
    mulsd       xmm5, xmm2

    addsd       xmm4, __real_ca1
    addsd       xmm5, __real_ca3

    mulsd       xmm2, xmm1 ; u^3
    mulsd       xmm4, xmm2

    mulsd       xmm2, xmm2
    mulsd       xmm2, xmm1 ; u^7
    mulsd       xmm5, xmm2

    addsd       xmm4, xmm5
    subsd       xmm4, xmm6
    addsd       xmm0, xmm4

    RestoreXmm  xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN 16
__denormal_adjust:
    por         xmm2, XMMWORD PTR __real_one
    subsd       xmm2, QWORD PTR __real_one
    movsd       xmm5, xmm2
    pand        xmm2, XMMWORD PTR __real_mant
    movd        rax, xmm2
    psrlq       xmm5, 52
    psubd       xmm5, XMMWORD PTR __mask_2045
    cvtdq2pd    xmm6, xmm5
    jmp         __continue_common

ALIGN 16
__x_is_zero_or_neg:
    jne         __x_is_neg

    movsd       xmm1, QWORD PTR __real_ninf
    mov         r8d, DWORD PTR __flag_x_zero
    call        fname_special
    jmp         __finish

ALIGN 16
__x_is_neg:

    movsd       xmm1, QWORD PTR __real_neg_qnan
    mov         r8d, DWORD PTR __flag_x_neg
    call        fname_special
    jmp         __finish

ALIGN 16
__x_is_inf_or_nan:

    cmp         rax, QWORD PTR __real_inf
    je          __finish

    cmp         rax, QWORD PTR __real_ninf
    je          __x_is_neg

    or          rax, QWORD PTR __real_qnanbit
    movd        xmm1, rax
    mov         r8d, DWORD PTR __flag_x_nan
    call        fname_special
    jmp         __finish

ALIGN 16
__finish:
    RestoreXmm  xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN 16
Llog_fma3:
    ; compute exponent part
    xor          rax,rax
    vpsrlq       xmm3,xmm0,52
    vmovq        rax,xmm0
    vpsubq       xmm3,xmm3,XMMWORD PTR __mask_1023
    vcvtdq2pd    xmm6,xmm3 ; xexp

    ;  NaN or inf
    vpand        xmm5,xmm0,XMMWORD PTR __real_inf
    vcomisd      xmm5,QWORD PTR __real_inf
    je           Llog_fma3_x_is_inf_or_nan

    ; check for negative numbers or zero
    vpxor        xmm5,xmm5,xmm5
    vcomisd      xmm0,xmm5
    jbe          Llog_fma3_x_is_zero_or_neg

    vpand        xmm2,xmm0,XMMWORD PTR __real_mant
    vsubsd       xmm4,xmm0,QWORD PTR __real_one

    vcomisd      xmm6,QWORD PTR __mask_1023_f
    je           Llog_fma3_denormal_adjust

Llog_fma3_continue_common:
    ; compute index into the log tables
    vpand        xmm1,xmm0,XMMWORD PTR __mask_mant_all8
    vpand        xmm3,xmm0,XMMWORD PTR __mask_mant9
    vpsllq       xmm3,xmm3,1
    vpaddq       xmm1,xmm3,xmm1
    vmovq        rax,xmm1

    ; near one codepath
    vpand        xmm4,xmm4,XMMWORD PTR __real_notsign
    vcomisd      xmm4,QWORD PTR __real_threshold
    jb           Llog_fma3_near_one

    ; F,Y
    shr          rax,44
    vpor         xmm2,xmm2,XMMWORD PTR __real_half
    vpor         xmm1,xmm1,XMMWORD PTR __real_half
    lea          r9,QWORD PTR __log_F_inv_qword

    ; f = F - Y,r = f * inv
    vsubsd       xmm1,xmm1,xmm2
    vmulsd       xmm1,xmm1,QWORD PTR[r9 + rax * 8]

    lea          r9,QWORD PTR __log_256_lead

    ; poly
    vmulsd       xmm0,xmm1,xmm1           ; r*r
    vmovsd       xmm3,QWORD PTR __real_1_over_6
    vmovsd       xmm5,QWORD PTR __real_1_over_3
    vfmadd213sd  xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
    vfmadd213sd  xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3
    vmovsd       xmm4,xmm0,xmm0
    vfmadd213sd  xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)

    vmulsd       xmm4,xmm0,xmm0           ; r*r*r*r
    vfmadd231sd  xmm1,xmm5,xmm0           ; r*r*(1/2+r*1/3) + r
    vfmadd231sd  xmm1,xmm3,xmm4

    ; m*log(2) + log(G) - poly
    vmovsd       xmm5,QWORD PTR __real_log2_tail
    vfmsub213sd  xmm5,xmm6,xmm1

    vmovsd       xmm0,QWORD PTR[r9 + rax * 8]
    lea          rdx,QWORD PTR __log_256_tail
    vmovsd       xmm1,QWORD PTR[rdx + rax * 8]
    vaddsd       xmm1,xmm1,xmm5

    vfmadd231sd  xmm0,xmm6,QWORD PTR __real_log2_lead

    vaddsd       xmm0,xmm0,xmm1
    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret


ALIGN  16
Llog_fma3_near_one:

    ; r = x - 1.0
    vmovsd       xmm3,QWORD PTR __real_two
    vsubsd       xmm0,xmm0,QWORD PTR __real_one ; r

    vaddsd       xmm3,xmm3,xmm0
    vdivsd       xmm1,xmm0,xmm3           ; r/(2+r) = u/2

    vmovsd       xmm4,QWORD PTR __real_ca2
    vmovsd       xmm5,QWORD PTR __real_ca4

    vmulsd       xmm3,xmm0,xmm1           ; correction
    vaddsd       xmm1,xmm1,xmm1           ; u

    vmulsd       xmm2,xmm1,xmm1           ; u^2
    vfmadd213sd  xmm4,xmm2,QWORD PTR __real_ca1
    vfmadd213sd  xmm5,xmm2,QWORD PTR __real_ca3

    vmulsd       xmm2,xmm2,xmm1           ; u^3
    vmulsd       xmm4,xmm4,xmm2

    vmulsd       xmm2,xmm2,xmm2
    vmulsd       xmm2,xmm2,xmm1           ; u^7

    vfmadd231sd  xmm4,xmm5,xmm2
    vsubsd       xmm4,xmm4,xmm3
    vaddsd       xmm0,xmm0,xmm4

    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret


Llog_fma3_denormal_adjust:
    vpor         xmm2,xmm2,XMMWORD PTR __real_one
    vsubsd       xmm2,xmm2,QWORD PTR __real_one
    vpsrlq       xmm5,xmm2,52
    vpand        xmm2,xmm2,XMMWORD PTR __real_mant
    vmovapd      xmm0,xmm2
    vpsubd       xmm5,xmm5,XMMWORD PTR __mask_2045
    vcvtdq2pd    xmm6,xmm5
    jmp          Llog_fma3_continue_common

ALIGN  16
Llog_fma3_x_is_zero_or_neg:
    jne          Llog_fma3_x_is_neg
    vmovsd       xmm1,QWORD PTR __real_ninf
    mov          r8d,DWORD PTR __flag_x_zero
    call         fname_special

    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN  16
Llog_fma3_x_is_neg:

    vmovsd       xmm1,QWORD PTR __real_neg_qnan
    mov          r8d,DWORD PTR __flag_x_neg
    call         fname_special

    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN  16
Llog_fma3_x_is_inf_or_nan:

    cmp          rax,QWORD PTR __real_inf
    je           Llog_fma3_finish

    cmp          rax,QWORD PTR __real_ninf
    je           Llog_fma3_x_is_neg

    or           rax,QWORD PTR __real_qnanbit
    vmovq        xmm1,rax
    mov          r8d,DWORD PTR __flag_x_nan
    call         fname_special

ALIGN  16
Llog_fma3_finish:
    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret
fname       endp

END