reactos/sdk/lib/crt/math/libm_sse2/log.asm

;
; MIT License
; -----------
; 
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
; 
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
; 
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
; 
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
; log.asm
;
; An implementation of the log libm function.
;
; Prototype:
;
;     double log(double x);
;

;
;   Algorithm:
;
;   Based on:
;   Ping-Tak Peter Tang
;   "Table-driven implementation of the logarithm function in IEEE
;   floating-point arithmetic"
;   ACM Transactions on Mathematical Software (TOMS)
;   Volume 16, Issue 4 (December 1990)
;
;
;   x very close to 1.0 is handled differently, for x everywhere else
;   a brief explanation is given below
;
;   x = (2^m)*A
;   x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9))
;   x = (2^m)*2*(G/2+g/2)
;   x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10))
;
;   Y = (2^(-1))*(2^(-m))*(2^m)*A
;   Now, range of Y is: 0.5 <= Y < 1
;
;   F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
;   Now, range of F is: 256 <= F <= 512
;   F = F / 512
;   Now, range of F is: 0.5 <= F <= 1
;
;   f = -(Y-F), with (f <= 2^(-10))
;
;   log(x) = m*log(2) + log(2) + log(F-f)
;   log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
;   log(x) = m*log(2) + log(2*F) + log(1-r)
;
;   r = (f/F), with (r <= 2^(-9))
;   r = f*(1/F) with (1/F) precomputed to avoid division
;
;   log(x) = m*log(2) + log(G) - poly
;
;   log(G) is precomputed
;   poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6))
;
;   log(2) and log(G) need to be maintained in extra precision
;   to avoid losing precision in the calculations
;

.const
ALIGN 16

__real_ninf         DQ 0fff0000000000000h   ; -inf
                    DQ 0000000000000000h
__real_inf          DQ 7ff0000000000000h    ; +inf
                    DQ 0000000000000000h
__real_neg_qnan     DQ 0fff8000000000000h   ; neg qNaN
                    DQ 0000000000000000h
__real_qnanbit      DQ 0008000000000000h
                    DQ 0000000000000000h
__real_min_norm     DQ 0010000000000000h
                    DQ 0000000000000000h
__real_mant         DQ 000FFFFFFFFFFFFFh    ; mantissa bits
                    DQ 0000000000000000h
__mask_1023         DQ 00000000000003ffh
                    DQ 0000000000000000h
__mask_001          DQ 0000000000000001h
                    DQ 0000000000000000h

__mask_mant_all8    DQ 000ff00000000000h
                    DQ 0000000000000000h
__mask_mant9        DQ 0000080000000000h
                    DQ 0000000000000000h

__real_two          DQ 4000000000000000h ; 2
                    DQ 0000000000000000h

__real_one          DQ 3ff0000000000000h ; 1
                    DQ 0000000000000000h

__real_near_one_lt  DQ 3fee000000000000h ; .9375
                    DQ 0000000000000000h

__real_near_one_gt  DQ 3ff1000000000000h ; 1.0625
                    DQ 0000000000000000h

__real_half         DQ 3fe0000000000000h ; 1/2
                    DQ 0000000000000000h

__mask_100          DQ 0000000000000100h
                    DQ 0000000000000000h

__real_1_over_512   DQ 3f60000000000000h
                    DQ 0000000000000000h

__real_1_over_2     DQ 3fe0000000000000h
                    DQ 0000000000000000h
__real_1_over_3     DQ 3fd5555555555555h
                    DQ 0000000000000000h
__real_1_over_4     DQ 3fd0000000000000h
                    DQ 0000000000000000h
__real_1_over_5     DQ 3fc999999999999ah
                    DQ 0000000000000000h
__real_1_over_6     DQ 3fc5555555555555h
                    DQ 0000000000000000h

__mask_1023_f       DQ 0c08ff80000000000h
                    DQ 0000000000000000h

__mask_2045         DQ 00000000000007fdh
                    DQ 0000000000000000h

__real_threshold    DQ 3fb0000000000000h ; .0625
                    DQ 0000000000000000h

__real_notsign      DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
                    DQ 0000000000000000h

__real_ca1          DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
                    DQ 0000000000000000h
__real_ca2          DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
                    DQ 0000000000000000h
__real_ca3          DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
                    DQ 0000000000000000h
__real_ca4          DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
                    DQ 0000000000000000h
__real_log2_lead    DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01
                    DQ 00000000000000000h
__real_log2_tail    DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08
                    DQ 00000000000000000h

; these codes and the ones in the corresponding .c file have to match
__flag_x_zero          DD 00000001
__flag_x_neg           DD 00000002
__flag_x_nan           DD 00000003


EXTRN __log_256_lead:QWORD
EXTRN __log_256_tail:QWORD
EXTRN __log_F_inv_qword:QWORD
EXTRN __use_fma3_lib:DWORD


fname           TEXTEQU <log>
fname_special   TEXTEQU <_log_special>

; define local variable storage offsets

save_xmm6       EQU     20h
dummy_space     EQU     40h

stack_size      EQU     58h

include fm.inc

; external function
EXTERN fname_special:PROC

.code
ALIGN 16
PUBLIC fname
fname PROC FRAME
    StackAllocate stack_size
    SaveXmm      xmm6, save_xmm6
    .ENDPROLOG

    cmp          DWORD PTR __use_fma3_lib, 0
    jne          Llog_fma3

Llog_sse2:

    ; compute exponent part
    movdqa      xmm3, xmm0
    movapd      xmm4, xmm0
    psrlq       xmm3, 52
    movd        rax, xmm0
    psubq       xmm3, XMMWORD PTR __mask_1023

    ;  NaN or inf
    mov         rcx, rax
    btr         rcx, 63
    cmp         rcx, QWORD PTR __real_inf
    jae         __x_is_inf_or_nan

    movdqa      xmm2, xmm0
    cvtdq2pd    xmm6, xmm3 ; xexp


    pand        xmm2, XMMWORD PTR __real_mant
    subsd       xmm4, QWORD PTR __real_one

    comisd      xmm6, QWORD PTR __mask_1023_f
    je          __denormal_adjust

__continue_common:    

    andpd       xmm4, XMMWORD PTR __real_notsign
    ; compute index into the log tables
    mov         r9, rax
    and         rax, QWORD PTR __mask_mant_all8
    and         r9, QWORD PTR __mask_mant9
    shl         r9, 1
    add         rax, r9
    movd        xmm1, rax

    ; near one codepath
    comisd      xmm4, QWORD PTR __real_threshold
    jb          __near_one

    ; F, Y
    shr         rax, 44
    por         xmm2, XMMWORD PTR __real_half
    por         xmm1, XMMWORD PTR __real_half
    lea         r9, __log_F_inv_qword

    ; check for negative numbers or zero
    xorpd       xmm5, xmm5
    comisd      xmm0, xmm5
    jbe         __x_is_zero_or_neg

    ; f = F - Y, r = f * inv
    subsd       xmm1, xmm2                       ; xmm1 <-- f = F - Y
    mulsd       xmm1, QWORD PTR [r9+rax*8]       ; xmm1 <-- r = f * inv

    movapd      xmm2, xmm1                       ; xmm2 <-- copy of r
    movapd      xmm0, xmm1                       ; xmm0 <-- copy of r
    lea         r9, QWORD PTR __log_256_lead

    ; poly
    movsd       xmm3, QWORD PTR __real_1_over_6
    movsd       xmm1, QWORD PTR __real_1_over_3
    mulsd       xmm3, xmm2                      ; xmm3 <-- r/6
    mulsd       xmm1, xmm2                      ; xmm1 <-- r/3
    mulsd       xmm0, xmm2                      ; xmm0 <-- r*r
    movapd      xmm4, xmm0                      ; xmm4 <-- copy of r*r
    addsd       xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5
    addsd       xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2
    mulsd       xmm4, xmm0                      ; xmm4 <-- r^4
    mulsd       xmm3, xmm2                      ; xmm3 <-- (r/6 + 1/5)*r
    mulsd       xmm1, xmm0                      ; xmm1 <-- (r/3 + 1/2)*r^2
    addsd       xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4
    addsd       xmm1, xmm2                      ; xmm1 <-- (r/3 + 1/2)*r^2 + r
    mulsd       xmm3, xmm4                      ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4
    addsd       xmm1, xmm3                      ; xmm1 <-- poly

    ; m*log(2)_tail + log(G)_tail - poly
    movsd       xmm5, QWORD PTR __real_log2_tail
    mulsd       xmm5, xmm6                      ; xmm5 <-- m*log2_tail
    subsd       xmm5, xmm1                      ; xmm5 <-- m*log2_tail - poly

    movsd       xmm0, QWORD PTR [r9+rax*8]      ; xmm0 <-- log(G)_lead
    lea         rdx, QWORD PTR __log_256_tail
    movsd       xmm2, QWORD PTR [rdx+rax*8]     ; xmm2 <-- log(G)_tail
    addsd       xmm2, xmm5                      ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail

    movsd       xmm4, QWORD PTR __real_log2_lead
    mulsd       xmm4, xmm6                      ; xmm4 <-- m*log2_lead
    addsd       xmm0, xmm4                      ; xmm0 <-- m*log2_lead + log(G)_lead

    addsd       xmm0, xmm2        ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly

    RestoreXmm  xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN 16
__near_one:

    ; r = x - 1.0
    movsd       xmm2, QWORD PTR __real_two
    subsd       xmm0, QWORD PTR __real_one ; r

    addsd       xmm2, xmm0
    movsd       xmm1, xmm0
    divsd       xmm1, xmm2 ; r/(2+r) = u/2

    movsd       xmm4, QWORD PTR __real_ca2
    movsd       xmm5, QWORD PTR __real_ca4

    movsd       xmm6, xmm0
    mulsd       xmm6, xmm1 ; correction

    addsd       xmm1, xmm1 ; u
    movsd       xmm2, xmm1

    mulsd       xmm2, xmm1 ; u^2

    mulsd       xmm4, xmm2
    mulsd       xmm5, xmm2

    addsd       xmm4, __real_ca1
    addsd       xmm5, __real_ca3

    mulsd       xmm2, xmm1 ; u^3
    mulsd       xmm4, xmm2

    mulsd       xmm2, xmm2
    mulsd       xmm2, xmm1 ; u^7
    mulsd       xmm5, xmm2

    addsd       xmm4, xmm5
    subsd       xmm4, xmm6
    addsd       xmm0, xmm4

    RestoreXmm  xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN 16
__denormal_adjust:
    por         xmm2, XMMWORD PTR __real_one
    subsd       xmm2, QWORD PTR __real_one
    movsd       xmm5, xmm2
    pand        xmm2, XMMWORD PTR __real_mant
    movd        rax, xmm2
    psrlq       xmm5, 52
    psubd       xmm5, XMMWORD PTR __mask_2045
    cvtdq2pd    xmm6, xmm5
    jmp         __continue_common

ALIGN 16
__x_is_zero_or_neg:
    jne         __x_is_neg

    movsd       xmm1, QWORD PTR __real_ninf
    mov         r8d, DWORD PTR __flag_x_zero
    call        fname_special
    jmp         __finish

ALIGN 16
__x_is_neg:

    movsd       xmm1, QWORD PTR __real_neg_qnan
    mov         r8d, DWORD PTR __flag_x_neg
    call        fname_special
    jmp         __finish

ALIGN 16
__x_is_inf_or_nan:

    cmp         rax, QWORD PTR __real_inf
    je          __finish

    cmp         rax, QWORD PTR __real_ninf
    je          __x_is_neg

    or          rax, QWORD PTR __real_qnanbit
    movd        xmm1, rax
    mov         r8d, DWORD PTR __flag_x_nan
    call        fname_special
    jmp         __finish

ALIGN 16
__finish:
    RestoreXmm  xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN 16
Llog_fma3:
    ; compute exponent part
    xor          rax,rax
    vpsrlq       xmm3,xmm0,52
    vmovq        rax,xmm0
    vpsubq       xmm3,xmm3,XMMWORD PTR __mask_1023
    vcvtdq2pd    xmm6,xmm3 ; xexp

    ;  NaN or inf
    vpand        xmm5,xmm0,XMMWORD PTR __real_inf
    vcomisd      xmm5,QWORD PTR __real_inf
    je           Llog_fma3_x_is_inf_or_nan

    ; check for negative numbers or zero
    vpxor        xmm5,xmm5,xmm5
    vcomisd      xmm0,xmm5
    jbe          Llog_fma3_x_is_zero_or_neg

    vpand        xmm2,xmm0,XMMWORD PTR __real_mant
    vsubsd       xmm4,xmm0,QWORD PTR __real_one

    vcomisd      xmm6,QWORD PTR __mask_1023_f
    je           Llog_fma3_denormal_adjust

Llog_fma3_continue_common:
    ; compute index into the log tables
    vpand        xmm1,xmm0,XMMWORD PTR __mask_mant_all8
    vpand        xmm3,xmm0,XMMWORD PTR __mask_mant9
    vpsllq       xmm3,xmm3,1
    vpaddq       xmm1,xmm3,xmm1
    vmovq        rax,xmm1

    ; near one codepath
    vpand        xmm4,xmm4,XMMWORD PTR __real_notsign
    vcomisd      xmm4,QWORD PTR __real_threshold
    jb           Llog_fma3_near_one

    ; F,Y
    shr          rax,44
    vpor         xmm2,xmm2,XMMWORD PTR __real_half
    vpor         xmm1,xmm1,XMMWORD PTR __real_half
    lea          r9,QWORD PTR __log_F_inv_qword

    ; f = F - Y,r = f * inv
    vsubsd       xmm1,xmm1,xmm2
    vmulsd       xmm1,xmm1,QWORD PTR[r9 + rax * 8]

    lea          r9,QWORD PTR __log_256_lead

    ; poly
    vmulsd       xmm0,xmm1,xmm1           ; r*r
    vmovsd       xmm3,QWORD PTR __real_1_over_6
    vmovsd       xmm5,QWORD PTR __real_1_over_3
    vfmadd213sd  xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
    vfmadd213sd  xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3
    vmovsd       xmm4,xmm0,xmm0
    vfmadd213sd  xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)

    vmulsd       xmm4,xmm0,xmm0           ; r*r*r*r
    vfmadd231sd  xmm1,xmm5,xmm0           ; r*r*(1/2+r*1/3) + r
    vfmadd231sd  xmm1,xmm3,xmm4

    ; m*log(2) + log(G) - poly
    vmovsd       xmm5,QWORD PTR __real_log2_tail
    vfmsub213sd  xmm5,xmm6,xmm1

    vmovsd       xmm0,QWORD PTR[r9 + rax * 8]
    lea          rdx,QWORD PTR __log_256_tail
    vmovsd       xmm1,QWORD PTR[rdx + rax * 8]
    vaddsd       xmm1,xmm1,xmm5

    vfmadd231sd  xmm0,xmm6,QWORD PTR __real_log2_lead

    vaddsd       xmm0,xmm0,xmm1
    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret


ALIGN  16
Llog_fma3_near_one:

    ; r = x - 1.0
    vmovsd       xmm3,QWORD PTR __real_two
    vsubsd       xmm0,xmm0,QWORD PTR __real_one ; r

    vaddsd       xmm3,xmm3,xmm0
    vdivsd       xmm1,xmm0,xmm3           ; r/(2+r) = u/2

    vmovsd       xmm4,QWORD PTR __real_ca2
    vmovsd       xmm5,QWORD PTR __real_ca4

    vmulsd       xmm3,xmm0,xmm1           ; correction
    vaddsd       xmm1,xmm1,xmm1           ; u

    vmulsd       xmm2,xmm1,xmm1           ; u^2
    vfmadd213sd  xmm4,xmm2,QWORD PTR __real_ca1
    vfmadd213sd  xmm5,xmm2,QWORD PTR __real_ca3

    vmulsd       xmm2,xmm2,xmm1           ; u^3
    vmulsd       xmm4,xmm4,xmm2

    vmulsd       xmm2,xmm2,xmm2
    vmulsd       xmm2,xmm2,xmm1           ; u^7

    vfmadd231sd  xmm4,xmm5,xmm2
    vsubsd       xmm4,xmm4,xmm3
    vaddsd       xmm0,xmm0,xmm4

    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret


Llog_fma3_denormal_adjust:
    vpor         xmm2,xmm2,XMMWORD PTR __real_one
    vsubsd       xmm2,xmm2,QWORD PTR __real_one
    vpsrlq       xmm5,xmm2,52
    vpand        xmm2,xmm2,XMMWORD PTR __real_mant
    vmovapd      xmm0,xmm2
    vpsubd       xmm5,xmm5,XMMWORD PTR __mask_2045
    vcvtdq2pd    xmm6,xmm5
    jmp          Llog_fma3_continue_common

ALIGN  16
Llog_fma3_x_is_zero_or_neg:
    jne          Llog_fma3_x_is_neg
    vmovsd       xmm1,QWORD PTR __real_ninf
    mov          r8d,DWORD PTR __flag_x_zero
    call         fname_special

    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN  16
Llog_fma3_x_is_neg:

    vmovsd       xmm1,QWORD PTR __real_neg_qnan
    mov          r8d,DWORD PTR __flag_x_neg
    call         fname_special

    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret

ALIGN  16
Llog_fma3_x_is_inf_or_nan:

    cmp          rax,QWORD PTR __real_inf
    je           Llog_fma3_finish

    cmp          rax,QWORD PTR __real_ninf
    je           Llog_fma3_x_is_neg

    or           rax,QWORD PTR __real_qnanbit
    vmovq        xmm1,rax
    mov          r8d,DWORD PTR __flag_x_nan
    call         fname_special

ALIGN  16
Llog_fma3_finish:
    AVXRestoreXmm   xmm6, save_xmm6
    StackDeallocate stack_size
    ret
fname       endp

END
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`;`
			`; MIT License`
			`; -----------`
			`;`
			`; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.`
			`;`
			`; Permission is hereby granted, free of charge, to any person obtaining a copy`
			`; of this Software and associated documentaon files (the "Software"), to deal`
			`; in the Software without restriction, including without limitation the rights`
			`; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`; copies of the Software, and to permit persons to whom the Software is`
			`; furnished to do so, subject to the following conditions:`
			`;`
			`; The above copyright notice and this permission notice shall be included in`
			`; all copies or substantial portions of the Software.`
			`;`
			`; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN`
			`; THE SOFTWARE.`
			`;`
			`; log.asm`
			`;`
			`; An implementation of the log libm function.`
			`;`
			`; Prototype:`
			`;`
			`; double log(double x);`
			`;`

			`;`
			`; Algorithm:`
			`;`
			`; Based on:`
			`; Ping-Tak Peter Tang`
			`; "Table-driven implementation of the logarithm function in IEEE`
			`; floating-point arithmetic"`
			`; ACM Transactions on Mathematical Software (TOMS)`
			`; Volume 16, Issue 4 (December 1990)`
			`;`
			`;`
			`; x very close to 1.0 is handled differently, for x everywhere else`
			`; a brief explanation is given below`
			`;`
			`; x = (2^m)*A`
			`; x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9))`
			`; x = (2^m)2(G/2+g/2)`
			`; x = (2^m)2(F+f) with (0.5 <= F < 1) and (f <= 2^(-10))`
			`;`
			`; Y = (2^(-1))(2^(-m))(2^m)*A`
			`; Now, range of Y is: 0.5 <= Y < 1`
			`;`
			`; F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)`
			`; Now, range of F is: 256 <= F <= 512`
			`; F = F / 512`
			`; Now, range of F is: 0.5 <= F <= 1`
			`;`
			`; f = -(Y-F), with (f <= 2^(-10))`
			`;`
			`; log(x) = m*log(2) + log(2) + log(F-f)`
			`; log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))`
			`; log(x) = mlog(2) + log(2F) + log(1-r)`
			`;`
			`; r = (f/F), with (r <= 2^(-9))`
			`; r = f*(1/F) with (1/F) precomputed to avoid division`
			`;`
			`; log(x) = m*log(2) + log(G) - poly`
			`;`
			`; log(G) is precomputed`
			`; poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6))`
			`;`
			`; log(2) and log(G) need to be maintained in extra precision`
			`; to avoid losing precision in the calculations`
			`;`

			`.const`
			`ALIGN 16`

			`__real_ninf DQ 0fff0000000000000h ; -inf`
			`DQ 0000000000000000h`
			`__real_inf DQ 7ff0000000000000h ; +inf`
			`DQ 0000000000000000h`
			`__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN`
			`DQ 0000000000000000h`
			`__real_qnanbit DQ 0008000000000000h`
			`DQ 0000000000000000h`
			`__real_min_norm DQ 0010000000000000h`
			`DQ 0000000000000000h`
			`__real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits`
			`DQ 0000000000000000h`
			`__mask_1023 DQ 00000000000003ffh`
			`DQ 0000000000000000h`
			`__mask_001 DQ 0000000000000001h`
			`DQ 0000000000000000h`

			`__mask_mant_all8 DQ 000ff00000000000h`
			`DQ 0000000000000000h`
			`__mask_mant9 DQ 0000080000000000h`
			`DQ 0000000000000000h`

			`__real_two DQ 4000000000000000h ; 2`
			`DQ 0000000000000000h`

			`__real_one DQ 3ff0000000000000h ; 1`
			`DQ 0000000000000000h`

			`__real_near_one_lt DQ 3fee000000000000h ; .9375`
			`DQ 0000000000000000h`

			`__real_near_one_gt DQ 3ff1000000000000h ; 1.0625`
			`DQ 0000000000000000h`

			`__real_half DQ 3fe0000000000000h ; 1/2`
			`DQ 0000000000000000h`

			`__mask_100 DQ 0000000000000100h`
			`DQ 0000000000000000h`

			`__real_1_over_512 DQ 3f60000000000000h`
			`DQ 0000000000000000h`

			`__real_1_over_2 DQ 3fe0000000000000h`
			`DQ 0000000000000000h`
			`__real_1_over_3 DQ 3fd5555555555555h`
			`DQ 0000000000000000h`
			`__real_1_over_4 DQ 3fd0000000000000h`
			`DQ 0000000000000000h`
			`__real_1_over_5 DQ 3fc999999999999ah`
			`DQ 0000000000000000h`
			`__real_1_over_6 DQ 3fc5555555555555h`
			`DQ 0000000000000000h`

			`__mask_1023_f DQ 0c08ff80000000000h`
			`DQ 0000000000000000h`

			`__mask_2045 DQ 00000000000007fdh`
			`DQ 0000000000000000h`

			`__real_threshold DQ 3fb0000000000000h ; .0625`
			`DQ 0000000000000000h`

			`__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit`
			`DQ 0000000000000000h`

			`__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02`
			`DQ 0000000000000000h`
			`__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02`
			`DQ 0000000000000000h`
			`__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03`
			`DQ 0000000000000000h`
			`__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04`
			`DQ 0000000000000000h`
			`__real_log2_lead DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01`
			`DQ 00000000000000000h`
			`__real_log2_tail DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08`
			`DQ 00000000000000000h`

			`; these codes and the ones in the corresponding .c file have to match`
			`__flag_x_zero DD 00000001`
			`__flag_x_neg DD 00000002`
			`__flag_x_nan DD 00000003`


			`EXTRN __log_256_lead:QWORD`
			`EXTRN __log_256_tail:QWORD`
			`EXTRN __log_F_inv_qword:QWORD`
			`EXTRN __use_fma3_lib:DWORD`


			`fname TEXTEQU <log>`
			`fname_special TEXTEQU <_log_special>`

			`; define local variable storage offsets`

			`save_xmm6 EQU 20h`
			`dummy_space EQU 40h`

			`stack_size EQU 58h`

			`include fm.inc`

			`; external function`
			`EXTERN fname_special:PROC`

			`.code`
			`ALIGN 16`
			`PUBLIC fname`
			`fname PROC FRAME`
			`StackAllocate stack_size`
			`SaveXmm xmm6, save_xmm6`
			`.ENDPROLOG`

			`cmp DWORD PTR __use_fma3_lib, 0`
			`jne Llog_fma3`

			`Llog_sse2:`

			`; compute exponent part`
			`movdqa xmm3, xmm0`
			`movapd xmm4, xmm0`
			`psrlq xmm3, 52`
			`movd rax, xmm0`
			`psubq xmm3, XMMWORD PTR __mask_1023`

			`; NaN or inf`
			`mov rcx, rax`
			`btr rcx, 63`
			`cmp rcx, QWORD PTR __real_inf`
			`jae __x_is_inf_or_nan`

			`movdqa xmm2, xmm0`
			`cvtdq2pd xmm6, xmm3 ; xexp`


			`pand xmm2, XMMWORD PTR __real_mant`
			`subsd xmm4, QWORD PTR __real_one`

			`comisd xmm6, QWORD PTR __mask_1023_f`
			`je __denormal_adjust`

			`__continue_common:`

			`andpd xmm4, XMMWORD PTR __real_notsign`
			`; compute index into the log tables`
			`mov r9, rax`
			`and rax, QWORD PTR __mask_mant_all8`
			`and r9, QWORD PTR __mask_mant9`
			`shl r9, 1`
			`add rax, r9`
			`movd xmm1, rax`

			`; near one codepath`
			`comisd xmm4, QWORD PTR __real_threshold`
			`jb __near_one`

			`; F, Y`
			`shr rax, 44`
			`por xmm2, XMMWORD PTR __real_half`
			`por xmm1, XMMWORD PTR __real_half`
			`lea r9, __log_F_inv_qword`

			`; check for negative numbers or zero`
			`xorpd xmm5, xmm5`
			`comisd xmm0, xmm5`
			`jbe __x_is_zero_or_neg`

			`; f = F - Y, r = f * inv`
			`subsd xmm1, xmm2 ; xmm1 <-- f = F - Y`
			`mulsd xmm1, QWORD PTR [r9+rax8] ; xmm1 <-- r = f inv`

			`movapd xmm2, xmm1 ; xmm2 <-- copy of r`
			`movapd xmm0, xmm1 ; xmm0 <-- copy of r`
			`lea r9, QWORD PTR __log_256_lead`

			`; poly`
			`movsd xmm3, QWORD PTR __real_1_over_6`
			`movsd xmm1, QWORD PTR __real_1_over_3`
			`mulsd xmm3, xmm2 ; xmm3 <-- r/6`
			`mulsd xmm1, xmm2 ; xmm1 <-- r/3`
			`mulsd xmm0, xmm2 ; xmm0 <-- r*r`
			`movapd xmm4, xmm0 ; xmm4 <-- copy of r*r`
			`addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5`
			`addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2`
			`mulsd xmm4, xmm0 ; xmm4 <-- r^4`
			`mulsd xmm3, xmm2 ; xmm3 <-- (r/6 + 1/5)*r`
			`mulsd xmm1, xmm0 ; xmm1 <-- (r/3 + 1/2)*r^2`
			`addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4`
			`addsd xmm1, xmm2 ; xmm1 <-- (r/3 + 1/2)*r^2 + r`
			`mulsd xmm3, xmm4 ; xmm3 <-- ((r/6+1/5)r+1/4)r^4`
			`addsd xmm1, xmm3 ; xmm1 <-- poly`

			`; m*log(2)_tail + log(G)_tail - poly`
			`movsd xmm5, QWORD PTR __real_log2_tail`
			`mulsd xmm5, xmm6 ; xmm5 <-- m*log2_tail`
			`subsd xmm5, xmm1 ; xmm5 <-- m*log2_tail - poly`

			`movsd xmm0, QWORD PTR [r9+rax*8] ; xmm0 <-- log(G)_lead`
			`lea rdx, QWORD PTR __log_256_tail`
			`movsd xmm2, QWORD PTR [rdx+rax*8] ; xmm2 <-- log(G)_tail`
			`addsd xmm2, xmm5 ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail`

			`movsd xmm4, QWORD PTR __real_log2_lead`
			`mulsd xmm4, xmm6 ; xmm4 <-- m*log2_lead`
			`addsd xmm0, xmm4 ; xmm0 <-- m*log2_lead + log(G)_lead`

			`addsd xmm0, xmm2 ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly`

			`RestoreXmm xmm6, save_xmm6`
			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`__near_one:`

			`; r = x - 1.0`
			`movsd xmm2, QWORD PTR __real_two`
			`subsd xmm0, QWORD PTR __real_one ; r`

			`addsd xmm2, xmm0`
			`movsd xmm1, xmm0`
			`divsd xmm1, xmm2 ; r/(2+r) = u/2`

			`movsd xmm4, QWORD PTR __real_ca2`
			`movsd xmm5, QWORD PTR __real_ca4`

			`movsd xmm6, xmm0`
			`mulsd xmm6, xmm1 ; correction`

			`addsd xmm1, xmm1 ; u`
			`movsd xmm2, xmm1`

			`mulsd xmm2, xmm1 ; u^2`

			`mulsd xmm4, xmm2`
			`mulsd xmm5, xmm2`

			`addsd xmm4, __real_ca1`
			`addsd xmm5, __real_ca3`

			`mulsd xmm2, xmm1 ; u^3`
			`mulsd xmm4, xmm2`

			`mulsd xmm2, xmm2`
			`mulsd xmm2, xmm1 ; u^7`
			`mulsd xmm5, xmm2`

			`addsd xmm4, xmm5`
			`subsd xmm4, xmm6`
			`addsd xmm0, xmm4`

			`RestoreXmm xmm6, save_xmm6`
			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`__denormal_adjust:`
			`por xmm2, XMMWORD PTR __real_one`
			`subsd xmm2, QWORD PTR __real_one`
			`movsd xmm5, xmm2`
			`pand xmm2, XMMWORD PTR __real_mant`
			`movd rax, xmm2`
			`psrlq xmm5, 52`
			`psubd xmm5, XMMWORD PTR __mask_2045`
			`cvtdq2pd xmm6, xmm5`
			`jmp __continue_common`

			`ALIGN 16`
			`__x_is_zero_or_neg:`
			`jne __x_is_neg`

			`movsd xmm1, QWORD PTR __real_ninf`
			`mov r8d, DWORD PTR __flag_x_zero`
			`call fname_special`
			`jmp __finish`

			`ALIGN 16`
			`__x_is_neg:`

			`movsd xmm1, QWORD PTR __real_neg_qnan`
			`mov r8d, DWORD PTR __flag_x_neg`
			`call fname_special`
			`jmp __finish`

			`ALIGN 16`
			`__x_is_inf_or_nan:`

			`cmp rax, QWORD PTR __real_inf`
			`je __finish`

			`cmp rax, QWORD PTR __real_ninf`
			`je __x_is_neg`

			`or rax, QWORD PTR __real_qnanbit`
			`movd xmm1, rax`
			`mov r8d, DWORD PTR __flag_x_nan`
			`call fname_special`
			`jmp __finish`

			`ALIGN 16`
			`__finish:`
			`RestoreXmm xmm6, save_xmm6`
			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`Llog_fma3:`
			`; compute exponent part`
			`xor rax,rax`
			`vpsrlq xmm3,xmm0,52`
			`vmovq rax,xmm0`
[LIBM] Fix up some asm files This allows to compile them with GAS after translation 2022-06-25 13:00:01 +03:00			`vpsubq xmm3,xmm3,XMMWORD PTR __mask_1023`
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`vcvtdq2pd xmm6,xmm3 ; xexp`

			`; NaN or inf`
[LIBM] Fix up some asm files This allows to compile them with GAS after translation 2022-06-25 13:00:01 +03:00			`vpand xmm5,xmm0,XMMWORD PTR __real_inf`
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`vcomisd xmm5,QWORD PTR __real_inf`
			`je Llog_fma3_x_is_inf_or_nan`

			`; check for negative numbers or zero`
			`vpxor xmm5,xmm5,xmm5`
			`vcomisd xmm0,xmm5`
			`jbe Llog_fma3_x_is_zero_or_neg`

[LIBM] Fix up some asm files This allows to compile them with GAS after translation 2022-06-25 13:00:01 +03:00			`vpand xmm2,xmm0,XMMWORD PTR __real_mant`
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`vsubsd xmm4,xmm0,QWORD PTR __real_one`

			`vcomisd xmm6,QWORD PTR __mask_1023_f`
			`je Llog_fma3_denormal_adjust`

			`Llog_fma3_continue_common:`
			`; compute index into the log tables`
[LIBM] Fix up some asm files This allows to compile them with GAS after translation 2022-06-25 13:00:01 +03:00			`vpand xmm1,xmm0,XMMWORD PTR __mask_mant_all8`
			`vpand xmm3,xmm0,XMMWORD PTR __mask_mant9`
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`vpsllq xmm3,xmm3,1`
			`vpaddq xmm1,xmm3,xmm1`
			`vmovq rax,xmm1`

			`; near one codepath`
[LIBM] Fix up some asm files This allows to compile them with GAS after translation 2022-06-25 13:00:01 +03:00			`vpand xmm4,xmm4,XMMWORD PTR __real_notsign`
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`vcomisd xmm4,QWORD PTR __real_threshold`
			`jb Llog_fma3_near_one`

			`; F,Y`
			`shr rax,44`
[LIBM] Fix up some asm files This allows to compile them with GAS after translation 2022-06-25 13:00:01 +03:00			`vpor xmm2,xmm2,XMMWORD PTR __real_half`
			`vpor xmm1,xmm1,XMMWORD PTR __real_half`
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`lea r9,QWORD PTR __log_F_inv_qword`

			`; f = F - Y,r = f * inv`
			`vsubsd xmm1,xmm1,xmm2`
			`vmulsd xmm1,xmm1,QWORD PTR[r9 + rax * 8]`

			`lea r9,QWORD PTR __log_256_lead`

			`; poly`
			`vmulsd xmm0,xmm1,xmm1 ; r*r`
			`vmovsd xmm3,QWORD PTR __real_1_over_6`
			`vmovsd xmm5,QWORD PTR __real_1_over_3`
			`vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5`
			`vfmadd213sd xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3`
			`vmovsd xmm4,xmm0,xmm0`
			`vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5r+rr*1/6)`

			`vmulsd xmm4,xmm0,xmm0 ; rrr*r`
			`vfmadd231sd xmm1,xmm5,xmm0 ; rr(1/2+r*1/3) + r`
			`vfmadd231sd xmm1,xmm3,xmm4`

			`; m*log(2) + log(G) - poly`
			`vmovsd xmm5,QWORD PTR __real_log2_tail`
			`vfmsub213sd xmm5,xmm6,xmm1`

			`vmovsd xmm0,QWORD PTR[r9 + rax * 8]`
			`lea rdx,QWORD PTR __log_256_tail`
			`vmovsd xmm1,QWORD PTR[rdx + rax * 8]`
			`vaddsd xmm1,xmm1,xmm5`

			`vfmadd231sd xmm0,xmm6,QWORD PTR __real_log2_lead`

			`vaddsd xmm0,xmm0,xmm1`
			`AVXRestoreXmm xmm6, save_xmm6`
			`StackDeallocate stack_size`
			`ret`


			`ALIGN 16`
			`Llog_fma3_near_one:`

			`; r = x - 1.0`
			`vmovsd xmm3,QWORD PTR __real_two`
			`vsubsd xmm0,xmm0,QWORD PTR __real_one ; r`

			`vaddsd xmm3,xmm3,xmm0`
			`vdivsd xmm1,xmm0,xmm3 ; r/(2+r) = u/2`

			`vmovsd xmm4,QWORD PTR __real_ca2`
			`vmovsd xmm5,QWORD PTR __real_ca4`

			`vmulsd xmm3,xmm0,xmm1 ; correction`
			`vaddsd xmm1,xmm1,xmm1 ; u`

			`vmulsd xmm2,xmm1,xmm1 ; u^2`
			`vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1`
			`vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3`

			`vmulsd xmm2,xmm2,xmm1 ; u^3`
			`vmulsd xmm4,xmm4,xmm2`

			`vmulsd xmm2,xmm2,xmm2`
			`vmulsd xmm2,xmm2,xmm1 ; u^7`

			`vfmadd231sd xmm4,xmm5,xmm2`
			`vsubsd xmm4,xmm4,xmm3`
			`vaddsd xmm0,xmm0,xmm4`

			`AVXRestoreXmm xmm6, save_xmm6`
			`StackDeallocate stack_size`
			`ret`


			`Llog_fma3_denormal_adjust:`
[LIBM] Fix up some asm files This allows to compile them with GAS after translation 2022-06-25 13:00:01 +03:00			`vpor xmm2,xmm2,XMMWORD PTR __real_one`
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`vsubsd xmm2,xmm2,QWORD PTR __real_one`
			`vpsrlq xmm5,xmm2,52`
[LIBM] Fix up some asm files This allows to compile them with GAS after translation 2022-06-25 13:00:01 +03:00			`vpand xmm2,xmm2,XMMWORD PTR __real_mant`
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 12:02:01 +02:00			`vmovapd xmm0,xmm2`
			`vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045`
			`vcvtdq2pd xmm6,xmm5`
			`jmp Llog_fma3_continue_common`

			`ALIGN 16`
			`Llog_fma3_x_is_zero_or_neg:`
			`jne Llog_fma3_x_is_neg`
			`vmovsd xmm1,QWORD PTR __real_ninf`
			`mov r8d,DWORD PTR __flag_x_zero`
			`call fname_special`

			`AVXRestoreXmm xmm6, save_xmm6`
			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`Llog_fma3_x_is_neg:`

			`vmovsd xmm1,QWORD PTR __real_neg_qnan`
			`mov r8d,DWORD PTR __flag_x_neg`
			`call fname_special`

			`AVXRestoreXmm xmm6, save_xmm6`
			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`Llog_fma3_x_is_inf_or_nan:`

			`cmp rax,QWORD PTR __real_inf`
			`je Llog_fma3_finish`

			`cmp rax,QWORD PTR __real_ninf`
			`je Llog_fma3_x_is_neg`

			`or rax,QWORD PTR __real_qnanbit`
			`vmovq xmm1,rax`
			`mov r8d,DWORD PTR __flag_x_nan`
			`call fname_special`

			`ALIGN 16`
			`Llog_fma3_finish:`
			`AVXRestoreXmm xmm6, save_xmm6`
			`StackDeallocate stack_size`
			`ret`
			`fname endp`

			`END`