reactos/sdk/lib/crt/math/libm_sse2/logf.asm

;
; MIT License
; -----------
; 
; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
; 
; Permission is hereby granted, free of charge, to any person obtaining a copy
; of this Software and associated documentaon files (the "Software"), to deal
; in the Software without restriction, including without limitation the rights
; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
; copies of the Software, and to permit persons to whom the Software is
; furnished to do so, subject to the following conditions:
; 
; The above copyright notice and this permission notice shall be included in
; all copies or substantial portions of the Software.
; 
; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
; THE SOFTWARE.
;
;
; logf.asm
;
; An implementation of the logf libm function.
;
; Prototype:
;
;     float logf(float x);
;

;
;   Algorithm:
;       Similar to one presnted in log.asm
;
.const


ALIGN 16

L_real_one              DQ 0000000003f800000h   ; 1.0
                        DQ 0000000000000000h
L_real_two              DQ 00000000040000000h   ; 1.0
                        DQ 00000000000000000h
L_real_ninf             DQ 000000000ff800000h   ; -inf
                        DQ 0000000000000000h
L_real_inf              DQ 0000000007f800000h   ; +inf
                        DQ 0000000000000000h
L_real_nan              DQ 0000000007fc00000h   ; NaN
                        DQ 0000000000000000h
L_real_neg_qnan         DQ 000000000ffc00000h
                        DQ 0000000000000000h
L_real_notsign          DQ 0000000007ffFFFFFh   ; ^sign bit
                        DQ 0000000000000000h
L_real_mant             DQ 0007FFFFF007FFFFFh   ; mantissa bits
                        DQ 0007FFFFF007FFFFFh
L_mask_127              DQ 00000007f0000007fh   ; 
                        DQ 00000007f0000007fh
L_mask_253              DQ 000000000000000fdh
                        DQ 00000000000000000h
L_mask_mant_all7        DQ 00000000007f0000h
                        DQ 00000000007f0000h
L_mask_mant8            DQ 0000000000008000h
                        DQ 0000000000000000h
L_real_ca1              DQ 0000000003DAAAAABh   ; 8.33333333333317923934e-02
                        DQ 00000000000000000h
L_real_ca2              DQ 0000000003C4CCCCDh   ; 1.25000000037717509602e-02
                        DQ 00000000000000000h
L_real_log2_lead        DQ 03F3170003F317000h   ; 0.693115234375
                        DQ 00000000000000000h
L_real_log2_tail        DQ 0000000003805FDF4h   ; 0.000031946183
                        DQ 00000000000000000h
L_real_half             DQ 0000000003f000000h   ; 1/2
                        DQ 00000000000000000h
L_real_1_over_3         DQ 0000000003eaaaaabh
                        DQ 00000000000000000h

L_real_1_over_2         DD 03f000000h
L_real_neg127           DD 0c2fe0000h
L_real_qnanbit          DD 000400000h   ; quiet nan bit
L_real_threshold        DD 03d800000h

; these codes and the ones in the corresponding .c file have to match
L_flag_x_zero           DD 00000001
L_flag_x_neg            DD 00000002
L_flag_x_nan            DD 00000003

EXTRN __log_128_lead:DWORD
EXTRN __log_128_tail:DWORD
EXTRN __log_F_inv_dword:DWORD
EXTRN __use_fma3_lib:DWORD

fname           TEXTEQU <logf>
fname_special   TEXTEQU <_logf_special>

; define local variable storage offsets

dummy_space     EQU     020h
stack_size      EQU     038h

include fm.inc

; external function
EXTERN fname_special:PROC

.code

PUBLIC fname
fname PROC FRAME
    StackAllocate stack_size
    .ENDPROLOG
    cmp         DWORD PTR __use_fma3_lib, 0
    jne         Llogf_fma3

    ; Some of the placement of instructions below iwll be odd.
    ; We are attempting to have no more than one branch per 32-byte block.
Llogf_sse2:
    ; Zero the high bits of rax because it will be used as an index later.
    xor         rax, rax
    movdqa      xmm3, xmm0
    movaps      xmm4, xmm0

    ; This computation of the expoonent of x will produce nonsenes if x <= 0.,
    ; but those cases are eliminated below, so it does no harm.
    psrld       xmm3, 23                         ; xmm3 <-- biased exp if x > 0.

    ; Is x Inf or NaN?
    movd        eax, xmm0                        ; eax <-- x
    mov         ecx, eax
    btr         ecx, 31                          ; ecx <-- |x|
    cmp         ecx, DWORD PTR L_real_inf
    jae         Llogf_sse2_x_is_inf_or_nan

    ; Finish computing exponent.
    psubd       xmm3, XMMWORD PTR L_mask_127     ; xmm3 <-- xexp (unbiased)
    movdqa      xmm2, xmm0
    cvtdq2ps    xmm5, xmm3                       ; (float)xexp, unless x <= 0.

    ; Is x negative or zero?
    xorps       xmm1, xmm1
    comiss      xmm0, xmm1
    jbe         Llogf_sse2_x_is_zero_or_neg

    pand        xmm2, XMMWORD PTR L_real_mant    ; xmm2 <-- x mantissa for later
    subss       xmm4, DWORD PTR L_real_one       ; xmm4 <-- x - 1. for later

    comiss      xmm5, DWORD PTR L_real_neg127    ; x!=0, xexp==0 ==> subnormal
    je          Llogf_sse2_subnormal_adjust

Llogf_sse2_continue_common:
    ; At this point we need |x| (possibly adjusted) in eax
    ; and m = xexpx (possibly adjusted) in xmm5
    ; We also need the value of x - 1. computed above.

    ; compute the index into the log tables
    mov         r9d, eax
    and         eax, DWORD PTR L_mask_mant_all7  ; eax <-- 7 bits of x mantissa
    and         r9d, DWORD PTR L_mask_mant8      ; r9d <-- 8th bit
    shl         r9d, 1
    add         eax, r9d                         ; use 8th bit to round up
    movd        xmm1, eax

    ; Is x near 1.0 ?
    ; Note that if x is subnormal it is perforce not near one.
    andps       xmm4, XMMWORD PTR L_real_notsign ; xmm4 <-- |x-1|
    comiss      xmm4, DWORD PTR L_real_threshold ; is |x-1| < 1/16?
    jb          Llogf_sse2_near_one              ; if so, handle elsewhere

    ; F, Y
    ; F is a number in [.5,1) scaled from the rounded mantissa bits computed
    ; above by oring in the exponent of .5.
    ; Y is all of the mantissa bits of X scaled to [.5,1.) similarly
    shr         eax, 16                          ; shift eax to use as index
    por         xmm2, XMMWORD PTR L_real_half    ; xmm2 <-- Y
    por         xmm1, XMMWORD PTR L_real_half    ; xmm2 <-- F
    lea         r9, QWORD PTR __log_F_inv_dword


    ; f = F - Y, r = f * inv
    subss       xmm1, xmm2                       ; xmm1 <-- f
    mulss       xmm1, DWORD PTR [r9+rax*4]       ; xmm1 <-- r = f*inv (tabled)

    movaps      xmm2, xmm1
    movaps      xmm0, xmm1

    ; poly
    mulss       xmm2, DWORD PTR L_real_1_over_3  ; xmm2 <-- r/3
    mulss       xmm0, xmm1                       ; xmm0 <-- r^2
    addss       xmm2, DWORD PTR L_real_1_over_2
    movaps      xmm3, XMMWORD PTR L_real_log2_tail

    lea         r9, QWORD PTR __log_128_tail
    lea         r10, QWORD PTR __log_128_lead

    mulss       xmm2, xmm0                       ; xmm2 <-- r^2 * (r/3 + 1/2)
    mulss       xmm3, xmm5                       ; xmm3 <-- (m=xexp)*log2_tail
    addss       xmm1, xmm2                       ; xmm1 <-- poly

    ; m*log(2) + log(G) - poly, where G is just 2*F
    ; log(G) is precomputed to extra precision.
    ; small pieces and large pieces are separated until the final add,
    ; to preserve accuracy
    movaps      xmm0, XMMWORD PTR L_real_log2_lead
    subss       xmm3, xmm1                       ; xmm3 <-- m*log2_tail - poly
    mulss       xmm0, xmm5                       ; xmm0 <-- m*log1_lead
    addss       xmm3, DWORD PTR [r9+rax*4]       ; xmm3 += log(G) tail
    addss       xmm0, DWORD PTR [r10+rax*4]      ; xmm0 += log(G) lead

    addss       xmm0, xmm3                       ; xmm0 <-- m*log(2)+log(G)-poly

    StackDeallocate stack_size
    ret

ALIGN 16
Llogf_sse2_near_one:
    ; Computation of the log for x near one requires special techniques.
    movaps      xmm2, DWORD PTR L_real_two
    subss       xmm0, DWORD PTR L_real_one       ; xmm0 <-- r = x - 1.0
    addss       xmm2, xmm0
    movaps      xmm1, xmm0
    divss       xmm1, xmm2                       ; xmm1 <-- u = r/(2.0+r)
    movaps      xmm4, xmm0
    mulss       xmm4, xmm1                       ; xmm4 <-- correction = r*u
    addss       xmm1, xmm1                       ; xmm1 <-- u = 2.*u
    movaps      xmm2, xmm1
    mulss       xmm2, xmm2                       ; xmm2 <-- u^2

    ; r2 = (u^3 * (ca_1 + u^2 * ca_2) - correction)
    movaps      xmm3, xmm1
    mulss       xmm3, xmm2                       ; xmm3 <-- u^3
    mulss       xmm2, DWORD PTR L_real_ca2       ; xmm2 <-- ca2*u^2
    addss       xmm2, DWORD PTR L_real_ca1       ; xmm2 <-- ca2*u^2 + ca1
    mulss       xmm2, xmm3                       ; xmm2 <-- u^3*(ca1+u^2*ca2)
    subss       xmm2, xmm4                       ; xmm2 <-- r2

    ; return r + r2
    addss       xmm0, xmm2
    StackDeallocate stack_size
    ret

ALIGN 16
Llogf_sse2_subnormal_adjust:
    ; This code adjusts eax and xmm5.
    ; It must preserve xmm4.
    por         xmm2, XMMWORD PTR L_real_one
    subss       xmm2, DWORD PTR L_real_one
    movdqa      xmm5, xmm2
    pand        xmm2, XMMWORD PTR L_real_mant
    movd        eax, xmm2
    psrld       xmm5, 23
    psubd       xmm5, XMMWORD PTR L_mask_253
    cvtdq2ps    xmm5, xmm5
    jmp         Llogf_sse2_continue_common

; Until we get to the FMA3 code, the rest of this is special case handling.
Llogf_sse2_x_is_zero_or_neg:
    jne         Llogf_sse2_x_is_neg

    movaps      xmm1, XMMWORD PTR L_real_ninf
    mov         r8d, DWORD PTR L_flag_x_zero
    call        fname_special
    jmp         Llogf_sse2_finish

Llogf_sse2_x_is_neg:

    movaps      xmm1, XMMWORD PTR L_real_neg_qnan
    mov         r8d, DWORD PTR L_flag_x_neg
    call        fname_special
    jmp         Llogf_sse2_finish

Llogf_sse2_x_is_inf_or_nan:

    cmp         eax, DWORD PTR L_real_inf
    je          Llogf_sse2_finish

    cmp         eax, DWORD PTR L_real_ninf
    je          Llogf_sse2_x_is_neg

    or          eax, DWORD PTR L_real_qnanbit
    movd        xmm1, eax
    mov         r8d, DWORD PTR L_flag_x_nan
    call        fname_special
    jmp         Llogf_sse2_finish    

Llogf_sse2_finish:
    StackDeallocate stack_size
    ret

ALIGN 16
Llogf_fma3:
    ; compute exponent part
    vmovaps      xmm4,XMMWORD PTR L_real_inf ; preload for inf/nan test
    xor          rax,rax
    vpsrld       xmm3,xmm0,23             ; xmm3 <-- (ux>>23)
    vmovd        eax,xmm0  ;eax = x
    vpsubd       xmm3,xmm3,DWORD PTR L_mask_127 ; xmm3 <-- (ux>>23) - 127
    vcvtdq2ps    xmm5,xmm3                ; xmm5 <-- float((ux>>23)-127) = xexp

    ;  NaN or inf
    vpand        xmm1,xmm0,xmm4           ; xmm1 <-- (ux & 07f800000h)
    vcomiss      xmm1,xmm4
    je           Llogf_fma3_x_is_inf_or_nan

    ; check for negative numbers or zero
    vpxor        xmm1,xmm1,xmm1
    vcomiss      xmm0,xmm1
    jbe          Llogf_fma3_x_is_zero_or_neg

    vpand        xmm2,xmm0,DWORD PTR L_real_mant  ; xmm2 <-- ux & 0007FFFFFh
    vsubss       xmm4,xmm0,DWORD PTR L_real_one   ; xmm4 <-- x - 1.0

    vcomiss      xmm5,DWORD PTR L_real_neg127
    je           Llogf_fma3_subnormal_adjust

Llogf_fma3_continue_common:

    ; compute the index into the log tables
    vpand        xmm1,xmm0,DWORD PTR L_mask_mant_all7 ; xmm1 = ux & 0007f0000h 
    vpand        xmm3,xmm0,DWORD PTR L_mask_mant8     ; xmm3 = ux & 000008000h
    vpslld       xmm3,xmm3,1              ; xmm3  = (ux & 000008000h) << 1
    vpaddd       xmm1,xmm3,xmm1
    ; eax = (ux & 0007f0000h) + ((ux & 000008000h) << 1)
    ; eax <-- x/127., rounded to nearest
    vmovd        eax,xmm1

    ; near one codepath
    vandps       xmm4,xmm4,DWORD PTR L_real_notsign   ; xmm4 <-- fabs (x - 1.0)
    vcomiss      xmm4,DWORD PTR L_real_threshold
    jb           Llogf_fma3_near_one

    ; F,Y
    shr          eax,16
    vpor         xmm2,xmm2,DWORD PTR L_real_half ; xmm2 <-- Y
    vpor         xmm1,xmm1,DWORD PTR L_real_half ; xmm1 <-- F
    lea          r9,QWORD PTR __log_F_inv_dword

    ; f = F - Y
    vsubss       xmm1,xmm1,xmm2           ; f = F - Y
    ; r = f * log_F_inv_dword[index]
    vmulss       xmm1,xmm1,DWORD PTR [r9 + rax * 4]

    ; poly
    vmovaps      xmm2,XMMWORD PTR L_real_1_over_3
    vfmadd213ss  xmm2,xmm1,DWORD PTR L_real_1_over_2 ; 1/3*r + 1/2
    vmulss       xmm0,xmm1,xmm1           ; r*r
    vmovaps      xmm3,DWORD PTR L_real_log2_tail;

    lea          r9,DWORD PTR __log_128_tail
    lea          r10,DWORD PTR __log_128_lead

    vfmadd231ss  xmm1,xmm2,xmm0           ; poly = r + 1/2*r*r + 1/3*r*r*r 
    vfmsub213ss  xmm3,xmm5,xmm1           ; (xexp * log2_tail) - poly

    ; m*log(2) + log(G) - poly
    vmovaps      xmm0,DWORD PTR L_real_log2_lead
    vfmadd213ss  xmm0,xmm5,[r10 + rax * 4]
    ; z2 = (xexp * log2_tail) - poly + log_128_tail[index]
    vaddss       xmm3,xmm3,DWORD PTR [r9 + rax * 4]
    vaddss       xmm0,xmm0,xmm3           ; return z1 + z2
    
    StackDeallocate stack_size
    ret

ALIGN  16
Llogf_fma3_near_one:
    ; r = x - 1.0;
    vmovaps      xmm2,DWORD PTR L_real_two
    vsubss       xmm0,xmm0,DWORD PTR L_real_one  ; xmm0 = r = = x - 1.0

    ; u = r / (2.0 + r)
    vaddss       xmm2,xmm2,xmm0           ; (r+2.0)
    vdivss       xmm1,xmm0,xmm2           ; u = r / (2.0 + r)

    ; correction = r * u
    vmulss       xmm4,xmm0,xmm1           ; correction = u*r

    ; u = u + u;
    vaddss       xmm1,xmm1,xmm1           ; u = u+u 
    vmulss       xmm2,xmm1,xmm1           ; v = u^2

    ; r2 = (u * v * (ca_1 + v * ca_2) - correction)
    vmulss       xmm3,xmm1,xmm2           ; u^3
    vmovaps      xmm5,DWORD PTR L_real_ca2
    vfmadd213ss  xmm2,xmm5,DWORD PTR L_real_ca1
    vfmsub213ss  xmm2,xmm3,xmm4       ; r2 = (ca1 + ca2 * v) * u^3 - correction

    ; r + r2
    vaddss       xmm0,xmm0,xmm2
    StackDeallocate stack_size
    ret


ALIGN  16
Llogf_fma3_subnormal_adjust:
    vmovaps      xmm3,DWORD PTR L_real_one
    vpor         xmm2,xmm2,xmm3  ; xmm2 = temp = ((ux &0007FFFFFh) | 03f800000h)
    vsubss       xmm2,xmm2,xmm3  ; xmm2 = temp -1.0
    vpsrld       xmm5,xmm2,23                 ; xmm5 = (utemp >> 23)
    vpand        xmm2,xmm2,DWORD PTR L_real_mant ; xmm2 = (utemp & 0007FFFFFh)
    vmovaps      xmm0,xmm2
    vpsubd       xmm5,xmm5,DWORD PTR L_mask_253  ; xmm5 = (utemp >> 23) - 253
    vcvtdq2ps    xmm5,xmm5               ; xmm5 = (float) ((utemp >> 23) - 253)
    jmp          Llogf_fma3_continue_common

Llogf_fma3_x_is_zero_or_neg:
    jne          Llogf_fma3_x_is_neg

    vmovaps      xmm1,DWORD PTR L_real_ninf
    mov          r8d,DWORD PTR L_flag_x_zero
    call         fname_special
    
    StackDeallocate stack_size
    ret
 

Llogf_fma3_x_is_neg:

    vmovaps      xmm1,DWORD PTR L_real_neg_qnan
    mov          r8d,DWORD PTR L_flag_x_neg
    call         fname_special
    
    StackDeallocate stack_size
    ret
     
Llogf_fma3_x_is_inf_or_nan:

    cmp          eax,DWORD PTR L_real_inf
    je           Llogf_fma3_finish

    cmp          eax,DWORD PTR L_real_ninf
    je           Llogf_fma3_x_is_neg

    or           eax,DWORD PTR L_real_qnanbit
    vmovd        xmm1,eax
    mov          r8d,DWORD PTR L_flag_x_nan
    call         fname_special
    
    StackDeallocate stack_size
    ret

Llogf_fma3_finish:
    
    StackDeallocate stack_size
    ret


fname endp
END
[LIBM] Import win-libm from AMD Source: https://github.com/amd/win-libm 2022-06-12 10:02:01 +00:00			`;`
			`; MIT License`
			`; -----------`
			`;`
			`; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.`
			`;`
			`; Permission is hereby granted, free of charge, to any person obtaining a copy`
			`; of this Software and associated documentaon files (the "Software"), to deal`
			`; in the Software without restriction, including without limitation the rights`
			`; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`; copies of the Software, and to permit persons to whom the Software is`
			`; furnished to do so, subject to the following conditions:`
			`;`
			`; The above copyright notice and this permission notice shall be included in`
			`; all copies or substantial portions of the Software.`
			`;`
			`; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN`
			`; THE SOFTWARE.`
			`;`
			`;`
			`; logf.asm`
			`;`
			`; An implementation of the logf libm function.`
			`;`
			`; Prototype:`
			`;`
			`; float logf(float x);`
			`;`

			`;`
			`; Algorithm:`
			`; Similar to one presnted in log.asm`
			`;`
			`.const`


			`ALIGN 16`

			`L_real_one DQ 0000000003f800000h ; 1.0`
			`DQ 0000000000000000h`
			`L_real_two DQ 00000000040000000h ; 1.0`
			`DQ 00000000000000000h`
			`L_real_ninf DQ 000000000ff800000h ; -inf`
			`DQ 0000000000000000h`
			`L_real_inf DQ 0000000007f800000h ; +inf`
			`DQ 0000000000000000h`
			`L_real_nan DQ 0000000007fc00000h ; NaN`
			`DQ 0000000000000000h`
			`L_real_neg_qnan DQ 000000000ffc00000h`
			`DQ 0000000000000000h`
			`L_real_notsign DQ 0000000007ffFFFFFh ; ^sign bit`
			`DQ 0000000000000000h`
			`L_real_mant DQ 0007FFFFF007FFFFFh ; mantissa bits`
			`DQ 0007FFFFF007FFFFFh`
			`L_mask_127 DQ 00000007f0000007fh ;`
			`DQ 00000007f0000007fh`
			`L_mask_253 DQ 000000000000000fdh`
			`DQ 00000000000000000h`
			`L_mask_mant_all7 DQ 00000000007f0000h`
			`DQ 00000000007f0000h`
			`L_mask_mant8 DQ 0000000000008000h`
			`DQ 0000000000000000h`
			`L_real_ca1 DQ 0000000003DAAAAABh ; 8.33333333333317923934e-02`
			`DQ 00000000000000000h`
			`L_real_ca2 DQ 0000000003C4CCCCDh ; 1.25000000037717509602e-02`
			`DQ 00000000000000000h`
			`L_real_log2_lead DQ 03F3170003F317000h ; 0.693115234375`
			`DQ 00000000000000000h`
			`L_real_log2_tail DQ 0000000003805FDF4h ; 0.000031946183`
			`DQ 00000000000000000h`
			`L_real_half DQ 0000000003f000000h ; 1/2`
			`DQ 00000000000000000h`
			`L_real_1_over_3 DQ 0000000003eaaaaabh`
			`DQ 00000000000000000h`

			`L_real_1_over_2 DD 03f000000h`
			`L_real_neg127 DD 0c2fe0000h`
			`L_real_qnanbit DD 000400000h ; quiet nan bit`
			`L_real_threshold DD 03d800000h`

			`; these codes and the ones in the corresponding .c file have to match`
			`L_flag_x_zero DD 00000001`
			`L_flag_x_neg DD 00000002`
			`L_flag_x_nan DD 00000003`

			`EXTRN __log_128_lead:DWORD`
			`EXTRN __log_128_tail:DWORD`
			`EXTRN __log_F_inv_dword:DWORD`
			`EXTRN __use_fma3_lib:DWORD`

			`fname TEXTEQU <logf>`
			`fname_special TEXTEQU <_logf_special>`

			`; define local variable storage offsets`

			`dummy_space EQU 020h`
			`stack_size EQU 038h`

			`include fm.inc`

			`; external function`
			`EXTERN fname_special:PROC`

			`.code`

			`PUBLIC fname`
			`fname PROC FRAME`
			`StackAllocate stack_size`
			`.ENDPROLOG`
			`cmp DWORD PTR __use_fma3_lib, 0`
			`jne Llogf_fma3`

			`; Some of the placement of instructions below iwll be odd.`
			`; We are attempting to have no more than one branch per 32-byte block.`
			`Llogf_sse2:`
			`; Zero the high bits of rax because it will be used as an index later.`
			`xor rax, rax`
			`movdqa xmm3, xmm0`
			`movaps xmm4, xmm0`

			`; This computation of the expoonent of x will produce nonsenes if x <= 0.,`
			`; but those cases are eliminated below, so it does no harm.`
			`psrld xmm3, 23 ; xmm3 <-- biased exp if x > 0.`

			`; Is x Inf or NaN?`
			`movd eax, xmm0 ; eax <-- x`
			`mov ecx, eax`
			`btr ecx, 31 ; ecx <-- \|x\|`
			`cmp ecx, DWORD PTR L_real_inf`
			`jae Llogf_sse2_x_is_inf_or_nan`

			`; Finish computing exponent.`
			`psubd xmm3, XMMWORD PTR L_mask_127 ; xmm3 <-- xexp (unbiased)`
			`movdqa xmm2, xmm0`
			`cvtdq2ps xmm5, xmm3 ; (float)xexp, unless x <= 0.`

			`; Is x negative or zero?`
			`xorps xmm1, xmm1`
			`comiss xmm0, xmm1`
			`jbe Llogf_sse2_x_is_zero_or_neg`

			`pand xmm2, XMMWORD PTR L_real_mant ; xmm2 <-- x mantissa for later`
			`subss xmm4, DWORD PTR L_real_one ; xmm4 <-- x - 1. for later`

			`comiss xmm5, DWORD PTR L_real_neg127 ; x!=0, xexp==0 ==> subnormal`
			`je Llogf_sse2_subnormal_adjust`

			`Llogf_sse2_continue_common:`
			`; At this point we need \|x\| (possibly adjusted) in eax`
			`; and m = xexpx (possibly adjusted) in xmm5`
			`; We also need the value of x - 1. computed above.`

			`; compute the index into the log tables`
			`mov r9d, eax`
			`and eax, DWORD PTR L_mask_mant_all7 ; eax <-- 7 bits of x mantissa`
			`and r9d, DWORD PTR L_mask_mant8 ; r9d <-- 8th bit`
			`shl r9d, 1`
			`add eax, r9d ; use 8th bit to round up`
			`movd xmm1, eax`

			`; Is x near 1.0 ?`
			`; Note that if x is subnormal it is perforce not near one.`
			`andps xmm4, XMMWORD PTR L_real_notsign ; xmm4 <-- \|x-1\|`
			`comiss xmm4, DWORD PTR L_real_threshold ; is \|x-1\| < 1/16?`
			`jb Llogf_sse2_near_one ; if so, handle elsewhere`

			`; F, Y`
			`; F is a number in [.5,1) scaled from the rounded mantissa bits computed`
			`; above by oring in the exponent of .5.`
			`; Y is all of the mantissa bits of X scaled to [.5,1.) similarly`
			`shr eax, 16 ; shift eax to use as index`
			`por xmm2, XMMWORD PTR L_real_half ; xmm2 <-- Y`
			`por xmm1, XMMWORD PTR L_real_half ; xmm2 <-- F`
			`lea r9, QWORD PTR __log_F_inv_dword`


			`; f = F - Y, r = f * inv`
			`subss xmm1, xmm2 ; xmm1 <-- f`
			`mulss xmm1, DWORD PTR [r9+rax4] ; xmm1 <-- r = finv (tabled)`

			`movaps xmm2, xmm1`
			`movaps xmm0, xmm1`

			`; poly`
			`mulss xmm2, DWORD PTR L_real_1_over_3 ; xmm2 <-- r/3`
			`mulss xmm0, xmm1 ; xmm0 <-- r^2`
			`addss xmm2, DWORD PTR L_real_1_over_2`
			`movaps xmm3, XMMWORD PTR L_real_log2_tail`

			`lea r9, QWORD PTR __log_128_tail`
			`lea r10, QWORD PTR __log_128_lead`

			`mulss xmm2, xmm0 ; xmm2 <-- r^2 * (r/3 + 1/2)`
			`mulss xmm3, xmm5 ; xmm3 <-- (m=xexp)*log2_tail`
			`addss xmm1, xmm2 ; xmm1 <-- poly`

			`; mlog(2) + log(G) - poly, where G is just 2F`
			`; log(G) is precomputed to extra precision.`
			`; small pieces and large pieces are separated until the final add,`
			`; to preserve accuracy`
			`movaps xmm0, XMMWORD PTR L_real_log2_lead`
			`subss xmm3, xmm1 ; xmm3 <-- m*log2_tail - poly`
			`mulss xmm0, xmm5 ; xmm0 <-- m*log1_lead`
			`addss xmm3, DWORD PTR [r9+rax*4] ; xmm3 += log(G) tail`
			`addss xmm0, DWORD PTR [r10+rax*4] ; xmm0 += log(G) lead`

			`addss xmm0, xmm3 ; xmm0 <-- m*log(2)+log(G)-poly`

			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`Llogf_sse2_near_one:`
			`; Computation of the log for x near one requires special techniques.`
			`movaps xmm2, DWORD PTR L_real_two`
			`subss xmm0, DWORD PTR L_real_one ; xmm0 <-- r = x - 1.0`
			`addss xmm2, xmm0`
			`movaps xmm1, xmm0`
			`divss xmm1, xmm2 ; xmm1 <-- u = r/(2.0+r)`
			`movaps xmm4, xmm0`
			`mulss xmm4, xmm1 ; xmm4 <-- correction = r*u`
			`addss xmm1, xmm1 ; xmm1 <-- u = 2.*u`
			`movaps xmm2, xmm1`
			`mulss xmm2, xmm2 ; xmm2 <-- u^2`

			`; r2 = (u^3 * (ca_1 + u^2 * ca_2) - correction)`
			`movaps xmm3, xmm1`
			`mulss xmm3, xmm2 ; xmm3 <-- u^3`
			`mulss xmm2, DWORD PTR L_real_ca2 ; xmm2 <-- ca2*u^2`
			`addss xmm2, DWORD PTR L_real_ca1 ; xmm2 <-- ca2*u^2 + ca1`
			`mulss xmm2, xmm3 ; xmm2 <-- u^3(ca1+u^2ca2)`
			`subss xmm2, xmm4 ; xmm2 <-- r2`

			`; return r + r2`
			`addss xmm0, xmm2`
			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`Llogf_sse2_subnormal_adjust:`
			`; This code adjusts eax and xmm5.`
			`; It must preserve xmm4.`
			`por xmm2, XMMWORD PTR L_real_one`
			`subss xmm2, DWORD PTR L_real_one`
			`movdqa xmm5, xmm2`
			`pand xmm2, XMMWORD PTR L_real_mant`
			`movd eax, xmm2`
			`psrld xmm5, 23`
			`psubd xmm5, XMMWORD PTR L_mask_253`
			`cvtdq2ps xmm5, xmm5`
			`jmp Llogf_sse2_continue_common`

			`; Until we get to the FMA3 code, the rest of this is special case handling.`
			`Llogf_sse2_x_is_zero_or_neg:`
			`jne Llogf_sse2_x_is_neg`

			`movaps xmm1, XMMWORD PTR L_real_ninf`
			`mov r8d, DWORD PTR L_flag_x_zero`
			`call fname_special`
			`jmp Llogf_sse2_finish`

			`Llogf_sse2_x_is_neg:`

			`movaps xmm1, XMMWORD PTR L_real_neg_qnan`
			`mov r8d, DWORD PTR L_flag_x_neg`
			`call fname_special`
			`jmp Llogf_sse2_finish`

			`Llogf_sse2_x_is_inf_or_nan:`

			`cmp eax, DWORD PTR L_real_inf`
			`je Llogf_sse2_finish`

			`cmp eax, DWORD PTR L_real_ninf`
			`je Llogf_sse2_x_is_neg`

			`or eax, DWORD PTR L_real_qnanbit`
			`movd xmm1, eax`
			`mov r8d, DWORD PTR L_flag_x_nan`
			`call fname_special`
			`jmp Llogf_sse2_finish`

			`Llogf_sse2_finish:`
			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`Llogf_fma3:`
			`; compute exponent part`
			`vmovaps xmm4,XMMWORD PTR L_real_inf ; preload for inf/nan test`
			`xor rax,rax`
			`vpsrld xmm3,xmm0,23 ; xmm3 <-- (ux>>23)`
			`vmovd eax,xmm0 ;eax = x`
			`vpsubd xmm3,xmm3,DWORD PTR L_mask_127 ; xmm3 <-- (ux>>23) - 127`
			`vcvtdq2ps xmm5,xmm3 ; xmm5 <-- float((ux>>23)-127) = xexp`

			`; NaN or inf`
			`vpand xmm1,xmm0,xmm4 ; xmm1 <-- (ux & 07f800000h)`
			`vcomiss xmm1,xmm4`
			`je Llogf_fma3_x_is_inf_or_nan`

			`; check for negative numbers or zero`
			`vpxor xmm1,xmm1,xmm1`
			`vcomiss xmm0,xmm1`
			`jbe Llogf_fma3_x_is_zero_or_neg`

			`vpand xmm2,xmm0,DWORD PTR L_real_mant ; xmm2 <-- ux & 0007FFFFFh`
			`vsubss xmm4,xmm0,DWORD PTR L_real_one ; xmm4 <-- x - 1.0`

			`vcomiss xmm5,DWORD PTR L_real_neg127`
			`je Llogf_fma3_subnormal_adjust`

			`Llogf_fma3_continue_common:`

			`; compute the index into the log tables`
			`vpand xmm1,xmm0,DWORD PTR L_mask_mant_all7 ; xmm1 = ux & 0007f0000h`
			`vpand xmm3,xmm0,DWORD PTR L_mask_mant8 ; xmm3 = ux & 000008000h`
			`vpslld xmm3,xmm3,1 ; xmm3 = (ux & 000008000h) << 1`
			`vpaddd xmm1,xmm3,xmm1`
			`; eax = (ux & 0007f0000h) + ((ux & 000008000h) << 1)`
			`; eax <-- x/127., rounded to nearest`
			`vmovd eax,xmm1`

			`; near one codepath`
			`vandps xmm4,xmm4,DWORD PTR L_real_notsign ; xmm4 <-- fabs (x - 1.0)`
			`vcomiss xmm4,DWORD PTR L_real_threshold`
			`jb Llogf_fma3_near_one`

			`; F,Y`
			`shr eax,16`
			`vpor xmm2,xmm2,DWORD PTR L_real_half ; xmm2 <-- Y`
			`vpor xmm1,xmm1,DWORD PTR L_real_half ; xmm1 <-- F`
			`lea r9,QWORD PTR __log_F_inv_dword`

			`; f = F - Y`
			`vsubss xmm1,xmm1,xmm2 ; f = F - Y`
			`; r = f * log_F_inv_dword[index]`
			`vmulss xmm1,xmm1,DWORD PTR [r9 + rax * 4]`

			`; poly`
			`vmovaps xmm2,XMMWORD PTR L_real_1_over_3`
			`vfmadd213ss xmm2,xmm1,DWORD PTR L_real_1_over_2 ; 1/3*r + 1/2`
			`vmulss xmm0,xmm1,xmm1 ; r*r`
			`vmovaps xmm3,DWORD PTR L_real_log2_tail;`

			`lea r9,DWORD PTR __log_128_tail`
			`lea r10,DWORD PTR __log_128_lead`

			`vfmadd231ss xmm1,xmm2,xmm0 ; poly = r + 1/2rr + 1/3rr*r`
			`vfmsub213ss xmm3,xmm5,xmm1 ; (xexp * log2_tail) - poly`

			`; m*log(2) + log(G) - poly`
			`vmovaps xmm0,DWORD PTR L_real_log2_lead`
			`vfmadd213ss xmm0,xmm5,[r10 + rax * 4]`
			`; z2 = (xexp * log2_tail) - poly + log_128_tail[index]`
			`vaddss xmm3,xmm3,DWORD PTR [r9 + rax * 4]`
			`vaddss xmm0,xmm0,xmm3 ; return z1 + z2`

			`StackDeallocate stack_size`
			`ret`

			`ALIGN 16`
			`Llogf_fma3_near_one:`
			`; r = x - 1.0;`
			`vmovaps xmm2,DWORD PTR L_real_two`
			`vsubss xmm0,xmm0,DWORD PTR L_real_one ; xmm0 = r = = x - 1.0`

			`; u = r / (2.0 + r)`
			`vaddss xmm2,xmm2,xmm0 ; (r+2.0)`
			`vdivss xmm1,xmm0,xmm2 ; u = r / (2.0 + r)`

			`; correction = r * u`
			`vmulss xmm4,xmm0,xmm1 ; correction = u*r`

			`; u = u + u;`
			`vaddss xmm1,xmm1,xmm1 ; u = u+u`
			`vmulss xmm2,xmm1,xmm1 ; v = u^2`

			`; r2 = (u * v * (ca_1 + v * ca_2) - correction)`
			`vmulss xmm3,xmm1,xmm2 ; u^3`
			`vmovaps xmm5,DWORD PTR L_real_ca2`
			`vfmadd213ss xmm2,xmm5,DWORD PTR L_real_ca1`
			`vfmsub213ss xmm2,xmm3,xmm4 ; r2 = (ca1 + ca2 * v) * u^3 - correction`

			`; r + r2`
			`vaddss xmm0,xmm0,xmm2`
			`StackDeallocate stack_size`
			`ret`


			`ALIGN 16`
			`Llogf_fma3_subnormal_adjust:`
			`vmovaps xmm3,DWORD PTR L_real_one`
			`vpor xmm2,xmm2,xmm3 ; xmm2 = temp = ((ux &0007FFFFFh) \| 03f800000h)`
			`vsubss xmm2,xmm2,xmm3 ; xmm2 = temp -1.0`
			`vpsrld xmm5,xmm2,23 ; xmm5 = (utemp >> 23)`
			`vpand xmm2,xmm2,DWORD PTR L_real_mant ; xmm2 = (utemp & 0007FFFFFh)`
			`vmovaps xmm0,xmm2`
			`vpsubd xmm5,xmm5,DWORD PTR L_mask_253 ; xmm5 = (utemp >> 23) - 253`
			`vcvtdq2ps xmm5,xmm5 ; xmm5 = (float) ((utemp >> 23) - 253)`
			`jmp Llogf_fma3_continue_common`

			`Llogf_fma3_x_is_zero_or_neg:`
			`jne Llogf_fma3_x_is_neg`

			`vmovaps xmm1,DWORD PTR L_real_ninf`
			`mov r8d,DWORD PTR L_flag_x_zero`
			`call fname_special`

			`StackDeallocate stack_size`
			`ret`


			`Llogf_fma3_x_is_neg:`

			`vmovaps xmm1,DWORD PTR L_real_neg_qnan`
			`mov r8d,DWORD PTR L_flag_x_neg`
			`call fname_special`

			`StackDeallocate stack_size`
			`ret`

			`Llogf_fma3_x_is_inf_or_nan:`

			`cmp eax,DWORD PTR L_real_inf`
			`je Llogf_fma3_finish`

			`cmp eax,DWORD PTR L_real_ninf`
			`je Llogf_fma3_x_is_neg`

			`or eax,DWORD PTR L_real_qnanbit`
			`vmovd xmm1,eax`
			`mov r8d,DWORD PTR L_flag_x_nan`
			`call fname_special`

			`StackDeallocate stack_size`
			`ret`

			`Llogf_fma3_finish:`

			`StackDeallocate stack_size`
			`ret`


			`fname endp`
			`END`