[RTL/x64] Improve RtlCaptureContext

Use movaps instead of movdqa, it does the same thing, but is one byte shorter.
Shuffle instructions around a bit to maximize parallel execution.
This commit is contained in:
Timo Kreuzer 2018-03-02 08:02:13 +01:00
parent 3831c0ca31
commit abb338b13d

View file

@ -16,9 +16,9 @@
.code64 .code64
/* /*
* VOID NTAPI * VOID
* RtlCaptureContext( * RtlCaptureContext(
* PCONTEXT ContextRecord); <rcx> * _Out_ PCONTEXT ContextRecord@<rcx>);
*/ */
PUBLIC RtlCaptureContext PUBLIC RtlCaptureContext
.PROC RtlCaptureContext .PROC RtlCaptureContext
@ -28,70 +28,78 @@ PUBLIC RtlCaptureContext
.ALLOCSTACK 8 .ALLOCSTACK 8
.ENDPROLOG .ENDPROLOG
/* Save the basic register context */ /* Save rax first, we use it later to copy some data */
mov [rcx + CONTEXT_Rax], rax mov [rcx + CxRax], rax
mov [rcx + CONTEXT_Rcx], rcx
mov [rcx + CONTEXT_Rdx], rdx
/* Load rflags into rax */ /* Set ContextFlags */
mov rax, [rsp] mov dword ptr [rcx + CxContextFlags], (CONTEXT_FULL or CONTEXT_SEGMENTS)
mov [rcx + CONTEXT_Rbx], rbx /* Store the basic register context */
mov [rcx + CONTEXT_Rsi], rsi mov [rcx + CxRcx], rcx
mov [rcx + CONTEXT_Rdi], rdi mov [rcx + CxRdx], rdx
mov [rcx + CxRbx], rbx
/* Store rflags */ mov [rcx + CxRsi], rsi
mov [rcx + CONTEXT_EFlags], rax
mov [rcx + CONTEXT_Rbp], rbp
mov [rcx + CONTEXT_R8], r8
mov [rcx + CONTEXT_R9], r9
/* Load former stack pointer in rax */
lea rax, [rsp + 16]
mov [rcx + CONTEXT_R10], r10
mov [rcx + CONTEXT_R11], r11
mov [rcx + CONTEXT_R12], r12
/* Store stack pointer */
mov [rcx + CONTEXT_Rsp], rax
mov [rcx + CONTEXT_R13], r13
mov [rcx + CONTEXT_R14], r14
mov [rcx + CONTEXT_R15], r15
/* Load return address in rax */ /* Load return address in rax */
mov rax, [rsp + 8] mov rax, [rsp + 8]
/* Safe segment selectors */ mov [rcx + CxRdi], rdi
mov [rcx + CONTEXT_SegCs], cs mov [rcx + CxRbp], rbp
mov [rcx + CONTEXT_SegDs], ds mov [rcx + CxR8], r8
mov [rcx + CONTEXT_SegEs], es mov [rcx + CxR9], r9
mov [rcx + CONTEXT_SegFs], fs mov [rcx + CxR10], r10
mov [rcx + CONTEXT_SegGs], gs
mov [rcx + CONTEXT_SegSs], ss
/* Store return address */ /* Store the return address */
mov [rcx + CONTEXT_Rip], rax mov [rcx + CxRip], rax
/* Safe xmm registers */ mov [rcx + CxR11], r11
movdqa [rcx + CONTEXT_Xmm0], xmm0 mov [rcx + CxR12], r12
movdqa [rcx + CONTEXT_Xmm1], xmm1 mov [rcx + CxR13], r13
movdqa [rcx + CONTEXT_Xmm2], xmm2 mov [rcx + CxR14], r14
movdqa [rcx + CONTEXT_Xmm3], xmm3 mov [rcx + CxR15], r15
movdqa [rcx + CONTEXT_Xmm4], xmm4
movdqa [rcx + CONTEXT_Xmm5], xmm5 /* Load former stack pointer in rax */
movdqa [rcx + CONTEXT_Xmm6], xmm6 lea rax, [rsp + 16]
movdqa [rcx + CONTEXT_Xmm7], xmm7
movdqa [rcx + CONTEXT_Xmm8], xmm8 /* Store segment selectors */
movdqa [rcx + CONTEXT_Xmm9], xmm9 mov [rcx + CxSegCs], cs
movdqa [rcx + CONTEXT_Xmm10], xmm10 mov [rcx + CxSegDs], ds
movdqa [rcx + CONTEXT_Xmm11], xmm11 mov [rcx + CxSegEs], es
movdqa [rcx + CONTEXT_Xmm12], xmm12 mov [rcx + CxSegFs], fs
movdqa [rcx + CONTEXT_Xmm13], xmm13 mov [rcx + CxSegGs], gs
movdqa [rcx + CONTEXT_Xmm14], xmm14 mov [rcx + CxSegSs], ss
movdqa [rcx + CONTEXT_Xmm15], xmm15
/* Store stack pointer */
mov [rcx + CxRsp], rax
/* Store xmm registers */
movaps [rcx + CxXmm0], xmm0
movaps [rcx + CxXmm1], xmm1
movaps [rcx + CxXmm2], xmm2
movaps [rcx + CxXmm3], xmm3
movaps [rcx + CxXmm4], xmm4
movaps [rcx + CxXmm5], xmm5
movaps [rcx + CxXmm6], xmm6
movaps [rcx + CxXmm7], xmm7
/* Load rflags into eax */
mov eax, [rsp]
movaps [rcx + CxXmm8], xmm8
movaps [rcx + CxXmm9], xmm9
movaps [rcx + CxXmm10], xmm10
movaps [rcx + CxXmm11], xmm11
movaps [rcx + CxXmm12], xmm12
movaps [rcx + CxXmm13], xmm13
movaps [rcx + CxXmm14], xmm14
movaps [rcx + CxXmm15], xmm15
/* Store legacy floating point registers */
fxsave [rcx + CxFltSave]
stmxcsr [rcx + CxMxCsr]
/* Store rflags */
mov [rcx + CxEFlags], eax
/* Cleanup stack and return */ /* Cleanup stack and return */
add rsp, 8 add rsp, 8