diff --git a/dll/win32/rpcrt4/msvc.S b/dll/win32/rpcrt4/msvc.S
index fca5ee5e612..a1da254285b 100644
--- a/dll/win32/rpcrt4/msvc.S
+++ b/dll/win32/rpcrt4/msvc.S
@@ -62,9 +62,9 @@ FUNC call_stubless_func
     add rdx, [rcx + 8]       /* info->ProcFormatString + offset */
     mov rcx, [rcx]           /* info->pStubDesc */
 
-    movaps [rsp + 20h], xmm1
-    movaps [rsp + 28h], xmm2
-    movaps [rsp + 30h], xmm3
+    movsd qword ptr [rsp + 20h], xmm1
+    movsd qword ptr [rsp + 28h], xmm2
+    movsd qword ptr [rsp + 30h], xmm3
     lea r9, [rsp + 18h]      /* fpu_args */
     call ndr_client_call
     add rsp, 38h
@@ -97,6 +97,14 @@ FUNC call_server_func
     mov rdx, [rsp + 8]
     mov r8, [rsp + 16]
     mov r9, [rsp + 24]
+
+    /* Usually the 64 bit SSE2 version of movd is called movq, as in GCC code
+       (see https://www.felixcloutier.com/x86/movd:movq). But there is another
+       movq with different encoding, which does not accept an integer register
+       as source (see https://www.felixcloutier.com/x86/movq). Older versions
+       of ML64 get confused and do not accept movq with integer registers,
+       but they translate movd to 64 bit, when 64 bit registers are used as
+       source, so we use that here. */
     movd xmm0, rcx
     movd xmm1, rdx
     movd xmm2, r8