- Make memcpy a duplicate of a memmove (confirmed by testing), there is a dependency on this behaviour. One less "msvcrt_winetest.exe string" failure.

svn path=/trunk/; revision=41126
2025-07-31 19:31:45 +00:00 · 2009-05-26 16:35:01 +00:00 · 2009-05-26 16:35:01 +00:00 · d41ad9a51b
commit d41ad9a51b
parent 878b219c0c
5 changed files with 112 additions and 86 deletions
--- a/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s
+++ b/reactos/lib/sdk/crt/mem/i386/memcpy_asm.s
@ -1,7 +1,7 @@
 /*
 * void *memcpy (void *to, const void *from, size_t count)
 *
- * Some optimization research can be found in media/doc/memcpy_optimize.txt
+ * NOTE: This code is a duplicate of memmove function from memmove_asm.s
 */

 .globl	_memcpy
@ -9,26 +9,39 @@
 _memcpy:
 	push	%ebp
 	mov	%esp,%ebp
+	
 	push	%esi
 	push	%edi
-	mov	0x8(%ebp),%edi
-	mov	0xc(%ebp),%esi
-	mov	0x10(%ebp),%ecx
+	
+	mov	8(%ebp),%edi
+	mov	12(%ebp),%esi
+	mov	16(%ebp),%ecx
+	
+	cmp	%esi,%edi
+	jbe	.CopyUp
+	mov	%ecx,%eax
+	add	%esi,%eax
+	cmp	%eax,%edi
+	jb	.CopyDown
+	
+.CopyUp:	
 	cld
+	
 	cmp	$16,%ecx
 	jb	.L1
 	mov	%ecx,%edx
 	test	$3,%edi
 	je	.L2
 /*
- *  Make the destination dword aligned
+ * Make the destination dword aligned
 */
-	mov	%edi,%ecx
-	neg %ecx
-	and	$3,%ecx
-	sub	%ecx,%edx
-	rep	movsb
-	mov	%edx,%ecx	
+        mov	%edi,%ecx
+        and	$3,%ecx
+        sub	$5,%ecx
+        not	%ecx
+        sub	%ecx,%edx
+        rep	movsb
+        mov	%edx,%ecx	
 .L2:
 	shr	$2,%ecx
 	rep	movsl
@ -39,9 +52,63 @@ _memcpy:
 	je	.L3
 	rep	movsb
 .L3:
+	mov	8(%ebp),%eax
 	pop	%edi
 	pop	%esi
-	mov	0x8(%ebp),%eax
 	leave
 	ret

+.CopyDown:
+        std
+        
+	add	%ecx,%edi
+	add	%ecx,%esi
+	
+	cmp	$16,%ecx
+	jb	.L4
+        mov	%ecx,%edx
+	test	$3,%edi
+	je	.L5
+	
+/*
+ * Make the destination dword aligned
+ */
+	mov	%edi,%ecx
+	and	$3,%ecx
+	sub	%ecx,%edx
+	dec	%esi
+	dec	%edi
+	rep	movsb
+	mov	%edx,%ecx
+	
+	sub	$3,%esi
+	sub	$3,%edi
+.L6:	
+	shr	$2,%ecx
+	rep	movsl
+	mov	%edx,%ecx
+	and	$3,%ecx
+	je	.L7
+	add	$3,%esi
+	add	$3,%edi
+.L8:	
+	rep	movsb
+.L7:
+	cld
+	mov	8(%ebp),%eax
+	pop	%edi
+	pop	%esi
+	leave
+	ret
+.L5:
+	sub	$4,%edi
+	sub	$4,%esi
+	jmp	.L6
+		
+.L4:
+	test	%ecx,%ecx
+	je	.L7	
+	dec	%esi
+	dec	%edi
+	jmp	.L8
+
--- a/reactos/lib/sdk/crt/mem/i386/memmove_asm.s
+++ b/reactos/lib/sdk/crt/mem/i386/memmove_asm.s
@ -1,9 +1,7 @@
-/* 
- * $Id$
- */
-
 /*
 * void *memmove (void *to, const void *from, size_t count)
+ *
+ * NOTE: This code is duplicated in memcpy_asm.s
 */

 .globl	_memmove
--- a/reactos/lib/sdk/crt/mem/memcpy.c
+++ b/reactos/lib/sdk/crt/mem/memcpy.c
@ -1,16 +1,36 @@
-/*
- * $Id$
- */
-
 #include <string.h>

+/* NOTE: This code is a duplicate of memmove implementation! */
 void* memcpy(void* to, const void* from, size_t count)
 {
-  register char *f = (char *)from;
-  register char *t = (char *)to;
-  register int i = count;
+    char *char_dest = (char *)dest;
+    char *char_src = (char *)src;

-  while (i-- > 0)
-    *t++ = *f++;
-  return to;
+    if ((char_dest <= char_src) || (char_dest >= (char_src+count)))
+    {
+        /*  non-overlapping buffers */
+        while(count > 0)
+	{
+            *char_dest = *char_src;
+            char_dest++;
+            char_src++;
+            count--;
+	}
+    }
+    else
+    {
+        /* overlaping buffers */
+        char_dest = (char *)dest + count - 1;
+        char_src = (char *)src + count - 1;
+
+        while(count > 0)
+	{
+           *char_dest = *char_src;
+           char_dest--;
+           char_src--;
+           count--;
+	}
+    }
+
+    return dest;
 }
--- a/reactos/lib/sdk/crt/mem/memmove.c
+++ b/reactos/lib/sdk/crt/mem/memmove.c
@ -1,10 +1,6 @@
-/*
- * $Id$
- */
-
 #include <string.h>

-
+/* NOTE: This code is duplicated in memcpy function */
 void * memmove(void *dest,const void *src,size_t count)
 {
    char *char_dest = (char *)dest;
--- a/reactos/media/doc/memcpy_optimize.txt
+++ b/reactos/media/doc/memcpy_optimize.txt
@ -1,55 +0,0 @@
-Surfing the Internet, I stumbled upon http://www.sciencemark.org where you
-can download a benchmark program that (amongst others) can benchmark different
-x86 memcpy implementations. Running that benchmark on my machine revealed that
-the fastest implementation was roughly twice as fast as the "rep movsl"
-implementation (lib/string/i386/memcpy_asm.s) that ReactOS uses.
-To test the alternate implementations in a ReactOS setting, I first
-instrumented the existing memcpy implementation to log with which arguments
-it was being called. I then booted ReactOS, started a background compile in it
-(to generate some I/O) and played a game of Solitaire (to generate graphics
-operations). After loosing the game, I shut down ReactOS. I then extracted
-the memcpy calls roughly between the start of Explorer (to get rid of one time
-startup effects) an shutdown. The resulting call profile is attached below.
-I then used that profile to make calls to the existing memcpy and an alternate
-implementation (I selected the "MMX registry copy with SSE prefetching"),
-taking care to use different source and destination regions to remove caching
-effects. The profile consisted of roughly 250000 calls to memcpy, I found
-that I had to execute the profile 10000 times to get "reasonable" time values.
-To compensate for the overhead of the test program, I also ran a test where
-the whole memcpy routine consisted of a single instruction: "ret". The test
-results, after applying a correction for the overhead:
-
-rep movl 70.5 sec
-mmx registers 58.3 sec
-Speed increase: 17%
-
-(Test machine: AMD Athlon MP 2800+ running Linux).
-Although the relative speed increase is nice (17%), we also have to look at the
-absolute speed increase. Remember that the 70.5 sec for the "rep movl" case
-was obtained by running the whole profile 10000 times. This means that all the
-memcpy's executed during the profiling run of ReactOS together took only
-0.00705 seconds. So the conclusion has to be that we're simply not spending
-a significant amount of time in memcpy (BTW, our memcpy implementation is
-shared between kernel and user mode, of the total of 250000 memcpy calls about
-90% were made from kernel mode and 10% from user mode), so optimizing memcpy
-(although possible) will not result in a significant better performance of
-ReactOS as a whole.
-Just for fun, I then used only the part of the profile where the memory area
-was larger than 128 bytes. The MMX implementation actually only runs for sizes
-over 128 bytes, for smaller sizes it deferred to the "rep movl" implementation.
-According to the profile, the vast majority of memcpy calls is made with a
-size smaller than 128 bytes (96.8%).
-
-rep movl 52.9 sec
-mmx registers 27.1 sec
-Speed increase 48%
-
-This is more or less in line with the results I got from the membench benchmark
-from http://www.sciencemark.org.
-
-Final conclusion: Although optimizing memcpy is useful (and feasible) for
-transfer of large blocks, the usage pattern in ReactOS consists mostly of
-small blocks. The resulting absolute spead increase doesn't justify the
-increased code complexity.
-
-2005/12/03 GvG