mirror of
https://github.com/reactos/reactos.git
synced 2025-07-31 19:31:45 +00:00
- Make memcpy a duplicate of a memmove (confirmed by testing), there is a dependency on this behaviour. One less "msvcrt_winetest.exe string" failure.
svn path=/trunk/; revision=41126
This commit is contained in:
parent
878b219c0c
commit
d41ad9a51b
5 changed files with 112 additions and 86 deletions
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* void *memcpy (void *to, const void *from, size_t count)
|
||||
*
|
||||
* Some optimization research can be found in media/doc/memcpy_optimize.txt
|
||||
* NOTE: This code is a duplicate of memmove function from memmove_asm.s
|
||||
*/
|
||||
|
||||
.globl _memcpy
|
||||
|
@ -9,26 +9,39 @@
|
|||
_memcpy:
|
||||
push %ebp
|
||||
mov %esp,%ebp
|
||||
|
||||
push %esi
|
||||
push %edi
|
||||
mov 0x8(%ebp),%edi
|
||||
mov 0xc(%ebp),%esi
|
||||
mov 0x10(%ebp),%ecx
|
||||
|
||||
mov 8(%ebp),%edi
|
||||
mov 12(%ebp),%esi
|
||||
mov 16(%ebp),%ecx
|
||||
|
||||
cmp %esi,%edi
|
||||
jbe .CopyUp
|
||||
mov %ecx,%eax
|
||||
add %esi,%eax
|
||||
cmp %eax,%edi
|
||||
jb .CopyDown
|
||||
|
||||
.CopyUp:
|
||||
cld
|
||||
|
||||
cmp $16,%ecx
|
||||
jb .L1
|
||||
mov %ecx,%edx
|
||||
test $3,%edi
|
||||
je .L2
|
||||
/*
|
||||
* Make the destination dword aligned
|
||||
* Make the destination dword aligned
|
||||
*/
|
||||
mov %edi,%ecx
|
||||
neg %ecx
|
||||
and $3,%ecx
|
||||
sub %ecx,%edx
|
||||
rep movsb
|
||||
mov %edx,%ecx
|
||||
mov %edi,%ecx
|
||||
and $3,%ecx
|
||||
sub $5,%ecx
|
||||
not %ecx
|
||||
sub %ecx,%edx
|
||||
rep movsb
|
||||
mov %edx,%ecx
|
||||
.L2:
|
||||
shr $2,%ecx
|
||||
rep movsl
|
||||
|
@ -39,9 +52,63 @@ _memcpy:
|
|||
je .L3
|
||||
rep movsb
|
||||
.L3:
|
||||
mov 8(%ebp),%eax
|
||||
pop %edi
|
||||
pop %esi
|
||||
mov 0x8(%ebp),%eax
|
||||
leave
|
||||
ret
|
||||
|
||||
.CopyDown:
|
||||
std
|
||||
|
||||
add %ecx,%edi
|
||||
add %ecx,%esi
|
||||
|
||||
cmp $16,%ecx
|
||||
jb .L4
|
||||
mov %ecx,%edx
|
||||
test $3,%edi
|
||||
je .L5
|
||||
|
||||
/*
|
||||
* Make the destination dword aligned
|
||||
*/
|
||||
mov %edi,%ecx
|
||||
and $3,%ecx
|
||||
sub %ecx,%edx
|
||||
dec %esi
|
||||
dec %edi
|
||||
rep movsb
|
||||
mov %edx,%ecx
|
||||
|
||||
sub $3,%esi
|
||||
sub $3,%edi
|
||||
.L6:
|
||||
shr $2,%ecx
|
||||
rep movsl
|
||||
mov %edx,%ecx
|
||||
and $3,%ecx
|
||||
je .L7
|
||||
add $3,%esi
|
||||
add $3,%edi
|
||||
.L8:
|
||||
rep movsb
|
||||
.L7:
|
||||
cld
|
||||
mov 8(%ebp),%eax
|
||||
pop %edi
|
||||
pop %esi
|
||||
leave
|
||||
ret
|
||||
.L5:
|
||||
sub $4,%edi
|
||||
sub $4,%esi
|
||||
jmp .L6
|
||||
|
||||
.L4:
|
||||
test %ecx,%ecx
|
||||
je .L7
|
||||
dec %esi
|
||||
dec %edi
|
||||
jmp .L8
|
||||
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
/*
|
||||
* $Id$
|
||||
*/
|
||||
|
||||
/*
|
||||
* void *memmove (void *to, const void *from, size_t count)
|
||||
*
|
||||
* NOTE: This code is duplicated in memcpy_asm.s
|
||||
*/
|
||||
|
||||
.globl _memmove
|
||||
|
|
|
@ -1,16 +1,36 @@
|
|||
/*
|
||||
* $Id$
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/* NOTE: This code is a duplicate of memmove implementation! */
|
||||
void* memcpy(void* to, const void* from, size_t count)
|
||||
{
|
||||
register char *f = (char *)from;
|
||||
register char *t = (char *)to;
|
||||
register int i = count;
|
||||
char *char_dest = (char *)dest;
|
||||
char *char_src = (char *)src;
|
||||
|
||||
while (i-- > 0)
|
||||
*t++ = *f++;
|
||||
return to;
|
||||
if ((char_dest <= char_src) || (char_dest >= (char_src+count)))
|
||||
{
|
||||
/* non-overlapping buffers */
|
||||
while(count > 0)
|
||||
{
|
||||
*char_dest = *char_src;
|
||||
char_dest++;
|
||||
char_src++;
|
||||
count--;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* overlaping buffers */
|
||||
char_dest = (char *)dest + count - 1;
|
||||
char_src = (char *)src + count - 1;
|
||||
|
||||
while(count > 0)
|
||||
{
|
||||
*char_dest = *char_src;
|
||||
char_dest--;
|
||||
char_src--;
|
||||
count--;
|
||||
}
|
||||
}
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
/*
|
||||
* $Id$
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
|
||||
/* NOTE: This code is duplicated in memcpy function */
|
||||
void * memmove(void *dest,const void *src,size_t count)
|
||||
{
|
||||
char *char_dest = (char *)dest;
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
Surfing the Internet, I stumbled upon http://www.sciencemark.org where you
|
||||
can download a benchmark program that (amongst others) can benchmark different
|
||||
x86 memcpy implementations. Running that benchmark on my machine revealed that
|
||||
the fastest implementation was roughly twice as fast as the "rep movsl"
|
||||
implementation (lib/string/i386/memcpy_asm.s) that ReactOS uses.
|
||||
To test the alternate implementations in a ReactOS setting, I first
|
||||
instrumented the existing memcpy implementation to log with which arguments
|
||||
it was being called. I then booted ReactOS, started a background compile in it
|
||||
(to generate some I/O) and played a game of Solitaire (to generate graphics
|
||||
operations). After loosing the game, I shut down ReactOS. I then extracted
|
||||
the memcpy calls roughly between the start of Explorer (to get rid of one time
|
||||
startup effects) an shutdown. The resulting call profile is attached below.
|
||||
I then used that profile to make calls to the existing memcpy and an alternate
|
||||
implementation (I selected the "MMX registry copy with SSE prefetching"),
|
||||
taking care to use different source and destination regions to remove caching
|
||||
effects. The profile consisted of roughly 250000 calls to memcpy, I found
|
||||
that I had to execute the profile 10000 times to get "reasonable" time values.
|
||||
To compensate for the overhead of the test program, I also ran a test where
|
||||
the whole memcpy routine consisted of a single instruction: "ret". The test
|
||||
results, after applying a correction for the overhead:
|
||||
|
||||
rep movl 70.5 sec
|
||||
mmx registers 58.3 sec
|
||||
Speed increase: 17%
|
||||
|
||||
(Test machine: AMD Athlon MP 2800+ running Linux).
|
||||
Although the relative speed increase is nice (17%), we also have to look at the
|
||||
absolute speed increase. Remember that the 70.5 sec for the "rep movl" case
|
||||
was obtained by running the whole profile 10000 times. This means that all the
|
||||
memcpy's executed during the profiling run of ReactOS together took only
|
||||
0.00705 seconds. So the conclusion has to be that we're simply not spending
|
||||
a significant amount of time in memcpy (BTW, our memcpy implementation is
|
||||
shared between kernel and user mode, of the total of 250000 memcpy calls about
|
||||
90% were made from kernel mode and 10% from user mode), so optimizing memcpy
|
||||
(although possible) will not result in a significant better performance of
|
||||
ReactOS as a whole.
|
||||
Just for fun, I then used only the part of the profile where the memory area
|
||||
was larger than 128 bytes. The MMX implementation actually only runs for sizes
|
||||
over 128 bytes, for smaller sizes it deferred to the "rep movl" implementation.
|
||||
According to the profile, the vast majority of memcpy calls is made with a
|
||||
size smaller than 128 bytes (96.8%).
|
||||
|
||||
rep movl 52.9 sec
|
||||
mmx registers 27.1 sec
|
||||
Speed increase 48%
|
||||
|
||||
This is more or less in line with the results I got from the membench benchmark
|
||||
from http://www.sciencemark.org.
|
||||
|
||||
Final conclusion: Although optimizing memcpy is useful (and feasible) for
|
||||
transfer of large blocks, the usage pattern in ReactOS consists mostly of
|
||||
small blocks. The resulting absolute spead increase doesn't justify the
|
||||
increased code complexity.
|
||||
|
||||
2005/12/03 GvG
|
Loading…
Add table
Add a link
Reference in a new issue