Speed up 16bpp fills by a factor 2

svn path=/trunk/; revision=13053
This commit is contained in:
Gé van Geldorp 2005-01-15 09:46:55 +00:00
parent cc5f2d10ce
commit 8d58a5f254

View file

@ -40,15 +40,52 @@ DIB_16BPP_GetPixel(SURFOBJ *SurfObj, LONG x, LONG y)
VOID
DIB_16BPP_HLine(SURFOBJ *SurfObj, LONG x1, LONG x2, LONG y, ULONG c)
{
PBYTE byteaddr = SurfObj->pvScan0 + y * SurfObj->lDelta;
PWORD addr = (PWORD)byteaddr + x1;
LONG cx = x1;
PDWORD addr = (PDWORD)((PWORD)(SurfObj->pvScan0 + y * SurfObj->lDelta) + x1);
while(cx < x2) {
*addr = (WORD)c;
++addr;
++cx;
#ifdef _M_IX86
/* This is about 10% faster than the generic C code below */
LONG Count = x2 - x1;
__asm__(
" cld\n"
" andl $0xffff, %0\n" /* If the pixel value is "abcd", put "abcdabcd" in %eax */
" mov %0, %%eax\n"
" shl $16, %%eax\n"
" or %0, %%eax\n"
" test $0x01, %%edi\n" /* Align to fullword boundary */
" jz .L1\n"
" stosw\n"
" dec %1\n"
" jz .L2\n"
".L1:\n"
" mov %1,%%ecx\n" /* Setup count of fullwords to fill */
" shr $1,%%ecx\n"
" rep stosl\n" /* The actual fill */
" test $0x01, %1\n" /* One left to do at the right side? */
" jz .L2\n"
" stosw\n"
".L2:\n"
: /* no output */
: "r"(c), "r"(Count), "D"(addr)
: "%eax", "%ecx");
#else /* _M_IX86 */
LONG cx = x1;
DWORD cc;
if (0 != (cx & 0x01)) {
*((PWORD) addr) = c;
cx++;
addr = (PDWORD)((PWORD)(addr) + 1);
}
cc = ((c & 0xffff) << 16) | (c & 0xffff);
while(cx + 1 < x2) {
*addr++ = cc;
cx += 2;
}
if (cx < x2) {
*((PWORD) addr) = c;
}
#endif /* _M_IX86 */
}
VOID