for 24bpp

1. Remove inline asm for hline and implement a asm version of it in intel syntax. 
   (optimzeing of this version can be done better, but it for be done for now)
2. set eorly style on file I forget. 



svn path=/trunk/; revision=23732
This commit is contained in:
Magnus Olsen 2006-08-26 23:26:39 +00:00
parent c5edc7e6d3
commit 6518b5361d
6 changed files with 466 additions and 310 deletions

View file

@ -38,105 +38,7 @@ DIB_24BPP_GetPixel(SURFOBJ *SurfObj, LONG x, LONG y)
return *(PUSHORT)(addr) + (*(addr + 2) << 16);
}
VOID
DIB_24BPP_HLine(SURFOBJ *SurfObj, LONG x1, LONG x2, LONG y, ULONG c)
{
PBYTE addr = (PBYTE)SurfObj->pvScan0 + y * SurfObj->lDelta + (x1 << 1) + x1;
ULONG Count = x2 - x1;
#if !defined(_M_IX86) || defined(_MSC_VER)
ULONG MultiCount;
ULONG Fill[3];
#endif
if (Count < 8)
{
/* For small fills, don't bother doing anything fancy */
while (Count--)
{
*(PUSHORT)(addr) = c;
addr += 2;
*(addr) = c >> 16;
addr += 1;
}
}
else
{
/* Align to 4-byte address */
while (0 != ((ULONG_PTR) addr & 0x3))
{
*(PUSHORT)(addr) = c;
addr += 2;
*(addr) = c >> 16;
addr += 1;
Count--;
}
/* If the color we need to fill with is 0ABC, then the final mem pattern
* (note little-endianness) would be:
*
* |C.B.A|C.B.A|C.B.A|C.B.A| <- pixel borders
* |C.B.A.C|B.A.C.B|A.C.B.A| <- ULONG borders
*
* So, taking endianness into account again, we need to fill with these
* ULONGs: CABC BCAB ABCA */
#if defined(_M_IX86) && !defined(_MSC_VER)
/* This is about 30% faster than the generic C code below */
__asm__ __volatile__ (
" movl %1, %%ecx\n"
" andl $0xffffff, %%ecx\n" /* 0ABC */
" movl %%ecx, %%ebx\n" /* Construct BCAB in ebx */
" shrl $8, %%ebx\n"
" movl %%ecx, %%eax\n"
" shll $16, %%eax\n"
" orl %%eax, %%ebx\n"
" movl %%ecx, %%edx\n" /* Construct ABCA in edx */
" shll $8, %%edx\n"
" movl %%ecx, %%eax\n"
" shrl $16, %%eax\n"
" orl %%eax, %%edx\n"
" movl %%ecx, %%eax\n" /* Construct CABC in eax */
" shll $24, %%eax\n"
" orl %%ecx, %%eax\n"
" movl %2, %%ecx\n" /* Load count */
" shr $2, %%ecx\n"
" movl %3, %%edi\n" /* Load dest */
"0:\n"
" movl %%eax, (%%edi)\n" /* Store 4 pixels, 12 bytes */
" movl %%ebx, 4(%%edi)\n"
" movl %%edx, 8(%%edi)\n"
" addl $12, %%edi\n"
" dec %%ecx\n"
" jnz 0b\n"
" movl %%edi, %0\n"
: "=m"(addr)
: "m"(c), "m"(Count), "m"(addr)
: "%eax", "%ebx", "%ecx", "%edx", "%edi");
#else
c = c & 0xffffff; /* 0ABC */
Fill[0] = c | (c << 24); /* CABC */
Fill[1] = (c >> 8) | (c << 16); /* BCAB */
Fill[2] = (c << 8) | (c >> 16); /* ABCA */
MultiCount = Count / 4;
do
{
*(PULONG)addr = Fill[0];
addr += 4;
*(PULONG)addr = Fill[1];
addr += 4;
*(PULONG)addr = Fill[2];
addr += 4;
}
while (0 != --MultiCount);
#endif
Count = Count & 0x03;
while (0 != Count--)
{
*(PUSHORT)(addr) = c;
addr += 2;
*(addr) = c >> 16;
addr += 1;
}
}
}
VOID
DIB_24BPP_VLine(SURFOBJ *SurfObj, LONG x, LONG y1, LONG y2, ULONG c)

View file

@ -0,0 +1,89 @@
/*
* ReactOS W32 Subsystem
* Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 ReactOS Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* $Id$ */
#include <w32k.h>
#define NDEBUG
#include <debug.h>
VOID
DIB_24BPP_HLine(SURFOBJ *SurfObj, LONG x1, LONG x2, LONG y, ULONG c)
{
PBYTE addr = (PBYTE)SurfObj->pvScan0 + y * SurfObj->lDelta + (x1 << 1) + x1;
ULONG Count = x2 - x1;
if (Count < 8)
{
/* For small fills, don't bother doing anything fancy */
while (Count--)
{
*(PUSHORT)(addr) = c;
addr += 2;
*(addr) = c >> 16;
addr += 1;
}
}
else
{
/* Align to 4-byte address */
while (0 != ((ULONG_PTR) addr & 0x3))
{
*(PUSHORT)(addr) = c;
addr += 2;
*(addr) = c >> 16;
addr += 1;
Count--;
}
/* If the color we need to fill with is 0ABC, then the final mem pattern
* (note little-endianness) would be:
*
* |C.B.A|C.B.A|C.B.A|C.B.A| <- pixel borders
* |C.B.A.C|B.A.C.B|A.C.B.A| <- ULONG borders
*
* So, taking endianness into account again, we need to fill with these
* ULONGs: CABC BCAB ABCA */
c = c & 0xffffff; /* 0ABC */
Fill[0] = c | (c << 24); /* CABC */
Fill[1] = (c >> 8) | (c << 16); /* BCAB */
Fill[2] = (c << 8) | (c >> 16); /* ABCA */
MultiCount = Count / 4;
do
{
*(PULONG)addr = Fill[0];
addr += 4;
*(PULONG)addr = Fill[1];
addr += 4;
*(PULONG)addr = Fill[2];
addr += 4;
}
while (0 != --MultiCount);
Count = Count & 0x03;
while (0 != Count--)
{
*(PUSHORT)(addr) = c;
addr += 2;
*(addr) = c >> 16;
addr += 1;
}
}
}

View file

@ -0,0 +1,165 @@
/*
* ReactOS W32 Subsystem
* Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 2004, 2005, 2006 ReactOS Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/* $Id: */
.globl _DIB_24BPP_HLine
.intel_syntax noprefix
.def _DIB_24BPP_HLine;
.scl 2;
.type 32;
.endef
_DIB_24BPP_HLine:
push edi
push esi
push ebx
sub esp, 24
mov ebx, [esp+40]
mov edi, [esp+52]
mov ecx, [esp+44]
mov eax, [ebx+36]
mov esi, [ebx+32]
mov edx, [esp+48]
imul eax, edi
sub edx, ecx
mov [esp], edx
add eax, esi
lea eax, [eax+ecx*2]
add eax, ecx
cmp edx, 7
mov esi, edx
mov [esp+4], eax
ja Align4byte
lea eax, [edx-1]
mov [esp], eax
inc eax
jnz small_fill
add esp, 24
pop ebx
pop esi
pop edi
ret
/* For small fills, don't bother doing anything fancy */
small_fill:
movzx ecx, word ptr [esp+58]
mov edx, [esp+4]
mov esi, [esp+56]
lea eax, [edx+2]
mov [esp+4], eax
mov [edx+2], cl
mov eax, [esp]
inc dword ptr [esp+4]
mov [edx], si
dec eax
mov [esp], eax
inc eax
jnz small_fill
add esp, 24
pop ebx
pop esi
pop edi
ret
Align4byte:
/* Align to 4-byte address */
test al, 3
mov ecx, eax
jz loop1
lea esi, [esi+0]
lea edi, [edi+0]
loopasmversion:
/* This is about 30% faster than the generic C code below */
movzx edx, word ptr [esp+58]
lea edi, [ecx+2]
mov eax, [esp+56]
mov [esp+4], edi
mov [ecx+2], dl
mov ebx, [esp+4]
mov [ecx], ax
mov edx, [esp]
inc ebx
mov [esp+4], ebx
dec edx
test bl, 3
mov [esp], edx
mov ecx, ebx
jnz loopasmversion
mov esi, edx
loop1:
mov ecx, [esp+56]
and ecx, 16777215
mov ebx, ecx
shr ebx, 8
mov eax, ecx
shl eax, 16
or ebx, eax
mov edx, ecx
shl edx, 8
mov eax, ecx
shr eax, 16
or edx, eax
mov eax, ecx
shl eax, 24
or eax, ecx
mov ecx, [esp]
shr ecx, 2
mov edi, [esp+4]
loop2:
mov [edi], eax
mov [edi+4], ebx
mov [edi+8], edx
add edi, 12
dec ecx
jnz loop2
mov [esp+4], edi
and esi, 3
lea eax, [esi-1]
mov [esp], eax
inc eax
jnz leftoverfromthemainloop
add esp, 24
pop ebx
pop esi
pop edi
ret
leftoverfromthemainloop:
/* Count = Count & 0x03; */
mov ecx, [esp+4]
mov ebx, [esp+56]
lea esi, [ecx+2]
mov [ecx], bx
shr ebx, 16
mov [esp+4], esi
mov [ecx+2], bl
mov eax, [esp]
inc dword ptr [esp+4]
dec eax
mov [esp], eax
inc eax
jnz leftoverfromthemainloop
add esp, 24
pop ebx
pop esi
pop edi
ret