reactos/dll/opengl/mesa/x86/read_rgba_span_x86.S

686 lines
13 KiB
ArmAsm

/*
* (C) Copyright IBM Corporation 2004
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* on the rights to use, copy, modify, merge, publish, distribute, sub
* license, and/or sell copies of the Software, and to permit persons to whom
* the Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
* IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* \file read_rgba_span_x86.S
* Optimized routines to transfer pixel data from the framebuffer to a
* buffer in main memory.
*
* \author Ian Romanick <idr@us.ibm.com>
*/
.file "read_rgba_span_x86.S"
#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
/* Kevin F. Quinn 2nd July 2006
* Replaced data segment constants with text-segment instructions.
*/
#define LOAD_MASK(mvins,m1,m2) \
pushl $0xff00ff00 ;\
pushl $0xff00ff00 ;\
pushl $0xff00ff00 ;\
pushl $0xff00ff00 ;\
mvins (%esp), m1 ;\
pushl $0x00ff0000 ;\
pushl $0x00ff0000 ;\
pushl $0x00ff0000 ;\
pushl $0x00ff0000 ;\
mvins (%esp), m2 ;\
addl $32, %esp
/* I implemented these as macros because they appear in several places,
* and I've tweaked them a number of times. I got tired of changing every
* place they appear. :)
*/
#define DO_ONE_PIXEL() \
movl (%ebx), %eax ; \
addl $4, %ebx ; \
bswap %eax /* ARGB -> BGRA */ ; \
rorl $8, %eax /* BGRA -> ABGR */ ; \
movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
addl $4, %ecx
#define DO_ONE_LAST_PIXEL() \
movl (%ebx), %eax ; \
bswap %eax /* ARGB -> BGRA */ ; \
rorl $8, %eax /* BGRA -> ABGR */ ; \
movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \
/**
* MMX optimized version of the BGRA8888_REV to RGBA copy routine.
*
* \warning
* This function assumes that the caller will issue the EMMS instruction
* at the correct places.
*/
.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
#ifndef USE_DRICORE
.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
#endif
.type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function
_generic_read_RGBA_span_BGRA8888_REV_MMX:
pushl %ebx
#ifdef USE_INNER_EMMS
emms
#endif
LOAD_MASK(movq,%mm1,%mm2)
movl 8(%esp), %ebx /* source pointer */
movl 16(%esp), %edx /* number of pixels to copy */
movl 12(%esp), %ecx /* destination pointer */
testl %edx, %edx
jle .L20 /* Bail if there's nothing to do. */
movl %ebx, %eax
negl %eax
sarl $2, %eax
andl $1, %eax
je .L17
subl %eax, %edx
DO_ONE_PIXEL()
.L17:
/* Would it be faster to unroll this loop once and process 4 pixels
* per pass, instead of just two?
*/
movl %edx, %eax
shrl %eax
jmp .L18
.L19:
movq (%ebx), %mm0
addl $8, %ebx
/* These 9 instructions do what PSHUFB (if there were such an
* instruction) could do in 1. :(
*/
movq %mm0, %mm3
movq %mm0, %mm4
pand %mm2, %mm3
psllq $16, %mm4
psrlq $16, %mm3
pand %mm2, %mm4
pand %mm1, %mm0
por %mm4, %mm3
por %mm3, %mm0
movq %mm0, (%ecx)
addl $8, %ecx
subl $1, %eax
.L18:
jne .L19
#ifdef USE_INNER_EMMS
emms
#endif
/* At this point there are either 1 or 0 pixels remaining to be
* converted. Convert the last pixel, if needed.
*/
testl $1, %edx
je .L20
DO_ONE_LAST_PIXEL()
.L20:
popl %ebx
ret
.size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
/**
* SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE
* instructions are only actually used to read data from the framebuffer.
* In practice, the speed-up is pretty small.
*
* \todo
* Do some more testing and determine if there's any reason to have this
* function in addition to the MMX version.
*
* \warning
* This function assumes that the caller will issue the EMMS instruction
* at the correct places.
*/
.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
#ifndef USE_DRICORE
.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
#endif
.type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE:
pushl %esi
pushl %ebx
pushl %ebp
#ifdef USE_INNER_EMMS
emms
#endif
LOAD_MASK(movq,%mm1,%mm2)
movl 16(%esp), %ebx /* source pointer */
movl 24(%esp), %edx /* number of pixels to copy */
movl 20(%esp), %ecx /* destination pointer */
testl %edx, %edx
jle .L35 /* Bail if there's nothing to do. */
movl %esp, %ebp
subl $16, %esp
andl $0xfffffff0, %esp
movl %ebx, %eax
movl %edx, %esi
negl %eax
andl $15, %eax
sarl $2, %eax
cmpl %edx, %eax
cmovle %eax, %esi
subl %esi, %edx
testl $1, %esi
je .L32
DO_ONE_PIXEL()
.L32:
testl $2, %esi
je .L31
movq (%ebx), %mm0
addl $8, %ebx
movq %mm0, %mm3
movq %mm0, %mm4
pand %mm2, %mm3
psllq $16, %mm4
psrlq $16, %mm3
pand %mm2, %mm4
pand %mm1, %mm0
por %mm4, %mm3
por %mm3, %mm0
movq %mm0, (%ecx)
addl $8, %ecx
.L31:
movl %edx, %eax
shrl $2, %eax
jmp .L33
.L34:
movaps (%ebx), %xmm0
addl $16, %ebx
/* This would be so much better if we could just move directly from
* an SSE register to an MMX register. Unfortunately, that
* functionality wasn't introduced until SSE2 with the MOVDQ2Q
* instruction.
*/
movaps %xmm0, (%esp)
movq (%esp), %mm0
movq 8(%esp), %mm5
movq %mm0, %mm3
movq %mm0, %mm4
movq %mm5, %mm6
movq %mm5, %mm7
pand %mm2, %mm3
pand %mm2, %mm6
psllq $16, %mm4
psllq $16, %mm7
psrlq $16, %mm3
psrlq $16, %mm6
pand %mm2, %mm4
pand %mm2, %mm7
pand %mm1, %mm0
pand %mm1, %mm5
por %mm4, %mm3
por %mm7, %mm6
por %mm3, %mm0
por %mm6, %mm5
movq %mm0, (%ecx)
movq %mm5, 8(%ecx)
addl $16, %ecx
subl $1, %eax
.L33:
jne .L34
#ifdef USE_INNER_EMMS
emms
#endif
movl %ebp, %esp
/* At this point there are either [0, 3] pixels remaining to be
* converted.
*/
testl $2, %edx
je .L36
movq (%ebx), %mm0
addl $8, %ebx
movq %mm0, %mm3
movq %mm0, %mm4
pand %mm2, %mm3
psllq $16, %mm4
psrlq $16, %mm3
pand %mm2, %mm4
pand %mm1, %mm0
por %mm4, %mm3
por %mm3, %mm0
movq %mm0, (%ecx)
addl $8, %ecx
.L36:
testl $1, %edx
je .L35
DO_ONE_LAST_PIXEL()
.L35:
popl %ebp
popl %ebx
popl %esi
ret
.size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
/**
* SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
*/
.text
.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
#ifndef USE_DRICORE
.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
#endif
.type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE2:
pushl %esi
pushl %ebx
LOAD_MASK(movdqu,%xmm1,%xmm2)
movl 12(%esp), %ebx /* source pointer */
movl 20(%esp), %edx /* number of pixels to copy */
movl 16(%esp), %ecx /* destination pointer */
movl %ebx, %eax
movl %edx, %esi
testl %edx, %edx
jle .L46 /* Bail if there's nothing to do. */
/* If the source pointer isn't a multiple of 16 we have to process
* a few pixels the "slow" way to get the address aligned for
* the SSE fetch instructions.
*/
negl %eax
andl $15, %eax
sarl $2, %eax
cmpl %edx, %eax
cmovbe %eax, %esi
subl %esi, %edx
testl $1, %esi
je .L41
DO_ONE_PIXEL()
.L41:
testl $2, %esi
je .L40
movq (%ebx), %xmm0
addl $8, %ebx
movdqa %xmm0, %xmm3
movdqa %xmm0, %xmm4
andps %xmm1, %xmm0
andps %xmm2, %xmm3
pslldq $2, %xmm4
psrldq $2, %xmm3
andps %xmm2, %xmm4
orps %xmm4, %xmm3
orps %xmm3, %xmm0
movq %xmm0, (%ecx)
addl $8, %ecx
.L40:
/* Would it be worth having a specialized version of this loop for
* the case where the destination is 16-byte aligned? That version
* would be identical except that it could use movedqa instead of
* movdqu.
*/
movl %edx, %eax
shrl $2, %eax
jmp .L42
.L43:
movdqa (%ebx), %xmm0
addl $16, %ebx
movdqa %xmm0, %xmm3
movdqa %xmm0, %xmm4
andps %xmm1, %xmm0
andps %xmm2, %xmm3
pslldq $2, %xmm4
psrldq $2, %xmm3
andps %xmm2, %xmm4
orps %xmm4, %xmm3
orps %xmm3, %xmm0
movdqu %xmm0, (%ecx)
addl $16, %ecx
subl $1, %eax
.L42:
jne .L43
/* There may be upto 3 pixels remaining to be copied. Take care
* of them now. We do the 2 pixel case first because the data
* will be aligned.
*/
testl $2, %edx
je .L47
movq (%ebx), %xmm0
addl $8, %ebx
movdqa %xmm0, %xmm3
movdqa %xmm0, %xmm4
andps %xmm1, %xmm0
andps %xmm2, %xmm3
pslldq $2, %xmm4
psrldq $2, %xmm3
andps %xmm2, %xmm4
orps %xmm4, %xmm3
orps %xmm3, %xmm0
movq %xmm0, (%ecx)
addl $8, %ecx
.L47:
testl $1, %edx
je .L46
DO_ONE_LAST_PIXEL()
.L46:
popl %ebx
popl %esi
ret
.size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
#define MASK_565_L 0x07e0f800
#define MASK_565_H 0x0000001f
/* Setting SCALE_ADJUST to 5 gives a perfect match with the
* classic C implementation in Mesa. Setting SCALE_ADJUST
* to 0 is slightly faster but at a small cost to accuracy.
*/
#define SCALE_ADJUST 5
#if SCALE_ADJUST == 5
#define PRESCALE_L 0x00100001
#define PRESCALE_H 0x00000200
#define SCALE_L 0x40C620E8
#define SCALE_H 0x0000839d
#elif SCALE_ADJUST == 0
#define PRESCALE_L 0x00200001
#define PRESCALE_H 0x00000800
#define SCALE_L 0x01040108
#define SCALE_H 0x00000108
#else
#error SCALE_ADJUST must either be 5 or 0.
#endif
#define ALPHA_L 0x00000000
#define ALPHA_H 0x00ff0000
/**
* MMX optimized version of the RGB565 to RGBA copy routine.
*/
.text
.globl _generic_read_RGBA_span_RGB565_MMX
#ifndef USE_DRICORE
.hidden _generic_read_RGBA_span_RGB565_MMX
#endif
.type _generic_read_RGBA_span_RGB565_MMX, @function
_generic_read_RGBA_span_RGB565_MMX:
#ifdef USE_INNER_EMMS
emms
#endif
movl 4(%esp), %eax /* source pointer */
movl 8(%esp), %edx /* destination pointer */
movl 12(%esp), %ecx /* number of pixels to copy */
pushl $MASK_565_H
pushl $MASK_565_L
movq (%esp), %mm5
pushl $PRESCALE_H
pushl $PRESCALE_L
movq (%esp), %mm6
pushl $SCALE_H
pushl $SCALE_L
movq (%esp), %mm7
pushl $ALPHA_H
pushl $ALPHA_L
movq (%esp), %mm3
addl $32,%esp
sarl $2, %ecx
jl .L01 /* Bail early if the count is negative. */
jmp .L02
.L03:
/* Fetch 4 RGB565 pixels into %mm4. Distribute the first and
* second pixels into the four words of %mm0 and %mm2.
*/
movq (%eax), %mm4
addl $8, %eax
pshufw $0x00, %mm4, %mm0
pshufw $0x55, %mm4, %mm2
/* Mask the pixels so that each word of each register contains only
* one color component.
*/
pand %mm5, %mm0
pand %mm5, %mm2
/* Adjust the component values so that they are as small as possible,
* but large enough so that we can multiply them by an unsigned 16-bit
* number and get a value as large as 0x00ff0000.
*/
pmullw %mm6, %mm0
pmullw %mm6, %mm2
#if SCALE_ADJUST > 0
psrlw $SCALE_ADJUST, %mm0
psrlw $SCALE_ADJUST, %mm2
#endif
/* Scale the input component values to be on the range
* [0, 0x00ff0000]. This it the real magic of the whole routine.
*/
pmulhuw %mm7, %mm0
pmulhuw %mm7, %mm2
/* Always set the alpha value to 0xff.
*/
por %mm3, %mm0
por %mm3, %mm2
/* Pack the 16-bit values to 8-bit values and store the converted
* pixel data.
*/
packuswb %mm2, %mm0
movq %mm0, (%edx)
addl $8, %edx
pshufw $0xaa, %mm4, %mm0
pshufw $0xff, %mm4, %mm2
pand %mm5, %mm0
pand %mm5, %mm2
pmullw %mm6, %mm0
pmullw %mm6, %mm2
#if SCALE_ADJUST > 0
psrlw $SCALE_ADJUST, %mm0
psrlw $SCALE_ADJUST, %mm2
#endif
pmulhuw %mm7, %mm0
pmulhuw %mm7, %mm2
por %mm3, %mm0
por %mm3, %mm2
packuswb %mm2, %mm0
movq %mm0, (%edx)
addl $8, %edx
subl $1, %ecx
.L02:
jne .L03
/* At this point there can be at most 3 pixels left to process. If
* there is either 2 or 3 left, process 2.
*/
movl 12(%esp), %ecx
testl $0x02, %ecx
je .L04
movd (%eax), %mm4
addl $4, %eax
pshufw $0x00, %mm4, %mm0
pshufw $0x55, %mm4, %mm2
pand %mm5, %mm0
pand %mm5, %mm2
pmullw %mm6, %mm0
pmullw %mm6, %mm2
#if SCALE_ADJUST > 0
psrlw $SCALE_ADJUST, %mm0
psrlw $SCALE_ADJUST, %mm2
#endif
pmulhuw %mm7, %mm0
pmulhuw %mm7, %mm2
por %mm3, %mm0
por %mm3, %mm2
packuswb %mm2, %mm0
movq %mm0, (%edx)
addl $8, %edx
.L04:
/* At this point there can be at most 1 pixel left to process.
* Process it if needed.
*/
testl $0x01, %ecx
je .L01
movzwl (%eax), %ecx
movd %ecx, %mm4
pshufw $0x00, %mm4, %mm0
pand %mm5, %mm0
pmullw %mm6, %mm0
#if SCALE_ADJUST > 0
psrlw $SCALE_ADJUST, %mm0
#endif
pmulhuw %mm7, %mm0
por %mm3, %mm0
packuswb %mm0, %mm0
movd %mm0, (%edx)
.L01:
#ifdef USE_INNER_EMMS
emms
#endif
ret
#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
#if defined (__ELF__) && defined (__linux__)
.section .note.GNU-stack,"",%progbits
#endif