mirror of
https://github.com/reactos/reactos.git
synced 2024-11-04 05:43:30 +00:00
527f2f9057
* Create a branch for some evul shell experiments. svn path=/branches/shell-experiments/; revision=61927
483 lines
14 KiB
ArmAsm
483 lines
14 KiB
ArmAsm
/*
|
|
* Mesa 3-D graphics library
|
|
* Version: 7.1
|
|
*
|
|
* Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included
|
|
* in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
|
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#ifdef USE_X86_64_ASM
|
|
|
|
#include "matypes.h"
|
|
|
|
.text
|
|
|
|
.align 16
|
|
.globl _mesa_x86_64_cpuid
|
|
.hidden _mesa_x86_64_cpuid
|
|
_mesa_x86_64_cpuid:
|
|
pushq %rbx
|
|
movl (%rdi), %eax
|
|
movl 8(%rdi), %ecx
|
|
|
|
cpuid
|
|
|
|
movl %ebx, 4(%rdi)
|
|
movl %eax, (%rdi)
|
|
movl %ecx, 8(%rdi)
|
|
movl %edx, 12(%rdi)
|
|
popq %rbx
|
|
ret
|
|
|
|
.align 16
|
|
.globl _mesa_x86_64_transform_points4_general
|
|
.hidden _mesa_x86_64_transform_points4_general
|
|
_mesa_x86_64_transform_points4_general:
|
|
/*
|
|
* rdi = dest
|
|
* rsi = matrix
|
|
* rdx = source
|
|
*/
|
|
movl V4F_COUNT(%rdx), %ecx /* count */
|
|
movzbl V4F_STRIDE(%rdx), %eax /* stride */
|
|
|
|
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
|
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
|
.byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
|
|
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
|
|
|
testl %ecx, %ecx /* verify non-zero count */
|
|
prefetchnta 64(%rsi)
|
|
jz p4_general_done
|
|
|
|
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
|
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
|
|
|
prefetch 16(%rdx)
|
|
|
|
movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
|
|
movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
|
movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
|
|
movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
|
|
|
|
p4_general_loop:
|
|
|
|
movups (%rdx), %xmm8 /* ox | oy | oz | ow */
|
|
prefetchw 16(%rdi)
|
|
|
|
pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
|
|
addq %rax, %rdx
|
|
pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
|
|
mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
|
|
pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
|
|
mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
|
|
pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
|
|
mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
|
|
addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
|
|
mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
|
|
addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
|
|
prefetch 16(%rdx)
|
|
addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
|
|
|
|
movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
|
|
addq $16, %rdi
|
|
|
|
decl %ecx
|
|
jnz p4_general_loop
|
|
|
|
p4_general_done:
|
|
.byte 0xf3
|
|
ret
|
|
|
|
.section .rodata
|
|
|
|
.align 16
|
|
p4_constants:
|
|
.byte 0xff, 0xff, 0xff, 0xff
|
|
.byte 0xff, 0xff, 0xff, 0xff
|
|
.byte 0xff, 0xff, 0xff, 0xff
|
|
.byte 0x00, 0x00, 0x00, 0x00
|
|
|
|
.byte 0x00, 0x00, 0x00, 0x00
|
|
.byte 0x00, 0x00, 0x00, 0x00
|
|
.byte 0x00, 0x00, 0x00, 0x00
|
|
.float 1.0
|
|
|
|
.text
|
|
.align 16
|
|
.globl _mesa_x86_64_transform_points4_3d
|
|
.hidden _mesa_x86_64_transform_points4_3d
|
|
/*
|
|
* this is slower than _mesa_x86_64_transform_points4_general
|
|
* because it ensures that the last matrix row (or is it column?) is 0,0,0,1
|
|
*/
|
|
_mesa_x86_64_transform_points4_3d:
|
|
|
|
leaq p4_constants(%rip), %rax
|
|
|
|
prefetchnta 64(%rsi)
|
|
|
|
movaps (%rax), %xmm9
|
|
movaps 16(%rax), %xmm10
|
|
|
|
movl V4F_COUNT(%rdx), %ecx /* count */
|
|
movzbl V4F_STRIDE(%rdx), %eax /* stride */
|
|
|
|
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
|
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
|
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
|
|
|
testl %ecx, %ecx /* verify non-zero count */
|
|
jz p4_3d_done
|
|
|
|
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
|
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
|
|
|
prefetch 16(%rdx)
|
|
|
|
movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
|
|
movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
|
|
andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
|
|
movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
|
|
andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
|
|
movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
|
|
andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
|
|
andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
|
orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
|
|
|
|
p4_3d_loop:
|
|
|
|
movups (%rdx), %xmm8 /* ox | oy | oz | ow */
|
|
prefetchw 16(%rdi)
|
|
|
|
pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
|
|
addq %rax, %rdx
|
|
pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
|
|
mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
|
|
pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
|
|
mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
|
|
pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
|
|
mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
|
|
addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
|
|
mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
|
|
addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
|
|
prefetch 16(%rdx)
|
|
addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
|
|
|
|
movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
|
|
addq $16, %rdi
|
|
|
|
dec %ecx
|
|
jnz p4_3d_loop
|
|
|
|
p4_3d_done:
|
|
.byte 0xf3
|
|
ret
|
|
|
|
|
|
.align 16
|
|
.globl _mesa_x86_64_transform_points4_identity
|
|
.hidden _mesa_x86_64_transform_points4_identity
|
|
_mesa_x86_64_transform_points4_identity:
|
|
|
|
movl V4F_COUNT(%rdx), %ecx /* count */
|
|
movzbl V4F_STRIDE(%rdx), %eax /* stride */
|
|
|
|
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
|
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
|
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
|
|
|
test %ecx, %ecx
|
|
jz p4_identity_done
|
|
|
|
movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
|
|
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
|
prefetch 64(%rsi)
|
|
prefetchw 64(%rdi)
|
|
|
|
add %ecx, %ecx
|
|
|
|
rep movsq
|
|
|
|
p4_identity_done:
|
|
.byte 0xf3
|
|
ret
|
|
|
|
|
|
.align 16
|
|
.globl _mesa_3dnow_transform_points4_3d_no_rot
|
|
.hidden _mesa_3dnow_transform_points4_3d_no_rot
|
|
_mesa_3dnow_transform_points4_3d_no_rot:
|
|
|
|
movl V4F_COUNT(%rdx), %ecx /* count */
|
|
movzbl V4F_STRIDE(%rdx), %eax /* stride */
|
|
|
|
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
|
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
|
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
|
|
|
test %ecx, %ecx
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
|
jz p4_3d_no_rot_done
|
|
|
|
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
|
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
|
|
|
prefetch (%rdx)
|
|
|
|
movd (%rsi), %mm0 /* | m00 */
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
|
punpckldq 20(%rsi), %mm0 /* m11 | m00 */
|
|
|
|
movd 40(%rsi), %mm2 /* | m22 */
|
|
movq 48(%rsi), %mm1 /* m31 | m30 */
|
|
|
|
punpckldq 56(%rsi), %mm2 /* m11 | m00 */
|
|
|
|
p4_3d_no_rot_loop:
|
|
|
|
prefetchw 32(%rdi)
|
|
|
|
movq (%rdx), %mm4 /* x1 | x0 */
|
|
movq 8(%rdx), %mm5 /* x3 | x2 */
|
|
movd 12(%rdx), %mm7 /* | x3 */
|
|
|
|
movq %mm5, %mm6 /* x3 | x2 */
|
|
pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
|
|
|
|
punpckhdq %mm6, %mm6 /* x3 | x3 */
|
|
pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
|
|
|
|
pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
|
|
pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
|
|
|
|
pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
|
|
|
|
addq %rax, %rdx
|
|
movq %mm4, (%rdi) /* write r0, r1 */
|
|
movq %mm5, 8(%rdi) /* write r2, r3 */
|
|
|
|
addq $16, %rdi
|
|
|
|
decl %ecx
|
|
prefetch 32(%rdx)
|
|
jnz p4_3d_no_rot_loop
|
|
|
|
p4_3d_no_rot_done:
|
|
femms
|
|
ret
|
|
|
|
|
|
.align 16
|
|
.globl _mesa_3dnow_transform_points4_perspective
|
|
.hidden _mesa_3dnow_transform_points4_perspective
|
|
_mesa_3dnow_transform_points4_perspective:
|
|
|
|
movl V4F_COUNT(%rdx), %ecx /* count */
|
|
movzbl V4F_STRIDE(%rdx), %eax /* stride */
|
|
|
|
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
|
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
|
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
|
|
|
test %ecx, %ecx
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
|
jz p4_perspective_done
|
|
|
|
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
|
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
|
|
|
movd (%rsi), %mm0 /* | m00 */
|
|
pxor %mm7, %mm7 /* 0 | 0 */
|
|
punpckldq 20(%rsi), %mm0 /* m11 | m00 */
|
|
|
|
movq 32(%rsi), %mm2 /* m21 | m20 */
|
|
prefetch (%rdx)
|
|
|
|
movd 40(%rsi), %mm1 /* | m22 */
|
|
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 3 */
|
|
punpckldq 56(%rsi), %mm1 /* m32 | m22 */
|
|
|
|
|
|
p4_perspective_loop:
|
|
|
|
prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
|
|
|
|
movq (%rdx), %mm4 /* x1 | x0 */
|
|
movq 8(%rdx), %mm5 /* x3 | x2 */
|
|
movd 8(%rdx), %mm3 /* | x2 */
|
|
|
|
movq %mm5, %mm6 /* x3 | x2 */
|
|
pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
|
|
|
|
punpckldq %mm5, %mm5 /* x2 | x2 */
|
|
|
|
pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
|
|
pfsubr %mm7, %mm3 /* | -x2 */
|
|
|
|
pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
|
|
pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
|
|
|
|
pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
|
|
|
|
movq %mm5, (%rdi) /* write r0, r1 */
|
|
addq %rax, %rdx
|
|
movq %mm6, 8(%rdi) /* write r2, r3 */
|
|
|
|
addq $16, %rdi
|
|
|
|
decl %ecx
|
|
prefetch 32(%rdx) /* hopefully stride is zero */
|
|
jnz p4_perspective_loop
|
|
|
|
p4_perspective_done:
|
|
femms
|
|
ret
|
|
|
|
.align 16
|
|
.globl _mesa_3dnow_transform_points4_2d_no_rot
|
|
.hidden _mesa_3dnow_transform_points4_2d_no_rot
|
|
_mesa_3dnow_transform_points4_2d_no_rot:
|
|
|
|
movl V4F_COUNT(%rdx), %ecx /* count */
|
|
movzbl V4F_STRIDE(%rdx), %eax /* stride */
|
|
|
|
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
|
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
|
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
|
|
|
test %ecx, %ecx
|
|
.byte 0x90 /* manual align += 1 */
|
|
jz p4_2d_no_rot_done
|
|
|
|
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
|
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
|
|
|
movd (%rsi), %mm0 /* | m00 */
|
|
prefetch (%rdx)
|
|
punpckldq 20(%rsi), %mm0 /* m11 | m00 */
|
|
|
|
movq 48(%rsi), %mm1 /* m31 | m30 */
|
|
|
|
p4_2d_no_rot_loop:
|
|
|
|
prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
|
|
|
|
movq (%rdx), %mm4 /* x1 | x0 */
|
|
movq 8(%rdx), %mm5 /* x3 | x2 */
|
|
|
|
pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
|
|
movq %mm5, %mm6 /* x3 | x2 */
|
|
|
|
punpckhdq %mm6, %mm6 /* x3 | x3 */
|
|
|
|
addq %rax, %rdx
|
|
pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
|
|
|
|
prefetch 32(%rdx) /* hopefully stride is zero */
|
|
pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
|
|
|
|
movq %mm6, (%rdi) /* write r0, r1 */
|
|
movq %mm5, 8(%rdi) /* write r2, r3 */
|
|
|
|
addq $16, %rdi
|
|
|
|
decl %ecx
|
|
jnz p4_2d_no_rot_loop
|
|
|
|
p4_2d_no_rot_done:
|
|
femms
|
|
ret
|
|
|
|
|
|
.align 16
|
|
.globl _mesa_3dnow_transform_points4_2d
|
|
.hidden _mesa_3dnow_transform_points4_2d
|
|
_mesa_3dnow_transform_points4_2d:
|
|
|
|
movl V4F_COUNT(%rdx), %ecx /* count */
|
|
movzbl V4F_STRIDE(%rdx), %eax /* stride */
|
|
|
|
movl %ecx, V4F_COUNT(%rdi) /* set dest count */
|
|
movl $4, V4F_SIZE(%rdi) /* set dest size */
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 4 */
|
|
orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
|
|
|
|
test %ecx, %ecx
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 4 */
|
|
jz p4_2d_done
|
|
|
|
movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
|
|
movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
|
|
|
|
movd (%rsi), %mm0 /* | m00 */
|
|
movd 4(%rsi), %mm1 /* | m01 */
|
|
|
|
prefetch (%rdx)
|
|
|
|
punpckldq 16(%rsi), %mm0 /* m10 | m00 */
|
|
.byte 0x66, 0x66, 0x90 /* manual align += 4 */
|
|
punpckldq 20(%rsi), %mm1 /* m11 | m01 */
|
|
|
|
movq 48(%rsi), %mm2 /* m31 | m30 */
|
|
|
|
p4_2d_loop:
|
|
|
|
prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
|
|
|
|
movq (%rdx), %mm3 /* x1 | x0 */
|
|
movq 8(%rdx), %mm5 /* x3 | x2 */
|
|
|
|
movq %mm3, %mm4 /* x1 | x0 */
|
|
movq %mm5, %mm6 /* x3 | x2 */
|
|
|
|
pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
|
|
punpckhdq %mm6, %mm6 /* x3 | x3 */
|
|
|
|
pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
|
|
|
|
addq %rax, %rdx
|
|
pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
|
|
|
|
pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
|
|
prefetch 32(%rdx) /* hopefully stride is zero */
|
|
|
|
pfadd %mm6, %mm3 /* r1 | r0 */
|
|
|
|
movq %mm3, (%rdi) /* write r0, r1 */
|
|
movq %mm5, 8(%rdi) /* write r2, r3 */
|
|
|
|
addq $16, %rdi
|
|
|
|
decl %ecx
|
|
jnz p4_2d_loop
|
|
|
|
p4_2d_done:
|
|
femms
|
|
ret
|
|
|
|
#endif
|
|
|
|
#if defined (__ELF__) && defined (__linux__)
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|