reactos/dll/opengl/mesa/asm-386.S

1645 lines
33 KiB
ArmAsm
Raw Normal View History

/* $Id: asm-386.S,v 1.8 1997/12/17 00:50:51 brianp Exp $ */
/*
* asm-386.S - special (hopefully faster) transformation functions for x86
*
* by Josh Vanderhoof
*
* This file is in the public domain.
*/
/*
* $Log: asm-386.S,v $
* Revision 1.8 1997/12/17 00:50:51 brianp
* applied Josh's patch to fix texture coordinate transformation bugs
*
* Revision 1.7 1997/12/17 00:27:11 brianp
* applied Josh's patch to fix bfris
*
* Revision 1.6 1997/12/01 01:02:41 brianp
* added FreeBSD patches (Daniel J. O'Connor)
*
* Revision 1.5 1997/11/19 23:52:17 brianp
* added missing "cld" instruction in asm_transform_points4_identity()
*
* Revision 1.4 1997/11/11 02:22:41 brianp
* small change per Josh to ensure U/V pairing
*
* Revision 1.3 1997/11/07 03:37:24 brianp
* added missing line from Stephane Rehel
*
* Revision 1.2 1997/11/07 03:30:37 brianp
* added Josh's 11-5-97 patches
*
* Revision 1.1 1997/10/30 06:00:33 brianp
* Initial revision
*/
#include <asm.inc>
#define S(x) dword ptr [esi + 4*x]
#define D(x) dword ptr [edi + 4*x]
#define M(x, y) dword ptr [edx + 16*x + 4*y]
.code
/*
* void asm_transform_points3_general( GLuint n, GLfloat d[][4],
* GLfloat m[16], GLfloat s[][4] );
*/
PUBLIC _asm_transform_points3_general
_asm_transform_points3_general:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _asm_transform_points3_general_end
.align 4
_asm_transform_points3_general_loop:
fld S(0)
fmul M(0, 0)
fld S(0)
fmul M(0, 1)
fld S(0)
fmul M(0, 2)
fld S(0)
fmul M(0, 3)
fld S(1)
fmul M(1, 0)
fld S(1)
fmul M(1, 1)
fld S(1)
fmul M(1, 2)
fld S(1)
fmul M(1, 3)
/*
* The FPU stack should now look like this:
*
* st(7) = S(0) * M(0, 0)
* st(6) = S(0) * M(0, 1)
* st(5) = S(0) * M(0, 2)
* st(4) = S(0) * M(0, 3)
* st(3) = S(1) * M(1, 0)
* st(2) = S(1) * M(1, 1)
* st(1) = S(1) * M(1, 2)
* st(0) = S(1) * M(1, 3)
*/
fxch st(3) /* 3 1 2 0 4 5 6 7 */
faddp st(7), st /* 1 2 0 4 5 6 7 */
fxch st(1) /* 2 1 0 4 5 6 7 */
faddp st(5), st /* 1 0 4 5 6 7 */
faddp st(3), st /* 0 4 5 6 7 */
faddp st(1), st /* 4 5 6 7 */
/*
* st(3) = S(0) * M(0, 0) + S(1) * M(1, 0)
* st(2) = S(0) * M(0, 1) + S(1) * M(1, 1)
* st(1) = S(0) * M(0, 2) + S(1) * M(1, 2)
* st(0) = S(0) * M(0, 3) + S(1) * M(1, 3)
*/
fld S(2)
fmul M(2, 0)
fld S(2)
fmul M(2, 1)
fld S(2)
fmul M(2, 2)
fld S(2)
fmul M(2, 3)
/*
* st(7) = S(0) * M(0, 0) + S(1) * M(1, 0)
* st(6) = S(0) * M(0, 1) + S(1) * M(1, 1)
* st(5) = S(0) * M(0, 2) + S(1) * M(1, 2)
* st(4) = S(0) * M(0, 3) + S(1) * M(1, 3)
* st(3) = S(2) * M(2, 0)
* st(2) = S(2) * M(2, 1)
* st(1) = S(2) * M(2, 2)
* st(0) = S(2) * M(2, 3)
*/
fxch st(3) /* 3 1 2 0 4 5 6 7 */
faddp st(7), st /* 1 2 0 4 5 6 7 */
fxch st(1) /* 2 1 0 4 5 6 7 */
faddp st(5), st /* 1 0 4 5 6 7 */
faddp st(3), st /* 0 4 5 6 7 */
faddp st(1), st /* 4 5 6 7 */
/*
* st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
* st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
* st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
* st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
*/
fxch st(3) /* 3 1 2 0 */
fadd M(3, 0)
fxch st(2) /* 2 1 3 0 */
fadd M(3, 1)
fxch st(1) /* 1 2 3 0 */
fadd M(3, 2)
fxch st(3) /* 0 2 3 1 */
fadd M(3, 3)
/*
* st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + M(3, 2)
* st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + M(3, 0)
* st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + M(3, 1)
* st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) + M(3, 3)
*/
fxch st(3) /* 3 1 2 0 */
fstp D(2) /* 1 2 0 */
fxch st(1) /* 2 1 0 */
fstp D(0) /* 1 0 */
lea esi, S(4)
fstp D(1) /* 0 */
dec ecx
fstp D(3) /* */
lea edi, D(4)
jnz _asm_transform_points3_general_loop
_asm_transform_points3_general_end:
pop edi
pop esi
ret
/*
* void asm_transform_points3_identity( GLuint n, GLfloat d[][4],
* GLfloat s[][4] );
*/
PUBLIC _asm_transform_points3_identity
_asm_transform_points3_identity:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov esi, [esp + 20] /* esi = s */
push ebx
push ebp
test ecx, ecx
jz _asm_transform_points3_identity_end
mov ebp, HEX(3f800000)
.align 4
_asm_transform_points3_identity_loop:
mov eax, S(0)
mov edx, S(1)
mov ebx, S(2)
lea esi, S(4)
mov D(0), eax
mov D(1), edx
mov D(2), ebx
mov D(3), ebp
dec ecx
lea edi, D(4)
jnz _asm_transform_points3_identity_loop
_asm_transform_points3_identity_end:
pop ebp
pop ebx
pop edi
pop esi
ret
/*
* void asm_transform_points3_2d( GLuint n, GLfloat d[][4], GLfloat m[16],
* GLfloat s[][4] );
*/
PUBLIC _asm_transform_points3_2d
_asm_transform_points3_2d:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
push ebp
mov ebp, HEX(3f800000)
test cl, DEC(1)
jz _asm_transform_points3_2d_step
dec ecx
fld S(0)
fmul M(0, 0)
fld S(0)
fmul M(0, 1)
fld S(1)
fmul M(1, 0)
fld S(1)
fmul M(1, 1)
/*
* st(3) = S(0) * M(0, 0)
* st(2) = S(0) * M(0, 1)
* st(1) = S(1) * M(1, 0)
* st(0) = S(1) * M(1, 1)
*/
fxch st(1) /* 1 0 2 3 */
fadd M(3, 0)
fxch st(1) /* 0 1 2 3 */
fadd M(3, 1)
fxch st(1) /* 1 0 2 3 */
faddp st(3), st /* 0 2 3 */
faddp st(1), st /* 2 3 */
fstp D(1) /* 3 */
fstp D(0) /* */
mov eax, S(2)
lea esi, S(4)
mov D(3), ebp
mov D(2), eax
lea edi, D(4)
_asm_transform_points3_2d_step:
test ecx, ecx
jz _asm_transform_points3_2d_end
.align 4
_asm_transform_points3_2d_loop:
fld S(0)
fmul M(0, 0)
fld S(0)
fmul M(0, 1)
fld S(4)
fmul M(0, 0)
fld S(4)
fmul M(0, 1)
fld S(1)
fmul M(1, 0)
fld S(1)
fmul M(1, 1)
fld S(5)
fmul M(1, 0)
fld S(5)
fmul M(1, 1)
/*
* st(7) = S(0) * M(0, 0)
* st(6) = S(0) * M(0, 1)
* st(5) = S(4) * M(0, 0)
* st(4) = S(4) * M(0, 1)
* st(3) = S(1) * M(1, 0)
* st(2) = S(1) * M(1, 1)
* st(1) = S(5) * M(1, 0)
* st(0) = S(5) * M(1, 1)
*/
fxch st(7) /* 7 1 2 3 4 5 6 0 */
fadd M(3, 0)
fxch st(6) /* 6 1 2 3 4 5 7 0 */
fadd M(3, 1)
fxch st(5) /* 5 1 2 3 4 6 7 0 */
fadd M(3, 0)
fxch st(4) /* 4 1 2 3 5 6 7 0 */
fadd M(3, 1)
mov eax, S(2)
mov D(3), ebp
mov D(2), eax
mov eax, S(6)
mov D(7), ebp
mov D(6), eax
lea esi, S(8)
sub ecx, DEC(2)
/*
* st(7) = S(5) * M(1, 1)
* st(6) = S(0) * M(0, 0) + M(3, 0)
* st(5) = S(0) * M(0, 1) + M(3, 1)
* st(4) = S(4) * M(0, 0) + M(3, 0)
* st(3) = S(1) * M(1, 0)
* st(2) = S(1) * M(1, 1)
* st(1) = S(5) * M(1, 0)
* st(0) = S(4) * M(0, 1) + M(3, 1)
*/
faddp st(7), st /* 1 2 3 4 5 6 7 */
faddp st(3), st /* 2 3 4 5 6 7 */
faddp st(3), st /* 3 4 5 6 7 */
faddp st(3), st /* 4 5 6 7 */
fxch st(3) /* 7 5 6 4 */
fstp D(5) /* 5 6 4 */
fstp D(1) /* 6 4 */
fstp D(0) /* 4 */
fstp D(4) /* */
lea edi, D(8)
jnz _asm_transform_points3_2d_loop
_asm_transform_points3_2d_end:
pop ebp
pop edi
pop esi
ret
/*
* void asm_transform_points3_2d_no_rot( GLuint n, GLfloat d[][4],
* GLfloat m[16], GLfloat s[][4] );
*
*/
PUBLIC _asm_transform_points3_2d_no_rot
_asm_transform_points3_2d_no_rot:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
push ebp
test ecx, ecx
jz _asm_transform_points3_2d_no_rot_end
mov ebp, HEX(3f800000)
.align 4
_asm_transform_points3_2d_no_rot_loop:
fld S(0)
fmul M(0, 0)
fld S(1)
fmul M(1, 1)
fxch st(1)
fadd M(3, 0)
fxch st(1)
fadd M(3, 1)
fxch st(1)
fstp D(0)
fstp D(1)
mov eax, S(2) /* cycle 1: U pipe */
mov D(3), ebp /* V pipe */
mov D(2), eax /* cycle 2: U pipe */
dec ecx
lea esi, S(4)
lea edi, D(4)
jnz _asm_transform_points3_2d_no_rot_loop
_asm_transform_points3_2d_no_rot_end:
pop ebp
pop edi
pop esi
ret
/*
* void asm_transform_points3_3d( GLuint n, GLfloat d[][4], GLfloat m[16],
* GLfloat s[][4] );
*/
PUBLIC _asm_transform_points3_3d
_asm_transform_points3_3d:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _asm_transform_points3_3d_end
mov eax, HEX(3f800000)
.align 4
_asm_transform_points3_3d_loop:
fld S(0)
fmul M(0, 0)
fld S(0)
fmul M(0, 1)
fld S(0)
fmul M(0, 2)
fld S(1)
fmul M(1, 0)
fld S(1)
fmul M(1, 1)
fld S(1)
fmul M(1, 2)
/*
* st(5) = S(0) * M(0, 0)
* st(4) = S(0) * M(0, 1)
* st(3) = S(0) * M(0, 2)
* st(2) = S(1) * M(1, 0)
* st(1) = S(1) * M(1, 1)
* st(0) = S(1) * M(1, 2)
*/
fxch st(2) /* 2 1 0 3 4 5 */
faddp st(5), st /* 1 0 3 4 5 */
faddp st(3), st /* 0 3 4 5 */
faddp st(1), st /* 3 4 5 */
/*
* st(2) = S(0) * M(0, 0) + S(1) * M(1, 0)
* st(1) = S(0) * M(0, 1) + S(1) * M(1, 1)
* st(0) = S(0) * M(0, 2) + S(1) * M(1, 2)
*/
fld S(2)
fmul M(2, 0)
fld S(2)
fmul M(2, 1)
fld S(2)
fmul M(2, 2)
/*
* st(5) = S(0) * M(0, 0) + S(1) * M(1, 0)
* st(4) = S(0) * M(0, 1) + S(1) * M(1, 1)
* st(3) = S(0) * M(0, 2) + S(1) * M(1, 2)
* st(2) = S(2) * M(2, 0)
* st(1) = S(2) * M(2, 1)
* st(0) = S(2) * M(2, 2)
*/
fxch st(2) /* 2 1 0 3 4 5 */
faddp st(5), st /* 1 0 3 4 5 */
faddp st(3), st /* 0 3 4 5 */
faddp st(1), st /* 3 4 5 */
/*
* st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
* st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
* st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
*/
fxch st(2) /* 2 1 0 */
fadd M(3, 0)
fxch st(1) /* 1 2 0 */
fadd M(3, 1)
fxch st(2) /* 0 2 1 */
fadd M(3, 2)
fxch st(1) /* 2 0 1 */
fstp D(0) /* 0 1 */
fstp D(2) /* 1 */
fstp D(1) /* */
mov D(3), eax
lea esi, S(4)
dec ecx
lea edi, D(4)
jnz _asm_transform_points3_3d_loop
_asm_transform_points3_3d_end:
pop edi
pop esi
ret
/*
* void asm_transform_points4_general( GLuint n, GLfloat d[][4],
* GLfloat m[16], GLfloat s[][4] );
*/
PUBLIC _asm_transform_points4_general
_asm_transform_points4_general:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _asm_transform_points4_general_end
.align 4
_asm_transform_points4_general_loop:
fld S(0)
fmul M(0, 0)
fld S(0)
fmul M(0, 1)
fld S(0)
fmul M(0, 2)
fld S(0)
fmul M(0, 3)
fld S(1)
fmul M(1, 0)
fld S(1)
fmul M(1, 1)
fld S(1)
fmul M(1, 2)
fld S(1)
fmul M(1, 3)
/*
* st(7) = S(0) * M(0, 0)
* st(6) = S(0) * M(0, 1)
* st(5) = S(0) * M(0, 2)
* st(4) = S(0) * M(0, 3)
* st(3) = S(1) * M(1, 0)
* st(2) = S(1) * M(1, 1)
* st(1) = S(1) * M(1, 2)
* st(0) = S(1) * M(1, 3)
*/
fxch st(3) /* 3 1 2 0 4 5 6 7 */
faddp st(7), st /* 1 2 0 4 5 6 7 */
fxch st(1) /* 2 1 0 4 5 6 7 */
faddp st(5), st /* 1 0 4 5 6 7 */
faddp st(3), st /* 0 4 5 6 7 */
faddp st(1), st /* 4 5 6 7 */
/*
* st(3) = S(0) * M(0, 0) + S(1) * M(1, 0)
* st(2) = S(0) * M(0, 1) + S(1) * M(1, 1)
* st(1) = S(0) * M(0, 2) + S(1) * M(1, 2)
* st(0) = S(0) * M(0, 3) + S(1) * M(1, 3)
*/
fld S(2)
fmul M(2, 0)
fld S(2)
fmul M(2, 1)
fld S(2)
fmul M(2, 2)
fld S(2)
fmul M(2, 3)
/*
* st(7) = S(0) * M(0, 0) + S(1) * M(1, 0)
* st(6) = S(0) * M(0, 1) + S(1) * M(1, 1)
* st(5) = S(0) * M(0, 2) + S(1) * M(1, 2)
* st(4) = S(0) * M(0, 3) + S(1) * M(1, 3)
* st(3) = S(2) * M(2, 0)
* st(2) = S(2) * M(2, 1)
* st(1) = S(2) * M(2, 2)
* st(0) = S(2) * M(2, 3)
*/
fxch st(3) /* 3 1 2 0 4 5 6 7 */
faddp st(7), st /* 1 2 0 4 5 6 7 */
fxch st(1) /* 2 1 0 4 5 6 7 */
faddp st(5), st /* 1 0 4 5 6 7 */
faddp st(3), st /* 0 4 5 6 7 */
faddp st(1), st /* 4 5 6 7 */
/*
* st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
* st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
* st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
* st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
*/
fld S(3)
fmul M(3, 0)
fld S(3)
fmul M(3, 1)
fld S(3)
fmul M(3, 2)
fld S(3)
fmul M(3, 3)
/*
* st(7) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
* st(6) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
* st(5) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
* st(4) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3)
* st(3) = S(3) * M(3, 0)
* st(2) = S(3) * M(3, 1)
* st(1) = S(3) * M(3, 2)
* st(0) = S(3) * M(3, 3)
*/
fxch st(3) /* 3 1 2 0 4 5 6 7 */
faddp st(7), st /* 1 2 0 4 5 6 7 */
fxch st(1) /* 2 1 0 4 5 6 7 */
faddp st(5), st /* 1 0 4 5 6 7 */
faddp st(3), st /* 0 4 5 6 7 */
lea esi, S(4)
dec ecx
faddp st(1), st /* 4 5 6 7 */
/*
* st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + S(3) * M(3, 0)
* st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + S(3) * M(3, 1)
* st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + S(3) * M(3, 2)
* st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) + S(3) * M(3, 3)
*/
fxch st(3) /* 3 1 2 0 */
fstp D(0) /* 1 2 0 */
fxch st(1) /* 2 1 0 */
fstp D(1) /* 1 0 */
fstp D(2) /* 0 */
fstp D(3) /* */
lea edi, D(4)
jnz _asm_transform_points4_general_loop
_asm_transform_points4_general_end:
pop edi
pop esi
ret
/*
* void asm_transform_points4_identity( GLuint n, GLfloat d[][4],
* GLfloat s[][4] );
*/
PUBLIC _asm_transform_points4_identity
_asm_transform_points4_identity:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov esi, [esp + 20] /* esi = s */
lea ecx, [ecx * 4]
cld
rep movsd
pop edi
pop esi
ret
/*
* void asm_transform_points4_2d( GLuint n, GLfloat d[][4], GLfloat m[16],
* GLfloat s[][4] );
*/
PUBLIC _asm_transform_points4_2d
_asm_transform_points4_2d:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _asm_transform_points4_2d_end
push ebx
.align 4
_asm_transform_points4_2d_loop:
fld S(0)
fmul M(0, 0)
fld S(0)
fmul M(0, 1)
fld S(1)
fmul M(1, 0)
fld S(1)
fmul M(1, 1)
fld S(3)
fmul M(3, 0)
fld S(3)
fmul M(3, 1)
/*
* st(5) = S(0) * M(0, 0)
* st(4) = S(0) * M(0, 1)
* st(3) = S(1) * M(1, 0)
* st(2) = S(1) * M(1, 1)
* st(1) = S(3) * M(3, 0)
* st(0) = S(3) * M(3, 1)
*/
mov eax, S(2)
mov ebx, S(3)
lea esi, S(4)
dec ecx
mov D(2), eax
mov D(3), ebx
faddp st(4), st
faddp st(4), st
faddp st(2), st
faddp st(2), st
fstp D(1)
fstp D(0)
lea edi, D(4)
jnz _asm_transform_points4_2d_loop
pop ebx
_asm_transform_points4_2d_end:
pop edi
pop esi
ret
/*
* void asm_transform_points4_2d_no_rot( GLuint n, GLfloat d[][4],
* GLfloat m[16], GLfloat s[][4] );
*/
PUBLIC _asm_transform_points4_2d_no_rot
_asm_transform_points4_2d_no_rot:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _asm_transform_points4_2d_no_rot_end
push ebx
.align 4
_asm_transform_points4_2d_no_rot_loop:
fld S(0)
fmul M(0, 0)
fld S(1)
fmul M(1, 1)
fld S(3)
fmul M(3, 0)
fld S(3)
fmul M(3, 1)
mov eax, S(2)
mov ebx, S(3)
lea esi, S(4)
dec ecx
mov D(2), eax
mov D(3), ebx
faddp st(2), st
faddp st(2), st
fstp D(1)
fstp D(0)
lea edi, D(4)
jnz _asm_transform_points4_2d_no_rot_loop
pop ebx
_asm_transform_points4_2d_no_rot_end:
pop edi
pop esi
ret
/*
* void asm_transform_points4_3d( GLuint n, GLfloat d[][4], GLfloat m[16],
* GLfloat s[][4] );
*/
PUBLIC _asm_transform_points4_3d
_asm_transform_points4_3d:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _asm_transform_points4_3d_end
.align 4
_asm_transform_points4_3d_loop:
fld S(3)
fld S(0)
fmul M(0, 0)
fld S(0)
fmul M(0, 1)
fld S(0)
fmul M(0, 2)
fld S(1)
fmul M(1, 0)
fld S(1)
fmul M(1, 1)
fld S(1)
fmul M(1, 2)
/*
* st(5) = S(0) * M(0, 0)
* st(4) = S(0) * M(0, 1)
* st(3) = S(0) * M(0, 2)
* st(2) = S(1) * M(1, 0)
* st(1) = S(1) * M(1, 1)
* st(0) = S(1) * M(1, 2)
*/
fxch st(2) /* 2 1 0 3 4 5 */
faddp st(5), st /* 1 0 3 4 5 */
faddp st(3), st /* 0 3 4 5 */
faddp st(1), st /* 3 4 5 */
/*
* st(2) = S(0) * M(0, 0) + S(1) * M(1, 0)
* st(1) = S(0) * M(0, 1) + S(1) * M(1, 1)
* st(0) = S(0) * M(0, 2) + S(1) * M(1, 2)
*/
fld S(2)
fmul M(2, 0)
fld S(2)
fmul M(2, 1)
fld S(2)
fmul M(2, 2)
/*
* st(5) = S(0) * M(0, 0) + S(1) * M(1, 0)
* st(4) = S(0) * M(0, 1) + S(1) * M(1, 1)
* st(3) = S(0) * M(0, 2) + S(1) * M(1, 2)
* st(2) = S(2) * M(2, 0)
* st(1) = S(2) * M(2, 1)
* st(0) = S(2) * M(2, 2)
*/
fxch st(2) /* 2 1 0 3 4 5 */
faddp st(5), st /* 1 0 3 4 5 */
faddp st(3), st /* 0 3 4 5 */
faddp st(1), st /* 3 4 5 */
/*
* st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
* st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
* st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
*/
fld S(3)
fmul M(3, 0)
fld S(3)
fmul M(3, 1)
fld S(3)
fmul M(3, 2)
/*
* st(5) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0)
* st(4) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1)
* st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2)
* st(2) = S(3) * M(3, 0)
* st(1) = S(3) * M(3, 1)
* st(0) = S(3) * M(3, 2)
*/
fxch st(2) /* 2 1 0 3 4 5 */
faddp st(5), st /* 1 0 3 4 5 */
faddp st(3), st /* 0 3 4 5 */
lea esi, S(4)
dec ecx
faddp st(1), st /* 3 4 5 */
/*
* st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + S(3) * M(3, 0)
* st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + S(3) * M(3, 1)
* st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + S(3) * M(3, 2)
*/
fxch st(2) /* 2 1 0 */
fstp D(0) /* 1 0 */
fstp D(1) /* 0 */
fstp D(2) /* */
fstp D(3)
lea edi, D(4)
jnz _asm_transform_points4_3d_loop
_asm_transform_points4_3d_end:
pop edi
pop esi
ret
/*
* void asm_transform_points4_ortho( GLuint n, GLfloat d[][4],
* GLfloat m[16], GLfloat s[][4] );
*/
PUBLIC _asm_transform_points4_ortho
_asm_transform_points4_ortho:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _asm_transform_points4_ortho_end
.align 4
_asm_transform_points4_ortho_loop:
fld S(0)
fmul M(0, 0)
fld S(1)
fmul M(1, 1)
fld S(2)
fmul M(2, 2)
fld S(3)
fmul M(3, 0)
fld S(3)
fmul M(3, 1)
fld S(3)
fmul M(3, 2)
mov eax, S(3)
lea esi, S(4)
dec ecx
mov D(3), eax
faddp st(3), st
faddp st(3), st
faddp st(3), st
fstp D(2)
fstp D(1)
fstp D(0)
lea edi, D(4)
jnz _asm_transform_points4_ortho_loop
_asm_transform_points4_ortho_end:
pop edi
pop esi
ret
/*
* void asm_transform_points4_perspective( GLuint n, GLfloat d[][4],
* GLfloat m[16], GLfloat s[][4] );
*/
PUBLIC _asm_transform_points4_perspective
_asm_transform_points4_perspective:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _asm_transform_points4_perspective_end
.align 4
_asm_transform_points4_perspective_loop:
fld S(0)
fmul M(0, 0)
fld S(1)
fmul M(1, 1)
fld S(2)
fmul M(2, 2)
fld S(2)
fmul M(2, 0)
fld S(2)
fmul M(2, 1)
fld S(3)
fmul M(3, 2)
mov eax, S(2)
lea esi, S(4)
xor eax, HEX(80000000)
dec ecx
faddp st(3), st
faddp st(3), st
faddp st(3), st
fstp D(2)
fstp D(1)
fstp D(0)
mov D(3), eax
lea edi, D(4)
jnz _asm_transform_points4_perspective_loop
_asm_transform_points4_perspective_end:
pop edi
pop esi
ret
/*
* Table for clip test.
*
* bit6 = S(3) < 0
* bit5 = S(2) < 0
* bit4 = abs(S(2)) > abs(S(3))
* bit3 = S(1) < 0
* bit2 = abs(S(1)) > abs(S(3))
* bit1 = S(0) < 0
* bit0 = abs(S(0)) > abs(S(3))
*/
/* Vertex buffer clipping flags (from vb.h) */
#if 0
#define CLIP_RIGHT_BIT 0x01
#define CLIP_LEFT_BIT 0x02
#define CLIP_TOP_BIT 0x04
#define CLIP_BOTTOM_BIT 0x08
#define CLIP_NEAR_BIT 0x10
#define CLIP_FAR_BIT 0x20
#define CLIP_USER_BIT 0x40
#define CLIP_ALL_BITS 0x3f
#define MAGN_X(i) (~(((i) & 1) - 1))
#define SIGN_X(i) (~((((i) >> 1) & 1) - 1))
#define MAGN_Y(i) (~((((i) >> 2) & 1) - 1))
#define SIGN_Y(i) (~((((i) >> 3) & 1) - 1))
#define MAGN_Z(i) (~((((i) >> 4) & 1) - 1))
#define SIGN_Z(i) (~((((i) >> 5) & 1) - 1))
#define SIGN_W(i) (~((((i) >> 6) & 1) - 1))
#define CLIP_VALUE(i) \
(CLIP_RIGHT_BIT \
& ((~SIGN_X(i) & SIGN_W(i)) \
| (~SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i)) \
| (SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i)))) \
| (CLIP_LEFT_BIT \
& ((SIGN_X(i) & SIGN_W(i)) \
| (~SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i)) \
| (SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i)))) \
| (CLIP_TOP_BIT \
& ((~SIGN_Y(i) & SIGN_W(i)) \
| (~SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i)) \
| (SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i)))) \
| (CLIP_BOTTOM_BIT \
& ((SIGN_Y(i) & SIGN_W(i)) \
| (~SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i)) \
| (SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i)))) \
| (CLIP_FAR_BIT \
& ((~SIGN_Z(i) & SIGN_W(i)) \
| (~SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i)) \
| (SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i)))) \
| (CLIP_NEAR_BIT \
& ((SIGN_Z(i) & SIGN_W(i)) \
| (~SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i)) \
| (SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i))))
#define CLIP_VALUE8(i) \
CLIP_VALUE(i + 0), CLIP_VALUE(i + 1), CLIP_VALUE(i + 2), CLIP_VALUE(i + 3), \
CLIP_VALUE(i + 4), CLIP_VALUE(i + 5), CLIP_VALUE(i + 6), CLIP_VALUE(i + 7)
.rodata
clip_table:
.byte CLIP_VALUE8(0x00)
.byte CLIP_VALUE8(0x08)
.byte CLIP_VALUE8(0x10)
.byte CLIP_VALUE8(0x18)
.byte CLIP_VALUE8(0x20)
.byte CLIP_VALUE8(0x28)
.byte CLIP_VALUE8(0x30)
.byte CLIP_VALUE8(0x38)
.byte CLIP_VALUE8(0x40)
.byte CLIP_VALUE8(0x48)
.byte CLIP_VALUE8(0x50)
.byte CLIP_VALUE8(0x58)
.byte CLIP_VALUE8(0x60)
.byte CLIP_VALUE8(0x68)
.byte CLIP_VALUE8(0x70)
.byte CLIP_VALUE8(0x78)
#else
.const
ASSUME NOTHING
clip_table:
.byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(4), HEX(5), HEX(4), HEX(6)
.byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(8), HEX(9), HEX(8), HEX(a)
.byte HEX(20), HEX(21), HEX(20), HEX(22), HEX(24), HEX(25), HEX(24), HEX(26)
.byte HEX(20), HEX(21), HEX(20), HEX(22), HEX(28), HEX(29), HEX(28), HEX(2a)
.byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(4), HEX(5), HEX(4), HEX(6)
.byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(8), HEX(9), HEX(8), HEX(a)
.byte HEX(10), HEX(11), HEX(10), HEX(12), HEX(14), HEX(15), HEX(14), HEX(16)
.byte HEX(10), HEX(11), HEX(10), HEX(12), HEX(18), HEX(19), HEX(18), HEX(1a)
.byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(37), HEX(35), HEX(37), HEX(36)
.byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(3b), HEX(39), HEX(3b), HEX(3a)
.byte HEX(2f), HEX(2d), HEX(2f), HEX(2e), HEX(27), HEX(25), HEX(27), HEX(26)
.byte HEX(2f), HEX(2d), HEX(2f), HEX(2e), HEX(2b), HEX(29), HEX(2b), HEX(2a)
.byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(37), HEX(35), HEX(37), HEX(36)
.byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(3b), HEX(39), HEX(3b), HEX(3a)
.byte HEX(1f), HEX(1d), HEX(1f), HEX(1e), HEX(17), HEX(15), HEX(17), HEX(16)
.byte HEX(1f), HEX(1d), HEX(1f), HEX(1e), HEX(1b), HEX(19), HEX(1b), HEX(1a)
#endif
.code
/*
* cliptest -
*
* inputs:
* ecx = # points
* esi = points
* edi = clipmask[]
*
* inputs/outputs:
* al = ormask
* ah = andmask
*/
cliptest:
test ecx, ecx
jz cliptest_end
push ebp
push ebx
.align 4
cliptest_loop:
mov ebp, S(3)
mov ebx, S(2)
xor edx, edx
add ebp, ebp /* %ebp = abs(S(3))*2 ; carry = sign of S(3) */
adc edx, edx
add ebx, ebx /* %ebx = abs(S(2))*2 ; carry = sign of S(2) */
adc edx, edx
cmp ebp, ebx /* carry = abs(S(2))*2 > abs(S(3))*2 */
adc edx, edx
mov ebx, S(1)
add ebx, ebx /* %ebx = abs(S(1))*2 ; carry = sign of S(1) */
adc edx, edx
cmp ebp, ebx /* carry = abs(S(1))*2 > abs(S(3))*2 */
adc edx, edx
mov ebx, S(0)
add ebx, ebx /* %ebx = abs(S(0))*2 ; carry = sign of S(0) */
adc edx, edx
cmp ebp, ebx /* carry = abs(S(0))*2 > abs(S(3))*2 */
adc edx, edx
lea esi, S(4)
mov bl, byte ptr [edi]
mov dl, byte ptr [clip_table + edx]
or bl, dl
or al, dl
and ah, dl
mov [edi], bl
inc edi
dec ecx
jnz cliptest_loop
pop ebx
pop ebp
cliptest_end:
ret
/*
* void asm_project_and_cliptest_general( GLuint n, GLfloat d[][4], GLfloat m[16],
* GLfloat s[][4], GLubyte clipmask[],
* GLubyte *ormask, GLubyte *andmask );
*/
PUBLIC _asm_project_and_cliptest_general
_asm_project_and_cliptest_general:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
push esi
push edx
push edi
push ecx
call _asm_transform_points4_general
add esp, DEC(16)
mov edi, [esp + 32] /* ormask */
mov esi, [esp + 36] /* andmask */
mov al, [edi]
mov ah, [esi]
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 28] /* edi = clipmask */
mov esi, [esp + 16] /* esi = d */
call cliptest
mov edi, [esp + 32] /* ormask */
mov esi, [esp + 36] /* andmask */
mov [edi], al
mov [esi], ah
pop edi
pop esi
ret
/*
* void asm_project_and_cliptest_identity( GLuint n, GLfloat d[][4],
* GLfloat s[][4], GLubyte clipmask[],
* GLubyte *ormask, GLubyte *andmask );
*/
PUBLIC _asm_project_and_cliptest_identity
_asm_project_and_cliptest_identity:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov esi, [esp + 20] /* esi = s */
push esi
push edi
push ecx
call _asm_transform_points4_identity
add esp, DEC(12)
mov edi, [esp + 28] /* ormask */
mov esi, [esp + 32] /* andmask */
mov al, [edi]
mov ah, [esi]
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 24] /* edi = clipmask */
mov esi, [esp + 16] /* esi = d */
call cliptest
mov edi, [esp + 28] /* ormask */
mov esi, [esp + 32] /* andmask */
mov [edi], al
mov [esi], ah
pop edi
pop esi
ret
/*
* void asm_project_and_cliptest_ortho( GLuint n, GLfloat d[][4], GLfloat m[16],
* GLfloat s[][4], GLubyte clipmask[],
* GLubyte *ormask, GLubyte *andmask );
*/
PUBLIC _asm_project_and_cliptest_ortho
_asm_project_and_cliptest_ortho:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
push esi
push edx
push edi
push ecx
call _asm_transform_points4_ortho
add esp, DEC(16)
mov edi, [esp + 32] /* ormask */
mov esi, [esp + 36] /* andmask */
mov al, [edi]
mov ah, [esi]
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 28] /* edi = clipmask */
mov esi, [esp + 16] /* esi = d */
call cliptest
mov edi, [esp + 32] /* ormask */
mov esi, [esp + 36] /* andmask */
mov [edi], al
mov [esi], ah
pop edi
pop esi
ret
/*
* void asm_project_and_cliptest_perspective( GLuint n, GLfloat d[][4], GLfloat m[16],
* GLfloat s[][4], GLubyte clipmask[],
* GLubyte *ormask, GLubyte *andmask );
*/
PUBLIC _asm_project_and_cliptest_perspective
_asm_project_and_cliptest_perspective:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
push esi
push edx
push edi
push ecx
call _asm_transform_points4_perspective
add esp, DEC(16)
mov edi, [esp + 32] /* ormask */
mov esi, [esp + 36] /* andmask */
mov al, [edi]
mov ah, [esi]
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 28] /* edi = clipmask */
mov esi, [esp + 16] /* esi = d */
call cliptest
mov edi, [esp + 32] /* ormask */
mov esi, [esp + 36] /* andmask */
mov byte ptr [edi], al
mov byte ptr [esi], ah
pop edi
pop esi
ret
/*
* unsigned int inverse_nofp( float f );
*
* Calculate the inverse of a float without using the FPU.
* This function returns a float in eax, so it's return
* type should be 'int' when called from C (and converted
* to float with pointer/union abuse).
*/
.align 4
inverse_nofp:
/* get mantissa in eax */
mov ecx, [esp + 4]
and ecx, HEX(7fffff)
/* set implicit integer */
or ecx, HEX(800000)
/* div 0x10000:0x00000000 by mantissa */
xor eax, eax
mov edx, HEX(10000)
div ecx
/* round result */
shr eax, DEC(1)
adc eax, DEC(0)
/* get exponent in ecx */
mov ecx, HEX(7f800000)
mov edx, [esp + 4]
and ecx, edx
/* negate exponent and decrement it */
mov edx, HEX(7E800000)
sub edx, ecx
/* if bit 24 is set, shift and adjust exponent */
test eax, HEX(1000000)
jz inverse_nofp_combine
shr eax, HEX(1)
add edx, HEX(800000)
/* combine mantissa and exponent, then set sign */
inverse_nofp_combine:
and eax, HEX(7fffff)
mov ecx, [esp + 4]
or eax, edx
and ecx, HEX(80000000)
or eax, ecx
ret
/*
* void gl_xform_normals_3fv( GLuint n, GLfloat d[][4], GLfloat m[16],
* GLfloat s[][4], GLboolean normalize );
*/
PUBLIC _gl_xform_normals_3fv
_gl_xform_normals_3fv:
.align 4
push esi
push edi
mov ecx, [esp + 12] /* ecx = n */
mov edi, [esp + 16] /* edi = d */
mov edx, [esp + 20] /* edx = m */
mov esi, [esp + 24] /* esi = s */
test ecx, ecx
jz _gl_xform_normals_3fv_end
.align 4
_gl_xform_normals_3fv_loop:
fld S(0)
fmul M(0, 0)
fld S(0)
fmul M(1, 0)
fld S(0)
fmul M(2, 0)
fld S(1)
fmul M(0, 1)
fld S(1)
fmul M(1, 1)
fld S(1)
fmul M(2, 1)
/*
* st(5) = S(0) * M(0, 0)
* st(4) = S(0) * M(1, 0)
* st(3) = S(0) * M(2, 0)
* st(2) = S(1) * M(0, 1)
* st(1) = S(1) * M(1, 1)
* st(0) = S(1) * M(2, 1)
*/
fxch st(2) /* 2 1 0 3 4 5 */
faddp st(5), st /* 1 0 3 4 5 */
faddp st(3), st /* 0 3 4 5 */
faddp st(1), st /* 3 4 5 */
/*
* st(2) = S(0) * M(0, 0) + S(1) * M(0, 1)
* st(1) = S(0) * M(1, 0) + S(1) * M(1, 1)
* st(0) = S(0) * M(2, 0) + S(1) * M(2, 1)
*/
fld S(2)
fmul M(0, 2)
fld S(2)
fmul M(1, 2)
fld S(2)
fmul M(2, 2)
/*
* st(5) = S(0) * M(0, 0) + S(1) * M(0, 1)
* st(4) = S(0) * M(1, 0) + S(1) * M(1, 1)
* st(3) = S(0) * M(2, 0) + S(1) * M(2, 1)
* st(2) = S(2) * M(0, 2)
* st(1) = S(2) * M(1, 2)
* st(0) = S(2) * M(2, 2)
*/
fxch st(2) /* 2 1 0 3 4 5 */
faddp st(5), st /* 1 0 3 4 5 */
faddp st(3), st /* 0 3 4 5 */
faddp st(1), st /* 3 4 5 */
/*
* st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) + S(2) * M(0, 2)
* st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) + S(2) * M(1, 2)
* st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) + S(2) * M(2, 2)
*/
fxch st(2) /* 2 1 0 */
fstp D(0) /* 1 0 */
fstp D(1) /* 0 */
fstp D(2) /* */
lea esi, S(3)
dec ecx
lea edi, D(3)
jnz _gl_xform_normals_3fv_loop
/*
* Skip normalize if it isn't needed
*/
cmp dword ptr [esp + 28], DEC(0)
jz _gl_xform_normals_3fv_end
/* Normalize required */
mov esi, [esp + 12] /* esi = n */
mov edi, [esp + 16] /* edi = d */
sub esp, DEC(4) /* temp var for 1.0 / len */
/*
* (%esp) = length of first normal
*/
fld D(0)
fmul D(0)
fld D(1)
fmul D(1)
fld D(2)
fmul D(2)
fxch st(2)
faddp st(1), st
faddp st(1), st
fsqrt
fstp dword ptr [esp]
jmp _gl_xform_normals_3fv_loop2_end
.align 4
_gl_xform_normals_3fv_loop2:
/* %st(0) = length of next normal */
fld D(3)
fmul D(3)
fld D(4)
fmul D(4)
fld D(5)
fmul D(5)
fxch st(2)
faddp st(1), st
faddp st(1), st
fsqrt
/*
* inverse the length of the current normal, which is
* already at (%esp). This should overlap the prev
* fsqrt nicely.
*/
call inverse_nofp
mov [esp], eax
/* multiply normal by 1/len */
fld D(0)
fmul dword ptr [esp]
fld D(1)
fmul dword ptr [esp]
fld D(2)
fmul dword ptr [esp]
fxch st(3)
fstp dword ptr [esp] /* store length of next normal */
fstp D(1)
fstp D(0)
fstp D(2)
lea edi, D(3)
_gl_xform_normals_3fv_loop2_end:
dec esi
jnz _gl_xform_normals_3fv_loop2
/* finish up the last normal */
call inverse_nofp
mov [esp], eax
fld D(0)
fmul dword ptr [esp]
fld D(1)
fmul dword ptr [esp]
fld D(2)
fmul dword ptr [esp]
fxch st(2)
fstp D(0)
fstp D(1)
fstp D(2)
add esp, DEC(4)
_gl_xform_normals_3fv_end:
pop edi
pop esi
ret
END