/* $Id: asm-386.S,v 1.8 1997/12/17 00:50:51 brianp Exp $ */ /* * asm-386.S - special (hopefully faster) transformation functions for x86 * * by Josh Vanderhoof * * This file is in the public domain. */ /* * $Log: asm-386.S,v $ * Revision 1.8 1997/12/17 00:50:51 brianp * applied Josh's patch to fix texture coordinate transformation bugs * * Revision 1.7 1997/12/17 00:27:11 brianp * applied Josh's patch to fix bfris * * Revision 1.6 1997/12/01 01:02:41 brianp * added FreeBSD patches (Daniel J. O'Connor) * * Revision 1.5 1997/11/19 23:52:17 brianp * added missing "cld" instruction in asm_transform_points4_identity() * * Revision 1.4 1997/11/11 02:22:41 brianp * small change per Josh to ensure U/V pairing * * Revision 1.3 1997/11/07 03:37:24 brianp * added missing line from Stephane Rehel * * Revision 1.2 1997/11/07 03:30:37 brianp * added Josh's 11-5-97 patches * * Revision 1.1 1997/10/30 06:00:33 brianp * Initial revision */ #include #define S(x) dword ptr [esi + 4*x] #define D(x) dword ptr [edi + 4*x] #define M(x, y) dword ptr [edx + 16*x + 4*y] .code /* * void asm_transform_points3_general( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ PUBLIC _asm_transform_points3_general _asm_transform_points3_general: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _asm_transform_points3_general_end .align 4 _asm_transform_points3_general_loop: fld S(0) fmul M(0, 0) fld S(0) fmul M(0, 1) fld S(0) fmul M(0, 2) fld S(0) fmul M(0, 3) fld S(1) fmul M(1, 0) fld S(1) fmul M(1, 1) fld S(1) fmul M(1, 2) fld S(1) fmul M(1, 3) /* * The FPU stack should now look like this: * * st(7) = S(0) * M(0, 0) * st(6) = S(0) * M(0, 1) * st(5) = S(0) * M(0, 2) * st(4) = S(0) * M(0, 3) * st(3) = S(1) * M(1, 0) * st(2) = S(1) * M(1, 1) * st(1) = S(1) * M(1, 2) * st(0) = S(1) * M(1, 3) */ fxch st(3) /* 3 1 2 0 4 5 6 7 */ faddp st(7), st /* 1 2 0 4 5 6 7 */ fxch st(1) /* 2 1 0 4 5 6 7 */ faddp st(5), st /* 1 0 4 5 6 7 */ faddp st(3), st /* 0 4 5 6 7 */ faddp st(1), st /* 4 5 6 7 */ /* * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) */ fld S(2) fmul M(2, 0) fld S(2) fmul M(2, 1) fld S(2) fmul M(2, 2) fld S(2) fmul M(2, 3) /* * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0) * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1) * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2) * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3) * st(3) = S(2) * M(2, 0) * st(2) = S(2) * M(2, 1) * st(1) = S(2) * M(2, 2) * st(0) = S(2) * M(2, 3) */ fxch st(3) /* 3 1 2 0 4 5 6 7 */ faddp st(7), st /* 1 2 0 4 5 6 7 */ fxch st(1) /* 2 1 0 4 5 6 7 */ faddp st(5), st /* 1 0 4 5 6 7 */ faddp st(3), st /* 0 4 5 6 7 */ faddp st(1), st /* 4 5 6 7 */ /* * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) */ fxch st(3) /* 3 1 2 0 */ fadd M(3, 0) fxch st(2) /* 2 1 3 0 */ fadd M(3, 1) fxch st(1) /* 1 2 3 0 */ fadd M(3, 2) fxch st(3) /* 0 2 3 1 */ fadd M(3, 3) /* * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + M(3, 2) * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + M(3, 0) * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + M(3, 1) * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) + M(3, 3) */ fxch st(3) /* 3 1 2 0 */ fstp D(2) /* 1 2 0 */ fxch st(1) /* 2 1 0 */ fstp D(0) /* 1 0 */ lea esi, S(4) fstp D(1) /* 0 */ dec ecx fstp D(3) /* */ lea edi, D(4) jnz _asm_transform_points3_general_loop _asm_transform_points3_general_end: pop edi pop esi ret /* * void asm_transform_points3_identity( GLuint n, GLfloat d[][4], * GLfloat s[][4] ); */ PUBLIC _asm_transform_points3_identity _asm_transform_points3_identity: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov esi, [esp + 20] /* esi = s */ push ebx push ebp test ecx, ecx jz _asm_transform_points3_identity_end mov ebp, HEX(3f800000) .align 4 _asm_transform_points3_identity_loop: mov eax, S(0) mov edx, S(1) mov ebx, S(2) lea esi, S(4) mov D(0), eax mov D(1), edx mov D(2), ebx mov D(3), ebp dec ecx lea edi, D(4) jnz _asm_transform_points3_identity_loop _asm_transform_points3_identity_end: pop ebp pop ebx pop edi pop esi ret /* * void asm_transform_points3_2d( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4] ); */ PUBLIC _asm_transform_points3_2d _asm_transform_points3_2d: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ push ebp mov ebp, HEX(3f800000) test cl, DEC(1) jz _asm_transform_points3_2d_step dec ecx fld S(0) fmul M(0, 0) fld S(0) fmul M(0, 1) fld S(1) fmul M(1, 0) fld S(1) fmul M(1, 1) /* * st(3) = S(0) * M(0, 0) * st(2) = S(0) * M(0, 1) * st(1) = S(1) * M(1, 0) * st(0) = S(1) * M(1, 1) */ fxch st(1) /* 1 0 2 3 */ fadd M(3, 0) fxch st(1) /* 0 1 2 3 */ fadd M(3, 1) fxch st(1) /* 1 0 2 3 */ faddp st(3), st /* 0 2 3 */ faddp st(1), st /* 2 3 */ fstp D(1) /* 3 */ fstp D(0) /* */ mov eax, S(2) lea esi, S(4) mov D(3), ebp mov D(2), eax lea edi, D(4) _asm_transform_points3_2d_step: test ecx, ecx jz _asm_transform_points3_2d_end .align 4 _asm_transform_points3_2d_loop: fld S(0) fmul M(0, 0) fld S(0) fmul M(0, 1) fld S(4) fmul M(0, 0) fld S(4) fmul M(0, 1) fld S(1) fmul M(1, 0) fld S(1) fmul M(1, 1) fld S(5) fmul M(1, 0) fld S(5) fmul M(1, 1) /* * st(7) = S(0) * M(0, 0) * st(6) = S(0) * M(0, 1) * st(5) = S(4) * M(0, 0) * st(4) = S(4) * M(0, 1) * st(3) = S(1) * M(1, 0) * st(2) = S(1) * M(1, 1) * st(1) = S(5) * M(1, 0) * st(0) = S(5) * M(1, 1) */ fxch st(7) /* 7 1 2 3 4 5 6 0 */ fadd M(3, 0) fxch st(6) /* 6 1 2 3 4 5 7 0 */ fadd M(3, 1) fxch st(5) /* 5 1 2 3 4 6 7 0 */ fadd M(3, 0) fxch st(4) /* 4 1 2 3 5 6 7 0 */ fadd M(3, 1) mov eax, S(2) mov D(3), ebp mov D(2), eax mov eax, S(6) mov D(7), ebp mov D(6), eax lea esi, S(8) sub ecx, DEC(2) /* * st(7) = S(5) * M(1, 1) * st(6) = S(0) * M(0, 0) + M(3, 0) * st(5) = S(0) * M(0, 1) + M(3, 1) * st(4) = S(4) * M(0, 0) + M(3, 0) * st(3) = S(1) * M(1, 0) * st(2) = S(1) * M(1, 1) * st(1) = S(5) * M(1, 0) * st(0) = S(4) * M(0, 1) + M(3, 1) */ faddp st(7), st /* 1 2 3 4 5 6 7 */ faddp st(3), st /* 2 3 4 5 6 7 */ faddp st(3), st /* 3 4 5 6 7 */ faddp st(3), st /* 4 5 6 7 */ fxch st(3) /* 7 5 6 4 */ fstp D(5) /* 5 6 4 */ fstp D(1) /* 6 4 */ fstp D(0) /* 4 */ fstp D(4) /* */ lea edi, D(8) jnz _asm_transform_points3_2d_loop _asm_transform_points3_2d_end: pop ebp pop edi pop esi ret /* * void asm_transform_points3_2d_no_rot( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); * */ PUBLIC _asm_transform_points3_2d_no_rot _asm_transform_points3_2d_no_rot: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ push ebp test ecx, ecx jz _asm_transform_points3_2d_no_rot_end mov ebp, HEX(3f800000) .align 4 _asm_transform_points3_2d_no_rot_loop: fld S(0) fmul M(0, 0) fld S(1) fmul M(1, 1) fxch st(1) fadd M(3, 0) fxch st(1) fadd M(3, 1) fxch st(1) fstp D(0) fstp D(1) mov eax, S(2) /* cycle 1: U pipe */ mov D(3), ebp /* V pipe */ mov D(2), eax /* cycle 2: U pipe */ dec ecx lea esi, S(4) lea edi, D(4) jnz _asm_transform_points3_2d_no_rot_loop _asm_transform_points3_2d_no_rot_end: pop ebp pop edi pop esi ret /* * void asm_transform_points3_3d( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4] ); */ PUBLIC _asm_transform_points3_3d _asm_transform_points3_3d: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _asm_transform_points3_3d_end mov eax, HEX(3f800000) .align 4 _asm_transform_points3_3d_loop: fld S(0) fmul M(0, 0) fld S(0) fmul M(0, 1) fld S(0) fmul M(0, 2) fld S(1) fmul M(1, 0) fld S(1) fmul M(1, 1) fld S(1) fmul M(1, 2) /* * st(5) = S(0) * M(0, 0) * st(4) = S(0) * M(0, 1) * st(3) = S(0) * M(0, 2) * st(2) = S(1) * M(1, 0) * st(1) = S(1) * M(1, 1) * st(0) = S(1) * M(1, 2) */ fxch st(2) /* 2 1 0 3 4 5 */ faddp st(5), st /* 1 0 3 4 5 */ faddp st(3), st /* 0 3 4 5 */ faddp st(1), st /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) */ fld S(2) fmul M(2, 0) fld S(2) fmul M(2, 1) fld S(2) fmul M(2, 2) /* * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0) * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1) * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) * st(2) = S(2) * M(2, 0) * st(1) = S(2) * M(2, 1) * st(0) = S(2) * M(2, 2) */ fxch st(2) /* 2 1 0 3 4 5 */ faddp st(5), st /* 1 0 3 4 5 */ faddp st(3), st /* 0 3 4 5 */ faddp st(1), st /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) */ fxch st(2) /* 2 1 0 */ fadd M(3, 0) fxch st(1) /* 1 2 0 */ fadd M(3, 1) fxch st(2) /* 0 2 1 */ fadd M(3, 2) fxch st(1) /* 2 0 1 */ fstp D(0) /* 0 1 */ fstp D(2) /* 1 */ fstp D(1) /* */ mov D(3), eax lea esi, S(4) dec ecx lea edi, D(4) jnz _asm_transform_points3_3d_loop _asm_transform_points3_3d_end: pop edi pop esi ret /* * void asm_transform_points4_general( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ PUBLIC _asm_transform_points4_general _asm_transform_points4_general: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _asm_transform_points4_general_end .align 4 _asm_transform_points4_general_loop: fld S(0) fmul M(0, 0) fld S(0) fmul M(0, 1) fld S(0) fmul M(0, 2) fld S(0) fmul M(0, 3) fld S(1) fmul M(1, 0) fld S(1) fmul M(1, 1) fld S(1) fmul M(1, 2) fld S(1) fmul M(1, 3) /* * st(7) = S(0) * M(0, 0) * st(6) = S(0) * M(0, 1) * st(5) = S(0) * M(0, 2) * st(4) = S(0) * M(0, 3) * st(3) = S(1) * M(1, 0) * st(2) = S(1) * M(1, 1) * st(1) = S(1) * M(1, 2) * st(0) = S(1) * M(1, 3) */ fxch st(3) /* 3 1 2 0 4 5 6 7 */ faddp st(7), st /* 1 2 0 4 5 6 7 */ fxch st(1) /* 2 1 0 4 5 6 7 */ faddp st(5), st /* 1 0 4 5 6 7 */ faddp st(3), st /* 0 4 5 6 7 */ faddp st(1), st /* 4 5 6 7 */ /* * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) */ fld S(2) fmul M(2, 0) fld S(2) fmul M(2, 1) fld S(2) fmul M(2, 2) fld S(2) fmul M(2, 3) /* * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0) * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1) * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2) * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3) * st(3) = S(2) * M(2, 0) * st(2) = S(2) * M(2, 1) * st(1) = S(2) * M(2, 2) * st(0) = S(2) * M(2, 3) */ fxch st(3) /* 3 1 2 0 4 5 6 7 */ faddp st(7), st /* 1 2 0 4 5 6 7 */ fxch st(1) /* 2 1 0 4 5 6 7 */ faddp st(5), st /* 1 0 4 5 6 7 */ faddp st(3), st /* 0 4 5 6 7 */ faddp st(1), st /* 4 5 6 7 */ /* * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) */ fld S(3) fmul M(3, 0) fld S(3) fmul M(3, 1) fld S(3) fmul M(3, 2) fld S(3) fmul M(3, 3) /* * st(7) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) * st(6) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) * st(5) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) * st(4) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) * st(3) = S(3) * M(3, 0) * st(2) = S(3) * M(3, 1) * st(1) = S(3) * M(3, 2) * st(0) = S(3) * M(3, 3) */ fxch st(3) /* 3 1 2 0 4 5 6 7 */ faddp st(7), st /* 1 2 0 4 5 6 7 */ fxch st(1) /* 2 1 0 4 5 6 7 */ faddp st(5), st /* 1 0 4 5 6 7 */ faddp st(3), st /* 0 4 5 6 7 */ lea esi, S(4) dec ecx faddp st(1), st /* 4 5 6 7 */ /* * st(3) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + S(3) * M(3, 0) * st(2) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + S(3) * M(3, 1) * st(1) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + S(3) * M(3, 2) * st(0) = S(0) * M(0, 3) + S(1) * M(1, 3) + S(2) * M(2, 3) + S(3) * M(3, 3) */ fxch st(3) /* 3 1 2 0 */ fstp D(0) /* 1 2 0 */ fxch st(1) /* 2 1 0 */ fstp D(1) /* 1 0 */ fstp D(2) /* 0 */ fstp D(3) /* */ lea edi, D(4) jnz _asm_transform_points4_general_loop _asm_transform_points4_general_end: pop edi pop esi ret /* * void asm_transform_points4_identity( GLuint n, GLfloat d[][4], * GLfloat s[][4] ); */ PUBLIC _asm_transform_points4_identity _asm_transform_points4_identity: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov esi, [esp + 20] /* esi = s */ lea ecx, [ecx * 4] cld rep movsd pop edi pop esi ret /* * void asm_transform_points4_2d( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4] ); */ PUBLIC _asm_transform_points4_2d _asm_transform_points4_2d: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _asm_transform_points4_2d_end push ebx .align 4 _asm_transform_points4_2d_loop: fld S(0) fmul M(0, 0) fld S(0) fmul M(0, 1) fld S(1) fmul M(1, 0) fld S(1) fmul M(1, 1) fld S(3) fmul M(3, 0) fld S(3) fmul M(3, 1) /* * st(5) = S(0) * M(0, 0) * st(4) = S(0) * M(0, 1) * st(3) = S(1) * M(1, 0) * st(2) = S(1) * M(1, 1) * st(1) = S(3) * M(3, 0) * st(0) = S(3) * M(3, 1) */ mov eax, S(2) mov ebx, S(3) lea esi, S(4) dec ecx mov D(2), eax mov D(3), ebx faddp st(4), st faddp st(4), st faddp st(2), st faddp st(2), st fstp D(1) fstp D(0) lea edi, D(4) jnz _asm_transform_points4_2d_loop pop ebx _asm_transform_points4_2d_end: pop edi pop esi ret /* * void asm_transform_points4_2d_no_rot( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ PUBLIC _asm_transform_points4_2d_no_rot _asm_transform_points4_2d_no_rot: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _asm_transform_points4_2d_no_rot_end push ebx .align 4 _asm_transform_points4_2d_no_rot_loop: fld S(0) fmul M(0, 0) fld S(1) fmul M(1, 1) fld S(3) fmul M(3, 0) fld S(3) fmul M(3, 1) mov eax, S(2) mov ebx, S(3) lea esi, S(4) dec ecx mov D(2), eax mov D(3), ebx faddp st(2), st faddp st(2), st fstp D(1) fstp D(0) lea edi, D(4) jnz _asm_transform_points4_2d_no_rot_loop pop ebx _asm_transform_points4_2d_no_rot_end: pop edi pop esi ret /* * void asm_transform_points4_3d( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4] ); */ PUBLIC _asm_transform_points4_3d _asm_transform_points4_3d: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _asm_transform_points4_3d_end .align 4 _asm_transform_points4_3d_loop: fld S(3) fld S(0) fmul M(0, 0) fld S(0) fmul M(0, 1) fld S(0) fmul M(0, 2) fld S(1) fmul M(1, 0) fld S(1) fmul M(1, 1) fld S(1) fmul M(1, 2) /* * st(5) = S(0) * M(0, 0) * st(4) = S(0) * M(0, 1) * st(3) = S(0) * M(0, 2) * st(2) = S(1) * M(1, 0) * st(1) = S(1) * M(1, 1) * st(0) = S(1) * M(1, 2) */ fxch st(2) /* 2 1 0 3 4 5 */ faddp st(5), st /* 1 0 3 4 5 */ faddp st(3), st /* 0 3 4 5 */ faddp st(1), st /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) */ fld S(2) fmul M(2, 0) fld S(2) fmul M(2, 1) fld S(2) fmul M(2, 2) /* * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0) * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1) * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) * st(2) = S(2) * M(2, 0) * st(1) = S(2) * M(2, 1) * st(0) = S(2) * M(2, 2) */ fxch st(2) /* 2 1 0 3 4 5 */ faddp st(5), st /* 1 0 3 4 5 */ faddp st(3), st /* 0 3 4 5 */ faddp st(1), st /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) */ fld S(3) fmul M(3, 0) fld S(3) fmul M(3, 1) fld S(3) fmul M(3, 2) /* * st(5) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) * st(4) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) * st(3) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) * st(2) = S(3) * M(3, 0) * st(1) = S(3) * M(3, 1) * st(0) = S(3) * M(3, 2) */ fxch st(2) /* 2 1 0 3 4 5 */ faddp st(5), st /* 1 0 3 4 5 */ faddp st(3), st /* 0 3 4 5 */ lea esi, S(4) dec ecx faddp st(1), st /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(1, 0) + S(2) * M(2, 0) + S(3) * M(3, 0) * st(1) = S(0) * M(0, 1) + S(1) * M(1, 1) + S(2) * M(2, 1) + S(3) * M(3, 1) * st(0) = S(0) * M(0, 2) + S(1) * M(1, 2) + S(2) * M(2, 2) + S(3) * M(3, 2) */ fxch st(2) /* 2 1 0 */ fstp D(0) /* 1 0 */ fstp D(1) /* 0 */ fstp D(2) /* */ fstp D(3) lea edi, D(4) jnz _asm_transform_points4_3d_loop _asm_transform_points4_3d_end: pop edi pop esi ret /* * void asm_transform_points4_ortho( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ PUBLIC _asm_transform_points4_ortho _asm_transform_points4_ortho: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _asm_transform_points4_ortho_end .align 4 _asm_transform_points4_ortho_loop: fld S(0) fmul M(0, 0) fld S(1) fmul M(1, 1) fld S(2) fmul M(2, 2) fld S(3) fmul M(3, 0) fld S(3) fmul M(3, 1) fld S(3) fmul M(3, 2) mov eax, S(3) lea esi, S(4) dec ecx mov D(3), eax faddp st(3), st faddp st(3), st faddp st(3), st fstp D(2) fstp D(1) fstp D(0) lea edi, D(4) jnz _asm_transform_points4_ortho_loop _asm_transform_points4_ortho_end: pop edi pop esi ret /* * void asm_transform_points4_perspective( GLuint n, GLfloat d[][4], * GLfloat m[16], GLfloat s[][4] ); */ PUBLIC _asm_transform_points4_perspective _asm_transform_points4_perspective: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _asm_transform_points4_perspective_end .align 4 _asm_transform_points4_perspective_loop: fld S(0) fmul M(0, 0) fld S(1) fmul M(1, 1) fld S(2) fmul M(2, 2) fld S(2) fmul M(2, 0) fld S(2) fmul M(2, 1) fld S(3) fmul M(3, 2) mov eax, S(2) lea esi, S(4) xor eax, HEX(80000000) dec ecx faddp st(3), st faddp st(3), st faddp st(3), st fstp D(2) fstp D(1) fstp D(0) mov D(3), eax lea edi, D(4) jnz _asm_transform_points4_perspective_loop _asm_transform_points4_perspective_end: pop edi pop esi ret /* * Table for clip test. * * bit6 = S(3) < 0 * bit5 = S(2) < 0 * bit4 = abs(S(2)) > abs(S(3)) * bit3 = S(1) < 0 * bit2 = abs(S(1)) > abs(S(3)) * bit1 = S(0) < 0 * bit0 = abs(S(0)) > abs(S(3)) */ /* Vertex buffer clipping flags (from vb.h) */ #if 0 #define CLIP_RIGHT_BIT 0x01 #define CLIP_LEFT_BIT 0x02 #define CLIP_TOP_BIT 0x04 #define CLIP_BOTTOM_BIT 0x08 #define CLIP_NEAR_BIT 0x10 #define CLIP_FAR_BIT 0x20 #define CLIP_USER_BIT 0x40 #define CLIP_ALL_BITS 0x3f #define MAGN_X(i) (~(((i) & 1) - 1)) #define SIGN_X(i) (~((((i) >> 1) & 1) - 1)) #define MAGN_Y(i) (~((((i) >> 2) & 1) - 1)) #define SIGN_Y(i) (~((((i) >> 3) & 1) - 1)) #define MAGN_Z(i) (~((((i) >> 4) & 1) - 1)) #define SIGN_Z(i) (~((((i) >> 5) & 1) - 1)) #define SIGN_W(i) (~((((i) >> 6) & 1) - 1)) #define CLIP_VALUE(i) \ (CLIP_RIGHT_BIT \ & ((~SIGN_X(i) & SIGN_W(i)) \ | (~SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i)) \ | (SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i)))) \ | (CLIP_LEFT_BIT \ & ((SIGN_X(i) & SIGN_W(i)) \ | (~SIGN_X(i) & SIGN_W(i) & ~MAGN_X(i)) \ | (SIGN_X(i) & ~SIGN_W(i) & MAGN_X(i)))) \ | (CLIP_TOP_BIT \ & ((~SIGN_Y(i) & SIGN_W(i)) \ | (~SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i)) \ | (SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i)))) \ | (CLIP_BOTTOM_BIT \ & ((SIGN_Y(i) & SIGN_W(i)) \ | (~SIGN_Y(i) & SIGN_W(i) & ~MAGN_Y(i)) \ | (SIGN_Y(i) & ~SIGN_W(i) & MAGN_Y(i)))) \ | (CLIP_FAR_BIT \ & ((~SIGN_Z(i) & SIGN_W(i)) \ | (~SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i)) \ | (SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i)))) \ | (CLIP_NEAR_BIT \ & ((SIGN_Z(i) & SIGN_W(i)) \ | (~SIGN_Z(i) & SIGN_W(i) & ~MAGN_Z(i)) \ | (SIGN_Z(i) & ~SIGN_W(i) & MAGN_Z(i)))) #define CLIP_VALUE8(i) \ CLIP_VALUE(i + 0), CLIP_VALUE(i + 1), CLIP_VALUE(i + 2), CLIP_VALUE(i + 3), \ CLIP_VALUE(i + 4), CLIP_VALUE(i + 5), CLIP_VALUE(i + 6), CLIP_VALUE(i + 7) .rodata clip_table: .byte CLIP_VALUE8(0x00) .byte CLIP_VALUE8(0x08) .byte CLIP_VALUE8(0x10) .byte CLIP_VALUE8(0x18) .byte CLIP_VALUE8(0x20) .byte CLIP_VALUE8(0x28) .byte CLIP_VALUE8(0x30) .byte CLIP_VALUE8(0x38) .byte CLIP_VALUE8(0x40) .byte CLIP_VALUE8(0x48) .byte CLIP_VALUE8(0x50) .byte CLIP_VALUE8(0x58) .byte CLIP_VALUE8(0x60) .byte CLIP_VALUE8(0x68) .byte CLIP_VALUE8(0x70) .byte CLIP_VALUE8(0x78) #else .const ASSUME NOTHING clip_table: .byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(4), HEX(5), HEX(4), HEX(6) .byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(8), HEX(9), HEX(8), HEX(a) .byte HEX(20), HEX(21), HEX(20), HEX(22), HEX(24), HEX(25), HEX(24), HEX(26) .byte HEX(20), HEX(21), HEX(20), HEX(22), HEX(28), HEX(29), HEX(28), HEX(2a) .byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(4), HEX(5), HEX(4), HEX(6) .byte HEX(0), HEX(1), HEX(0), HEX(2), HEX(8), HEX(9), HEX(8), HEX(a) .byte HEX(10), HEX(11), HEX(10), HEX(12), HEX(14), HEX(15), HEX(14), HEX(16) .byte HEX(10), HEX(11), HEX(10), HEX(12), HEX(18), HEX(19), HEX(18), HEX(1a) .byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(37), HEX(35), HEX(37), HEX(36) .byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(3b), HEX(39), HEX(3b), HEX(3a) .byte HEX(2f), HEX(2d), HEX(2f), HEX(2e), HEX(27), HEX(25), HEX(27), HEX(26) .byte HEX(2f), HEX(2d), HEX(2f), HEX(2e), HEX(2b), HEX(29), HEX(2b), HEX(2a) .byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(37), HEX(35), HEX(37), HEX(36) .byte HEX(3f), HEX(3d), HEX(3f), HEX(3e), HEX(3b), HEX(39), HEX(3b), HEX(3a) .byte HEX(1f), HEX(1d), HEX(1f), HEX(1e), HEX(17), HEX(15), HEX(17), HEX(16) .byte HEX(1f), HEX(1d), HEX(1f), HEX(1e), HEX(1b), HEX(19), HEX(1b), HEX(1a) #endif .code /* * cliptest - * * inputs: * ecx = # points * esi = points * edi = clipmask[] * * inputs/outputs: * al = ormask * ah = andmask */ cliptest: test ecx, ecx jz cliptest_end push ebp push ebx .align 4 cliptest_loop: mov ebp, S(3) mov ebx, S(2) xor edx, edx add ebp, ebp /* %ebp = abs(S(3))*2 ; carry = sign of S(3) */ adc edx, edx add ebx, ebx /* %ebx = abs(S(2))*2 ; carry = sign of S(2) */ adc edx, edx cmp ebp, ebx /* carry = abs(S(2))*2 > abs(S(3))*2 */ adc edx, edx mov ebx, S(1) add ebx, ebx /* %ebx = abs(S(1))*2 ; carry = sign of S(1) */ adc edx, edx cmp ebp, ebx /* carry = abs(S(1))*2 > abs(S(3))*2 */ adc edx, edx mov ebx, S(0) add ebx, ebx /* %ebx = abs(S(0))*2 ; carry = sign of S(0) */ adc edx, edx cmp ebp, ebx /* carry = abs(S(0))*2 > abs(S(3))*2 */ adc edx, edx lea esi, S(4) mov bl, byte ptr [edi] mov dl, byte ptr [clip_table + edx] or bl, dl or al, dl and ah, dl mov [edi], bl inc edi dec ecx jnz cliptest_loop pop ebx pop ebp cliptest_end: ret /* * void asm_project_and_cliptest_general( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4], GLubyte clipmask[], * GLubyte *ormask, GLubyte *andmask ); */ PUBLIC _asm_project_and_cliptest_general _asm_project_and_cliptest_general: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ push esi push edx push edi push ecx call _asm_transform_points4_general add esp, DEC(16) mov edi, [esp + 32] /* ormask */ mov esi, [esp + 36] /* andmask */ mov al, [edi] mov ah, [esi] mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 28] /* edi = clipmask */ mov esi, [esp + 16] /* esi = d */ call cliptest mov edi, [esp + 32] /* ormask */ mov esi, [esp + 36] /* andmask */ mov [edi], al mov [esi], ah pop edi pop esi ret /* * void asm_project_and_cliptest_identity( GLuint n, GLfloat d[][4], * GLfloat s[][4], GLubyte clipmask[], * GLubyte *ormask, GLubyte *andmask ); */ PUBLIC _asm_project_and_cliptest_identity _asm_project_and_cliptest_identity: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov esi, [esp + 20] /* esi = s */ push esi push edi push ecx call _asm_transform_points4_identity add esp, DEC(12) mov edi, [esp + 28] /* ormask */ mov esi, [esp + 32] /* andmask */ mov al, [edi] mov ah, [esi] mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 24] /* edi = clipmask */ mov esi, [esp + 16] /* esi = d */ call cliptest mov edi, [esp + 28] /* ormask */ mov esi, [esp + 32] /* andmask */ mov [edi], al mov [esi], ah pop edi pop esi ret /* * void asm_project_and_cliptest_ortho( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4], GLubyte clipmask[], * GLubyte *ormask, GLubyte *andmask ); */ PUBLIC _asm_project_and_cliptest_ortho _asm_project_and_cliptest_ortho: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ push esi push edx push edi push ecx call _asm_transform_points4_ortho add esp, DEC(16) mov edi, [esp + 32] /* ormask */ mov esi, [esp + 36] /* andmask */ mov al, [edi] mov ah, [esi] mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 28] /* edi = clipmask */ mov esi, [esp + 16] /* esi = d */ call cliptest mov edi, [esp + 32] /* ormask */ mov esi, [esp + 36] /* andmask */ mov [edi], al mov [esi], ah pop edi pop esi ret /* * void asm_project_and_cliptest_perspective( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4], GLubyte clipmask[], * GLubyte *ormask, GLubyte *andmask ); */ PUBLIC _asm_project_and_cliptest_perspective _asm_project_and_cliptest_perspective: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ push esi push edx push edi push ecx call _asm_transform_points4_perspective add esp, DEC(16) mov edi, [esp + 32] /* ormask */ mov esi, [esp + 36] /* andmask */ mov al, [edi] mov ah, [esi] mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 28] /* edi = clipmask */ mov esi, [esp + 16] /* esi = d */ call cliptest mov edi, [esp + 32] /* ormask */ mov esi, [esp + 36] /* andmask */ mov byte ptr [edi], al mov byte ptr [esi], ah pop edi pop esi ret /* * unsigned int inverse_nofp( float f ); * * Calculate the inverse of a float without using the FPU. * This function returns a float in eax, so it's return * type should be 'int' when called from C (and converted * to float with pointer/union abuse). */ .align 4 inverse_nofp: /* get mantissa in eax */ mov ecx, [esp + 4] and ecx, HEX(7fffff) /* set implicit integer */ or ecx, HEX(800000) /* div 0x10000:0x00000000 by mantissa */ xor eax, eax mov edx, HEX(10000) div ecx /* round result */ shr eax, DEC(1) adc eax, DEC(0) /* get exponent in ecx */ mov ecx, HEX(7f800000) mov edx, [esp + 4] and ecx, edx /* negate exponent and decrement it */ mov edx, HEX(7E800000) sub edx, ecx /* if bit 24 is set, shift and adjust exponent */ test eax, HEX(1000000) jz inverse_nofp_combine shr eax, HEX(1) add edx, HEX(800000) /* combine mantissa and exponent, then set sign */ inverse_nofp_combine: and eax, HEX(7fffff) mov ecx, [esp + 4] or eax, edx and ecx, HEX(80000000) or eax, ecx ret /* * void gl_xform_normals_3fv( GLuint n, GLfloat d[][4], GLfloat m[16], * GLfloat s[][4], GLboolean normalize ); */ PUBLIC _gl_xform_normals_3fv _gl_xform_normals_3fv: .align 4 push esi push edi mov ecx, [esp + 12] /* ecx = n */ mov edi, [esp + 16] /* edi = d */ mov edx, [esp + 20] /* edx = m */ mov esi, [esp + 24] /* esi = s */ test ecx, ecx jz _gl_xform_normals_3fv_end .align 4 _gl_xform_normals_3fv_loop: fld S(0) fmul M(0, 0) fld S(0) fmul M(1, 0) fld S(0) fmul M(2, 0) fld S(1) fmul M(0, 1) fld S(1) fmul M(1, 1) fld S(1) fmul M(2, 1) /* * st(5) = S(0) * M(0, 0) * st(4) = S(0) * M(1, 0) * st(3) = S(0) * M(2, 0) * st(2) = S(1) * M(0, 1) * st(1) = S(1) * M(1, 1) * st(0) = S(1) * M(2, 1) */ fxch st(2) /* 2 1 0 3 4 5 */ faddp st(5), st /* 1 0 3 4 5 */ faddp st(3), st /* 0 3 4 5 */ faddp st(1), st /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) */ fld S(2) fmul M(0, 2) fld S(2) fmul M(1, 2) fld S(2) fmul M(2, 2) /* * st(5) = S(0) * M(0, 0) + S(1) * M(0, 1) * st(4) = S(0) * M(1, 0) + S(1) * M(1, 1) * st(3) = S(0) * M(2, 0) + S(1) * M(2, 1) * st(2) = S(2) * M(0, 2) * st(1) = S(2) * M(1, 2) * st(0) = S(2) * M(2, 2) */ fxch st(2) /* 2 1 0 3 4 5 */ faddp st(5), st /* 1 0 3 4 5 */ faddp st(3), st /* 0 3 4 5 */ faddp st(1), st /* 3 4 5 */ /* * st(2) = S(0) * M(0, 0) + S(1) * M(0, 1) + S(2) * M(0, 2) * st(1) = S(0) * M(1, 0) + S(1) * M(1, 1) + S(2) * M(1, 2) * st(0) = S(0) * M(2, 0) + S(1) * M(2, 1) + S(2) * M(2, 2) */ fxch st(2) /* 2 1 0 */ fstp D(0) /* 1 0 */ fstp D(1) /* 0 */ fstp D(2) /* */ lea esi, S(3) dec ecx lea edi, D(3) jnz _gl_xform_normals_3fv_loop /* * Skip normalize if it isn't needed */ cmp dword ptr [esp + 28], DEC(0) jz _gl_xform_normals_3fv_end /* Normalize required */ mov esi, [esp + 12] /* esi = n */ mov edi, [esp + 16] /* edi = d */ sub esp, DEC(4) /* temp var for 1.0 / len */ /* * (%esp) = length of first normal */ fld D(0) fmul D(0) fld D(1) fmul D(1) fld D(2) fmul D(2) fxch st(2) faddp st(1), st faddp st(1), st fsqrt fstp dword ptr [esp] jmp _gl_xform_normals_3fv_loop2_end .align 4 _gl_xform_normals_3fv_loop2: /* %st(0) = length of next normal */ fld D(3) fmul D(3) fld D(4) fmul D(4) fld D(5) fmul D(5) fxch st(2) faddp st(1), st faddp st(1), st fsqrt /* * inverse the length of the current normal, which is * already at (%esp). This should overlap the prev * fsqrt nicely. */ call inverse_nofp mov [esp], eax /* multiply normal by 1/len */ fld D(0) fmul dword ptr [esp] fld D(1) fmul dword ptr [esp] fld D(2) fmul dword ptr [esp] fxch st(3) fstp dword ptr [esp] /* store length of next normal */ fstp D(1) fstp D(0) fstp D(2) lea edi, D(3) _gl_xform_normals_3fv_loop2_end: dec esi jnz _gl_xform_normals_3fv_loop2 /* finish up the last normal */ call inverse_nofp mov [esp], eax fld D(0) fmul dword ptr [esp] fld D(1) fmul dword ptr [esp] fld D(2) fmul dword ptr [esp] fxch st(2) fstp D(0) fstp D(1) fstp D(2) add esp, DEC(4) _gl_xform_normals_3fv_end: pop edi pop esi ret END