/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		IP/TCP/UDP checksumming routines
 *
 * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Tom May, <ftom@netcom.com>
 *              Pentium Pro/II routines:
 *              Alexander Kjeldaas <astor@guardian.no>
 *              Finn Arne Gangstad <finnag@guardian.no>
 *		Lots of code moved from tcp.c and ip.c; see those files
 *		for more names.
 *
 * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
 *			     handling.
 *		Andi Kleen,  add zeroing on error
 *                   converted to pure assembler
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

/*
 * computes a partial checksum, e.g. for TCP/UDP fragments
 */

/*
unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 */

#include <asm.inc>

.code
.align 4
PUBLIC _csum_partial

#ifndef CONFIG_X86_USE_PPRO_CHECKSUM

	  /*
	   * Experiments with Ethernet and SLIP connections show that buff
	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
	   * alignment for the unrolled loop.
	   */
_csum_partial:
	push esi
	push ebx
	mov eax, [esp + 20]	// Function arg: unsigned int sum
	mov ecx, [esp + 16]	// Function arg: int len
	mov esi, [esp + 12]	// Function arg: unsigned char *buff
	test esi, 3	    	// Check alignment.
	jz m2			// Jump if alignment is ok.
	test esi, 1		// Check alignment.
	jz l10			// Jump if alignment is boundary of 2bytes.

	// buf is odd
	dec ecx
	jl l8
	movzx ebx, byte ptr [esi]
	adc eax, ebx
	rol eax, 8
	inc esi
	test esi, 2
	jz m2
l10:
	sub ecx, 2		// Alignment uses up two bytes.
	jae m1			// Jump if we had at least two bytes.
	add ecx, 2		// ecx was < 2.  Deal with it.
	jmp l4
m1:	mov bx, [esi]
	add esi, 2
	add ax, bx
	adc eax, 0
m2:
	mov edx, ecx
	shr ecx, 5
	jz l2
	test esi, esi
l1:	mov ebx, [esi]
	adc eax, ebx
	mov ebx, [esi + 4]
	adc eax, ebx
	mov ebx, [esi + 8]
	adc eax, ebx
	mov ebx, [esi + 12]
	adc eax, ebx
	mov ebx, [esi + 16]
	adc eax, ebx
	mov ebx, [esi + 20]
	adc eax, ebx
	mov ebx, [esi + 24]
	adc eax, ebx
	mov ebx, [esi + 28]
	adc eax, ebx
	lea esi, [esi + 32]
	dec ecx
	jne l1
	adc eax, 0
l2:	mov ecx, edx
	and edx, HEX(1c)
	je l4
	shr edx, 2		// This clears CF
l3:	adc eax, [esi]
	lea esi, [esi + 4]
	dec edx
	jne l3
	adc eax, 0
l4:	and ecx, 3
	jz l7
	cmp ecx, 2
	jb l5
	mov cx, [esi]
	lea esi, [esi + 2]
	je l6
	shl ecx, 16
l5:	mov cl, [esi]
l6:	add eax, ecx
	adc eax, 0
l7:
	test dword ptr [esp + 12], 1
	jz l8
	rol eax, 8
l8:
	pop ebx
	pop esi
	ret

#else

/* Version for PentiumII/PPro */

csum_partial:
	push esi
	push ebx
	mov eax, [esp + 20]	# Function arg: unsigned int sum
	mov ecx, [esp + 16]	# Function arg: int len
	mov esi, [esp + 12]	# Function arg:	const unsigned char *buf

	test esi, 3
	jnz l25f
l10:
	mov edx, ecx
	mov ebx, ecx
	and ebx, HEX(7c)
	shr ecx, 7
	add esi, ebx
	shr ebx, 2
	neg ebx
	lea ebx, l45[ebx + ebx * 2]
	test esi, esi
	jmp dword ptr [ebx]

	// Handle 2-byte-aligned regions
l20: add ax, [esi]
	lea esi, [esi + 2]
	adc eax, 0
	jmp l10b
l25:
	test esi, 1
	jz l30f
	// buf is odd
	dec ecx
	jl l90
	movzb ebx, [esi]
	add eax, ebx
	adc eax, 0
	rol eax, 8
	inc esi
	test esi, 2
	jz l10b

l30: sub ecx, 2
	ja l20
	je l32
	add ecx, 2
	jz l80
	movzb ebx, [esi]	// csumming 1 byte, 2-aligned
	add eax, ebx
	adc eax, 0
	jmp l80
l32:
	add ax, [esi]	// csumming 2 bytes, 2-aligned
	adc eax, 0
	jmp l80

l40:
	add eax, [esi -128]
	adc eax, [esi -124]
	adc eax, [esi -120]
	adc eax, [esi -116]
	adc eax, [esi -112]
	adc eax, [esi -108]
	adc eax, [esi -104]
	adc eax, [esi -100]
	adc eax, [esi -96]
	adc eax, [esi -92]
	adc eax, [esi -88]
	adc eax, [esi -84]
	adc eax, [esi -80]
	adc eax, [esi -76]
	adc eax, [esi -72]
	adc eax, [esi -68]
	adc eax, [esi -64]
	adc eax, [esi -60]
	adc eax, [esi -56]
	adc eax, [esi -52]
	adc eax, [esi -48]
	adc eax, [esi -44]
	adc eax, [esi -40]
	adc eax, [esi -36]
	adc eax, [esi -32]
	adc eax, [esi -28]
	adc eax, [esi -24]
	adc eax, [esi -20]
	adc eax, [esi -16]
	adc eax, [esi -12]
	adc eax, [esi -8]
	adc eax, [esi -4]
l45:
	lea esi, [esi + 128]
	adc eax, 0
	dec ecx
	jge l40
	mov ecx, edx
l50:	and ecx, 3
	jz l80

	// Handle the last 1-3 bytes without jumping
	not ecx		// 1->2, 2->1, 3->0, higher bits are masked
	mov ebx, HEX(ffffff)	// by the shll and shrl instructions
	shl ecx, 3
	shr ebx, cl
	and ebx, [esi -128]	// esi is 4-aligned so should be ok
	add eax, ebx
	adc eax, 0
l80:
	test dword ptr [esp + 12], 1
	jz l90
	rol eax, 8
l90:
	pop ebx
	pop esi
	ret

#endif

END