diff --git a/reactos/ChangeLog b/reactos/ChangeLog index 38d4cbefaa5..ccf18c00e90 100644 --- a/reactos/ChangeLog +++ b/reactos/ChangeLog @@ -1,3 +1,18 @@ +2003-01-15 Casper S. Hornstrup + + * drivers/net/tcpip/makefile (TCP_OBJECTS): Add transport/tcp/tcpcore.o, + transport/tcp/tcp_input.o, transport/tcp/tcp_ipv4.o, + transport/tcp/tcp_output.o, and transport/tcp/tcp_timer.o. + * drivers/net/tcpip/transport/tcp/tcp.c (TCPStartup): Call tcp_init(). + * drivers/net/tcpip/include/linux.h: New file. + * drivers/net/tcpip/include/tcpcore.h: Ditto. + * drivers/net/tcpip/include/tcpdef.h: Ditto. + * drivers/net/tcpip/transport/tcp/tcp_input.c: Ditto. + * drivers/net/tcpip/transport/tcp/tcp_ipv4.c: Ditto. + * drivers/net/tcpip/transport/tcp/tcp_output.c: Ditto. + * drivers/net/tcpip/transport/tcp/tcp_timer.c: Ditto. + * drivers/net/tcpip/transport/tcp/tcpcore.c: Ditto. + 2003-01-15 Casper S. Hornstrup * lib/kernel32/k32.h: New file. diff --git a/reactos/drivers/net/tcpip/include/linux.h b/reactos/drivers/net/tcpip/include/linux.h new file mode 100755 index 00000000000..86db7ae0180 --- /dev/null +++ b/reactos/drivers/net/tcpip/include/linux.h @@ -0,0 +1,1876 @@ +#ifndef _LINUX_TYPES_H +#define _LINUX_TYPES_H + +#include + +#ifndef NULL +#define NULL (void*)0 +#endif + +typedef struct page { + int x; +} mem_map_t; + + + + + + + + + + + + + +/* i386 */ + +typedef unsigned short umode_t; + +/* + * __xx is ok: it doesn't pollute the POSIX namespace. Use these in the + * header files exported to user space + */ + +typedef __signed__ char __s8; +typedef unsigned char __u8; + +typedef __signed__ short __s16; +typedef unsigned short __u16; + +typedef __signed__ int __s32; +typedef unsigned int __u32; + +#if defined(__GNUC__) && !defined(__STRICT_ANSI__) +typedef __signed__ long long __s64; +typedef unsigned long long __u64; +#endif + +/* + * These aren't exported outside the kernel to avoid name space clashes + */ +typedef signed char s8; +typedef unsigned char u8; + +typedef signed short s16; +typedef unsigned short u16; + +typedef signed int s32; +typedef unsigned int u32; + +typedef signed long long s64; +typedef unsigned long long u64; + +#define BITS_PER_LONG 32 + +/* DMA addresses come in generic and 64-bit flavours. */ + +#ifdef CONFIG_HIGHMEM64G +typedef u64 dma_addr_t; +#else +typedef u32 dma_addr_t; +#endif +typedef u64 dma64_addr_t; + + + +/* + * This allows for 1024 file descriptors: if NR_OPEN is ever grown + * beyond that you'll have to change this too. But 1024 fd's seem to be + * enough even for such "real" unices like OSF/1, so hopefully this is + * one limit that doesn't have to be changed [again]. + * + * Note that POSIX wants the FD_CLEAR(fd,fdsetp) defines to be in + * (and thus ) - but this is a more logical + * place for them. Solved by having dummy defines in . + */ + +/* + * Those macros may have been defined in . But we always + * use the ones here. + */ +#undef __NFDBITS +#define __NFDBITS (8 * sizeof(unsigned long)) + +#undef __FD_SETSIZE +#define __FD_SETSIZE 1024 + +#undef __FDSET_LONGS +#define __FDSET_LONGS (__FD_SETSIZE/__NFDBITS) + +#undef __FDELT +#define __FDELT(d) ((d) / __NFDBITS) + +#undef __FDMASK +#define __FDMASK(d) (1UL << ((d) % __NFDBITS)) + +typedef struct { + unsigned long fds_bits [__FDSET_LONGS]; +} __kernel_fd_set; + +/* Type of a signal handler. */ +typedef void (*__kernel_sighandler_t)(int); + +/* Type of a SYSV IPC key. */ +typedef int __kernel_key_t; + + +/* + * This file is generally used by user-level software, so you need to + * be a little careful about namespace pollution etc. Also, we cannot + * assume GCC is being used. + */ + +typedef unsigned short __kernel_dev_t; +typedef unsigned long __kernel_ino_t; +typedef unsigned short __kernel_mode_t; +typedef unsigned short __kernel_nlink_t; +typedef long __kernel_off_t; +typedef int __kernel_pid_t; +typedef unsigned short __kernel_ipc_pid_t; +typedef unsigned short __kernel_uid_t; +typedef unsigned short __kernel_gid_t; +typedef unsigned int __kernel_size_t; +typedef int __kernel_ssize_t; +typedef int __kernel_ptrdiff_t; +typedef long __kernel_time_t; +typedef long __kernel_suseconds_t; +typedef long __kernel_clock_t; +typedef int __kernel_daddr_t; +typedef char * __kernel_caddr_t; +typedef unsigned short __kernel_uid16_t; +typedef unsigned short __kernel_gid16_t; +typedef unsigned int __kernel_uid32_t; +typedef unsigned int __kernel_gid32_t; + +typedef unsigned short __kernel_old_uid_t; +typedef unsigned short __kernel_old_gid_t; + +#ifdef __GNUC__ +typedef long long __kernel_loff_t; +#endif + +typedef struct { +#if defined(__KERNEL__) || defined(__USE_ALL) + int val[2]; +#else /* !defined(__KERNEL__) && !defined(__USE_ALL) */ + int __val[2]; +#endif /* !defined(__KERNEL__) && !defined(__USE_ALL) */ +} __kernel_fsid_t; + +#if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) + +#undef __FD_SET +#define __FD_SET(fd,fdsetp) \ + __asm__ __volatile__("btsl %1,%0": \ + "=m" (*(__kernel_fd_set *) (fdsetp)):"r" ((int) (fd))) + +#undef __FD_CLR +#define __FD_CLR(fd,fdsetp) \ + __asm__ __volatile__("btrl %1,%0": \ + "=m" (*(__kernel_fd_set *) (fdsetp)):"r" ((int) (fd))) + +#undef __FD_ISSET +#define __FD_ISSET(fd,fdsetp) (__extension__ ({ \ + unsigned char __result; \ + __asm__ __volatile__("btl %1,%2 ; setb %0" \ + :"=q" (__result) :"r" ((int) (fd)), \ + "m" (*(__kernel_fd_set *) (fdsetp))); \ + __result; })) + +#undef __FD_ZERO +#define __FD_ZERO(fdsetp) \ +do { \ + int __d0, __d1; \ + __asm__ __volatile__("cld ; rep ; stosl" \ + :"=m" (*(__kernel_fd_set *) (fdsetp)), \ + "=&c" (__d0), "=&D" (__d1) \ + :"a" (0), "1" (__FDSET_LONGS), \ + "2" ((__kernel_fd_set *) (fdsetp)) : "memory"); \ +} while (0) + +#endif /* defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) */ + + +#ifndef __KERNEL_STRICT_NAMES + +typedef __kernel_fd_set fd_set; +typedef __kernel_dev_t dev_t; +typedef __kernel_ino_t ino_t; +typedef __kernel_mode_t mode_t; +typedef __kernel_nlink_t nlink_t; +typedef __kernel_off_t off_t; +typedef __kernel_pid_t pid_t; +typedef __kernel_daddr_t daddr_t; +typedef __kernel_key_t key_t; +typedef __kernel_suseconds_t suseconds_t; + +#ifdef __KERNEL__ +typedef __kernel_uid32_t uid_t; +typedef __kernel_gid32_t gid_t; +typedef __kernel_uid16_t uid16_t; +typedef __kernel_gid16_t gid16_t; + +#ifdef CONFIG_UID16 +/* This is defined by include/asm-{arch}/posix_types.h */ +typedef __kernel_old_uid_t old_uid_t; +typedef __kernel_old_gid_t old_gid_t; +#endif /* CONFIG_UID16 */ + +/* libc5 includes this file to define uid_t, thus uid_t can never change + * when it is included by non-kernel code + */ +#else +typedef __kernel_uid_t uid_t; +typedef __kernel_gid_t gid_t; +#endif /* __KERNEL__ */ + +#if defined(__GNUC__) +typedef __kernel_loff_t loff_t; +#endif + +/* + * The following typedefs are also protected by individual ifdefs for + * historical reasons: + */ +#ifndef _SIZE_T +#define _SIZE_T +typedef __kernel_size_t size_t; +#endif + +#ifndef _SSIZE_T +#define _SSIZE_T +typedef __kernel_ssize_t ssize_t; +#endif + +#ifndef _PTRDIFF_T +#define _PTRDIFF_T +typedef __kernel_ptrdiff_t ptrdiff_t; +#endif + +#ifndef _TIME_T +#define _TIME_T +typedef __kernel_time_t time_t; +#endif + +#ifndef _CLOCK_T +#define _CLOCK_T +typedef __kernel_clock_t clock_t; +#endif + +#ifndef _CADDR_T +#define _CADDR_T +typedef __kernel_caddr_t caddr_t; +#endif + +/* bsd */ +typedef unsigned char u_char; +typedef unsigned short u_short; +typedef unsigned int u_int; +typedef unsigned long u_long; + +/* sysv */ +typedef unsigned char unchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; + +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ + +typedef __u8 u_int8_t; +typedef __s8 int8_t; +typedef __u16 u_int16_t; +typedef __s16 int16_t; +typedef __u32 u_int32_t; +typedef __s32 int32_t; + +#endif /* !(__BIT_TYPES_DEFINED__) */ + +typedef __u8 uint8_t; +typedef __u16 uint16_t; +typedef __u32 uint32_t; + +#if defined(__GNUC__) && !defined(__STRICT_ANSI__) +typedef __u64 uint64_t; +typedef __u64 u_int64_t; +typedef __s64 int64_t; +#endif + +#endif /* __KERNEL_STRICT_NAMES */ + +/* + * Below are truly Linux-specific types that should never collide with + * any application/library that wants linux/types.h. + */ + +struct ustat { + __kernel_daddr_t f_tfree; + __kernel_ino_t f_tinode; + char f_fname[6]; + char f_fpack[6]; +}; + + + + + + + + + + + + + + +#ifndef __LITTLE_ENDIAN +#define __LITTLE_ENDIAN 1234 +#endif +#ifndef __LITTLE_ENDIAN_BITFIELD +#define __LITTLE_ENDIAN_BITFIELD +#endif + +#if 1 /* swab */ + +/* + * linux/byteorder/swab.h + * Byte-swapping, independently from CPU endianness + * swabXX[ps]?(foo) + * + * Francois-Rene Rideau 19971205 + * separated swab functions from cpu_to_XX, + * to clean up support for bizarre-endian architectures. + * + * See asm-i386/byteorder.h and suches for examples of how to provide + * architecture-dependent optimized versions + * + */ + +/* casts are necessary for constants, because we never know how for sure + * how U/UL/ULL map to __u16, __u32, __u64. At least not in a portable way. + */ +#define ___swab16(x) \ +({ \ + __u16 __x = (x); \ + ((__u16)( \ + (((__u16)(__x) & (__u16)0x00ffU) << 8) | \ + (((__u16)(__x) & (__u16)0xff00U) >> 8) )); \ +}) + +#define ___swab24(x) \ +({ \ + __u32 __x = (x); \ + ((__u32)( \ + ((__x & (__u32)0x000000ffUL) << 16) | \ + (__x & (__u32)0x0000ff00UL) | \ + ((__x & (__u32)0x00ff0000UL) >> 16) )); \ +}) + +#define ___swab32(x) \ +({ \ + __u32 __x = (x); \ + ((__u32)( \ + (((__u32)(__x) & (__u32)0x000000ffUL) << 24) | \ + (((__u32)(__x) & (__u32)0x0000ff00UL) << 8) | \ + (((__u32)(__x) & (__u32)0x00ff0000UL) >> 8) | \ + (((__u32)(__x) & (__u32)0xff000000UL) >> 24) )); \ +}) + +#define ___swab64(x) \ +({ \ + __u64 __x = (x); \ + ((__u64)( \ + (__u64)(((__u64)(__x) & (__u64)0x00000000000000ffULL) << 56) | \ + (__u64)(((__u64)(__x) & (__u64)0x000000000000ff00ULL) << 40) | \ + (__u64)(((__u64)(__x) & (__u64)0x0000000000ff0000ULL) << 24) | \ + (__u64)(((__u64)(__x) & (__u64)0x00000000ff000000ULL) << 8) | \ + (__u64)(((__u64)(__x) & (__u64)0x000000ff00000000ULL) >> 8) | \ + (__u64)(((__u64)(__x) & (__u64)0x0000ff0000000000ULL) >> 24) | \ + (__u64)(((__u64)(__x) & (__u64)0x00ff000000000000ULL) >> 40) | \ + (__u64)(((__u64)(__x) & (__u64)0xff00000000000000ULL) >> 56) )); \ +}) + +#define ___constant_swab16(x) \ + ((__u16)( \ + (((__u16)(x) & (__u16)0x00ffU) << 8) | \ + (((__u16)(x) & (__u16)0xff00U) >> 8) )) +#define ___constant_swab24(x) \ + ((__u32)( \ + (((__u32)(x) & (__u32)0x000000ffU) << 16) | \ + (((__u32)(x) & (__u32)0x0000ff00U) | \ + (((__u32)(x) & (__u32)0x00ff0000U) >> 16) )) +#define ___constant_swab32(x) \ + ((__u32)( \ + (((__u32)(x) & (__u32)0x000000ffUL) << 24) | \ + (((__u32)(x) & (__u32)0x0000ff00UL) << 8) | \ + (((__u32)(x) & (__u32)0x00ff0000UL) >> 8) | \ + (((__u32)(x) & (__u32)0xff000000UL) >> 24) )) +#define ___constant_swab64(x) \ + ((__u64)( \ + (__u64)(((__u64)(x) & (__u64)0x00000000000000ffULL) << 56) | \ + (__u64)(((__u64)(x) & (__u64)0x000000000000ff00ULL) << 40) | \ + (__u64)(((__u64)(x) & (__u64)0x0000000000ff0000ULL) << 24) | \ + (__u64)(((__u64)(x) & (__u64)0x00000000ff000000ULL) << 8) | \ + (__u64)(((__u64)(x) & (__u64)0x000000ff00000000ULL) >> 8) | \ + (__u64)(((__u64)(x) & (__u64)0x0000ff0000000000ULL) >> 24) | \ + (__u64)(((__u64)(x) & (__u64)0x00ff000000000000ULL) >> 40) | \ + (__u64)(((__u64)(x) & (__u64)0xff00000000000000ULL) >> 56) )) + +/* + * provide defaults when no architecture-specific optimization is detected + */ +#ifndef __arch__swab16 +# define __arch__swab16(x) ({ __u16 __tmp = (x) ; ___swab16(__tmp); }) +#endif +#ifndef __arch__swab24 +# define __arch__swab24(x) ({ __u32 __tmp = (x) ; ___swab24(__tmp); }) +#endif +#ifndef __arch__swab32 +# define __arch__swab32(x) ({ __u32 __tmp = (x) ; ___swab32(__tmp); }) +#endif +#ifndef __arch__swab64 +# define __arch__swab64(x) ({ __u64 __tmp = (x) ; ___swab64(__tmp); }) +#endif + +#ifndef __arch__swab16p +# define __arch__swab16p(x) __arch__swab16(*(x)) +#endif +#ifndef __arch__swab24p +# define __arch__swab24p(x) __arch__swab24(*(x)) +#endif +#ifndef __arch__swab32p +# define __arch__swab32p(x) __arch__swab32(*(x)) +#endif +#ifndef __arch__swab64p +# define __arch__swab64p(x) __arch__swab64(*(x)) +#endif + +#ifndef __arch__swab16s +# define __arch__swab16s(x) do { *(x) = __arch__swab16p((x)); } while (0) +#endif +#ifndef __arch__swab24s +# define __arch__swab24s(x) do { *(x) = __arch__swab24p((x)); } while (0) +#endif +#ifndef __arch__swab32s +# define __arch__swab32s(x) do { *(x) = __arch__swab32p((x)); } while (0) +#endif +#ifndef __arch__swab64s +# define __arch__swab64s(x) do { *(x) = __arch__swab64p((x)); } while (0) +#endif + + +/* + * Allow constant folding + */ +#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__) +# define __swab16(x) \ +(__builtin_constant_p((__u16)(x)) ? \ + ___swab16((x)) : \ + __fswab16((x))) +# define __swab24(x) \ +(__builtin_constant_p((__u32)(x)) ? \ + ___swab24((x)) : \ + __fswab24((x))) +# define __swab32(x) \ +(__builtin_constant_p((__u32)(x)) ? \ + ___swab32((x)) : \ + __fswab32((x))) +# define __swab64(x) \ +(__builtin_constant_p((__u64)(x)) ? \ + ___swab64((x)) : \ + __fswab64((x))) +#else +# define __swab16(x) __fswab16(x) +# define __swab24(x) __fswab24(x) +# define __swab32(x) __fswab32(x) +# define __swab64(x) __fswab64(x) +#endif /* OPTIMIZE */ + + +static __inline__ __const__ __u16 __fswab16(__u16 x) +{ + return __arch__swab16(x); +} +static __inline__ __u16 __swab16p(__u16 *x) +{ + return __arch__swab16p(x); +} +static __inline__ void __swab16s(__u16 *addr) +{ + __arch__swab16s(addr); +} + +static __inline__ __const__ __u32 __fswab24(__u32 x) +{ + return __arch__swab24(x); +} +static __inline__ __u32 __swab24p(__u32 *x) +{ + return __arch__swab24p(x); +} +static __inline__ void __swab24s(__u32 *addr) +{ + __arch__swab24s(addr); +} + +static __inline__ __const__ __u32 __fswab32(__u32 x) +{ + return __arch__swab32(x); +} +static __inline__ __u32 __swab32p(__u32 *x) +{ + return __arch__swab32p(x); +} +static __inline__ void __swab32s(__u32 *addr) +{ + __arch__swab32s(addr); +} + +#ifdef __BYTEORDER_HAS_U64__ +static __inline__ __const__ __u64 __fswab64(__u64 x) +{ +# ifdef __SWAB_64_THRU_32__ + __u32 h = x >> 32; + __u32 l = x & ((1ULL<<32)-1); + return (((__u64)__swab32(l)) << 32) | ((__u64)(__swab32(h))); +# else + return __arch__swab64(x); +# endif +} +static __inline__ __u64 __swab64p(__u64 *x) +{ + return __arch__swab64p(x); +} +static __inline__ void __swab64s(__u64 *addr) +{ + __arch__swab64s(addr); +} +#endif /* __BYTEORDER_HAS_U64__ */ + +#if defined(__KERNEL__) +#define swab16 __swab16 +#define swab24 __swab24 +#define swab32 __swab32 +#define swab64 __swab64 +#define swab16p __swab16p +#define swab24p __swab24p +#define swab32p __swab32p +#define swab64p __swab64p +#define swab16s __swab16s +#define swab24s __swab24s +#define swab32s __swab32s +#define swab64s __swab64s +#endif + +#endif /* swab */ + + + +#if 1 /* generic */ + +/* + * linux/byteorder_generic.h + * Generic Byte-reordering support + * + * Francois-Rene Rideau 19970707 + * gathered all the good ideas from all asm-foo/byteorder.h into one file, + * cleaned them up. + * I hope it is compliant with non-GCC compilers. + * I decided to put __BYTEORDER_HAS_U64__ in byteorder.h, + * because I wasn't sure it would be ok to put it in types.h + * Upgraded it to 2.1.43 + * Francois-Rene Rideau 19971012 + * Upgraded it to 2.1.57 + * to please Linus T., replaced huge #ifdef's between little/big endian + * by nestedly #include'd files. + * Francois-Rene Rideau 19971205 + * Made it to 2.1.71; now a facelift: + * Put files under include/linux/byteorder/ + * Split swab from generic support. + * + * TODO: + * = Regular kernel maintainers could also replace all these manual + * byteswap macros that remain, disseminated among drivers, + * after some grep or the sources... + * = Linus might want to rename all these macros and files to fit his taste, + * to fit his personal naming scheme. + * = it seems that a few drivers would also appreciate + * nybble swapping support... + * = every architecture could add their byteswap macro in asm/byteorder.h + * see how some architectures already do (i386, alpha, ppc, etc) + * = cpu_to_beXX and beXX_to_cpu might some day need to be well + * distinguished throughout the kernel. This is not the case currently, + * since little endian, big endian, and pdp endian machines needn't it. + * But this might be the case for, say, a port of Linux to 20/21 bit + * architectures (and F21 Linux addict around?). + */ + +/* + * The following macros are to be defined by : + * + * Conversion of long and short int between network and host format + * ntohl(__u32 x) + * ntohs(__u16 x) + * htonl(__u32 x) + * htons(__u16 x) + * It seems that some programs (which? where? or perhaps a standard? POSIX?) + * might like the above to be functions, not macros (why?). + * if that's true, then detect them, and take measures. + * Anyway, the measure is: define only ___ntohl as a macro instead, + * and in a separate file, have + * unsigned long inline ntohl(x){return ___ntohl(x);} + * + * The same for constant arguments + * __constant_ntohl(__u32 x) + * __constant_ntohs(__u16 x) + * __constant_htonl(__u32 x) + * __constant_htons(__u16 x) + * + * Conversion of XX-bit integers (16- 32- or 64-) + * between native CPU format and little/big endian format + * 64-bit stuff only defined for proper architectures + * cpu_to_[bl]eXX(__uXX x) + * [bl]eXX_to_cpu(__uXX x) + * + * The same, but takes a pointer to the value to convert + * cpu_to_[bl]eXXp(__uXX x) + * [bl]eXX_to_cpup(__uXX x) + * + * The same, but change in situ + * cpu_to_[bl]eXXs(__uXX x) + * [bl]eXX_to_cpus(__uXX x) + * + * See asm-foo/byteorder.h for examples of how to provide + * architecture-optimized versions + * + */ + + +#if defined(__KERNEL__) +/* + * inside the kernel, we can use nicknames; + * outside of it, we must avoid POSIX namespace pollution... + */ +#define cpu_to_le64 __cpu_to_le64 +#define le64_to_cpu __le64_to_cpu +#define cpu_to_le32 __cpu_to_le32 +#define le32_to_cpu __le32_to_cpu +#define cpu_to_le16 __cpu_to_le16 +#define le16_to_cpu __le16_to_cpu +#define cpu_to_be64 __cpu_to_be64 +#define be64_to_cpu __be64_to_cpu +#define cpu_to_be32 __cpu_to_be32 +#define be32_to_cpu __be32_to_cpu +#define cpu_to_be16 __cpu_to_be16 +#define be16_to_cpu __be16_to_cpu +#define cpu_to_le64p __cpu_to_le64p +#define le64_to_cpup __le64_to_cpup +#define cpu_to_le32p __cpu_to_le32p +#define le32_to_cpup __le32_to_cpup +#define cpu_to_le16p __cpu_to_le16p +#define le16_to_cpup __le16_to_cpup +#define cpu_to_be64p __cpu_to_be64p +#define be64_to_cpup __be64_to_cpup +#define cpu_to_be32p __cpu_to_be32p +#define be32_to_cpup __be32_to_cpup +#define cpu_to_be16p __cpu_to_be16p +#define be16_to_cpup __be16_to_cpup +#define cpu_to_le64s __cpu_to_le64s +#define le64_to_cpus __le64_to_cpus +#define cpu_to_le32s __cpu_to_le32s +#define le32_to_cpus __le32_to_cpus +#define cpu_to_le16s __cpu_to_le16s +#define le16_to_cpus __le16_to_cpus +#define cpu_to_be64s __cpu_to_be64s +#define be64_to_cpus __be64_to_cpus +#define cpu_to_be32s __cpu_to_be32s +#define be32_to_cpus __be32_to_cpus +#define cpu_to_be16s __cpu_to_be16s +#define be16_to_cpus __be16_to_cpus +#endif + + +/* + * Handle ntohl and suches. These have various compatibility + * issues - like we want to give the prototype even though we + * also have a macro for them in case some strange program + * wants to take the address of the thing or something.. + * + * Note that these used to return a "long" in libc5, even though + * long is often 64-bit these days.. Thus the casts. + * + * They have to be macros in order to do the constant folding + * correctly - if the argument passed into a inline function + * it is no longer constant according to gcc.. + */ + +#undef ntohl +#undef ntohs +#undef htonl +#undef htons + +/* + * Do the prototypes. Somebody might want to take the + * address or some such sick thing.. + */ +#if defined(__KERNEL__) || (defined (__GLIBC__) && __GLIBC__ >= 2) +extern __u32 ntohl(__u32); +extern __u32 htonl(__u32); +#else +extern unsigned long int ntohl(unsigned long int); +extern unsigned long int htonl(unsigned long int); +#endif +extern unsigned short int ntohs(unsigned short int); +extern unsigned short int htons(unsigned short int); + + +#if defined(__GNUC__) && (__GNUC__ >= 2) && defined(__OPTIMIZE__) && !defined(__STRICT_ANSI__) + +#define ___htonl(x) __cpu_to_be32(x) +#define ___htons(x) __cpu_to_be16(x) +#define ___ntohl(x) __be32_to_cpu(x) +#define ___ntohs(x) __be16_to_cpu(x) + +#if defined(__KERNEL__) || (defined (__GLIBC__) && __GLIBC__ >= 2) +#define htonl(x) ___htonl(x) +#define ntohl(x) ___ntohl(x) +#else +#define htonl(x) ((unsigned long)___htonl(x)) +#define ntohl(x) ((unsigned long)___ntohl(x)) +#endif +#define htons(x) ___htons(x) +#define ntohs(x) ___ntohs(x) + +#endif /* OPTIMIZE */ + +#endif /* generic */ + + +#define __constant_htonl(x) ___constant_swab32((x)) +#define __constant_ntohl(x) ___constant_swab32((x)) +#define __constant_htons(x) ___constant_swab16((x)) +#define __constant_ntohs(x) ___constant_swab16((x)) +#define __constant_cpu_to_le64(x) ((__u64)(x)) +#define __constant_le64_to_cpu(x) ((__u64)(x)) +#define __constant_cpu_to_le32(x) ((__u32)(x)) +#define __constant_le32_to_cpu(x) ((__u32)(x)) +#define __constant_cpu_to_le24(x) ((__u32)(x)) +#define __constant_le24_to_cpu(x) ((__u32)(x)) +#define __constant_cpu_to_le16(x) ((__u16)(x)) +#define __constant_le16_to_cpu(x) ((__u16)(x)) +#define __constant_cpu_to_be64(x) ___constant_swab64((x)) +#define __constant_be64_to_cpu(x) ___constant_swab64((x)) +#define __constant_cpu_to_be32(x) ___constant_swab32((x)) +#define __constant_be32_to_cpu(x) ___constant_swab32((x)) +#define __constant_cpu_to_be24(x) ___constant_swab24((x)) +#define __constant_be24_to_cpu(x) ___constant_swab24((x)) +#define __constant_cpu_to_be16(x) ___constant_swab16((x)) +#define __constant_be16_to_cpu(x) ___constant_swab16((x)) +#define __cpu_to_le64(x) ((__u64)(x)) +#define __le64_to_cpu(x) ((__u64)(x)) +#define __cpu_to_le32(x) ((__u32)(x)) +#define __le32_to_cpu(x) ((__u32)(x)) +#define __cpu_to_le24(x) ((__u32)(x)) +#define __le24_to_cpu(x) ((__u32)(x)) +#define __cpu_to_le16(x) ((__u16)(x)) +#define __le16_to_cpu(x) ((__u16)(x)) +#define __cpu_to_be64(x) __swab64((x)) +#define __be64_to_cpu(x) __swab64((x)) +#define __cpu_to_be32(x) __swab32((x)) +#define __be32_to_cpu(x) __swab32((x)) +#define __cpu_to_be24(x) __swab24((x)) +#define __be24_to_cpu(x) __swab24((x)) +#define __cpu_to_be16(x) __swab16((x)) +#define __be16_to_cpu(x) __swab16((x)) +#define __cpu_to_le64p(x) (*(__u64*)(x)) +#define __le64_to_cpup(x) (*(__u64*)(x)) +#define __cpu_to_le32p(x) (*(__u32*)(x)) +#define __le32_to_cpup(x) (*(__u32*)(x)) +#define __cpu_to_le24p(x) (*(__u32*)(x)) +#define __le24_to_cpup(x) (*(__u32*)(x)) +#define __cpu_to_le16p(x) (*(__u16*)(x)) +#define __le16_to_cpup(x) (*(__u16*)(x)) +#define __cpu_to_be64p(x) __swab64p((x)) +#define __be64_to_cpup(x) __swab64p((x)) +#define __cpu_to_be32p(x) __swab32p((x)) +#define __be32_to_cpup(x) __swab32p((x)) +#define __cpu_to_be24p(x) __swab24p((x)) +#define __be24_to_cpup(x) __swab24p((x)) +#define __cpu_to_be16p(x) __swab16p((x)) +#define __be16_to_cpup(x) __swab16p((x)) +#define __cpu_to_le64s(x) do {} while (0) +#define __le64_to_cpus(x) do {} while (0) +#define __cpu_to_le32s(x) do {} while (0) +#define __le32_to_cpus(x) do {} while (0) +#define __cpu_to_le24s(x) do {} while (0) +#define __le24_to_cpus(x) do {} while (0) +#define __cpu_to_le16s(x) do {} while (0) +#define __le16_to_cpus(x) do {} while (0) +#define __cpu_to_be64s(x) __swab64s((x)) +#define __be64_to_cpus(x) __swab64s((x)) +#define __cpu_to_be32s(x) __swab32s((x)) +#define __be32_to_cpus(x) __swab32s((x)) +#define __cpu_to_be24s(x) __swab24s((x)) +#define __be24_to_cpus(x) __swab24s((x)) +#define __cpu_to_be16s(x) __swab16s((x)) +#define __be16_to_cpus(x) __swab16s((x)) + + + + + + + + +#if 1 + +/* Dummy types */ + +#define ____cacheline_aligned + +typedef struct +{ + volatile unsigned int lock; +} rwlock_t; + +typedef struct { + volatile unsigned int lock; +} spinlock_t; + +struct task_struct; + + + + + +#if 1 /* atomic */ + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +#ifdef CONFIG_SMP +#define LOCK "lock ; " +#else +#define LOCK "" +#endif + +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { volatile int counter; } atomic_t; + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +#define atomic_read(v) ((v)->counter) + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +#define atomic_set(v,i) (((v)->counter) = (i)) + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. Note that the guaranteed useful range + * of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_add(int i, atomic_t *v) +{ +#if 0 + __asm__ __volatile__( + LOCK "addl %1,%0" + :"=m" (v->counter) + :"ir" (i), "m" (v->counter)); +#endif +} + +/** + * atomic_sub - subtract the atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_sub(int i, atomic_t *v) +{ +#if 0 + __asm__ __volatile__( + LOCK "subl %1,%0" + :"=m" (v->counter) + :"ir" (i), "m" (v->counter)); +#endif +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ int atomic_sub_and_test(int i, atomic_t *v) +{ +#if 0 + unsigned char c; + + __asm__ __volatile__( + LOCK "subl %2,%0; sete %1" + :"=m" (v->counter), "=qm" (c) + :"ir" (i), "m" (v->counter) : "memory"); + return c; +#endif +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_inc(atomic_t *v) +{ +#if 0 + __asm__ __volatile__( + LOCK "incl %0" + :"=m" (v->counter) + :"m" (v->counter)); +#endif +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ void atomic_dec(atomic_t *v) +{ +#if 0 + __asm__ __volatile__( + LOCK "decl %0" + :"=m" (v->counter) + :"m" (v->counter)); +#endif +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ int atomic_dec_and_test(atomic_t *v) +{ +#if 0 + unsigned char c; + + __asm__ __volatile__( + LOCK "decl %0; sete %1" + :"=m" (v->counter), "=qm" (c) + :"m" (v->counter) : "memory"); + return c != 0; +#else + return 1; +#endif +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ int atomic_inc_and_test(atomic_t *v) +{ +#if 0 + unsigned char c; + + __asm__ __volatile__( + LOCK "incl %0; sete %1" + :"=m" (v->counter), "=qm" (c) + :"m" (v->counter) : "memory"); + return c != 0; +#else + return 1; +#endif +} + +/** + * atomic_add_negative - add and test if negative + * @v: pointer of type atomic_t + * @i: integer value to add + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. Note that the guaranteed + * useful range of an atomic_t is only 24 bits. + */ +static __inline__ int atomic_add_negative(int i, atomic_t *v) +{ +#if 0 + unsigned char c; + + __asm__ __volatile__( + LOCK "addl %2,%0; sets %1" + :"=m" (v->counter), "=qm" (c) + :"ir" (i), "m" (v->counter) : "memory"); + return c; +#else + return 0; +#endif +} + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) +#if 0 +__asm__ __volatile__(LOCK "andl %0,%1" \ +: : "r" (~(mask)),"m" (*addr) : "memory") +#endif + +#define atomic_set_mask(mask, addr) +#if 0 +__asm__ __volatile__(LOCK "orl %0,%1" \ +: : "r" (mask),"m" (*addr) : "memory") +#endif + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() +#define smp_mb__after_atomic_dec() +#define smp_mb__before_atomic_inc() +#define smp_mb__after_atomic_inc() + + + +#endif /* atomic */ + + + + + +#if 1 /* list */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ +#if 0 + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +#endif +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ +#if 0 + __list_add(new, head, head->next); +#endif +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ +#if 0 + __list_add(new, head->prev, head); +#endif +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ +#if 0 + __list_del(entry->prev, entry->next); + entry->next = (void *) 0; + entry->prev = (void *) 0; +#endif +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ +#if 0 + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +#endif +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ +#if 0 + __list_del(list->prev, list->next); + list_add(list, head); +#endif +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ +#if 0 + __list_del(list->prev, list->next); + list_add_tail(list, head); +#endif +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ +#if 0 + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +#endif +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ +#if 0 + if (!list_empty(list)) + __list_splice(list, head); +#endif +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ +#if 0 + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +#endif +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) +#if 0 + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) +#endif + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) +#if 0 + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) +#endif + +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) +#if 0 + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) +#endif + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) +#if 0 + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) +#endif + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) +#if 0 + for (pos = list_entry((head)->next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) +#endif + +#endif /* list */ + + + + + +#if 1 /* wait */ + +#define WNOHANG 0x00000001 +#define WUNTRACED 0x00000002 + +#define __WNOTHREAD 0x20000000 /* Don't wait on children of other threads in this group */ +#define __WALL 0x40000000 /* Wait on all children, regardless of type */ +#define __WCLONE 0x80000000 /* Wait only on non-SIGCHLD children */ + +#if 0 +#include +#include +#include +#include +#include + +#include +#include +#endif + +/* + * Debug control. Slow but useful. + */ +#if defined(CONFIG_DEBUG_WAITQ) +#define WAITQUEUE_DEBUG 1 +#else +#define WAITQUEUE_DEBUG 0 +#endif + +struct __wait_queue { + unsigned int flags; +#define WQ_FLAG_EXCLUSIVE 0x01 + struct task_struct * task; + struct list_head task_list; +#if WAITQUEUE_DEBUG + long __magic; + long __waker; +#endif +}; +typedef struct __wait_queue wait_queue_t; + +/* + * 'dual' spinlock architecture. Can be switched between spinlock_t and + * rwlock_t locks via changing this define. Since waitqueues are quite + * decoupled in the new architecture, lightweight 'simple' spinlocks give + * us slightly better latencies and smaller waitqueue structure size. + */ +#define USE_RW_WAIT_QUEUE_SPINLOCK 0 + +#if USE_RW_WAIT_QUEUE_SPINLOCK +# define wq_lock_t rwlock_t +# define WAITQUEUE_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED + +# define wq_read_lock read_lock +# define wq_read_lock_irqsave read_lock_irqsave +# define wq_read_unlock_irqrestore read_unlock_irqrestore +# define wq_read_unlock read_unlock +# define wq_write_lock_irq write_lock_irq +# define wq_write_lock_irqsave write_lock_irqsave +# define wq_write_unlock_irqrestore write_unlock_irqrestore +# define wq_write_unlock write_unlock +#else +# define wq_lock_t spinlock_t +# define WAITQUEUE_RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED + +# define wq_read_lock spin_lock +# define wq_read_lock_irqsave spin_lock_irqsave +# define wq_read_unlock spin_unlock +# define wq_read_unlock_irqrestore spin_unlock_irqrestore +# define wq_write_lock_irq spin_lock_irq +# define wq_write_lock_irqsave spin_lock_irqsave +# define wq_write_unlock_irqrestore spin_unlock_irqrestore +# define wq_write_unlock spin_unlock +#endif + +struct __wait_queue_head { + wq_lock_t lock; + struct list_head task_list; +#if WAITQUEUE_DEBUG + long __magic; + long __creator; +#endif +}; +typedef struct __wait_queue_head wait_queue_head_t; + + +/* + * Debugging macros. We eschew `do { } while (0)' because gcc can generate + * spurious .aligns. + */ +#if WAITQUEUE_DEBUG +#define WQ_BUG() BUG() +#define CHECK_MAGIC(x) +#if 0 + do { \ + if ((x) != (long)&(x)) { \ + printk("bad magic %lx (should be %lx), ", \ + (long)x, (long)&(x)); \ + WQ_BUG(); \ + } \ + } while (0) +#endif + +#define CHECK_MAGIC_WQHEAD(x) +#if 0 + do { \ + if ((x)->__magic != (long)&((x)->__magic)) { \ + printk("bad magic %lx (should be %lx, creator %lx), ", \ + (x)->__magic, (long)&((x)->__magic), (x)->__creator); \ + WQ_BUG(); \ + } \ + } while (0) +#endif + +#define WQ_CHECK_LIST_HEAD(list) +#if 0 + do { \ + if (!(list)->next || !(list)->prev) \ + WQ_BUG(); \ + } while(0) +#endif + +#define WQ_NOTE_WAKER(tsk) +#if 0 + do { \ + (tsk)->__waker = (long)__builtin_return_address(0); \ + } while (0) +#endif +#else +#define WQ_BUG() +#define CHECK_MAGIC(x) +#define CHECK_MAGIC_WQHEAD(x) +#define WQ_CHECK_LIST_HEAD(list) +#define WQ_NOTE_WAKER(tsk) +#endif + +/* + * Macros for declaration and initialisaton of the datatypes + */ + +#if WAITQUEUE_DEBUG +# define __WAITQUEUE_DEBUG_INIT(name) //(long)&(name).__magic, 0 +# define __WAITQUEUE_HEAD_DEBUG_INIT(name) //(long)&(name).__magic, (long)&(name).__magic +#else +# define __WAITQUEUE_DEBUG_INIT(name) +# define __WAITQUEUE_HEAD_DEBUG_INIT(name) +#endif + +#define __WAITQUEUE_INITIALIZER(name, tsk) +#if 0 +{ + task: tsk, \ + task_list: { NULL, NULL }, \ + __WAITQUEUE_DEBUG_INIT(name)} +#endif + +#define DECLARE_WAITQUEUE(name, tsk) +#if 0 + wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk) +#endif + +#define __WAIT_QUEUE_HEAD_INITIALIZER(name) +#if 0 +{ + lock: WAITQUEUE_RW_LOCK_UNLOCKED, \ + task_list: { &(name).task_list, &(name).task_list }, \ + __WAITQUEUE_HEAD_DEBUG_INIT(name)} +#endif + +#define DECLARE_WAIT_QUEUE_HEAD(name) +#if 0 + wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name) +#endif + +static inline void init_waitqueue_head(wait_queue_head_t *q) +{ +#if 0 +#if WAITQUEUE_DEBUG + if (!q) + WQ_BUG(); +#endif + q->lock = WAITQUEUE_RW_LOCK_UNLOCKED; + INIT_LIST_HEAD(&q->task_list); +#if WAITQUEUE_DEBUG + q->__magic = (long)&q->__magic; + q->__creator = (long)current_text_addr(); +#endif +#endif +} + +static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) +{ +#if 0 +#if WAITQUEUE_DEBUG + if (!q || !p) + WQ_BUG(); +#endif + q->flags = 0; + q->task = p; +#if WAITQUEUE_DEBUG + q->__magic = (long)&q->__magic; +#endif +#endif +} + +static inline int waitqueue_active(wait_queue_head_t *q) +{ +#if 0 +#if WAITQUEUE_DEBUG + if (!q) + WQ_BUG(); + CHECK_MAGIC_WQHEAD(q); +#endif + + return !list_empty(&q->task_list); +#endif +} + +static inline void __add_wait_queue(wait_queue_head_t *head, wait_queue_t *new) +{ +#if 0 +#if WAITQUEUE_DEBUG + if (!head || !new) + WQ_BUG(); + CHECK_MAGIC_WQHEAD(head); + CHECK_MAGIC(new->__magic); + if (!head->task_list.next || !head->task_list.prev) + WQ_BUG(); +#endif + list_add(&new->task_list, &head->task_list); +#endif +} + +/* + * Used for wake-one threads: + */ +static inline void __add_wait_queue_tail(wait_queue_head_t *head, + wait_queue_t *new) +{ +#if 0 +#if WAITQUEUE_DEBUG + if (!head || !new) + WQ_BUG(); + CHECK_MAGIC_WQHEAD(head); + CHECK_MAGIC(new->__magic); + if (!head->task_list.next || !head->task_list.prev) + WQ_BUG(); +#endif + list_add_tail(&new->task_list, &head->task_list); +#endif +} + +static inline void __remove_wait_queue(wait_queue_head_t *head, + wait_queue_t *old) +{ +#if 0 +#if WAITQUEUE_DEBUG + if (!old) + WQ_BUG(); + CHECK_MAGIC(old->__magic); +#endif + list_del(&old->task_list); +#endif +} + + + + +#endif /* wait */ + + +#endif + + + + +#if 1 /* slab */ + +typedef struct +{ + int x; +} kmem_cache_s; + +typedef struct kmem_cache_s kmem_cache_t; + +#if 0 +#include +#include +#endif + +/* flags for kmem_cache_alloc() */ +#define SLAB_NOFS GFP_NOFS +#define SLAB_NOIO GFP_NOIO +#define SLAB_NOHIGHIO GFP_NOHIGHIO +#define SLAB_ATOMIC GFP_ATOMIC +#define SLAB_USER GFP_USER +#define SLAB_KERNEL GFP_KERNEL +#define SLAB_NFS GFP_NFS +#define SLAB_DMA GFP_DMA + +#define SLAB_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_HIGHIO|__GFP_FS) +#define SLAB_NO_GROW 0x00001000UL /* don't grow a cache */ + +/* flags to pass to kmem_cache_create(). + * The first 3 are only valid when the allocator as been build + * SLAB_DEBUG_SUPPORT. + */ +#define SLAB_DEBUG_FREE 0x00000100UL /* Peform (expensive) checks on free */ +#define SLAB_DEBUG_INITIAL 0x00000200UL /* Call constructor (as verifier) */ +#define SLAB_RED_ZONE 0x00000400UL /* Red zone objs in a cache */ +#define SLAB_POISON 0x00000800UL /* Poison objects */ +#define SLAB_NO_REAP 0x00001000UL /* never reap from the cache */ +#define SLAB_HWCACHE_ALIGN 0x00002000UL /* align objs on a h/w cache lines */ +#define SLAB_CACHE_DMA 0x00004000UL /* use GFP_DMA memory */ +#define SLAB_MUST_HWCACHE_ALIGN 0x00008000UL /* force alignment */ + +/* flags passed to a constructor func */ +#define SLAB_CTOR_CONSTRUCTOR 0x001UL /* if not set, then deconstructor */ +#define SLAB_CTOR_ATOMIC 0x002UL /* tell constructor it can't sleep */ +#define SLAB_CTOR_VERIFY 0x004UL /* tell constructor it's a verify call */ + +/* prototypes */ +extern void kmem_cache_init(void); +extern void kmem_cache_sizes_init(void); + +extern kmem_cache_t *kmem_find_general_cachep(size_t, int gfpflags); +extern kmem_cache_t *kmem_cache_create(const char *, size_t, size_t, unsigned long, + void (*)(void *, kmem_cache_t *, unsigned long), + void (*)(void *, kmem_cache_t *, unsigned long)); +extern int kmem_cache_destroy(kmem_cache_t *); +extern int kmem_cache_shrink(kmem_cache_t *); +extern void *kmem_cache_alloc(kmem_cache_t *, int); +extern void kmem_cache_free(kmem_cache_t *, void *); +extern unsigned int kmem_cache_size(kmem_cache_t *); + +extern void *kmalloc(size_t, int); +extern void kfree(const void *); + +//extern int FASTCALL(kmem_cache_reap(int)); + +/* System wide caches */ +extern kmem_cache_t *vm_area_cachep; +extern kmem_cache_t *mm_cachep; +extern kmem_cache_t *names_cachep; +extern kmem_cache_t *files_cachep; +extern kmem_cache_t *filp_cachep; +extern kmem_cache_t *dquot_cachep; +extern kmem_cache_t *bh_cachep; +extern kmem_cache_t *fs_cachep; +extern kmem_cache_t *sigact_cachep; + +#endif /* slab */ + + + +/* + * Berkeley style UIO structures - Alan Cox 1994. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +/* A word of warning: Our uio structure will clash with the C library one (which is now obsolete). Remove the C + library one from sys/uio.h if you have a very old library set */ + +struct iovec +{ + void *iov_base; /* BSD uses caddr_t (1003.1g requires void *) */ + __kernel_size_t iov_len; /* Must be size_t (1003.1g) */ +}; + +/* + * UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1) + */ + +#define UIO_FASTIOV 8 +#define UIO_MAXIOV 1024 +#if 0 +#define UIO_MAXIOV 16 /* Maximum iovec's in one operation + 16 matches BSD */ + /* Beg pardon: BSD has 1024 --ANK */ +#endif + + + +/* + * In Linux 2.4, static timers have been removed from the kernel. + * Timers may be dynamically created and destroyed, and should be initialized + * by a call to init_timer() upon creation. + * + * The "data" field enables use of a common timeout function for several + * timeouts. You can use this field to distinguish between the different + * invocations. + */ +struct timer_list { + struct list_head list; + unsigned long expires; + unsigned long data; + void (*function)(unsigned long); +}; + + + +struct timeval { + unsigned long tv_sec; + unsigned long tv_usec; +// time_t tv_sec; /* seconds */ +// suseconds_t tv_usec; /* microseconds */ +}; + + + + + + + +#if 1 /* poll */ + +struct file; + +struct poll_table_page; + +typedef struct poll_table_struct { + int error; + struct poll_table_page * table; +} poll_table; + +extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p); + +static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) +{ + if (p && wait_address) + __pollwait(filp, wait_address, p); +} + +static inline void poll_initwait(poll_table* pt) +{ + pt->error = 0; + pt->table = NULL; +} +extern void poll_freewait(poll_table* pt); + + +/* + * Scaleable version of the fd_set. + */ + +typedef struct { + unsigned long *in, *out, *ex; + unsigned long *res_in, *res_out, *res_ex; +} fd_set_bits; + +/* + * How many longwords for "nr" bits? + */ +#define FDS_BITPERLONG (8*sizeof(long)) +#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) +#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) + +/* + * We do a VERIFY_WRITE here even though we are only reading this time: + * we'll write to it eventually.. + * + * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. + */ +static inline +int get_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset) +{ +#if 0 + nr = FDS_BYTES(nr); + if (ufdset) { + int error; + error = verify_area(VERIFY_WRITE, ufdset, nr); + if (!error && __copy_from_user(fdset, ufdset, nr)) + error = -EFAULT; + return error; + } + memset(fdset, 0, nr); + return 0; +#else + return 0; +#endif +} + +static inline +void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset) +{ +#if 0 + if (ufdset) + __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); +#endif +} + +static inline +void zero_fd_set(unsigned long nr, unsigned long *fdset) +{ +#if 0 + memset(fdset, 0, FDS_BYTES(nr)); +#endif +} + +extern int do_select(int n, fd_set_bits *fds, long *timeout); + +#endif /* poll */ + + + +typedef struct +{ + int x; +} read_descriptor_t; + + + + + +#if 1 /* poll */ + +/* These are specified by iBCS2 */ +#define POLLIN 0x0001 +#define POLLPRI 0x0002 +#define POLLOUT 0x0004 +#define POLLERR 0x0008 +#define POLLHUP 0x0010 +#define POLLNVAL 0x0020 + +/* The rest seem to be more-or-less nonstandard. Check them! */ +#define POLLRDNORM 0x0040 +#define POLLRDBAND 0x0080 +#define POLLWRNORM 0x0100 +#define POLLWRBAND 0x0200 +#define POLLMSG 0x0400 + +struct pollfd { + int fd; + short events; + short revents; +}; + +#endif /* poll */ + +#endif /* _LINUX_TYPES_H */ diff --git a/reactos/drivers/net/tcpip/include/tcpcore.h b/reactos/drivers/net/tcpip/include/tcpcore.h new file mode 100755 index 00000000000..cda54ed221e --- /dev/null +++ b/reactos/drivers/net/tcpip/include/tcpcore.h @@ -0,0 +1,3856 @@ +/* + * COPYRIGHT: See COPYING in the top level directory + * PROJECT: ReactOS TCP/IP protocol driver + * FILE: include/tcpcore.h + * PURPOSE: Transmission Control Protocol definitions + * REVISIONS: + * CSH 01/01-2003 Ported from linux kernel 2.4.20 + */ + +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP module. + * + * Version: @(#)tcp.h 1.0.5 05/23/93 + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef __TCPCORE_H +#define __TCPCORE_H + +#include "tcpdef.h" + + +struct socket; + + + +#if 1 /* skbuff */ + +#define HAVE_ALLOC_SKB /* For the drivers to know */ +#define HAVE_ALIGNABLE_SKB /* Ditto 8) */ +#define SLAB_SKB /* Slabified skbuffs */ + +#define CHECKSUM_NONE 0 +#define CHECKSUM_HW 1 +#define CHECKSUM_UNNECESSARY 2 + +#define SKB_DATA_ALIGN(X) (((X) + (SMP_CACHE_BYTES-1)) & ~(SMP_CACHE_BYTES-1)) +#define SKB_MAX_ORDER(X,ORDER) (((PAGE_SIZE<<(ORDER)) - (X) - sizeof(struct skb_shared_info))&~(SMP_CACHE_BYTES-1)) +#define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X),0)) +#define SKB_MAX_ALLOC (SKB_MAX_ORDER(0,2)) + +/* A. Checksumming of received packets by device. + * + * NONE: device failed to checksum this packet. + * skb->csum is undefined. + * + * UNNECESSARY: device parsed packet and wouldbe verified checksum. + * skb->csum is undefined. + * It is bad option, but, unfortunately, many of vendors do this. + * Apparently with secret goal to sell you new device, when you + * will add new protocol to your host. F.e. IPv6. 8) + * + * HW: the most generic way. Device supplied checksum of _all_ + * the packet as seen by netif_rx in skb->csum. + * NOTE: Even if device supports only some protocols, but + * is able to produce some skb->csum, it MUST use HW, + * not UNNECESSARY. + * + * B. Checksumming on output. + * + * NONE: skb is checksummed by protocol or csum is not required. + * + * HW: device is required to csum packet as seen by hard_start_xmit + * from skb->h.raw to the end and to record the checksum + * at skb->h.raw+skb->csum. + * + * Device must show its capabilities in dev->features, set + * at device setup time. + * NETIF_F_HW_CSUM - it is clever device, it is able to checksum + * everything. + * NETIF_F_NO_CSUM - loopback or reliable single hop media. + * NETIF_F_IP_CSUM - device is dumb. It is able to csum only + * TCP/UDP over IPv4. Sigh. Vendors like this + * way by an unknown reason. Though, see comment above + * about CHECKSUM_UNNECESSARY. 8) + * + * Any questions? No questions, good. --ANK + */ + +#ifdef __i386__ +#define NET_CALLER(arg) (*(((void**)&arg)-1)) +#else +#define NET_CALLER(arg) __builtin_return_address(0) +#endif + +#ifdef CONFIG_NETFILTER +struct nf_conntrack { + atomic_t use; + void (*destroy)(struct nf_conntrack *); +}; + +struct nf_ct_info { + struct nf_conntrack *master; +}; +#endif + +struct sk_buff_head { + /* These two members must be first. */ + struct sk_buff * next; + struct sk_buff * prev; + + __u32 qlen; + spinlock_t lock; +}; + +struct sk_buff; + +#define MAX_SKB_FRAGS 6 + +typedef struct skb_frag_struct skb_frag_t; + +struct skb_frag_struct +{ + struct page *page; + __u16 page_offset; + __u16 size; +}; + +/* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +struct skb_shared_info { + atomic_t dataref; + unsigned int nr_frags; + struct sk_buff *frag_list; + skb_frag_t frags[MAX_SKB_FRAGS]; +}; + +struct sk_buff { + /* These two members must be first. */ + struct sk_buff * next; /* Next buffer in list */ + struct sk_buff * prev; /* Previous buffer in list */ + + struct sk_buff_head * list; /* List we are on */ + struct sock *sk; /* Socket we are owned by */ + struct timeval stamp; /* Time we arrived */ + struct net_device *dev; /* Device we arrived on/are leaving by */ + + /* Transport layer header */ + union + { + struct tcphdr *th; + struct udphdr *uh; + struct icmphdr *icmph; + struct igmphdr *igmph; + struct iphdr *ipiph; + struct spxhdr *spxh; + unsigned char *raw; + } h; + + /* Network layer header */ + union + { + struct iphdr *iph; + struct ipv6hdr *ipv6h; + struct arphdr *arph; + struct ipxhdr *ipxh; + unsigned char *raw; + } nh; + + /* Link layer header */ + union + { + struct ethhdr *ethernet; + unsigned char *raw; + } mac; + + struct dst_entry *dst; + + /* + * This is the control buffer. It is free to use for every + * layer. Please put your private variables there. If you + * want to keep them across layers you have to do a skb_clone() + * first. This is owned by whoever has the skb queued ATM. + */ + char cb[48]; + + unsigned int len; /* Length of actual data */ + unsigned int data_len; + unsigned int csum; /* Checksum */ + unsigned char __unused, /* Dead field, may be reused */ + cloned, /* head may be cloned (check refcnt to be sure). */ + pkt_type, /* Packet class */ + ip_summed; /* Driver fed us an IP checksum */ + __u32 priority; /* Packet queueing priority */ + atomic_t users; /* User count - see datagram.c,tcp.c */ + unsigned short protocol; /* Packet protocol from driver. */ + unsigned short security; /* Security level of packet */ + unsigned int truesize; /* Buffer size */ + + unsigned char *head; /* Head of buffer */ + unsigned char *data; /* Data head pointer */ + unsigned char *tail; /* Tail pointer */ + unsigned char *end; /* End pointer */ + + void (*destructor)(struct sk_buff *); /* Destruct function */ +#ifdef CONFIG_NETFILTER + /* Can be used for communication between hooks. */ + unsigned long nfmark; + /* Cache info */ + __u32 nfcache; + /* Associated connection, if any */ + struct nf_ct_info *nfct; +#ifdef CONFIG_NETFILTER_DEBUG + unsigned int nf_debug; +#endif +#endif /*CONFIG_NETFILTER*/ + +#if defined(CONFIG_HIPPI) + union{ + __u32 ifield; + } private; +#endif + +#ifdef CONFIG_NET_SCHED + __u32 tc_index; /* traffic control index */ +#endif +}; + +#define SK_WMEM_MAX 65535 +#define SK_RMEM_MAX 65535 + +#if 1 +//#ifdef __KERNEL__ +/* + * Handling routines are only of interest to the kernel + */ + +extern void __kfree_skb(struct sk_buff *skb); +extern struct sk_buff * alloc_skb(unsigned int size, int priority); +extern void kfree_skbmem(struct sk_buff *skb); +extern struct sk_buff * skb_clone(struct sk_buff *skb, int priority); +extern struct sk_buff * skb_copy(const struct sk_buff *skb, int priority); +extern struct sk_buff * pskb_copy(struct sk_buff *skb, int gfp_mask); +extern int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask); +extern struct sk_buff * skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom); +extern struct sk_buff * skb_copy_expand(const struct sk_buff *skb, + int newheadroom, + int newtailroom, + int priority); +#define dev_kfree_skb(a) kfree_skb(a) +extern void skb_over_panic(struct sk_buff *skb, int len, void *here); +extern void skb_under_panic(struct sk_buff *skb, int len, void *here); + +/* Internal */ +#define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end)) + +/** + * skb_queue_empty - check if a queue is empty + * @list: queue head + * + * Returns true if the queue is empty, false otherwise. + */ + +static inline int skb_queue_empty(struct sk_buff_head *list) +{ + return (list->next == (struct sk_buff *) list); +} + +/** + * skb_get - reference buffer + * @skb: buffer to reference + * + * Makes another reference to a socket buffer and returns a pointer + * to the buffer. + */ + +static inline struct sk_buff *skb_get(struct sk_buff *skb) +{ + atomic_inc(&skb->users); + return skb; +} + +/* + * If users==1, we are the only owner and are can avoid redundant + * atomic change. + */ + +/** + * kfree_skb - free an sk_buff + * @skb: buffer to free + * + * Drop a reference to the buffer and free it if the usage count has + * hit zero. + */ + +static inline void kfree_skb(struct sk_buff *skb) +{ + if (atomic_read(&skb->users) == 1 || atomic_dec_and_test(&skb->users)) + __kfree_skb(skb); +} + +/* Use this if you didn't touch the skb state [for fast switching] */ +static inline void kfree_skb_fast(struct sk_buff *skb) +{ + if (atomic_read(&skb->users) == 1 || atomic_dec_and_test(&skb->users)) + kfree_skbmem(skb); +} + +/** + * skb_cloned - is the buffer a clone + * @skb: buffer to check + * + * Returns true if the buffer was generated with skb_clone() and is + * one of multiple shared copies of the buffer. Cloned buffers are + * shared data so must not be written to under normal circumstances. + */ + +static inline int skb_cloned(struct sk_buff *skb) +{ + return skb->cloned && atomic_read(&skb_shinfo(skb)->dataref) != 1; +} + +/** + * skb_shared - is the buffer shared + * @skb: buffer to check + * + * Returns true if more than one person has a reference to this + * buffer. + */ + +static inline int skb_shared(struct sk_buff *skb) +{ + return (atomic_read(&skb->users) != 1); +} + +/** + * skb_share_check - check if buffer is shared and if so clone it + * @skb: buffer to check + * @pri: priority for memory allocation + * + * If the buffer is shared the buffer is cloned and the old copy + * drops a reference. A new clone with a single reference is returned. + * If the buffer is not shared the original buffer is returned. When + * being called from interrupt status or with spinlocks held pri must + * be GFP_ATOMIC. + * + * NULL is returned on a memory allocation failure. + */ + +static inline struct sk_buff *skb_share_check(struct sk_buff *skb, int pri) +{ + if (skb_shared(skb)) { + struct sk_buff *nskb; + nskb = skb_clone(skb, pri); + kfree_skb(skb); + return nskb; + } + return skb; +} + + +/* + * Copy shared buffers into a new sk_buff. We effectively do COW on + * packets to handle cases where we have a local reader and forward + * and a couple of other messy ones. The normal one is tcpdumping + * a packet thats being forwarded. + */ + +/** + * skb_unshare - make a copy of a shared buffer + * @skb: buffer to check + * @pri: priority for memory allocation + * + * If the socket buffer is a clone then this function creates a new + * copy of the data, drops a reference count on the old copy and returns + * the new copy with the reference count at 1. If the buffer is not a clone + * the original buffer is returned. When called with a spinlock held or + * from interrupt state @pri must be %GFP_ATOMIC + * + * %NULL is returned on a memory allocation failure. + */ + +static inline struct sk_buff *skb_unshare(struct sk_buff *skb, int pri) +{ + struct sk_buff *nskb; + if(!skb_cloned(skb)) + return skb; + nskb=skb_copy(skb, pri); + kfree_skb(skb); /* Free our shared copy */ + return nskb; +} + +/** + * skb_peek + * @list_: list to peek at + * + * Peek an &sk_buff. Unlike most other operations you _MUST_ + * be careful with this one. A peek leaves the buffer on the + * list and someone else may run off with it. You must hold + * the appropriate locks or have a private queue to do this. + * + * Returns %NULL for an empty list or a pointer to the head element. + * The reference count is not incremented and the reference is therefore + * volatile. Use with caution. + */ + +static inline struct sk_buff *skb_peek(struct sk_buff_head *list_) +{ + struct sk_buff *list = ((struct sk_buff *)list_)->next; + if (list == (struct sk_buff *)list_) + list = NULL; + return list; +} + +/** + * skb_peek_tail + * @list_: list to peek at + * + * Peek an &sk_buff. Unlike most other operations you _MUST_ + * be careful with this one. A peek leaves the buffer on the + * list and someone else may run off with it. You must hold + * the appropriate locks or have a private queue to do this. + * + * Returns %NULL for an empty list or a pointer to the tail element. + * The reference count is not incremented and the reference is therefore + * volatile. Use with caution. + */ + +static inline struct sk_buff *skb_peek_tail(struct sk_buff_head *list_) +{ + struct sk_buff *list = ((struct sk_buff *)list_)->prev; + if (list == (struct sk_buff *)list_) + list = NULL; + return list; +} + +/** + * skb_queue_len - get queue length + * @list_: list to measure + * + * Return the length of an &sk_buff queue. + */ + +static inline __u32 skb_queue_len(struct sk_buff_head *list_) +{ + return(list_->qlen); +} + +static inline void skb_queue_head_init(struct sk_buff_head *list) +{ + spin_lock_init(&list->lock); + list->prev = (struct sk_buff *)list; + list->next = (struct sk_buff *)list; + list->qlen = 0; +} + +/* + * Insert an sk_buff at the start of a list. + * + * The "__skb_xxxx()" functions are the non-atomic ones that + * can only be called with interrupts disabled. + */ + +/** + * __skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of a list. This function takes no locks + * and you must therefore hold required locks before calling it. + * + * A buffer cannot be placed on two lists at the same time. + */ + +static inline void __skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + struct sk_buff *prev, *next; + + newsk->list = list; + list->qlen++; + prev = (struct sk_buff *)list; + next = prev->next; + newsk->next = next; + newsk->prev = prev; + next->prev = newsk; + prev->next = newsk; +} + + +/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ + +static inline void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_head(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * __skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the end of a list. This function takes no locks + * and you must therefore hold required locks before calling it. + * + * A buffer cannot be placed on two lists at the same time. + */ + + +static inline void __skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + struct sk_buff *prev, *next; + + newsk->list = list; + list->qlen++; + next = (struct sk_buff *)list; + prev = next->prev; + newsk->next = next; + newsk->prev = prev; + next->prev = newsk; + prev->next = newsk; +} + +/** + * skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the tail of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ + +static inline void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_tail(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * __skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. This function does not take any locks + * so must be used with appropriate locks held only. The head item is + * returned or %NULL if the list is empty. + */ + +static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list) +{ + struct sk_buff *next, *prev, *result; + + prev = (struct sk_buff *) list; + next = prev->next; + result = NULL; + if (next != prev) { + result = next; + next = next->next; + list->qlen--; + next->prev = prev; + prev->next = next; + result->next = NULL; + result->prev = NULL; + result->list = NULL; + } + return result; +} + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The head item is + * returned or %NULL if the list is empty. + */ + +static inline struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/* + * Insert a packet on a list. + */ + +static inline void __skb_insert(struct sk_buff *newsk, + struct sk_buff * prev, struct sk_buff *next, + struct sk_buff_head * list) +{ + newsk->next = next; + newsk->prev = prev; + next->prev = newsk; + prev->next = newsk; + newsk->list = list; + list->qlen++; +} + +/** + * skb_insert - insert a buffer + * @old: buffer to insert before + * @newsk: buffer to insert + * + * Place a packet before a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls + * A buffer cannot be placed on two lists at the same time. + */ + +static inline void skb_insert(struct sk_buff *old, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&old->list->lock, flags); + __skb_insert(newsk, old->prev, old, old->list); + spin_unlock_irqrestore(&old->list->lock, flags); +} + +/* + * Place a packet after a given packet in a list. + */ + +static inline void __skb_append(struct sk_buff *old, struct sk_buff *newsk) +{ + __skb_insert(newsk, old, old->next, old->list); +} + +/** + * skb_append - append a buffer + * @old: buffer to insert after + * @newsk: buffer to insert + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls. + * A buffer cannot be placed on two lists at the same time. + */ + + +static inline void skb_append(struct sk_buff *old, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&old->list->lock, flags); + __skb_append(old, newsk); + spin_unlock_irqrestore(&old->list->lock, flags); +} + +/* + * remove sk_buff from list. _Must_ be called atomically, and with + * the list known.. + */ + +static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) +{ + struct sk_buff * next, * prev; + + list->qlen--; + next = skb->next; + prev = skb->prev; + skb->next = NULL; + skb->prev = NULL; + skb->list = NULL; + next->prev = prev; + prev->next = next; +} + +/** + * skb_unlink - remove a buffer from a list + * @skb: buffer to remove + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls + * + * Works even without knowing the list it is sitting on, which can be + * handy at times. It also means that THE LIST MUST EXIST when you + * unlink. Thus a list must have its contents unlinked before it is + * destroyed. + */ + +static inline void skb_unlink(struct sk_buff *skb) +{ + struct sk_buff_head *list = skb->list; + + if(list) { + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + if(skb->list == list) + __skb_unlink(skb, skb->list); + spin_unlock_irqrestore(&list->lock, flags); + } +} + +/* XXX: more streamlined implementation */ + +/** + * __skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. This function does not take any locks + * so must be used with appropriate locks held only. The tail item is + * returned or %NULL if the list is empty. + */ + +static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list) +{ + struct sk_buff *skb = skb_peek_tail(list); + if (skb) + __skb_unlink(skb, list); + return skb; +} + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The tail item is + * returned or %NULL if the list is empty. + */ + +static inline struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue_tail(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +static inline int skb_is_nonlinear(const struct sk_buff *skb) +{ + return skb->data_len; +} + +static inline int skb_headlen(const struct sk_buff *skb) +{ + return skb->len - skb->data_len; +} + +#define SKB_PAGE_ASSERT(skb) do { if (skb_shinfo(skb)->nr_frags) out_of_line_bug(); } while (0) +#define SKB_FRAG_ASSERT(skb) do { if (skb_shinfo(skb)->frag_list) out_of_line_bug(); } while (0) +#define SKB_LINEAR_ASSERT(skb) do { if (skb_is_nonlinear(skb)) out_of_line_bug(); } while (0) + +/* + * Add data to an sk_buff + */ + +static inline unsigned char *__skb_put(struct sk_buff *skb, unsigned int len) +{ + unsigned char *tmp=skb->tail; + SKB_LINEAR_ASSERT(skb); + skb->tail+=len; + skb->len+=len; + return tmp; +} + +/** + * skb_put - add data to a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer. If this would + * exceed the total buffer size the kernel will panic. A pointer to the + * first byte of the extra data is returned. + */ + +static inline unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +{ +#if 0 + unsigned char *tmp=skb->tail; + SKB_LINEAR_ASSERT(skb); + skb->tail+=len; + skb->len+=len; + if(skb->tail>skb->end) { + skb_over_panic(skb, len, current_text_addr()); + } + return tmp; +#else +return NULL; +#endif +} + +static inline unsigned char *__skb_push(struct sk_buff *skb, unsigned int len) +{ + skb->data-=len; + skb->len+=len; + return skb->data; +} + +/** + * skb_push - add data to the start of a buffer + * @skb: buffer to use + * @len: amount of data to add + * + * This function extends the used data area of the buffer at the buffer + * start. If this would exceed the total buffer headroom the kernel will + * panic. A pointer to the first byte of the extra data is returned. + */ + +static inline unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +{ +#if 0 + skb->data-=len; + skb->len+=len; + if(skb->datahead) { + skb_under_panic(skb, len, current_text_addr()); + } + return skb->data; +#else + return NULL; +#endif +} + +static inline char *__skb_pull(struct sk_buff *skb, unsigned int len) +{ + skb->len-=len; + if (skb->len < skb->data_len) + out_of_line_bug(); + return skb->data+=len; +} + +/** + * skb_pull - remove data from the start of a buffer + * @skb: buffer to use + * @len: amount of data to remove + * + * This function removes data from the start of a buffer, returning + * the memory to the headroom. A pointer to the next data in the buffer + * is returned. Once the data has been pulled future pushes will overwrite + * the old data. + */ + +static inline unsigned char * skb_pull(struct sk_buff *skb, unsigned int len) +{ + if (len > skb->len) + return NULL; + return __skb_pull(skb,len); +} + +extern unsigned char * __pskb_pull_tail(struct sk_buff *skb, int delta); + +static inline char *__pskb_pull(struct sk_buff *skb, unsigned int len) +{ + if (len > skb_headlen(skb) && + __pskb_pull_tail(skb, len-skb_headlen(skb)) == NULL) + return NULL; + skb->len -= len; + return skb->data += len; +} + +static inline unsigned char * pskb_pull(struct sk_buff *skb, unsigned int len) +{ + if (len > skb->len) + return NULL; + return __pskb_pull(skb,len); +} + +static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) +{ + if (len <= skb_headlen(skb)) + return 1; + if (len > skb->len) + return 0; + return (__pskb_pull_tail(skb, len-skb_headlen(skb)) != NULL); +} + +/** + * skb_headroom - bytes at buffer head + * @skb: buffer to check + * + * Return the number of bytes of free space at the head of an &sk_buff. + */ + +static inline int skb_headroom(const struct sk_buff *skb) +{ + return skb->data-skb->head; +} + +/** + * skb_tailroom - bytes at buffer end + * @skb: buffer to check + * + * Return the number of bytes of free space at the tail of an sk_buff + */ + +static inline int skb_tailroom(const struct sk_buff *skb) +{ + return skb_is_nonlinear(skb) ? 0 : skb->end-skb->tail; +} + +/** + * skb_reserve - adjust headroom + * @skb: buffer to alter + * @len: bytes to move + * + * Increase the headroom of an empty &sk_buff by reducing the tail + * room. This is only allowed for an empty buffer. + */ + +static inline void skb_reserve(struct sk_buff *skb, unsigned int len) +{ + skb->data+=len; + skb->tail+=len; +} + +extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc); + +static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +{ + if (!skb->data_len) { + skb->len = len; + skb->tail = skb->data+len; + } else { + ___pskb_trim(skb, len, 0); + } +} + +/** + * skb_trim - remove end from a buffer + * @skb: buffer to alter + * @len: new length + * + * Cut the length of a buffer down by removing data from the tail. If + * the buffer is already under the length specified it is not modified. + */ + +static inline void skb_trim(struct sk_buff *skb, unsigned int len) +{ + if (skb->len > len) { + __skb_trim(skb, len); + } +} + + +static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) +{ + if (!skb->data_len) { + skb->len = len; + skb->tail = skb->data+len; + return 0; + } else { + return ___pskb_trim(skb, len, 1); + } +} + +static inline int pskb_trim(struct sk_buff *skb, unsigned int len) +{ + if (len < skb->len) + return __pskb_trim(skb, len); + return 0; +} + +/** + * skb_orphan - orphan a buffer + * @skb: buffer to orphan + * + * If a buffer currently has an owner then we call the owner's + * destructor function and make the @skb unowned. The buffer continues + * to exist but is no longer charged to its former owner. + */ + + +static inline void skb_orphan(struct sk_buff *skb) +{ + if (skb->destructor) + skb->destructor(skb); + skb->destructor = NULL; + skb->sk = NULL; +} + +/** + * skb_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function takes the list + * lock and is atomic with respect to other list locking functions. + */ + + +static inline void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb=skb_dequeue(list))!=NULL) + kfree_skb(skb); +} + +/** + * __skb_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function does not take the + * list lock and the caller must hold the relevant locks to use it. + */ + + +static inline void __skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb=__skb_dequeue(list))!=NULL) + kfree_skb(skb); +} + +/** + * __dev_alloc_skb - allocate an skbuff for sending + * @length: length to allocate + * @gfp_mask: get_free_pages mask, passed to alloc_skb + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned in there is no free memory. + */ + +static inline struct sk_buff *__dev_alloc_skb(unsigned int length, + int gfp_mask) +{ + struct sk_buff *skb; + + skb = alloc_skb(length+16, gfp_mask); + if (skb) + skb_reserve(skb,16); + return skb; +} + +/** + * dev_alloc_skb - allocate an skbuff for sending + * @length: length to allocate + * + * Allocate a new &sk_buff and assign it a usage count of one. The + * buffer has unspecified headroom built in. Users should allocate + * the headroom they think they need without accounting for the + * built in space. The built in space is used for optimisations. + * + * %NULL is returned in there is no free memory. Although this function + * allocates memory it can be called from an interrupt. + */ + +static inline struct sk_buff *dev_alloc_skb(unsigned int length) +{ +#if 0 + return __dev_alloc_skb(length, GFP_ATOMIC); +#else + return NULL; +#endif +} + +/** + * skb_cow - copy header of skb when it is required + * @skb: buffer to cow + * @headroom: needed headroom + * + * If the skb passed lacks sufficient headroom or its data part + * is shared, data is reallocated. If reallocation fails, an error + * is returned and original skb is not changed. + * + * The result is skb with writable area skb->head...skb->tail + * and at least @headroom of space at head. + */ + +static inline int +skb_cow(struct sk_buff *skb, unsigned int headroom) +{ +#if 0 + int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb); + + if (delta < 0) + delta = 0; + + if (delta || skb_cloned(skb)) + return pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC); + return 0; +#else + return 0; +#endif +} + +/** + * skb_linearize - convert paged skb to linear one + * @skb: buffer to linarize + * @gfp: allocation mode + * + * If there is no free memory -ENOMEM is returned, otherwise zero + * is returned and the old skb data released. */ +int skb_linearize(struct sk_buff *skb, int gfp); + +static inline void *kmap_skb_frag(const skb_frag_t *frag) +{ +#if 0 +#ifdef CONFIG_HIGHMEM + if (in_irq()) + out_of_line_bug(); + + local_bh_disable(); +#endif + return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ); +#else + return NULL; +#endif +} + +static inline void kunmap_skb_frag(void *vaddr) +{ +#if 0 + kunmap_atomic(vaddr, KM_SKB_DATA_SOFTIRQ); +#ifdef CONFIG_HIGHMEM + local_bh_enable(); +#endif +#endif +} + +#define skb_queue_walk(queue, skb) \ + for (skb = (queue)->next; \ + (skb != (struct sk_buff *)(queue)); \ + skb=skb->next) + + +extern struct sk_buff * skb_recv_datagram(struct sock *sk,unsigned flags,int noblock, int *err); +extern unsigned int datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); +extern int skb_copy_datagram(const struct sk_buff *from, int offset, char *to,int size); +extern int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, struct iovec *to,int size); +extern int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int *csump); +extern int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, int hlen, struct iovec *iov); +extern void skb_free_datagram(struct sock * sk, struct sk_buff *skb); + +extern unsigned int skb_checksum(const struct sk_buff *skb, int offset, int len, unsigned int csum); +extern int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len); +extern unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int csum); +extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); + +extern void skb_init(void); +extern void skb_add_mtu(int mtu); + +#ifdef CONFIG_NETFILTER +static inline void +nf_conntrack_put(struct nf_ct_info *nfct) +{ + if (nfct && atomic_dec_and_test(&nfct->master->use)) + nfct->master->destroy(nfct->master); +} +static inline void +nf_conntrack_get(struct nf_ct_info *nfct) +{ + if (nfct) + atomic_inc(&nfct->master->use); +} +#endif + + +#endif /* skbuff */ + + + + + +struct sock; + +typedef struct sockaddr +{ + int x; +} _sockaddr; + + +struct msghdr { + void * msg_name; /* Socket name */ + int msg_namelen; /* Length of name */ + struct iovec * msg_iov; /* Data blocks */ + __kernel_size_t msg_iovlen; /* Number of blocks */ + void * msg_control; /* Per protocol magic (eg BSD file descriptor passing) */ + __kernel_size_t msg_controllen; /* Length of cmsg list */ + unsigned msg_flags; +}; + + +/* IP protocol blocks we attach to sockets. + * socket layer -> transport layer interface + * transport -> network interface is defined by struct inet_proto + */ +struct proto { + void (*close)(struct sock *sk, + long timeout); + int (*connect)(struct sock *sk, + struct sockaddr *uaddr, + int addr_len); + int (*disconnect)(struct sock *sk, int flags); + + struct sock * (*accept) (struct sock *sk, int flags, int *err); + + int (*ioctl)(struct sock *sk, int cmd, + unsigned long arg); + int (*init)(struct sock *sk); + int (*destroy)(struct sock *sk); + void (*shutdown)(struct sock *sk, int how); + int (*setsockopt)(struct sock *sk, int level, + int optname, char *optval, int optlen); + int (*getsockopt)(struct sock *sk, int level, + int optname, char *optval, + int *option); + int (*sendmsg)(struct sock *sk, struct msghdr *msg, + int len); + int (*recvmsg)(struct sock *sk, struct msghdr *msg, + int len, int noblock, int flags, + int *addr_len); + int (*bind)(struct sock *sk, + struct sockaddr *uaddr, int addr_len); + + int (*backlog_rcv) (struct sock *sk, + struct sk_buff *skb); + + /* Keeping track of sk's, looking them up, and port selection methods. */ + void (*hash)(struct sock *sk); + void (*unhash)(struct sock *sk); + int (*get_port)(struct sock *sk, unsigned short snum); + + char name[32]; + + struct { + int inuse; + } stats[32]; +// u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; +// } stats[NR_CPUS]; +}; + + + + + + + +/* This defines a selective acknowledgement block. */ +struct tcp_sack_block { + __u32 start_seq; + __u32 end_seq; +}; + + +struct tcp_opt { + int tcp_header_len; /* Bytes of tcp header to send */ + +/* + * Header prediction flags + * 0x5?10 << 16 + snd_wnd in net byte order + */ + __u32 pred_flags; + +/* + * RFC793 variables by their proper names. This means you can + * read the code and the spec side by side (and laugh ...) + * See RFC793 and RFC1122. The RFC writes these in capitals. + */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + + __u32 snd_una; /* First byte we want an ack for */ + __u32 snd_sml; /* Last byte of the most recently transmitted small packet */ + __u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ + __u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ + + /* Delayed ACK control data */ + struct { + __u8 pending; /* ACK is pending */ + __u8 quick; /* Scheduled number of quick acks */ + __u8 pingpong; /* The session is interactive */ + __u8 blocked; /* Delayed ACK was blocked by socket lock*/ + __u32 ato; /* Predicted tick of soft clock */ + unsigned long timeout; /* Currently scheduled timeout */ + __u32 lrcvtime; /* timestamp of last received data packet*/ + __u16 last_seg_size; /* Size of last incoming segment */ + __u16 rcv_mss; /* MSS used for delayed ACK decisions */ + } ack; + + /* Data for direct copy to user */ + struct { + //struct sk_buff_head prequeue; + struct task_struct *task; + struct iovec *iov; + int memory; + int len; + } ucopy; + + __u32 snd_wl1; /* Sequence for window update */ + __u32 snd_wnd; /* The window we expect to receive */ + __u32 max_window; /* Maximal window ever seen from peer */ + __u32 pmtu_cookie; /* Last pmtu seen by socket */ + __u16 mss_cache; /* Cached effective mss, not including SACKS */ + __u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ + __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ + __u8 ca_state; /* State of fast-retransmit machine */ + __u8 retransmits; /* Number of unrecovered RTO timeouts. */ + + __u8 reordering; /* Packet reordering metric. */ + __u8 queue_shrunk; /* Write queue has been shrunk recently.*/ + __u8 defer_accept; /* User waits for some data after accept() */ + +/* RTT measurement */ + __u8 backoff; /* backoff */ + __u32 srtt; /* smothed round trip time << 3 */ + __u32 mdev; /* medium deviation */ + __u32 mdev_max; /* maximal mdev for the last rtt period */ + __u32 rttvar; /* smoothed mdev_max */ + __u32 rtt_seq; /* sequence number to update rttvar */ + __u32 rto; /* retransmit timeout */ + + __u32 packets_out; /* Packets which are "in flight" */ + __u32 left_out; /* Packets which leaved network */ + __u32 retrans_out; /* Retransmitted packets out */ + + +/* + * Slow start and congestion control (see also Nagle, and Karn & Partridge) + */ + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 snd_cwnd; /* Sending congestion window */ + __u16 snd_cwnd_cnt; /* Linear increase counter */ + __u16 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ + __u32 snd_cwnd_used; + __u32 snd_cwnd_stamp; + + /* Two commonly used timers in both sender and receiver paths. */ + unsigned long timeout; + struct timer_list retransmit_timer; /* Resend (no ack) */ + struct timer_list delack_timer; /* Ack delay */ + + struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ + + struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ + struct sk_buff *send_head; /* Front of stuff to transmit */ + struct page *sndmsg_page; /* Cached page for sendmsg */ + u32 sndmsg_off; /* Cached offset for sendmsg */ + + __u32 rcv_wnd; /* Current receiver window */ + __u32 rcv_wup; /* rcv_nxt on last window update sent */ + __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ + __u32 pushed_seq; /* Last pushed seq, required to talk to windows */ + __u32 copied_seq; /* Head of yet unread data */ +/* + * Options received (usually on last packet, some only on SYN packets). + */ + char tstamp_ok, /* TIMESTAMP seen on SYN packet */ + wscale_ok, /* Wscale seen on SYN packet */ + sack_ok; /* SACK seen on SYN packet */ + char saw_tstamp; /* Saw TIMESTAMP on last packet */ + __u8 snd_wscale; /* Window scaling received from sender */ + __u8 rcv_wscale; /* Window scaling to send to receiver */ + __u8 nonagle; /* Disable Nagle algorithm? */ + __u8 keepalive_probes; /* num of allowed keep alive probes */ + +/* PAWS/RTTM data */ + __u32 rcv_tsval; /* Time stamp value */ + __u32 rcv_tsecr; /* Time stamp echo reply */ + __u32 ts_recent; /* Time stamp to echo next */ + long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ + +/* SACKs data */ + __u16 user_mss; /* mss requested by user in ioctl */ + __u8 dsack; /* D-SACK is scheduled */ + __u8 eff_sacks; /* Size of SACK array to send with next packet */ + struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ + struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ + + __u32 window_clamp; /* Maximal window to advertise */ + __u32 rcv_ssthresh; /* Current window clamp */ + __u8 probes_out; /* unanswered 0 window probes */ + __u8 num_sacks; /* Number of SACK blocks */ + __u16 advmss; /* Advertised MSS */ + + __u8 syn_retries; /* num of allowed syn retries */ + __u8 ecn_flags; /* ECN status bits. */ + __u16 prior_ssthresh; /* ssthresh saved at recovery start */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u32 fackets_out; /* FACK'd packets */ + __u32 high_seq; /* snd_nxt at onset of congestion */ + + __u32 retrans_stamp; /* Timestamp of the last retransmit, + * also used in SYN-SENT to remember stamp of + * the first SYN. */ + __u32 undo_marker; /* tracking retrans started here. */ + int undo_retrans; /* number of undoable retransmissions. */ + __u32 urg_seq; /* Seq of received urgent pointer */ + __u16 urg_data; /* Saved octet of OOB data and control flags */ + __u8 pending; /* Scheduled timer event */ + __u8 urg_mode; /* In urgent mode */ + __u32 snd_up; /* Urgent pointer */ + + /* The syn_wait_lock is necessary only to avoid tcp_get_info having + * to grab the main lock sock while browsing the listening hash + * (otherwise it's deadlock prone). + * This lock is acquired in read mode only from tcp_get_info() and + * it's acquired in write mode _only_ from code that is actively + * changing the syn_wait_queue. All readers that are holding + * the master sock lock don't need to grab this lock in read mode + * too as the syn_wait_queue writes are always protected from + * the main sock lock. + */ + rwlock_t syn_wait_lock; + struct tcp_listen_opt *listen_opt; + + /* FIFO of established children */ + struct open_request *accept_queue; + struct open_request *accept_queue_tail; + + int write_pending; /* A write to socket waits to start. */ + + unsigned int keepalive_time; /* time before keep alive takes place */ + unsigned int keepalive_intvl; /* time interval between keep alive probes */ + int linger2; + + unsigned long last_synq_overflow; +}; + + + + +/* This is the per-socket lock. The spinlock provides a synchronization + * between user contexts and software interrupt processing, whereas the + * mini-semaphore synchronizes multiple users amongst themselves. + */ +typedef struct { + spinlock_t slock; + unsigned int users; + wait_queue_head_t wq; +} socket_lock_t; + +struct sock { + /* Socket demultiplex comparisons on incoming packets. */ + __u32 daddr; /* Foreign IPv4 addr */ + __u32 rcv_saddr; /* Bound local IPv4 addr */ + __u16 dport; /* Destination port */ + unsigned short num; /* Local port */ + int bound_dev_if; /* Bound device index if != 0 */ + + /* Main hash linkage for various protocol lookup tables. */ + struct sock *next; + struct sock **pprev; + struct sock *bind_next; + struct sock **bind_pprev; + + volatile unsigned char state, /* Connection state */ + zapped; /* In ax25 & ipx means not linked */ + __u16 sport; /* Source port */ + + unsigned short family; /* Address family */ + unsigned char reuse; /* SO_REUSEADDR setting */ + unsigned char shutdown; + atomic_t refcnt; /* Reference count */ + + socket_lock_t lock; /* Synchronizer... */ + int rcvbuf; /* Size of receive buffer in bytes */ + + wait_queue_head_t *sleep; /* Sock wait queue */ + struct dst_entry *dst_cache; /* Destination cache */ + rwlock_t dst_lock; + atomic_t rmem_alloc; /* Receive queue bytes committed */ + struct sk_buff_head receive_queue; /* Incoming packets */ + atomic_t wmem_alloc; /* Transmit queue bytes committed */ + struct sk_buff_head write_queue; /* Packet sending queue */ + atomic_t omem_alloc; /* "o" is "option" or "other" */ + int wmem_queued; /* Persistent queue size */ + int forward_alloc; /* Space allocated forward. */ + __u32 saddr; /* Sending source */ + unsigned int allocation; /* Allocation mode */ + int sndbuf; /* Size of send buffer in bytes */ + struct sock *prev; + + /* Not all are volatile, but some are, so we might as well say they all are. + * XXX Make this a flag word -DaveM + */ + volatile char dead, + done, + urginline, + keepopen, + linger, + destroy, + no_check, + broadcast, + bsdism; + unsigned char debug; + unsigned char rcvtstamp; + unsigned char use_write_queue; + unsigned char userlocks; + /* Hole of 3 bytes. Try to pack. */ + int route_caps; + int proc; + unsigned long lingertime; + + int hashent; + struct sock *pair; + + /* The backlog queue is special, it is always used with + * the per-socket spinlock held and requires low latency + * access. Therefore we special case it's implementation. + */ + struct { + struct sk_buff *head; + struct sk_buff *tail; + } backlog; + + rwlock_t callback_lock; + + /* Error queue, rarely used. */ + struct sk_buff_head error_queue; + + struct proto *prot; + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + union { + struct ipv6_pinfo af_inet6; + } net_pinfo; +#endif + + union { + struct tcp_opt af_tcp; +#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) + struct raw_opt tp_raw4; +#endif +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + struct raw6_opt tp_raw; +#endif /* CONFIG_IPV6 */ +#if defined(CONFIG_SPX) || defined (CONFIG_SPX_MODULE) + struct spx_opt af_spx; +#endif /* CONFIG_SPX */ + + } tp_pinfo; + + int err, err_soft; /* Soft holds errors that don't + cause failure but are the cause + of a persistent failure not just + 'timed out' */ + unsigned short ack_backlog; + unsigned short max_ack_backlog; + __u32 priority; + unsigned short type; + unsigned char localroute; /* Route locally only */ + unsigned char protocol; +// struct ucred peercred; + int rcvlowat; + long rcvtimeo; + long sndtimeo; + +#ifdef CONFIG_FILTER + /* Socket Filtering Instructions */ + struct sk_filter *filter; +#endif /* CONFIG_FILTER */ + + /* This is where all the private (optional) areas that don't + * overlap will eventually live. + */ + union { + void *destruct_hook; +// struct unix_opt af_unix; +#if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE) + struct inet_opt af_inet; +#endif +#if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE) + struct atalk_sock af_at; +#endif +#if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE) + struct ipx_opt af_ipx; +#endif +#if defined (CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE) + struct dn_scp dn; +#endif +#if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE) + struct packet_opt *af_packet; +#endif +#if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE) + x25_cb *x25; +#endif +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + ax25_cb *ax25; +#endif +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + nr_cb *nr; +#endif +#if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE) + rose_cb *rose; +#endif +#if defined(CONFIG_PPPOE) || defined(CONFIG_PPPOE_MODULE) + struct pppox_opt *pppox; +#endif + struct netlink_opt *af_netlink; +#if defined(CONFIG_ECONET) || defined(CONFIG_ECONET_MODULE) + struct econet_opt *af_econet; +#endif +#if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE) + struct atm_vcc *af_atm; +#endif +#if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE) + struct irda_sock *irda; +#endif +#if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE) + struct wanpipe_opt *af_wanpipe; +#endif + } protinfo; + + + /* This part is used for the timeout functions. */ + struct timer_list timer; /* This is the sock cleanup timer. */ + struct timeval stamp; + + /* Identd and reporting IO signals */ + struct socket *socket; + + /* RPC layer private data */ + void *user_data; + + /* Callbacks */ + void (*state_change)(struct sock *sk); + void (*data_ready)(struct sock *sk,int bytes); + void (*write_space)(struct sock *sk); + void (*error_report)(struct sock *sk); + + int (*backlog_rcv) (struct sock *sk, + struct sk_buff *skb); + void (*destruct)(struct sock *sk); +}; + + + + +#if 1 /* dst (_NET_DST_H) */ + +#if 0 +#include +#include +#endif + +/* + * 0 - no debugging messages + * 1 - rare events and bugs (default) + * 2 - trace mode. + */ +#define RT_CACHE_DEBUG 0 + +#define DST_GC_MIN (1*HZ) +#define DST_GC_INC (5*HZ) +#define DST_GC_MAX (120*HZ) + +struct sk_buff; + +struct dst_entry +{ + struct dst_entry *next; + atomic_t __refcnt; /* client references */ + int __use; + struct net_device *dev; + int obsolete; + int flags; +#define DST_HOST 1 + unsigned long lastuse; + unsigned long expires; + + unsigned mxlock; + unsigned pmtu; + unsigned window; + unsigned rtt; + unsigned rttvar; + unsigned ssthresh; + unsigned cwnd; + unsigned advmss; + unsigned reordering; + + unsigned long rate_last; /* rate limiting for ICMP */ + unsigned long rate_tokens; + + int error; + + struct neighbour *neighbour; + struct hh_cache *hh; + + int (*input)(struct sk_buff*); + int (*output)(struct sk_buff*); + +#ifdef CONFIG_NET_CLS_ROUTE + __u32 tclassid; +#endif + + struct dst_ops *ops; + + char info[0]; +}; + + +struct dst_ops +{ + unsigned short family; + unsigned short protocol; + unsigned gc_thresh; + + int (*gc)(void); + struct dst_entry * (*check)(struct dst_entry *, __u32 cookie); + struct dst_entry * (*reroute)(struct dst_entry *, + struct sk_buff *); + void (*destroy)(struct dst_entry *); + struct dst_entry * (*negative_advice)(struct dst_entry *); + void (*link_failure)(struct sk_buff *); + int entry_size; + + atomic_t entries; + kmem_cache_t *kmem_cachep; +}; + +#ifdef __KERNEL__ + +static inline void dst_hold(struct dst_entry * dst) +{ + atomic_inc(&dst->__refcnt); +} + +static inline +struct dst_entry * dst_clone(struct dst_entry * dst) +{ + if (dst) + atomic_inc(&dst->__refcnt); + return dst; +} + +static inline +void dst_release(struct dst_entry * dst) +{ + if (dst) + atomic_dec(&dst->__refcnt); +} + +extern void * dst_alloc(struct dst_ops * ops); +extern void __dst_free(struct dst_entry * dst); +extern void dst_destroy(struct dst_entry * dst); + +static inline +void dst_free(struct dst_entry * dst) +{ + if (dst->obsolete > 1) + return; + if (!atomic_read(&dst->__refcnt)) { + dst_destroy(dst); + return; + } + __dst_free(dst); +} + +static inline void dst_confirm(struct dst_entry *dst) +{ + if (dst) + neigh_confirm(dst->neighbour); +} + +static inline void dst_negative_advice(struct dst_entry **dst_p) +{ + struct dst_entry * dst = *dst_p; + if (dst && dst->ops->negative_advice) + *dst_p = dst->ops->negative_advice(dst); +} + +static inline void dst_link_failure(struct sk_buff *skb) +{ + struct dst_entry * dst = skb->dst; + if (dst && dst->ops && dst->ops->link_failure) + dst->ops->link_failure(skb); +} + +static inline void dst_set_expires(struct dst_entry *dst, int timeout) +{ + unsigned long expires = jiffies + timeout; + + if (expires == 0) + expires = 1; + + if (dst->expires == 0 || (long)(dst->expires - expires) > 0) + dst->expires = expires; +} + +extern void dst_init(void); + +#endif /* dst */ + + + +#if 1 +/* dummy types */ + + +#endif + +#define TCP_DEBUG 1 +#define FASTRETRANS_DEBUG 1 + +/* Cancel timers, when they are not required. */ +#undef TCP_CLEAR_TIMERS + +#if 0 +#include +#include +#include +#include +#include +#include +#else +#include "linux.h" +#endif + +/* This is for all connections with a full identity, no wildcards. + * New scheme, half the table is for TIME_WAIT, the other half is + * for the rest. I'll experiment with dynamic table growth later. + */ +struct tcp_ehash_bucket { + rwlock_t lock; + struct sock *chain; +} __attribute__((__aligned__(8))); + +/* This is for listening sockets, thus all sockets which possess wildcards. */ +#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */ + +/* There are a few simple rules, which allow for local port reuse by + * an application. In essence: + * + * 1) Sockets bound to different interfaces may share a local port. + * Failing that, goto test 2. + * 2) If all sockets have sk->reuse set, and none of them are in + * TCP_LISTEN state, the port may be shared. + * Failing that, goto test 3. + * 3) If all sockets are bound to a specific sk->rcv_saddr local + * address, and none of them are the same, the port may be + * shared. + * Failing this, the port cannot be shared. + * + * The interesting point, is test #2. This is what an FTP server does + * all day. To optimize this case we use a specific flag bit defined + * below. As we add sockets to a bind bucket list, we perform a + * check of: (newsk->reuse && (newsk->state != TCP_LISTEN)) + * As long as all sockets added to a bind bucket pass this test, + * the flag bit will be set. + * The resulting situation is that tcp_v[46]_verify_bind() can just check + * for this flag bit, if it is set and the socket trying to bind has + * sk->reuse set, we don't even have to walk the owners list at all, + * we return that it is ok to bind this socket to the requested local port. + * + * Sounds like a lot of work, but it is worth it. In a more naive + * implementation (ie. current FreeBSD etc.) the entire list of ports + * must be walked for each data port opened by an ftp server. Needless + * to say, this does not scale at all. With a couple thousand FTP + * users logged onto your box, isn't it nice to know that new data + * ports are created in O(1) time? I thought so. ;-) -DaveM + */ +struct tcp_bind_bucket { + unsigned short port; + signed short fastreuse; + struct tcp_bind_bucket *next; + struct sock *owners; + struct tcp_bind_bucket **pprev; +}; + +struct tcp_bind_hashbucket { + spinlock_t lock; + struct tcp_bind_bucket *chain; +}; + +extern struct tcp_hashinfo { + /* This is for sockets with full identity only. Sockets here will + * always be without wildcards and will have the following invariant: + * + * TCP_ESTABLISHED <= sk->state < TCP_CLOSE + * + * First half of the table is for sockets not in TIME_WAIT, second half + * is for TIME_WAIT sockets only. + */ + struct tcp_ehash_bucket *__tcp_ehash; + + /* Ok, let's try this, I give up, we do need a local binding + * TCP hash as well as the others for fast bind/connect. + */ + struct tcp_bind_hashbucket *__tcp_bhash; + + int __tcp_bhash_size; + int __tcp_ehash_size; + + /* All sockets in TCP_LISTEN state will be in here. This is the only + * table where wildcard'd TCP sockets can exist. Hash function here + * is just local port number. + */ + struct sock *__tcp_listening_hash[TCP_LHTABLE_SIZE]; + + /* All the above members are written once at bootup and + * never written again _or_ are predominantly read-access. + * + * Now align to a new cache line as all the following members + * are often dirty. + */ + rwlock_t __tcp_lhash_lock ____cacheline_aligned; + atomic_t __tcp_lhash_users; + wait_queue_head_t __tcp_lhash_wait; + spinlock_t __tcp_portalloc_lock; +} tcp_hashinfo; + +#define tcp_ehash (tcp_hashinfo.__tcp_ehash) +#define tcp_bhash (tcp_hashinfo.__tcp_bhash) +#define tcp_ehash_size (tcp_hashinfo.__tcp_ehash_size) +#define tcp_bhash_size (tcp_hashinfo.__tcp_bhash_size) +#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash) +#define tcp_lhash_lock (tcp_hashinfo.__tcp_lhash_lock) +#define tcp_lhash_users (tcp_hashinfo.__tcp_lhash_users) +#define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait) +#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock) + +extern kmem_cache_t *tcp_bucket_cachep; +extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, + unsigned short snum); +extern void tcp_bucket_unlock(struct sock *sk); +extern int tcp_port_rover; +extern struct sock *tcp_v4_lookup_listener(u32 addr, unsigned short hnum, int dif); + +/* These are AF independent. */ +static __inline__ int tcp_bhashfn(__u16 lport) +{ + return (lport & (tcp_bhash_size - 1)); +} + +/* This is a TIME_WAIT bucket. It works around the memory consumption + * problems of sockets in such a state on heavily loaded servers, but + * without violating the protocol specification. + */ +struct tcp_tw_bucket { + /* These _must_ match the beginning of struct sock precisely. + * XXX Yes I know this is gross, but I'd have to edit every single + * XXX networking file if I created a "struct sock_header". -DaveM + */ + __u32 daddr; + __u32 rcv_saddr; + __u16 dport; + unsigned short num; + int bound_dev_if; + struct sock *next; + struct sock **pprev; + struct sock *bind_next; + struct sock **bind_pprev; + unsigned char state, + substate; /* "zapped" is replaced with "substate" */ + __u16 sport; + unsigned short family; + unsigned char reuse, + rcv_wscale; /* It is also TW bucket specific */ + atomic_t refcnt; + + /* And these are ours. */ + int hashent; + int timeout; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 rcv_wnd; + __u32 ts_recent; + long ts_recent_stamp; + unsigned long ttd; + struct tcp_bind_bucket *tb; + struct tcp_tw_bucket *next_death; + struct tcp_tw_bucket **pprev_death; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct in6_addr v6_daddr; + struct in6_addr v6_rcv_saddr; +#endif +}; + +extern kmem_cache_t *tcp_timewait_cachep; + +static inline void tcp_tw_put(struct tcp_tw_bucket *tw) +{ + if (atomic_dec_and_test(&tw->refcnt)) { +#ifdef INET_REFCNT_DEBUG + printk(KERN_DEBUG "tw_bucket %p released\n", tw); +#endif + kmem_cache_free(tcp_timewait_cachep, tw); + } +} + +extern atomic_t tcp_orphan_count; +extern int tcp_tw_count; +extern void tcp_time_wait(struct sock *sk, int state, int timeo); +extern void tcp_timewait_kill(struct tcp_tw_bucket *tw); +extern void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); +extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw); + + +/* Socket demux engine toys. */ +#ifdef __BIG_ENDIAN +#define TCP_COMBINED_PORTS(__sport, __dport) \ + (((__u32)(__sport)<<16) | (__u32)(__dport)) +#else /* __LITTLE_ENDIAN */ +#define TCP_COMBINED_PORTS(__sport, __dport) \ + (((__u32)(__dport)<<16) | (__u32)(__sport)) +#endif + +#if (BITS_PER_LONG == 64) +#ifdef __BIG_ENDIAN +#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ + __u64 __name = (((__u64)(__saddr))<<32)|((__u64)(__daddr)); +#else /* __LITTLE_ENDIAN */ +#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \ + __u64 __name = (((__u64)(__daddr))<<32)|((__u64)(__saddr)); +#endif /* __BIG_ENDIAN */ +#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ + (((*((__u64 *)&((__sk)->daddr)))== (__cookie)) && \ + ((*((__u32 *)&((__sk)->dport)))== (__ports)) && \ + (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif)))) +#else /* 32-bit arch */ +#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) +#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\ + (((__sk)->daddr == (__saddr)) && \ + ((__sk)->rcv_saddr == (__daddr)) && \ + ((*((__u32 *)&((__sk)->dport)))== (__ports)) && \ + (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif)))) +#endif /* 64-bit arch */ + +#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \ + (((*((__u32 *)&((__sk)->dport)))== (__ports)) && \ + ((__sk)->family == AF_INET6) && \ + !ipv6_addr_cmp(&(__sk)->net_pinfo.af_inet6.daddr, (__saddr)) && \ + !ipv6_addr_cmp(&(__sk)->net_pinfo.af_inet6.rcv_saddr, (__daddr)) && \ + (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif)))) + +/* These can have wildcards, don't try too hard. */ +static __inline__ int tcp_lhashfn(unsigned short num) +{ +#if 0 + return num & (TCP_LHTABLE_SIZE - 1); +#else + return 0; +#endif +} + +static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) +{ +#if 0 + return tcp_lhashfn(sk->num); +#else + return 0; +#endif +} + +#define MAX_TCP_HEADER (128 + MAX_HEADER) + +/* + * Never offer a window over 32767 without using window scaling. Some + * poor stacks do signed 16bit maths! + */ +#define MAX_TCP_WINDOW 32767U + +/* Minimal accepted MSS. It is (60+60+8) - (20+20). */ +#define TCP_MIN_MSS 88U + +/* Minimal RCV_MSS. */ +#define TCP_MIN_RCVMSS 536U + +/* After receiving this amount of duplicate ACKs fast retransmit starts. */ +#define TCP_FASTRETRANS_THRESH 3 + +/* Maximal reordering. */ +#define TCP_MAX_REORDERING 127 + +/* Maximal number of ACKs sent quickly to accelerate slow-start. */ +#define TCP_MAX_QUICKACKS 16U + +/* urg_data states */ +#define TCP_URG_VALID 0x0100 +#define TCP_URG_NOTYET 0x0200 +#define TCP_URG_READ 0x0400 + +#define TCP_RETR1 3 /* + * This is how many retries it does before it + * tries to figure out if the gateway is + * down. Minimal RFC value is 3; it corresponds + * to ~3sec-8min depending on RTO. + */ + +#define TCP_RETR2 15 /* + * This should take at least + * 90 minutes to time out. + * RFC1122 says that the limit is 100 sec. + * 15 is ~13-30min depending on RTO. + */ + +#define TCP_SYN_RETRIES 5 /* number of times to retry active opening a + * connection: ~180sec is RFC minumum */ + +#define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a + * connection: ~180sec is RFC minumum */ + + +#define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned + * socket. 7 is ~50sec-16min. + */ + + +#define TCP_TIMEWAIT_LEN (60*1000) +//#define TCP_TIMEWAIT_LEN (60*HZ) +/* how long to wait to destroy TIME-WAIT + * state, about 60 seconds */ +#define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN + /* BSD style FIN_WAIT2 deadlock breaker. + * It used to be 3min, new value is 60sec, + * to combine FIN-WAIT-2 timeout with + * TIME-WAIT timer. + */ + +#define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ +#if HZ >= 100 +#define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ +#define TCP_ATO_MIN ((unsigned)(HZ/25)) +#else +#define TCP_DELACK_MIN 4U +#define TCP_ATO_MIN 4U +#endif +#define TCP_RTO_MAX ((unsigned)(120*HZ)) +#define TCP_RTO_MIN ((unsigned)(HZ/5)) +#define TCP_TIMEOUT_INIT ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value */ + +#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes + * for local resources. + */ + +#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ +#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ +#define TCP_KEEPALIVE_INTVL (75*HZ) + +#define MAX_TCP_KEEPIDLE 32767 +#define MAX_TCP_KEEPINTVL 32767 +#define MAX_TCP_KEEPCNT 127 +#define MAX_TCP_SYNCNT 127 + +/* TIME_WAIT reaping mechanism. */ +#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ +#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) + +#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ +#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */ + +#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) +#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated + * after this time. It should be equal + * (or greater than) TCP_TIMEWAIT_LEN + * to provide reliability equal to one + * provided by timewait state. + */ +#define TCP_PAWS_WINDOW 1 /* Replay window for per-host + * timestamps. It must be less than + * minimal timewait lifetime. + */ + +#define TCP_TW_RECYCLE_SLOTS_LOG 5 +#define TCP_TW_RECYCLE_SLOTS (1< 4sec, it is "slow" path, no recycling is required, + so that we select tick to get range about 4 seconds. + */ + +#if 0 +#if HZ <= 16 || HZ > 4096 +# error Unsupported: HZ <= 16 or HZ > 4096 +#elif HZ <= 32 +# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG) +#elif HZ <= 64 +# define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG) +#elif HZ <= 128 +# define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG) +#elif HZ <= 256 +# define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG) +#elif HZ <= 512 +# define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG) +#elif HZ <= 1024 +# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG) +#elif HZ <= 2048 +# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG) +#else +# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) +#endif +#else +#define TCP_TW_RECYCLE_TICK (0) +#endif + +/* + * TCP option + */ + +#define TCPOPT_NOP 1 /* Padding */ +#define TCPOPT_EOL 0 /* End of options */ +#define TCPOPT_MSS 2 /* Segment size negotiating */ +#define TCPOPT_WINDOW 3 /* Window scaling */ +#define TCPOPT_SACK_PERM 4 /* SACK Permitted */ +#define TCPOPT_SACK 5 /* SACK Block */ +#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ + +/* + * TCP option lengths + */ + +#define TCPOLEN_MSS 4 +#define TCPOLEN_WINDOW 3 +#define TCPOLEN_SACK_PERM 2 +#define TCPOLEN_TIMESTAMP 10 + +/* But this is what stacks really send out. */ +#define TCPOLEN_TSTAMP_ALIGNED 12 +#define TCPOLEN_WSCALE_ALIGNED 4 +#define TCPOLEN_SACKPERM_ALIGNED 4 +#define TCPOLEN_SACK_BASE 2 +#define TCPOLEN_SACK_BASE_ALIGNED 4 +#define TCPOLEN_SACK_PERBLOCK 8 + +#define TCP_TIME_RETRANS 1 /* Retransmit timer */ +#define TCP_TIME_DACK 2 /* Delayed ack timer */ +#define TCP_TIME_PROBE0 3 /* Zero window probe timer */ +#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */ + +#if 0 +/* sysctl variables for tcp */ +extern int sysctl_max_syn_backlog; +extern int sysctl_tcp_timestamps; +extern int sysctl_tcp_window_scaling; +extern int sysctl_tcp_sack; +extern int sysctl_tcp_fin_timeout; +extern int sysctl_tcp_tw_recycle; +extern int sysctl_tcp_keepalive_time; +extern int sysctl_tcp_keepalive_probes; +extern int sysctl_tcp_keepalive_intvl; +extern int sysctl_tcp_syn_retries; +extern int sysctl_tcp_synack_retries; +extern int sysctl_tcp_retries1; +extern int sysctl_tcp_retries2; +extern int sysctl_tcp_orphan_retries; +extern int sysctl_tcp_syncookies; +extern int sysctl_tcp_retrans_collapse; +extern int sysctl_tcp_stdurg; +extern int sysctl_tcp_rfc1337; +extern int sysctl_tcp_abort_on_overflow; +extern int sysctl_tcp_max_orphans; +extern int sysctl_tcp_max_tw_buckets; +extern int sysctl_tcp_fack; +extern int sysctl_tcp_reordering; +extern int sysctl_tcp_ecn; +extern int sysctl_tcp_dsack; +extern int sysctl_tcp_mem[3]; +extern int sysctl_tcp_wmem[3]; +extern int sysctl_tcp_rmem[3]; +extern int sysctl_tcp_app_win; +extern int sysctl_tcp_adv_win_scale; +extern int sysctl_tcp_tw_reuse; +#endif + +extern atomic_t tcp_memory_allocated; +extern atomic_t tcp_sockets_allocated; +extern int tcp_memory_pressure; + +struct open_request; + +struct or_calltable { + int family; + int (*rtx_syn_ack) (struct sock *sk, struct open_request *req, struct dst_entry*); + void (*send_ack) (struct sk_buff *skb, struct open_request *req); + void (*destructor) (struct open_request *req); + void (*send_reset) (struct sk_buff *skb); +}; + +struct tcp_v4_open_req { + __u32 loc_addr; + __u32 rmt_addr; + struct ip_options *opt; +}; + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +struct tcp_v6_open_req { + struct in6_addr loc_addr; + struct in6_addr rmt_addr; + struct sk_buff *pktopts; + int iif; +}; +#endif + +/* this structure is too big */ +struct open_request { + struct open_request *dl_next; /* Must be first member! */ + __u32 rcv_isn; + __u32 snt_isn; + __u16 rmt_port; + __u16 mss; + __u8 retrans; + __u8 __pad; + __u16 snd_wscale : 4, + rcv_wscale : 4, + tstamp_ok : 1, + sack_ok : 1, + wscale_ok : 1, + ecn_ok : 1, + acked : 1; + /* The following two fields can be easily recomputed I think -AK */ + __u32 window_clamp; /* window clamp at creation time */ + __u32 rcv_wnd; /* rcv_wnd offered first time */ + __u32 ts_recent; + unsigned long expires; + struct or_calltable *class; + struct sock *sk; + union { + struct tcp_v4_open_req v4_req; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + struct tcp_v6_open_req v6_req; +#endif + } af; +}; + +/* SLAB cache for open requests. */ +extern kmem_cache_t *tcp_openreq_cachep; + +#define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC) +#define tcp_openreq_fastfree(req) kmem_cache_free(tcp_openreq_cachep, req) + +static inline void tcp_openreq_free(struct open_request *req) +{ + req->class->destructor(req); + tcp_openreq_fastfree(req); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#define TCP_INET_FAMILY(fam) ((fam) == AF_INET) +#else +#define TCP_INET_FAMILY(fam) 1 +#endif + +/* + * Pointers to address related TCP functions + * (i.e. things that depend on the address family) + * + * BUGGG_FUTURE: all the idea behind this struct is wrong. + * It mixes socket frontend with transport function. + * With port sharing between IPv6/v4 it gives the only advantage, + * only poor IPv6 needs to permanently recheck, that it + * is still IPv6 8)8) It must be cleaned up as soon as possible. + * --ANK (980802) + */ + +struct tcp_func { + int (*queue_xmit) (struct sk_buff *skb); + + void (*send_check) (struct sock *sk, + struct tcphdr *th, + int len, + struct sk_buff *skb); + + int (*rebuild_header) (struct sock *sk); + + int (*conn_request) (struct sock *sk, + struct sk_buff *skb); + + struct sock * (*syn_recv_sock) (struct sock *sk, + struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst); + + int (*remember_stamp) (struct sock *sk); + + __u16 net_header_len; + + int (*setsockopt) (struct sock *sk, + int level, + int optname, + char *optval, + int optlen); + + int (*getsockopt) (struct sock *sk, + int level, + int optname, + char *optval, + int *optlen); + + + void (*addr2sockaddr) (struct sock *sk, + struct sockaddr *); + + int sockaddr_len; +}; + +/* + * The next routines deal with comparing 32 bit unsigned ints + * and worry about wraparound (automatic with unsigned arithmetic). + */ + +extern __inline int before(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq1-seq2) < 0; +} + +extern __inline int after(__u32 seq1, __u32 seq2) +{ + return (__s32)(seq2-seq1) < 0; +} + + +/* is s2<=s1<=s3 ? */ +extern __inline int between(__u32 seq1, __u32 seq2, __u32 seq3) +{ + return seq3 - seq2 >= seq1 - seq2; +} + + +extern struct proto tcp_prot; + +#ifdef ROS_STATISTICS +extern struct tcp_mib tcp_statistics[NR_CPUS*2]; + +#define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field) +#define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field) +#define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field) +#endif + +extern void tcp_put_port(struct sock *sk); +extern void __tcp_put_port(struct sock *sk); +extern void tcp_inherit_port(struct sock *sk, struct sock *child); + +extern void tcp_v4_err(struct sk_buff *skb, u32); + +extern void tcp_shutdown (struct sock *sk, int how); + +extern int tcp_v4_rcv(struct sk_buff *skb); + +extern int tcp_v4_remember_stamp(struct sock *sk); + +extern int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw); + +extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); +extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); + +extern int tcp_ioctl(struct sock *sk, + int cmd, + unsigned long arg); + +extern int tcp_rcv_state_process(struct sock *sk, + struct sk_buff *skb, + struct tcphdr *th, + unsigned len); + +extern int tcp_rcv_established(struct sock *sk, + struct sk_buff *skb, + struct tcphdr *th, + unsigned len); + +enum tcp_ack_state_t +{ + TCP_ACK_SCHED = 1, + TCP_ACK_TIMER = 2, + TCP_ACK_PUSHED= 4 +}; + +static inline void tcp_schedule_ack(struct tcp_opt *tp) +{ + tp->ack.pending |= TCP_ACK_SCHED; +} + +static inline int tcp_ack_scheduled(struct tcp_opt *tp) +{ + return tp->ack.pending&TCP_ACK_SCHED; +} + +static __inline__ void tcp_dec_quickack_mode(struct tcp_opt *tp) +{ + if (tp->ack.quick && --tp->ack.quick == 0) { + /* Leaving quickack mode we deflate ATO. */ + tp->ack.ato = TCP_ATO_MIN; + } +} + +extern void tcp_enter_quickack_mode(struct tcp_opt *tp); + +static __inline__ void tcp_delack_init(struct tcp_opt *tp) +{ + memset(&tp->ack, 0, sizeof(tp->ack)); +} + +static inline void tcp_clear_options(struct tcp_opt *tp) +{ + tp->tstamp_ok = tp->sack_ok = tp->wscale_ok = tp->snd_wscale = 0; +} + +enum tcp_tw_status +{ + TCP_TW_SUCCESS = 0, + TCP_TW_RST = 1, + TCP_TW_ACK = 2, + TCP_TW_SYN = 3 +}; + + +extern enum tcp_tw_status tcp_timewait_state_process(struct tcp_tw_bucket *tw, + struct sk_buff *skb, + struct tcphdr *th, + unsigned len); + +extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb, + struct open_request *req, + struct open_request **prev); +extern int tcp_child_process(struct sock *parent, + struct sock *child, + struct sk_buff *skb); +extern void tcp_enter_loss(struct sock *sk, int how); +extern void tcp_clear_retrans(struct tcp_opt *tp); +extern void tcp_update_metrics(struct sock *sk); + +extern void tcp_close(struct sock *sk, + long timeout); +extern struct sock * tcp_accept(struct sock *sk, int flags, int *err); +extern unsigned int tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait); +extern void tcp_write_space(struct sock *sk); + +extern int tcp_getsockopt(struct sock *sk, int level, + int optname, char *optval, + int *optlen); +extern int tcp_setsockopt(struct sock *sk, int level, + int optname, char *optval, + int optlen); +extern void tcp_set_keepalive(struct sock *sk, int val); +extern int tcp_recvmsg(struct sock *sk, + struct msghdr *msg, + int len, int nonblock, + int flags, int *addr_len); + +extern int tcp_listen_start(struct sock *sk); + +extern void tcp_parse_options(struct sk_buff *skb, + struct tcp_opt *tp, + int estab); + +/* + * TCP v4 functions exported for the inet6 API + */ + +extern int tcp_v4_rebuild_header(struct sock *sk); + +extern int tcp_v4_build_header(struct sock *sk, + struct sk_buff *skb); + +extern void tcp_v4_send_check(struct sock *sk, + struct tcphdr *th, int len, + struct sk_buff *skb); + +extern int tcp_v4_conn_request(struct sock *sk, + struct sk_buff *skb); + +extern struct sock * tcp_create_openreq_child(struct sock *sk, + struct open_request *req, + struct sk_buff *skb); + +extern struct sock * tcp_v4_syn_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst); + +extern int tcp_v4_do_rcv(struct sock *sk, + struct sk_buff *skb); + +extern int tcp_v4_connect(struct sock *sk, + struct sockaddr *uaddr, + int addr_len); + +extern int tcp_connect(struct sock *sk); + +extern struct sk_buff * tcp_make_synack(struct sock *sk, + struct dst_entry *dst, + struct open_request *req); + +extern int tcp_disconnect(struct sock *sk, int flags); + +extern void tcp_unhash(struct sock *sk); + +extern int tcp_v4_hash_connecting(struct sock *sk); + + +/* From syncookies.c */ +extern struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, + struct ip_options *opt); +extern __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, + __u16 *mss); + +/* tcp_output.c */ + +extern int tcp_write_xmit(struct sock *, int nonagle); +extern int tcp_retransmit_skb(struct sock *, struct sk_buff *); +extern void tcp_xmit_retransmit_queue(struct sock *); +extern void tcp_simple_retransmit(struct sock *); + +extern void tcp_send_probe0(struct sock *); +extern void tcp_send_partial(struct sock *); +extern int tcp_write_wakeup(struct sock *); +extern void tcp_send_fin(struct sock *sk); +extern void tcp_send_active_reset(struct sock *sk, int priority); +extern int tcp_send_synack(struct sock *); +extern int tcp_transmit_skb(struct sock *, struct sk_buff *); +extern void tcp_send_skb(struct sock *, struct sk_buff *, int force_queue, unsigned mss_now); +extern void tcp_push_one(struct sock *, unsigned mss_now); +extern void tcp_send_ack(struct sock *sk); +extern void tcp_send_delayed_ack(struct sock *sk); + +/* tcp_timer.c */ +extern void tcp_init_xmit_timers(struct sock *); +extern void tcp_clear_xmit_timers(struct sock *); + +extern void tcp_delete_keepalive_timer (struct sock *); +extern void tcp_reset_keepalive_timer (struct sock *, unsigned long); +extern int tcp_sync_mss(struct sock *sk, u32 pmtu); + +extern const char timer_bug_msg[]; + +/* Read 'sendfile()'-style from a TCP socket */ +typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, + unsigned int, size_t); +extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); + +static inline void tcp_clear_xmit_timer(struct sock *sk, int what) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + switch (what) { + case TCP_TIME_RETRANS: + case TCP_TIME_PROBE0: + tp->pending = 0; + +#ifdef TCP_CLEAR_TIMERS + if (timer_pending(&tp->retransmit_timer) && + del_timer(&tp->retransmit_timer)) + __sock_put(sk); +#endif + break; + case TCP_TIME_DACK: + tp->ack.blocked = 0; + tp->ack.pending = 0; + +#ifdef TCP_CLEAR_TIMERS + if (timer_pending(&tp->delack_timer) && + del_timer(&tp->delack_timer)) + __sock_put(sk); +#endif + break; + default: + printk(timer_bug_msg); + return; + }; +#endif +} + +/* + * Reset the retransmission timer + */ +static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if (when > TCP_RTO_MAX) { +#ifdef TCP_DEBUG + printk(KERN_DEBUG "reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk, what, when, current_text_addr()); +#endif + when = TCP_RTO_MAX; + } + + switch (what) { + case TCP_TIME_RETRANS: + case TCP_TIME_PROBE0: + tp->pending = what; + tp->timeout = jiffies+when; + if (!mod_timer(&tp->retransmit_timer, tp->timeout)) + sock_hold(sk); + break; + + case TCP_TIME_DACK: + tp->ack.pending |= TCP_ACK_TIMER; + tp->ack.timeout = jiffies+when; + if (!mod_timer(&tp->delack_timer, tp->ack.timeout)) + sock_hold(sk); + break; + + default: + printk(KERN_DEBUG "bug: unknown timer value\n"); + }; +#endif +} + +/* Compute the current effective MSS, taking SACKs and IP options, + * and even PMTU discovery events into account. + */ + +static __inline__ unsigned int tcp_current_mss(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct dst_entry *dst = __sk_dst_get(sk); + int mss_now = tp->mss_cache; + + if (dst && dst->pmtu != tp->pmtu_cookie) + mss_now = tcp_sync_mss(sk, dst->pmtu); + + if (tp->eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); + return mss_now; +#else + return 0; +#endif +} + +/* Initialize RCV_MSS value. + * RCV_MSS is an our guess about MSS used by the peer. + * We haven't any direct information about the MSS. + * It's better to underestimate the RCV_MSS rather than overestimate. + * Overestimations make us ACKing less frequently than needed. + * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). + */ + +static inline void tcp_initialize_rcv_mss(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + unsigned int hint = min(tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, TCP_MIN_RCVMSS); + hint = max(hint, TCP_MIN_MSS); + + tp->ack.rcv_mss = hint; +#endif +} + +static __inline__ void __tcp_fast_path_on(struct tcp_opt *tp, u32 snd_wnd) +{ +#if 0 + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + ntohl(TCP_FLAG_ACK) | + snd_wnd); +#endif +} + +static __inline__ void tcp_fast_path_on(struct tcp_opt *tp) +{ +#if 0 + __tcp_fast_path_on(tp, tp->snd_wnd>>tp->snd_wscale); +#endif +} + +static inline void tcp_fast_path_check(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + if (skb_queue_len(&tp->out_of_order_queue) == 0 && + tp->rcv_wnd && + atomic_read(&sk->rmem_alloc) < sk->rcvbuf && + !tp->urg_data) + tcp_fast_path_on(tp); +#endif +} + +/* Compute the actual receive window we are currently advertising. + * Rcv_nxt can be after the window if our peer push more data + * than the offered window. + */ +static __inline__ u32 tcp_receive_window(struct tcp_opt *tp) +{ +#if 0 + s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; + + if (win < 0) + win = 0; + return (u32) win; +#else + return 0; +#endif +} + +/* Choose a new window, without checks for shrinking, and without + * scaling applied to the result. The caller does these things + * if necessary. This is a "raw" window selection. + */ +extern u32 __tcp_select_window(struct sock *sk); + +/* TCP timestamps are only 32-bits, this causes a slight + * complication on 64-bit systems since we store a snapshot + * of jiffies in the buffer control blocks below. We decidely + * only use of the low 32-bits of jiffies and hide the ugly + * casts with the following macro. + */ +#define tcp_time_stamp ((__u32)(jiffies)) + +/* This is what the send packet queueing engine uses to pass + * TCP per-packet control information to the transmission + * code. We also store the host-order sequence numbers in + * here too. This is 36 bytes on 32-bit architectures, + * 40 bytes on 64-bit machines, if this grows please adjust + * skbuff.h:skbuff->cb[xxx] size appropriately. + */ +struct tcp_skb_cb { + union { +#if 0 + struct inet_skb_parm h4; +#endif +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + struct inet6_skb_parm h6; +#endif + } header; /* For incoming frames */ + __u32 seq; /* Starting sequence number */ + __u32 end_seq; /* SEQ + FIN + SYN + datalen */ + __u32 when; /* used to compute rtt's */ + __u8 flags; /* TCP header flags. */ + + /* NOTE: These must match up to the flags byte in a + * real TCP header. + */ +#define TCPCB_FLAG_FIN 0x01 +#define TCPCB_FLAG_SYN 0x02 +#define TCPCB_FLAG_RST 0x04 +#define TCPCB_FLAG_PSH 0x08 +#define TCPCB_FLAG_ACK 0x10 +#define TCPCB_FLAG_URG 0x20 +#define TCPCB_FLAG_ECE 0x40 +#define TCPCB_FLAG_CWR 0x80 + + __u8 sacked; /* State flags for SACK/FACK. */ +#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ +#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ +#define TCPCB_LOST 0x04 /* SKB is lost */ +#define TCPCB_TAGBITS 0x07 /* All tag bits */ + +#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ +#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS) + +#define TCPCB_URG 0x20 /* Urgent pointer advenced here */ + +#define TCPCB_AT_TAIL (TCPCB_URG) + + __u16 urg_ptr; /* Valid w/URG flags is set. */ + __u32 ack_seq; /* Sequence number ACK'd */ +}; + +#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) + +#define for_retrans_queue(skb, sk, tp) \ + for (skb = (sk)->write_queue.next; \ + (skb != (tp)->send_head) && \ + (skb != (struct sk_buff *)&(sk)->write_queue); \ + skb=skb->next) + + +//#include + + +/* + * Compute minimal free write space needed to queue new packets. + */ +static inline int tcp_min_write_space(struct sock *sk) +{ +#if 0 + return sk->wmem_queued/2; +#else +return 0; +#endif +} + +static inline int tcp_wspace(struct sock *sk) +{ +#if 0 + return sk->sndbuf - sk->wmem_queued; +#else +return 0; +#endif +} + + +/* This determines how many packets are "in the network" to the best + * of our knowledge. In many cases it is conservative, but where + * detailed information is available from the receiver (via SACK + * blocks etc.) we can make more aggressive calculations. + * + * Use this for decisions involving congestion control, use just + * tp->packets_out to determine if the send queue is empty or not. + * + * Read this equation as: + * + * "Packets sent once on transmission queue" MINUS + * "Packets left network, but not honestly ACKed yet" PLUS + * "Packets fast retransmitted" + */ +static __inline__ unsigned int tcp_packets_in_flight(struct tcp_opt *tp) +{ +#if 0 + return tp->packets_out - tp->left_out + tp->retrans_out; +#else + return 0; +#endif +} + +/* Recalculate snd_ssthresh, we want to set it to: + * + * one half the current congestion window, but no + * less than two segments + */ +static inline __u32 tcp_recalc_ssthresh(struct tcp_opt *tp) +{ +#if 0 + return max(tp->snd_cwnd >> 1U, 2U); +#else + return 0; +#endif +} + +/* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. + * The exception is rate halving phase, when cwnd is decreasing towards + * ssthresh. + */ +static inline __u32 tcp_current_ssthresh(struct tcp_opt *tp) +{ +#if 0 + if ((1<ca_state)&(TCPF_CA_CWR|TCPF_CA_Recovery)) + return tp->snd_ssthresh; + else + return max(tp->snd_ssthresh, + ((tp->snd_cwnd >> 1) + + (tp->snd_cwnd >> 2))); +#else + return 0; +#endif +} + +static inline void tcp_sync_left_out(struct tcp_opt *tp) +{ +#if 0 + if (tp->sack_ok && tp->sacked_out >= tp->packets_out - tp->lost_out) + tp->sacked_out = tp->packets_out - tp->lost_out; + tp->left_out = tp->sacked_out + tp->lost_out; +#endif +} + +extern void tcp_cwnd_application_limited(struct sock *sk); + +/* Congestion window validation. (RFC2861) */ + +static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + if (tp->packets_out >= tp->snd_cwnd) { + /* Network is feed fully. */ + tp->snd_cwnd_used = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + } else { + /* Network starves. */ + if (tp->packets_out > tp->snd_cwnd_used) + tp->snd_cwnd_used = tp->packets_out; + + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + tcp_cwnd_application_limited(sk); + } +#endif +} + +/* Set slow start threshould and cwnd not falling to slow start */ +static inline void __tcp_enter_cwr(struct tcp_opt *tp) +{ +#if 0 + tp->undo_marker = 0; + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp) + 1U); + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; + tp->snd_cwnd_stamp = tcp_time_stamp; + TCP_ECN_queue_cwr(tp); +#endif +} + +static inline void tcp_enter_cwr(struct tcp_opt *tp) +{ +#if 0 + tp->prior_ssthresh = 0; + if (tp->ca_state < TCP_CA_CWR) { + __tcp_enter_cwr(tp); + tp->ca_state = TCP_CA_CWR; + } +#endif +} + +extern __u32 tcp_init_cwnd(struct tcp_opt *tp); + +/* Slow start with delack produces 3 packets of burst, so that + * it is safe "de facto". + */ +static __inline__ __u32 tcp_max_burst(struct tcp_opt *tp) +{ + return 3; +} + +static __inline__ int tcp_minshall_check(struct tcp_opt *tp) +{ +#if 0 + return after(tp->snd_sml,tp->snd_una) && + !after(tp->snd_sml, tp->snd_nxt); +#else + return 0; +#endif +} + +static __inline__ void tcp_minshall_update(struct tcp_opt *tp, int mss, struct sk_buff *skb) +{ +#if 0 + if (skb->len < mss) + tp->snd_sml = TCP_SKB_CB(skb)->end_seq; +#endif +} + +/* Return 0, if packet can be sent now without violation Nagle's rules: + 1. It is full sized. + 2. Or it contains FIN. + 3. Or TCP_NODELAY was set. + 4. Or TCP_CORK is not set, and all sent packets are ACKed. + With Minshall's modification: all sent small packets are ACKed. + */ + +static __inline__ int +tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int nonagle) +{ +#if 0 + return (skb->len < mss_now && + !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + (nonagle == 2 || + (!nonagle && + tp->packets_out && + tcp_minshall_check(tp)))); +#else + return 0; +#endif +} + +/* This checks if the data bearing packet SKB (usually tp->send_head) + * should be put on the wire right now. + */ +static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb, + unsigned cur_mss, int nonagle) +{ +#if 0 + /* RFC 1122 - section 4.2.3.4 + * + * We must queue if + * + * a) The right edge of this frame exceeds the window + * b) There are packets in flight and we have a small segment + * [SWS avoidance and Nagle algorithm] + * (part of SWS is done on packetization) + * Minshall version sounds: there are no _small_ + * segments in flight. (tcp_nagle_check) + * c) We have too many packets 'in flight' + * + * Don't use the nagle rule for urgent data (or + * for the final FIN -DaveM). + * + * Also, Nagle rule does not apply to frames, which + * sit in the middle of queue (they have no chances + * to get new data) and if room at tail of skb is + * not enough to save something seriously (<32 for now). + */ + + /* Don't be strict about the congestion window for the + * final FIN frame. -DaveM + */ + return ((nonagle==1 || tp->urg_mode + || !tcp_nagle_check(tp, skb, cur_mss, nonagle)) && + ((tcp_packets_in_flight(tp) < tp->snd_cwnd) || + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) && + !after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd)); +#else + return 0; +#endif +} + +static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + if (!tp->packets_out && !tp->pending) + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto); +#endif +} + +static __inline__ int tcp_skb_is_last(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + return (skb->next == (struct sk_buff*)&sk->write_queue); +#else + return 0; +#endif +} + +/* Push out any pending frames which were held back due to + * TCP_CORK or attempt at coalescing tiny packets. + * The socket must be locked by the caller. + */ +static __inline__ void __tcp_push_pending_frames(struct sock *sk, + struct tcp_opt *tp, + unsigned cur_mss, + int nonagle) +{ +#if 0 + struct sk_buff *skb = tp->send_head; + + if (skb) { + if (!tcp_skb_is_last(sk, skb)) + nonagle = 1; + if (!tcp_snd_test(tp, skb, cur_mss, nonagle) || + tcp_write_xmit(sk, nonagle)) + tcp_check_probe_timer(sk, tp); + } + tcp_cwnd_validate(sk, tp); +#endif +} + +static __inline__ void tcp_push_pending_frames(struct sock *sk, + struct tcp_opt *tp) +{ +#if 0 + __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk), tp->nonagle); +#endif +} + +static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + struct sk_buff *skb = tp->send_head; + + return (skb && + tcp_snd_test(tp, skb, tcp_current_mss(sk), + tcp_skb_is_last(sk, skb) ? 1 : tp->nonagle)); +#else + return 0; +#endif +} + +static __inline__ void tcp_init_wl(struct tcp_opt *tp, u32 ack, u32 seq) +{ +#if 0 + tp->snd_wl1 = seq; +#endif +} + +static __inline__ void tcp_update_wl(struct tcp_opt *tp, u32 ack, u32 seq) +{ +#if 0 + tp->snd_wl1 = seq; +#endif +} + +extern void tcp_destroy_sock(struct sock *sk); + + +/* + * Calculate(/check) TCP checksum + */ +static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len, + unsigned long saddr, unsigned long daddr, + unsigned long base) +{ +#if 0 + return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); +#else + return 0; +#endif +} + +static __inline__ int __tcp_checksum_complete(struct sk_buff *skb) +{ +#if 0 + return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); +#else + return 0; +#endif +} + +static __inline__ int tcp_checksum_complete(struct sk_buff *skb) +{ +#if 0 + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __tcp_checksum_complete(skb); +#else + return 0; +#endif +} + +/* Prequeue for VJ style copy to user, combined with checksumming. */ + +static __inline__ void tcp_prequeue_init(struct tcp_opt *tp) +{ +#if 0 + tp->ucopy.task = NULL; + tp->ucopy.len = 0; + tp->ucopy.memory = 0; + skb_queue_head_init(&tp->ucopy.prequeue); +#endif +} + +/* Packet is added to VJ-style prequeue for processing in process + * context, if a reader task is waiting. Apparently, this exciting + * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93) + * failed somewhere. Latency? Burstiness? Well, at least now we will + * see, why it failed. 8)8) --ANK + * + * NOTE: is this not too big to inline? + */ +static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if (tp->ucopy.task) { + __skb_queue_tail(&tp->ucopy.prequeue, skb); + tp->ucopy.memory += skb->truesize; + if (tp->ucopy.memory > sk->rcvbuf) { + struct sk_buff *skb1; + + if (sk->lock.users) + out_of_line_bug(); + + while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { + sk->backlog_rcv(sk, skb1); + NET_INC_STATS_BH(TCPPrequeueDropped); + } + + tp->ucopy.memory = 0; + } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { + wake_up_interruptible(sk->sleep); + if (!tcp_ack_scheduled(tp)) + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, (3*TCP_RTO_MIN)/4); + } + return 1; + } + return 0; +#else + return 0; +#endif +} + + +#undef STATE_TRACE + +#ifdef STATE_TRACE +static char *statename[]={ + "Unused","Established","Syn Sent","Syn Recv", + "Fin Wait 1","Fin Wait 2","Time Wait", "Close", + "Close Wait","Last ACK","Listen","Closing" +}; +#endif + +static __inline__ void tcp_set_state(struct sock *sk, int state) +{ +#if 0 + int oldstate = sk->state; + + switch (state) { + case TCP_ESTABLISHED: + if (oldstate != TCP_ESTABLISHED) + TCP_INC_STATS(TcpCurrEstab); + break; + + case TCP_CLOSE: + sk->prot->unhash(sk); + if (sk->prev && !(sk->userlocks&SOCK_BINDPORT_LOCK)) + tcp_put_port(sk); + /* fall through */ + default: + if (oldstate==TCP_ESTABLISHED) + tcp_statistics[smp_processor_id()*2+!in_softirq()].TcpCurrEstab--; + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->state = state; + +#ifdef STATE_TRACE + SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]); +#endif +#endif +} + +static __inline__ void tcp_done(struct sock *sk) +{ +#if 0 + tcp_set_state(sk, TCP_CLOSE); + tcp_clear_xmit_timers(sk); + + sk->shutdown = SHUTDOWN_MASK; + + if (!sk->dead) + sk->state_change(sk); + else + tcp_destroy_sock(sk); +#endif +} + +static __inline__ void tcp_sack_reset(struct tcp_opt *tp) +{ +#if 0 + tp->dsack = 0; + tp->eff_sacks = 0; + tp->num_sacks = 0; +#endif +} + +static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_opt *tp, __u32 tstamp) +{ +#if 0 + if (tp->tstamp_ok) { + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); + *ptr++ = htonl(tp->ts_recent); + } + if (tp->eff_sacks) { + struct tcp_sack_block *sp = tp->dsack ? tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK))); + for(this_sack = 0; this_sack < tp->eff_sacks; this_sack++) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + if (tp->dsack) { + tp->dsack = 0; + tp->eff_sacks--; + } + } +#endif +} + +/* Construct a tcp options header for a SYN or SYN_ACK packet. + * If this is every changed make sure to change the definition of + * MAX_SYN_SIZE to match the new maximum number of options that you + * can generate. + */ +static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, + int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent) +{ +#if 0 + /* We always get an MSS option. + * The option bytes which will be seen in normal data + * packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so + * that calculations in tcp_sendmsg are simpler etc. + * So account for this fact here if necessary. If we + * don't do this correctly, as a receiver we won't + * recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK + * rules correctly. + * SACKs don't matter, we never delay an ACK when we + * have any of those going out. + */ + *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + if (ts) { + if(sack) + *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + else + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); /* TSVAL */ + *ptr++ = htonl(ts_recent); /* TSECR */ + } else if(sack) + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + if (offer_wscale) + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); +#endif +} + +/* Determine a window scaling and initial window to offer. + * Based on the assumption that the given amount of space + * will be offered. Store the results in the tp structure. + * NOTE: for smooth operation initial space offering should + * be a multiple of mss if possible. We assume here that mss >= 1. + * This MUST be enforced by all callers. + */ +static inline void tcp_select_initial_window(int __space, __u32 mss, + __u32 *rcv_wnd, + __u32 *window_clamp, + int wscale_ok, + __u8 *rcv_wscale) +{ +#if 0 + unsigned int space = (__space < 0 ? 0 : __space); + + /* If no clamp set the clamp to the max possible scaled window */ + if (*window_clamp == 0) + (*window_clamp) = (65535 << 14); + space = min(*window_clamp, space); + + /* Quantize space offering to a multiple of mss if possible. */ + if (space > mss) + space = (space / mss) * mss; + + /* NOTE: offering an initial window larger than 32767 + * will break some buggy TCP stacks. We try to be nice. + * If we are not window scaling, then this truncates + * our initial window offering to 32k. There should also + * be a sysctl option to stop being nice. + */ + (*rcv_wnd) = min(space, MAX_TCP_WINDOW); + (*rcv_wscale) = 0; + if (wscale_ok) { + /* See RFC1323 for an explanation of the limit to 14 */ + while (space > 65535 && (*rcv_wscale) < 14) { + space >>= 1; + (*rcv_wscale)++; + } + if (*rcv_wscale && sysctl_tcp_app_win && space>=mss && + space - max((space>>sysctl_tcp_app_win), mss>>*rcv_wscale) < 65536/2) + (*rcv_wscale)--; + } + + /* Set initial window to value enough for senders, + * following RFC1414. Senders, not following this RFC, + * will be satisfied with 2. + */ + if (mss > (1<<*rcv_wscale)) { + int init_cwnd = 4; + if (mss > 1460*3) + init_cwnd = 2; + else if (mss > 1460) + init_cwnd = 3; + if (*rcv_wnd > init_cwnd*mss) + *rcv_wnd = init_cwnd*mss; + } + /* Set the clamp no higher than max representable value */ + (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); +#endif +} + +static inline int tcp_win_from_space(int space) +{ +#if 0 + return sysctl_tcp_adv_win_scale<=0 ? + (space>>(-sysctl_tcp_adv_win_scale)) : + space - (space>>sysctl_tcp_adv_win_scale); +#else + return 0; +#endif +} + +/* Note: caller must be prepared to deal with negative returns */ +static inline int tcp_space(struct sock *sk) +{ +#if 0 + return tcp_win_from_space(sk->rcvbuf - atomic_read(&sk->rmem_alloc)); +#else + return 0; +#endif +} + +static inline int tcp_full_space( struct sock *sk) +{ +#if 0 + return tcp_win_from_space(sk->rcvbuf); +#else + return 0; +#endif +} + +static inline void tcp_acceptq_removed(struct sock *sk) +{ +#if 0 + sk->ack_backlog--; +#endif +} + +static inline void tcp_acceptq_added(struct sock *sk) +{ +#if 0 + sk->ack_backlog++; +#endif +} + +static inline int tcp_acceptq_is_full(struct sock *sk) +{ +#if 0 + return sk->ack_backlog > sk->max_ack_backlog; +#else + return 0; +#endif +} + +static inline void tcp_acceptq_queue(struct sock *sk, struct open_request *req, + struct sock *child) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + req->sk = child; + tcp_acceptq_added(sk); + + if (!tp->accept_queue_tail) { + tp->accept_queue = req; + } else { + tp->accept_queue_tail->dl_next = req; + } + tp->accept_queue_tail = req; + req->dl_next = NULL; +#endif +} + +struct tcp_listen_opt +{ + u8 max_qlen_log; /* log_2 of maximal queued SYNs */ + int qlen; + int qlen_young; + int clock_hand; + struct open_request *syn_table[TCP_SYNQ_HSIZE]; +}; + +static inline void +tcp_synq_removed(struct sock *sk, struct open_request *req) +{ +#if 0 + struct tcp_listen_opt *lopt = sk->tp_pinfo.af_tcp.listen_opt; + + if (--lopt->qlen == 0) + tcp_delete_keepalive_timer(sk); + if (req->retrans == 0) + lopt->qlen_young--; +#endif +} + +static inline void tcp_synq_added(struct sock *sk) +{ +#if 0 + struct tcp_listen_opt *lopt = sk->tp_pinfo.af_tcp.listen_opt; + + if (lopt->qlen++ == 0) + tcp_reset_keepalive_timer(sk, TCP_TIMEOUT_INIT); + lopt->qlen_young++; +#endif +} + +static inline int tcp_synq_len(struct sock *sk) +{ +#if 0 + return sk->tp_pinfo.af_tcp.listen_opt->qlen; +#else + return 0; +#endif +} + +static inline int tcp_synq_young(struct sock *sk) +{ +#if 0 + return sk->tp_pinfo.af_tcp.listen_opt->qlen_young; +#else + return 0; +#endif +} + +static inline int tcp_synq_is_full(struct sock *sk) +{ +#if 0 + return tcp_synq_len(sk)>>sk->tp_pinfo.af_tcp.listen_opt->max_qlen_log; +#else + return 0; +#endif +} + +static inline void tcp_synq_unlink(struct tcp_opt *tp, struct open_request *req, + struct open_request **prev) +{ +#if 0 + write_lock(&tp->syn_wait_lock); + *prev = req->dl_next; + write_unlock(&tp->syn_wait_lock); +#endif +} + +static inline void tcp_synq_drop(struct sock *sk, struct open_request *req, + struct open_request **prev) +{ +#if 0 + tcp_synq_unlink(&sk->tp_pinfo.af_tcp, req, prev); + tcp_synq_removed(sk, req); + tcp_openreq_free(req); +#endif +} + +static __inline__ void tcp_openreq_init(struct open_request *req, + struct tcp_opt *tp, + struct sk_buff *skb) +{ +#if 0 + req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */ + req->rcv_isn = TCP_SKB_CB(skb)->seq; + req->mss = tp->mss_clamp; + req->ts_recent = tp->saw_tstamp ? tp->rcv_tsval : 0; + req->tstamp_ok = tp->tstamp_ok; + req->sack_ok = tp->sack_ok; + req->snd_wscale = tp->snd_wscale; + req->wscale_ok = tp->wscale_ok; + req->acked = 0; + req->ecn_ok = 0; + req->rmt_port = skb->h.th->source; +#endif +} + +#define TCP_MEM_QUANTUM ((int)PAGE_SIZE) + +static inline void tcp_free_skb(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + sk->tp_pinfo.af_tcp.queue_shrunk = 1; + sk->wmem_queued -= skb->truesize; + sk->forward_alloc += skb->truesize; + __kfree_skb(skb); +#endif +} + +static inline void tcp_charge_skb(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + sk->wmem_queued += skb->truesize; + sk->forward_alloc -= skb->truesize; +#endif +} + +extern void __tcp_mem_reclaim(struct sock *sk); +extern int tcp_mem_schedule(struct sock *sk, int size, int kind); + +static inline void tcp_mem_reclaim(struct sock *sk) +{ +#if 0 + if (sk->forward_alloc >= TCP_MEM_QUANTUM) + __tcp_mem_reclaim(sk); +#endif +} + +static inline void tcp_enter_memory_pressure(void) +{ +#if 0 + if (!tcp_memory_pressure) { + NET_INC_STATS(TCPMemoryPressures); + tcp_memory_pressure = 1; + } +#endif +} + +static inline void tcp_moderate_sndbuf(struct sock *sk) +{ +#if 0 + if (!(sk->userlocks&SOCK_SNDBUF_LOCK)) { + sk->sndbuf = min(sk->sndbuf, sk->wmem_queued/2); + sk->sndbuf = max(sk->sndbuf, SOCK_MIN_SNDBUF); + } +#endif +} + +static inline struct sk_buff *tcp_alloc_pskb(struct sock *sk, int size, int mem, int gfp) +{ +#if 0 + struct sk_buff *skb = alloc_skb(size+MAX_TCP_HEADER, gfp); + + if (skb) { + skb->truesize += mem; + if (sk->forward_alloc >= (int)skb->truesize || + tcp_mem_schedule(sk, skb->truesize, 0)) { + skb_reserve(skb, MAX_TCP_HEADER); + return skb; + } + __kfree_skb(skb); + } else { + tcp_enter_memory_pressure(); + tcp_moderate_sndbuf(sk); + } + return NULL; +#else + return NULL; +#endif +} + +static inline struct sk_buff *tcp_alloc_skb(struct sock *sk, int size, int gfp) +{ +#if 0 + return tcp_alloc_pskb(sk, size, 0, gfp); +#else + return NULL; +#endif +} + +static inline struct page * tcp_alloc_page(struct sock *sk) +{ +#if 0 + if (sk->forward_alloc >= (int)PAGE_SIZE || + tcp_mem_schedule(sk, PAGE_SIZE, 0)) { + struct page *page = alloc_pages(sk->allocation, 0); + if (page) + return page; + } + tcp_enter_memory_pressure(); + tcp_moderate_sndbuf(sk); + return NULL; +#else + return NULL; +#endif +} + +static inline void tcp_writequeue_purge(struct sock *sk) +{ +#if 0 + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&sk->write_queue)) != NULL) + tcp_free_skb(sk, skb); + tcp_mem_reclaim(sk); +#endif +} + +extern void tcp_rfree(struct sk_buff *skb); + +static inline void tcp_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ +#if 0 + skb->sk = sk; + skb->destructor = tcp_rfree; + atomic_add(skb->truesize, &sk->rmem_alloc); + sk->forward_alloc -= skb->truesize; +#endif +} + +extern void tcp_listen_wlock(void); + +/* - We may sleep inside this lock. + * - If sleeping is not required (or called from BH), + * use plain read_(un)lock(&tcp_lhash_lock). + */ + +static inline void tcp_listen_lock(void) +{ +#if 0 + /* read_lock synchronizes to candidates to writers */ + read_lock(&tcp_lhash_lock); + atomic_inc(&tcp_lhash_users); + read_unlock(&tcp_lhash_lock); +#endif +} + +static inline void tcp_listen_unlock(void) +{ +#if 0 + if (atomic_dec_and_test(&tcp_lhash_users)) + wake_up(&tcp_lhash_wait); +#endif +} + +static inline int keepalive_intvl_when(struct tcp_opt *tp) +{ +#if 0 + return tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl; +#else + return 0; +#endif +} + +static inline int keepalive_time_when(struct tcp_opt *tp) +{ +#if 0 + return tp->keepalive_time ? : sysctl_tcp_keepalive_time; +#else + return 0; +#endif +} + +static inline int tcp_fin_time(struct tcp_opt *tp) +{ +#if 0 + int fin_timeout = tp->linger2 ? : sysctl_tcp_fin_timeout; + + if (fin_timeout < (tp->rto<<2) - (tp->rto>>1)) + fin_timeout = (tp->rto<<2) - (tp->rto>>1); + + return fin_timeout; +#else + return 0; +#endif +} + +static inline int tcp_paws_check(struct tcp_opt *tp, int rst) +{ +#if 0 + if ((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) + return 0; + if (xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) + return 0; + + /* RST segments are not recommended to carry timestamp, + and, if they do, it is recommended to ignore PAWS because + "their cleanup function should take precedence over timestamps." + Certainly, it is mistake. It is necessary to understand the reasons + of this constraint to relax it: if peer reboots, clock may go + out-of-sync and half-open connections will not be reset. + Actually, the problem would be not existing if all + the implementations followed draft about maintaining clock + via reboots. Linux-2.2 DOES NOT! + + However, we can relax time bounds for RST segments to MSL. + */ + if (rst && xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_MSL) + return 0; + return 1; +#else + return 0; +#endif +} + +#define TCP_CHECK_TIMER(sk) do { } while (0) + +#endif /* __TCPCORE_H */ + + +// +#endif +#endif diff --git a/reactos/drivers/net/tcpip/include/tcpdef.h b/reactos/drivers/net/tcpip/include/tcpdef.h new file mode 100755 index 00000000000..8005de2be6d --- /dev/null +++ b/reactos/drivers/net/tcpip/include/tcpdef.h @@ -0,0 +1,187 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP protocol. + * + * Version: @(#)tcp.h 1.0.2 04/28/93 + * + * Author: Fred N. van Kempen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _LINUX_TCP_H +#define _LINUX_TCP_H + +#include "linux.h" + +struct tcphdr { + __u16 source; + __u16 dest; + __u32 seq; + __u32 ack_seq; +//#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +/*#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif */ + __u16 window; + __u16 check; + __u16 urg_ptr; +}; + + +enum { + TCP_ESTABLISHED = 1, + TCP_SYN_SENT, + TCP_SYN_RECV, + TCP_FIN_WAIT1, + TCP_FIN_WAIT2, + TCP_TIME_WAIT, + TCP_CLOSE, + TCP_CLOSE_WAIT, + TCP_LAST_ACK, + TCP_LISTEN, + TCP_CLOSING, /* now a valid state */ + + TCP_MAX_STATES /* Leave at the end! */ +}; + +#define TCP_STATE_MASK 0xF +#define TCP_ACTION_FIN (1 << 7) + +enum { + TCPF_ESTABLISHED = (1 << 1), + TCPF_SYN_SENT = (1 << 2), + TCPF_SYN_RECV = (1 << 3), + TCPF_FIN_WAIT1 = (1 << 4), + TCPF_FIN_WAIT2 = (1 << 5), + TCPF_TIME_WAIT = (1 << 6), + TCPF_CLOSE = (1 << 7), + TCPF_CLOSE_WAIT = (1 << 8), + TCPF_LAST_ACK = (1 << 9), + TCPF_LISTEN = (1 << 10), + TCPF_CLOSING = (1 << 11) +}; + +/* + * The union cast uses a gcc extension to avoid aliasing problems + * (union is compatible to any of its members) + * This means this part of the code is -fstrict-aliasing safe now. + */ +union tcp_word_hdr { + struct tcphdr hdr; + __u32 words[5]; +}; + +#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) + +enum { + TCP_FLAG_CWR = 0x00800000, // __constant_htonl(0x00800000), + TCP_FLAG_ECE = 0x00400000, //__constant_htonl(0x00400000), + TCP_FLAG_URG = 0x00200000, //__constant_htonl(0x00200000), + TCP_FLAG_ACK = 0x00100000, //__constant_htonl(0x00100000), + TCP_FLAG_PSH = 0x00080000, //__constant_htonl(0x00080000), + TCP_FLAG_RST = 0x00040000, //__constant_htonl(0x00040000), + TCP_FLAG_SYN = 0x00020000, //__constant_htonl(0x00020000), + TCP_FLAG_FIN = 0x00010000, //__constant_htonl(0x00010000), + TCP_RESERVED_BITS = 0x0F000000, //__constant_htonl(0x0F000000), + TCP_DATA_OFFSET = 0xF0000000, //__constant_htonl(0xF0000000) +}; + +/* TCP socket options */ +#define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */ +#define TCP_MAXSEG 2 /* Limit MSS */ +#define TCP_CORK 3 /* Never send partially complete segments */ +#define TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define TCP_INFO 11 /* Information about this connection. */ +#define TCP_QUICKACK 12 /* Block/reenable quick acks */ + +#define TCPI_OPT_TIMESTAMPS 1 +#define TCPI_OPT_SACK 2 +#define TCPI_OPT_WSCALE 4 +#define TCPI_OPT_ECN 8 + +enum tcp_ca_state +{ + TCP_CA_Open = 0, +#define TCPF_CA_Open (1< + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +/* + * Changes: + * Pedro Roque : Fast Retransmit/Recovery. + * Two receive queues. + * Retransmit queue handled by TCP. + * Better retransmit timer handling. + * New congestion avoidance. + * Header prediction. + * Variable renaming. + * + * Eric : Fast Retransmit. + * Randy Scott : MSS option defines. + * Eric Schenk : Fixes to slow start algorithm. + * Eric Schenk : Yet another double ACK bug. + * Eric Schenk : Delayed ACK bug fixes. + * Eric Schenk : Floyd style fast retrans war avoidance. + * David S. Miller : Don't allow zero congestion window. + * Eric Schenk : Fix retransmitter so that it sends + * next packet on ack of previous packet. + * Andi Kleen : Moved open_request checking here + * and process RSTs for open_requests. + * Andi Kleen : Better prune_queue, and other fixes. + * Andrey Savochkin: Fix RTT measurements in the presnce of + * timestamps. + * Andrey Savochkin: Check sequence numbers correctly when + * removing SACKs due to in sequence incoming + * data segments. + * Andi Kleen: Make sure we never ack data there is not + * enough room for. Also make this condition + * a fatal error if it might still happen. + * Andi Kleen: Add tcp_measure_rcv_mss to make + * connections with MSS +#include +#include +#include +#include +#include +#else +#include "linux.h" +#include "tcpcore.h" +#endif + +int sysctl_tcp_timestamps = 1; +int sysctl_tcp_window_scaling = 1; +int sysctl_tcp_sack = 1; +int sysctl_tcp_fack = 1; +int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; +#ifdef CONFIG_INET_ECN +int sysctl_tcp_ecn = 1; +#else +int sysctl_tcp_ecn = 0; +#endif +int sysctl_tcp_dsack = 1; +int sysctl_tcp_app_win = 31; +int sysctl_tcp_adv_win_scale = 2; + +int sysctl_tcp_stdurg = 0; +int sysctl_tcp_rfc1337 = 0; +//int sysctl_tcp_max_orphans = NR_FILE; + +#define FLAG_DATA 0x01 /* Incoming frame contained data. */ +#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ +#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */ +#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ +#define FLAG_DATA_SACKED 0x20 /* New SACK. */ +#define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ + +#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) +#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) +#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) + +#define IsReno(tp) ((tp)->sack_ok == 0) +#define IsFack(tp) ((tp)->sack_ok & 2) +#define IsDSack(tp) ((tp)->sack_ok & 4) + +#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) + +/* Adapt the MSS value used to make delayed ack decision to the + * real world. + */ +static __inline__ void tcp_measure_rcv_mss(struct tcp_opt *tp, struct sk_buff *skb) +{ +#if 0 + unsigned int len, lss; + + lss = tp->ack.last_seg_size; + tp->ack.last_seg_size = 0; + + /* skb->len may jitter because of SACKs, even if peer + * sends good full-sized frames. + */ + len = skb->len; + if (len >= tp->ack.rcv_mss) { + tp->ack.rcv_mss = len; + } else { + /* Otherwise, we make more careful check taking into account, + * that SACKs block is variable. + * + * "len" is invariant segment length, including TCP header. + */ + len += skb->data - skb->h.raw; + if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || + /* If PSH is not set, packet should be + * full sized, provided peer TCP is not badly broken. + * This observation (if it is correct 8)) allows + * to handle super-low mtu links fairly. + */ + (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && + !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { + /* Subtract also invariant (if peer is RFC compliant), + * tcp header plus fixed timestamp option length. + * Resulting "len" is MSS free of SACK jitter. + */ + len -= tp->tcp_header_len; + tp->ack.last_seg_size = len; + if (len == lss) { + tp->ack.rcv_mss = len; + return; + } + } + tp->ack.pending |= TCP_ACK_PUSHED; + } +#endif +} + +static void tcp_incr_quickack(struct tcp_opt *tp) +{ +#if 0 + unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); + + if (quickacks==0) + quickacks=2; + if (quickacks > tp->ack.quick) + tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); +#endif +} + +void tcp_enter_quickack_mode(struct tcp_opt *tp) +{ +#if 0 + tcp_incr_quickack(tp); + tp->ack.pingpong = 0; + tp->ack.ato = TCP_ATO_MIN; +#endif +} + +/* Send ACKs quickly, if "quick" count is not exhausted + * and the session is not interactive. + */ + +static __inline__ int tcp_in_quickack_mode(struct tcp_opt *tp) +{ +#if 0 + return (tp->ack.quick && !tp->ack.pingpong); +#else + return 0; +#endif +} + +/* Buffer size and advertised window tuning. + * + * 1. Tuning sk->sndbuf, when connection enters established state. + */ + +static void tcp_fixup_sndbuf(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int sndmem = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff); + + if (sk->sndbuf < 3*sndmem) + sk->sndbuf = min(3*sndmem, sysctl_tcp_wmem[2]); +#endif +} + +/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) + * + * All tcp_full_space() is split to two parts: "network" buffer, allocated + * forward and advertised in receiver window (tp->rcv_wnd) and + * "application buffer", required to isolate scheduling/application + * latencies from network. + * window_clamp is maximal advertised window. It can be less than + * tcp_full_space(), in this case tcp_full_space() - window_clamp + * is reserved for "application" buffer. The less window_clamp is + * the smoother our behaviour from viewpoint of network, but the lower + * throughput and the higher sensitivity of the connection to losses. 8) + * + * rcv_ssthresh is more strict window_clamp used at "slow start" + * phase to predict further behaviour of this connection. + * It is used for two goals: + * - to enforce header prediction at sender, even when application + * requires some significant "application buffer". It is check #1. + * - to prevent pruning of receive queue because of misprediction + * of receiver window. Check #2. + * + * The scheme does not work when sender sends good segments opening + * window and then starts to feed us spagetti. But it should work + * in common situations. Otherwise, we have to rely on queue collapsing. + */ + +/* Slow part of check#2. */ +static int +__tcp_grow_window(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) +{ +#if 0 + /* Optimize this! */ + int truesize = tcp_win_from_space(skb->truesize)/2; + int window = tcp_full_space(sk)/2; + + while (tp->rcv_ssthresh <= window) { + if (truesize <= skb->len) + return 2*tp->ack.rcv_mss; + + truesize >>= 1; + window >>= 1; + } + return 0; +#else + return 0; +#endif +} + +static __inline__ void +tcp_grow_window(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) +{ +#if 0 + /* Check #1 */ + if (tp->rcv_ssthresh < tp->window_clamp && + (int)tp->rcv_ssthresh < tcp_space(sk) && + !tcp_memory_pressure) { + int incr; + + /* Check #2. Increase window, if skb with such overhead + * will fit to rcvbuf in future. + */ + if (tcp_win_from_space(skb->truesize) <= skb->len) + incr = 2*tp->advmss; + else + incr = __tcp_grow_window(sk, tp, skb); + + if (incr) { + tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); + tp->ack.quick |= 1; + } + } +#endif +} + +/* 3. Tuning rcvbuf, when connection enters established state. */ + +static void tcp_fixup_rcvbuf(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int rcvmem = tp->advmss+MAX_TCP_HEADER+16+sizeof(struct sk_buff); + + /* Try to select rcvbuf so that 4 mss-sized segments + * will fit to window and correspoding skbs will fit to our rcvbuf. + * (was 3; 4 is minimum to allow fast retransmit to work.) + */ + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + if (sk->rcvbuf < 4*rcvmem) + sk->rcvbuf = min(4*rcvmem, sysctl_tcp_rmem[2]); +#endif +} + +/* 4. Try to fixup all. It is made iimediately after connection enters + * established state. + */ +static void tcp_init_buffer_space(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int maxwin; + + if (!(sk->userlocks&SOCK_RCVBUF_LOCK)) + tcp_fixup_rcvbuf(sk); + if (!(sk->userlocks&SOCK_SNDBUF_LOCK)) + tcp_fixup_sndbuf(sk); + + maxwin = tcp_full_space(sk); + + if (tp->window_clamp >= maxwin) { + tp->window_clamp = maxwin; + + if (sysctl_tcp_app_win && maxwin>4*tp->advmss) + tp->window_clamp = max(maxwin-(maxwin>>sysctl_tcp_app_win), 4*tp->advmss); + } + + /* Force reservation of one segment. */ + if (sysctl_tcp_app_win && + tp->window_clamp > 2*tp->advmss && + tp->window_clamp + tp->advmss > maxwin) + tp->window_clamp = max(2*tp->advmss, maxwin-tp->advmss); + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +#endif +} + +/* 5. Recalculate window clamp after socket hit its memory bounds. */ +static void tcp_clamp_window(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + struct sk_buff *skb; + unsigned int app_win = tp->rcv_nxt - tp->copied_seq; + int ofo_win = 0; + + tp->ack.quick = 0; + + skb_queue_walk(&tp->out_of_order_queue, skb) { + ofo_win += skb->len; + } + + /* If overcommit is due to out of order segments, + * do not clamp window. Try to expand rcvbuf instead. + */ + if (ofo_win) { + if (sk->rcvbuf < sysctl_tcp_rmem[2] && + !(sk->userlocks&SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) + sk->rcvbuf = min(atomic_read(&sk->rmem_alloc), sysctl_tcp_rmem[2]); + } + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) { + app_win += ofo_win; + if (atomic_read(&sk->rmem_alloc) >= 2*sk->rcvbuf) + app_win >>= 1; + if (app_win > tp->ack.rcv_mss) + app_win -= tp->ack.rcv_mss; + app_win = max(app_win, 2U*tp->advmss); + + if (!ofo_win) + tp->window_clamp = min(tp->window_clamp, app_win); + tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); + } +#endif +} + +/* There is something which you must keep in mind when you analyze the + * behavior of the tp->ato delayed ack timeout interval. When a + * connection starts up, we want to ack as quickly as possible. The + * problem is that "good" TCP's do slow start at the beginning of data + * transmission. The means that until we send the first few ACK's the + * sender will sit on his end and only queue most of his data, because + * he can only send snd_cwnd unacked packets at any given time. For + * each ACK we send, he increments snd_cwnd and transmits more of his + * queue. -DaveM + */ +static void tcp_event_data_recv(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) +{ +#if 0 + u32 now; + + tcp_schedule_ack(tp); + + tcp_measure_rcv_mss(tp, skb); + + now = tcp_time_stamp; + + if (!tp->ack.ato) { + /* The _first_ data packet received, initialize + * delayed ACK engine. + */ + tcp_incr_quickack(tp); + tp->ack.ato = TCP_ATO_MIN; + } else { + int m = now - tp->ack.lrcvtime; + + if (m <= TCP_ATO_MIN/2) { + /* The fastest case is the first. */ + tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; + } else if (m < tp->ack.ato) { + tp->ack.ato = (tp->ack.ato>>1) + m; + if (tp->ack.ato > tp->rto) + tp->ack.ato = tp->rto; + } else if (m > tp->rto) { + /* Too long gap. Apparently sender falled to + * restart window, so that we send ACKs quickly. + */ + tcp_incr_quickack(tp); + tcp_mem_reclaim(sk); + } + } + tp->ack.lrcvtime = now; + + TCP_ECN_check_ce(tp, skb); + + if (skb->len >= 128) + tcp_grow_window(sk, tp, skb); +#endif +} + +/* Called to compute a smoothed rtt estimate. The data fed to this + * routine either comes from timestamps, or from segments that were + * known _not_ to have been retransmitted [see Karn/Partridge + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 + * piece by Van Jacobson. + * NOTE: the next three routines used to be one big routine. + * To save cycles in the RFC 1323 implementation it was better to break + * it up into three procedures. -- erics + */ +static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) +{ +#if 0 + long m = mrtt; /* RTT */ + + /* The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + * + * On a 1990 paper the rto value is changed to: + * RTO = rtt + 4 * mdev + * + * Funny. This algorithm seems to be very broken. + * These formulae increase RTO, when it should be decreased, increase + * too slowly, when it should be incresed fastly, decrease too fastly + * etc. I guess in BSD RTO takes ONE value, so that it is absolutely + * does not matter how to _calculate_ it. Seems, it was trap + * that VJ failed to avoid. 8) + */ + if(m == 0) + m = 1; + if (tp->srtt != 0) { + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) { + m = -m; /* m is now abs(error) */ + m -= (tp->mdev >> 2); /* similar update on mdev */ + /* This is similar to one of Eifel findings. + * Eifel blocks mdev updates when rtt decreases. + * This solution is a bit different: we use finer gain + * for mdev in this case (alpha*beta). + * Like Eifel it also prevents growth of rto, + * but also it limits too fast rto decreases, + * happening in pure Eifel. + */ + if (m > 0) + m >>= 3; + } else { + m -= (tp->mdev >> 2); /* similar update on mdev */ + } + tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev > tp->mdev_max) { + tp->mdev_max = tp->mdev; + if (tp->mdev_max > tp->rttvar) + tp->rttvar = tp->mdev_max; + } + if (after(tp->snd_una, tp->rtt_seq)) { + if (tp->mdev_max < tp->rttvar) + tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; + tp->rtt_seq = tp->snd_nxt; + tp->mdev_max = TCP_RTO_MIN; + } + } else { + /* no previous measure. */ + tp->srtt = m<<3; /* take the measured time to be rtt */ + tp->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->rtt_seq = tp->snd_nxt; + } +#endif +} + +/* Calculate rto without backoff. This is the second half of Van Jacobson's + * routine referred to above. + */ +static __inline__ void tcp_set_rto(struct tcp_opt *tp) +{ +#if 0 + /* Old crap is replaced with new one. 8) + * + * More seriously: + * 1. If rtt variance happened to be less 50msec, it is hallucination. + * It cannot be less due to utterly erratic ACK generation made + * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ + * to do with delayed acks, because at cwnd>2 true delack timeout + * is invisible. Actually, Linux-2.4 also generates erratic + * ACKs in some curcumstances. + */ + tp->rto = (tp->srtt >> 3) + tp->rttvar; + + /* 2. Fixups made earlier cannot be right. + * If we do not estimate RTO correctly without them, + * all the algo is pure shit and should be replaced + * with correct one. It is exaclty, which we pretend to do. + */ +#endif +} + +/* NOTE: clamping at TCP_RTO_MIN is not required, current algo + * guarantees that rto is higher. + */ +static __inline__ void tcp_bound_rto(struct tcp_opt *tp) +{ +#if 0 + if (tp->rto > TCP_RTO_MAX) + tp->rto = TCP_RTO_MAX; +#endif +} + +/* Save metrics learned by this TCP session. + This function is called only, when TCP finishes successfully + i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct dst_entry *dst = __sk_dst_get(sk); + + dst_confirm(dst); + + if (dst && (dst->flags&DST_HOST)) { + int m; + + if (tp->backoff || !tp->srtt) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. + * Reset our results. + */ + if (!(dst->mxlock&(1<rtt = 0; + return; + } + + m = dst->rtt - tp->srtt; + + /* If newly calculated rtt larger than stored one, + * store new one. Otherwise, use EWMA. Remember, + * rtt overestimation is always better than underestimation. + */ + if (!(dst->mxlock&(1<rtt = tp->srtt; + else + dst->rtt -= (m>>3); + } + + if (!(dst->mxlock&(1<>= 1; + if (m < tp->mdev) + m = tp->mdev; + + if (m >= dst->rttvar) + dst->rttvar = m; + else + dst->rttvar -= (dst->rttvar - m)>>2; + } + + if (tp->snd_ssthresh >= 0xFFFF) { + /* Slow start still did not finish. */ + if (dst->ssthresh && + !(dst->mxlock&(1<snd_cwnd>>1) > dst->ssthresh) + dst->ssthresh = (tp->snd_cwnd>>1); + if (!(dst->mxlock&(1<snd_cwnd > dst->cwnd) + dst->cwnd = tp->snd_cwnd; + } else if (tp->snd_cwnd > tp->snd_ssthresh && + tp->ca_state == TCP_CA_Open) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!(dst->mxlock&(1<ssthresh = max(tp->snd_cwnd>>1, tp->snd_ssthresh); + if (!(dst->mxlock&(1<cwnd = (dst->cwnd + tp->snd_cwnd)>>1; + } else { + /* Else slow start did not finish, cwnd is non-sense, + ssthresh may be also invalid. + */ + if (!(dst->mxlock&(1<cwnd = (dst->cwnd + tp->snd_ssthresh)>>1; + if (dst->ssthresh && + !(dst->mxlock&(1<snd_ssthresh > dst->ssthresh) + dst->ssthresh = tp->snd_ssthresh; + } + + if (!(dst->mxlock&(1<reordering < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + dst->reordering = tp->reordering; + } + } +#endif +} + +/* Increase initial CWND conservatively: if estimated + * RTT is low enough (<20msec) or if we have some preset ssthresh. + * + * Numbers are taken from RFC2414. + */ +__u32 tcp_init_cwnd(struct tcp_opt *tp) +{ +#if 0 + __u32 cwnd; + + if (tp->mss_cache > 1460) + return 2; + + cwnd = (tp->mss_cache > 1095) ? 3 : 4; + + if (!tp->srtt || (tp->snd_ssthresh >= 0xFFFF && tp->srtt > ((HZ/50)<<3))) + cwnd = 2; + else if (cwnd > tp->snd_ssthresh) + cwnd = tp->snd_ssthresh; + + return min_t(__u32, cwnd, tp->snd_cwnd_clamp); +#else + return 0; +#endif +} + +/* Initialize metrics on socket. */ + +static void tcp_init_metrics(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + if (dst->mxlock&(1<snd_cwnd_clamp = dst->cwnd; + if (dst->ssthresh) { + tp->snd_ssthresh = dst->ssthresh; + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } + if (dst->reordering && tp->reordering != dst->reordering) { + tp->sack_ok &= ~2; + tp->reordering = dst->reordering; + } + + if (dst->rtt == 0) + goto reset; + + if (!tp->srtt && dst->rtt < (TCP_TIMEOUT_INIT<<3)) + goto reset; + + /* Initial rtt is determined from SYN,SYN-ACK. + * The segment is small and rtt may appear much + * less than real one. Use per-dst memory + * to make it more realistic. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal curcumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ + if (dst->rtt > tp->srtt) + tp->srtt = dst->rtt; + if (dst->rttvar > tp->mdev) { + tp->mdev = dst->rttvar; + tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + } + tcp_set_rto(tp); + tcp_bound_rto(tp); + if (tp->rto < TCP_TIMEOUT_INIT && !tp->saw_tstamp) + goto reset; + tp->snd_cwnd = tcp_init_cwnd(tp); + tp->snd_cwnd_stamp = tcp_time_stamp; + return; + +reset: + /* Play conservative. If timestamps are not + * supported, TCP will fail to recalculate correct + * rtt, if initial rto is too small. FORGET ALL AND RESET! + */ + if (!tp->saw_tstamp && tp->srtt) { + tp->srtt = 0; + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; + tp->rto = TCP_TIMEOUT_INIT; + } +#endif +} + +static void tcp_update_reordering(struct tcp_opt *tp, int metric, int ts) +{ +#if 0 + if (metric > tp->reordering) { + tp->reordering = min(TCP_MAX_REORDERING, metric); + + /* This exciting event is worth to be remembered. 8) */ + if (ts) + NET_INC_STATS_BH(TCPTSReorder); + else if (IsReno(tp)) + NET_INC_STATS_BH(TCPRenoReorder); + else if (IsFack(tp)) + NET_INC_STATS_BH(TCPFACKReorder); + else + NET_INC_STATS_BH(TCPSACKReorder); +#if FASTRETRANS_DEBUG > 1 + printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", + tp->sack_ok, tp->ca_state, + tp->reordering, tp->fackets_out, tp->sacked_out, + tp->undo_marker ? tp->undo_retrans : 0); +#endif + /* Disable FACK yet. */ + tp->sack_ok &= ~2; + } +#endif +} + +/* This procedure tags the retransmission queue when SACKs arrive. + * + * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). + * Packets in queue with these bits set are counted in variables + * sacked_out, retrans_out and lost_out, correspondingly. + * + * Valid combinations are: + * Tag InFlight Description + * 0 1 - orig segment is in flight. + * S 0 - nothing flies, orig reached receiver. + * L 0 - nothing flies, orig lost by net. + * R 2 - both orig and retransmit are in flight. + * L|R 1 - orig is lost, retransmit is in flight. + * S|R 1 - orig reached receiver, retrans is still in flight. + * (L|S|R is logically valid, it could occur when L|R is sacked, + * but it is equivalent to plain S and code short-curcuits it to S. + * L|S is logically invalid, it would mean -1 packet in flight 8)) + * + * These 6 states form finite state machine, controlled by the following events: + * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) + * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) + * 3. Loss detection event of one of three flavors: + * A. Scoreboard estimator decided the packet is lost. + * A'. Reno "three dupacks" marks head of queue lost. + * A''. Its FACK modfication, head until snd.fack is lost. + * B. SACK arrives sacking data transmitted after never retransmitted + * hole was sent out. + * C. SACK arrives sacking SND.NXT at the moment, when the + * segment was retransmitted. + * 4. D-SACK added new rule: D-SACK changes any tag to S. + * + * It is pleasant to note, that state diagram turns out to be commutative, + * so that we are allowed not to be bothered by order of our actions, + * when multiple events arrive simultaneously. (see the function below). + * + * Reordering detection. + * -------------------- + * Reordering metric is maximal distance, which a packet can be displaced + * in packet stream. With SACKs we can estimate it: + * + * 1. SACK fills old hole and the corresponding segment was not + * ever retransmitted -> reordering. Alas, we cannot use it + * when segment was retransmitted. + * 2. The last flaw is solved with D-SACK. D-SACK arrives + * for retransmitted and already SACKed segment -> reordering.. + * Both of these heuristics are not used in Loss state, when we cannot + * account for retransmits accurately. + */ +static int +tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; + struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); + int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; + int reord = tp->packets_out; + int prior_fackets; + u32 lost_retrans = 0; + int flag = 0; + int i; + + if (!tp->sacked_out) + tp->fackets_out = 0; + prior_fackets = tp->fackets_out; + + for (i=0; istart_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count = 0; + int dup_sack = 0; + + /* Check for D-SACK. */ + if (i == 0) { + u32 ack = TCP_SKB_CB(ack_skb)->ack_seq; + + if (before(start_seq, ack)) { + dup_sack = 1; + tp->sack_ok |= 4; + NET_INC_STATS_BH(TCPDSACKRecv); + } else if (num_sacks > 1 && + !after(end_seq, ntohl(sp[1].end_seq)) && + !before(start_seq, ntohl(sp[1].start_seq))) { + dup_sack = 1; + tp->sack_ok |= 4; + NET_INC_STATS_BH(TCPDSACKOfoRecv); + } + + /* D-SACK for already forgotten data... + * Do dumb counting. */ + if (dup_sack && + !after(end_seq, prior_snd_una) && + after(end_seq, tp->undo_marker)) + tp->undo_retrans--; + + /* Eliminate too old ACKs, but take into + * account more or less fresh ones, they can + * contain valid SACK info. + */ + if (before(ack, prior_snd_una-tp->max_window)) + return 0; + } + + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + flag |= FLAG_DATA_LOST; + + for_retrans_queue(skb, sk, tp) { + u8 sacked = TCP_SKB_CB(skb)->sacked; + int in_sack; + + /* The retransmission queue is always in order, so + * we can short-circuit the walk early. + */ + if(!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + fack_count++; + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + /* Account D-SACK for retransmitted packet. */ + if ((dup_sack && in_sack) && + (sacked & TCPCB_RETRANS) && + after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + tp->undo_retrans--; + + /* The frame is ACKed. */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) { + if (sacked&TCPCB_RETRANS) { + if ((dup_sack && in_sack) && + (sacked&TCPCB_SACKED_ACKED)) + reord = min(fack_count, reord); + } else { + /* If it was in a hole, we detected reordering. */ + if (fack_count < prior_fackets && + !(sacked&TCPCB_SACKED_ACKED)) + reord = min(fack_count, reord); + } + + /* Nothing to do; acked frame is about to be dropped. */ + continue; + } + + if ((sacked&TCPCB_SACKED_RETRANS) && + after(end_seq, TCP_SKB_CB(skb)->ack_seq) && + (!lost_retrans || after(end_seq, lost_retrans))) + lost_retrans = end_seq; + + if (!in_sack) + continue; + + if (!(sacked&TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out--; + tp->retrans_out--; + } + } else { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (!(sacked & TCPCB_RETRANS) && + fack_count < prior_fackets) + reord = min(fack_count, reord); + + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + tp->lost_out--; + } + } + + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + flag |= FLAG_DATA_SACKED; + tp->sacked_out++; + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + } else { + if (dup_sack && (sacked&TCPCB_RETRANS)) + reord = min(fack_count, reord); + } + + /* D-SACK. We can detect redundant retransmission + * in S|R and plain R frames and clear it. + * undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && + (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out--; + } + } + } + + /* Check for lost retransmit. This superb idea is + * borrowed from "ratehalving". Event "C". + * Later note: FACK people cheated me again 8), + * we have to account for reordering! Ugly, + * but should help. + */ + if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { + struct sk_buff *skb; + + for_retrans_queue(skb, sk, tp) { + if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) + break; + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + continue; + if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && + after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && + (IsFack(tp) || + !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq+tp->reordering*tp->mss_cache))) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out--; + + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out++; + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + flag |= FLAG_DATA_SACKED; + NET_INC_STATS_BH(TCPLostRetransmit); + } + } + } + } + + tp->left_out = tp->sacked_out + tp->lost_out; + + if (reord < tp->fackets_out && tp->ca_state != TCP_CA_Loss) + tcp_update_reordering(tp, (tp->fackets_out+1)-reord, 0); + +#if FASTRETRANS_DEBUG > 0 + BUG_TRAP((int)tp->sacked_out >= 0); + BUG_TRAP((int)tp->lost_out >= 0); + BUG_TRAP((int)tp->retrans_out >= 0); + BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0); +#endif + return flag; +#else + return 0; +#endif +} + +void tcp_clear_retrans(struct tcp_opt *tp) +{ +#if 0 + tp->left_out = 0; + tp->retrans_out = 0; + + tp->fackets_out = 0; + tp->sacked_out = 0; + tp->lost_out = 0; + + tp->undo_marker = 0; + tp->undo_retrans = 0; +#endif +} + +/* Enter Loss state. If "how" is not zero, forget all SACK information + * and reset tags completely, otherwise preserve SACKs. If receiver + * dropped its ofo queue, we will know this due to reneging detection. + */ +void tcp_enter_loss(struct sock *sk, int how) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct sk_buff *skb; + int cnt = 0; + + /* Reduce ssthresh if it has not yet been made inside this window. */ + if (tp->ca_state <= TCP_CA_Disorder || + tp->snd_una == tp->high_seq || + (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + } + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + + tcp_clear_retrans(tp); + + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ + if (!how) + tp->undo_marker = tp->snd_una; + + for_retrans_queue(skb, sk, tp) { + cnt++; + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + } else { + tp->sacked_out++; + tp->fackets_out = cnt; + } + } + tcp_sync_left_out(tp); + + tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); + tp->ca_state = TCP_CA_Loss; + tp->high_seq = tp->snd_nxt; + TCP_ECN_queue_cwr(tp); +#endif +} + +static int tcp_check_sack_reneging(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + struct sk_buff *skb; + + /* If ACK arrived pointing to a remembered SACK, + * it means that our remembered SACKs do not reflect + * real state of receiver i.e. + * receiver _host_ is heavily congested (or buggy). + * Do processing similar to RTO timeout. + */ + if ((skb = skb_peek(&sk->write_queue)) != NULL && + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + NET_INC_STATS_BH(TCPSACKReneging); + + tcp_enter_loss(sk, 1); + tp->retransmits++; + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 1; + } + return 0; +#else + return 0; +#endif +} + +static inline int tcp_fackets_out(struct tcp_opt *tp) +{ +#if 0 + return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; +#else + return 0; +#endif +} + +static inline int tcp_skb_timedout(struct tcp_opt *tp, struct sk_buff *skb) +{ +#if 0 + return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); +#else + return 0; +#endif +} + +static inline int tcp_head_timedout(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + return tp->packets_out && tcp_skb_timedout(tp, skb_peek(&sk->write_queue)); +#else + return 0; +#endif +} + +/* Linux NewReno/SACK/FACK/ECN state machine. + * -------------------------------------- + * + * "Open" Normal state, no dubious events, fast path. + * "Disorder" In all the respects it is "Open", + * but requires a bit more attention. It is entered when + * we see some SACKs or dupacks. It is split of "Open" + * mainly to move some processing from fast path to slow one. + * "CWR" CWND was reduced due to some Congestion Notification event. + * It can be ECN, ICMP source quench, local device congestion. + * "Recovery" CWND was reduced, we are fast-retransmitting. + * "Loss" CWND was reduced due to RTO timeout or SACK reneging. + * + * tcp_fastretrans_alert() is entered: + * - each incoming ACK, if state is not "Open" + * - when arrived ACK is unusual, namely: + * * SACK + * * Duplicate ACK. + * * ECN ECE. + * + * Counting packets in flight is pretty simple. + * + * in_flight = packets_out - left_out + retrans_out + * + * packets_out is SND.NXT-SND.UNA counted in packets. + * + * retrans_out is number of retransmitted segments. + * + * left_out is number of segments left network, but not ACKed yet. + * + * left_out = sacked_out + lost_out + * + * sacked_out: Packets, which arrived to receiver out of order + * and hence not ACKed. With SACKs this number is simply + * amount of SACKed data. Even without SACKs + * it is easy to give pretty reliable estimate of this number, + * counting duplicate ACKs. + * + * lost_out: Packets lost by network. TCP has no explicit + * "loss notification" feedback from network (for now). + * It means that this number can be only _guessed_. + * Actually, it is the heuristics to predict lossage that + * distinguishes different algorithms. + * + * F.e. after RTO, when all the queue is considered as lost, + * lost_out = packets_out and in_flight = retrans_out. + * + * Essentially, we have now two algorithms counting + * lost packets. + * + * FACK: It is the simplest heuristics. As soon as we decided + * that something is lost, we decide that _all_ not SACKed + * packets until the most forward SACK are lost. I.e. + * lost_out = fackets_out - sacked_out and left_out = fackets_out. + * It is absolutely correct estimate, if network does not reorder + * packets. And it loses any connection to reality when reordering + * takes place. We use FACK by default until reordering + * is suspected on the path to this destination. + * + * NewReno: when Recovery is entered, we assume that one segment + * is lost (classic Reno). While we are in Recovery and + * a partial ACK arrives, we assume that one more packet + * is lost (NewReno). This heuristics are the same in NewReno + * and SACK. + * + * Imagine, that's all! Forget about all this shamanism about CWND inflation + * deflation etc. CWND is real congestion window, never inflated, changes + * only according to classic VJ rules. + * + * Really tricky (and requiring careful tuning) part of algorithm + * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). + * The first determines the moment _when_ we should reduce CWND and, + * hence, slow down forward transmission. In fact, it determines the moment + * when we decide that hole is caused by loss, rather than by a reorder. + * + * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill + * holes, caused by lost packets. + * + * And the most logically complicated part of algorithm is undo + * heuristics. We detect false retransmits due to both too early + * fast retransmit (reordering) and underestimated RTO, analyzing + * timestamps and D-SACKs. When we detect that some segments were + * retransmitted by mistake and CWND reduction was wrong, we undo + * window reduction and abort recovery phase. This logic is hidden + * inside several functions named tcp_try_undo_. + */ + +/* This function decides, when we should leave Disordered state + * and enter Recovery phase, reducing congestion window. + * + * Main question: may we further continue forward transmission + * with the same cwnd? + */ +static int +tcp_time_to_recover(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + /* Trick#1: The loss is proven. */ + if (tp->lost_out) + return 1; + + /* Not-A-Trick#2 : Classic rule... */ + if (tcp_fackets_out(tp) > tp->reordering) + return 1; + + /* Trick#3 : when we use RFC2988 timer restart, fast + * retransmit can be triggered by timeout of queue head. + */ + if (tcp_head_timedout(sk, tp)) + return 1; + + /* Trick#4: It is still not OK... But will it be useful to delay + * recovery more? + */ + if (tp->packets_out <= tp->reordering && + tp->sacked_out >= max_t(__u32, tp->packets_out/2, sysctl_tcp_reordering) && + !tcp_may_send_now(sk, tp)) { + /* We have nothing to send. This connection is limited + * either by receiver window or by application. + */ + return 1; + } + + return 0; +#else + return 0; +#endif +} + +/* If we receive more dupacks than we expected counting segments + * in assumption of absent reordering, interpret this as reordering. + * The only another reason could be bug in receiver TCP. + */ +static void tcp_check_reno_reordering(struct tcp_opt *tp, int addend) +{ +#if 0 + u32 holes; + + holes = max(tp->lost_out, 1U); + holes = min(holes, tp->packets_out); + + if (tp->sacked_out + holes > tp->packets_out) { + tp->sacked_out = tp->packets_out - holes; + tcp_update_reordering(tp, tp->packets_out+addend, 0); + } +#endif +} + +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct tcp_opt *tp) +{ +#if 0 + ++tp->sacked_out; + tcp_check_reno_reordering(tp, 0); + tcp_sync_left_out(tp); +#endif +} + +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_opt *tp, int acked) +{ +#if 0 + if (acked > 0) { + /* One ACK acked hole. The rest eat duplicate ACKs. */ + if (acked-1 >= tp->sacked_out) + tp->sacked_out = 0; + else + tp->sacked_out -= acked-1; + } + tcp_check_reno_reordering(tp, acked); + tcp_sync_left_out(tp); +#endif +} + +static inline void tcp_reset_reno_sack(struct tcp_opt *tp) +{ +#if 0 + tp->sacked_out = 0; + tp->left_out = tp->lost_out; +#endif +} + +/* Mark head of queue up as lost. */ +static void +tcp_mark_head_lost(struct sock *sk, struct tcp_opt *tp, int packets, u32 high_seq) +{ +#if 0 + struct sk_buff *skb; + int cnt = packets; + + BUG_TRAP(cnt <= tp->packets_out); + + for_retrans_queue(skb, sk, tp) { + if (--cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + break; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + } + } + tcp_sync_left_out(tp); +#endif +} + +/* Account newly detected lost packet(s) */ + +static void tcp_update_scoreboard(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + if (IsFack(tp)) { + int lost = tp->fackets_out - tp->reordering; + if (lost <= 0) + lost = 1; + tcp_mark_head_lost(sk, tp, lost, tp->high_seq); + } else { + tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + } + + /* New heuristics: it is possible only after we switched + * to restart timer each time when something is ACKed. + * Hence, we can detect timed out packets during fast + * retransmit without falling to slow start. + */ + if (tcp_head_timedout(sk, tp)) { + struct sk_buff *skb; + + for_retrans_queue(skb, sk, tp) { + if (tcp_skb_timedout(tp, skb) && + !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + } + } + tcp_sync_left_out(tp); + } +#endif +} + +/* CWND moderation, preventing bursts due to too big ACKs + * in dubious situations. + */ +static __inline__ void tcp_moderate_cwnd(struct tcp_opt *tp) +{ +#if 0 + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tp->snd_cwnd_stamp = tcp_time_stamp; +#endif +} + +/* Decrease cwnd each second ack. */ + +static void tcp_cwnd_down(struct tcp_opt *tp) +{ +#if 0 + int decr = tp->snd_cwnd_cnt + 1; + + tp->snd_cwnd_cnt = decr&1; + decr >>= 1; + + if (decr && tp->snd_cwnd > tp->snd_ssthresh/2) + tp->snd_cwnd -= decr; + + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd_stamp = tcp_time_stamp; +#endif +} + +/* Nothing was retransmitted or returned timestamp is less + * than timestamp of the first retransmission. + */ +static __inline__ int tcp_packet_delayed(struct tcp_opt *tp) +{ +#if 0 + return !tp->retrans_stamp || + (tp->saw_tstamp && tp->rcv_tsecr && + (__s32)(tp->rcv_tsecr - tp->retrans_stamp) < 0); +#else + return 0; +#endif +} + +/* Undo procedures. */ + +#if FASTRETRANS_DEBUG > 1 +static void DBGUNDO(struct sock *sk, struct tcp_opt *tp, const char *msg) +{ +#if 0 + printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", + msg, + NIPQUAD(sk->daddr), ntohs(sk->dport), + tp->snd_cwnd, tp->left_out, + tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); +#endif +} +#else +#define DBGUNDO(x...) do { } while (0) +#endif + +static void tcp_undo_cwr(struct tcp_opt *tp, int undo) +{ +#if 0 + if (tp->prior_ssthresh) { + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); + + if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { + tp->snd_ssthresh = tp->prior_ssthresh; + TCP_ECN_withdraw_cwr(tp); + } + } else { + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); + } + tcp_moderate_cwnd(tp); + tp->snd_cwnd_stamp = tcp_time_stamp; +#endif +} + +static inline int tcp_may_undo(struct tcp_opt *tp) +{ +#if 0 + return tp->undo_marker && + (!tp->undo_retrans || tcp_packet_delayed(tp)); +#else + return 0; +#endif +} + +/* People celebrate: "We love our President!" */ +static int tcp_try_undo_recovery(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + if (tcp_may_undo(tp)) { + /* Happy end! We did not retransmit anything + * or our original transmission succeeded. + */ + DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); + tcp_undo_cwr(tp, 1); + if (tp->ca_state == TCP_CA_Loss) + NET_INC_STATS_BH(TCPLossUndo); + else + NET_INC_STATS_BH(TCPFullUndo); + tp->undo_marker = 0; + } + if (tp->snd_una == tp->high_seq && IsReno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + tcp_moderate_cwnd(tp); + return 1; + } + tp->ca_state = TCP_CA_Open; + return 0; +#else + return 0; +#endif +} + +/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ +static void tcp_try_undo_dsack(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + if (tp->undo_marker && !tp->undo_retrans) { + DBGUNDO(sk, tp, "D-SACK"); + tcp_undo_cwr(tp, 1); + tp->undo_marker = 0; + NET_INC_STATS_BH(TCPDSACKUndo); + } +#endif +} + +/* Undo during fast recovery after partial ACK. */ + +static int tcp_try_undo_partial(struct sock *sk, struct tcp_opt *tp, int acked) +{ +#if 0 + /* Partial ACK arrived. Force Hoe's retransmit. */ + int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + + if (tcp_may_undo(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + if (tp->retrans_out == 0) + tp->retrans_stamp = 0; + + tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); + + DBGUNDO(sk, tp, "Hoe"); + tcp_undo_cwr(tp, 0); + NET_INC_STATS_BH(TCPPartialUndo); + + /* So... Do not make Hoe's retransmit yet. + * If the first packet was delayed, the rest + * ones are most probably delayed as well. + */ + failed = 0; + } + return failed; +#else + return 0; +#endif +} + +/* Undo during loss recovery after partial ACK. */ +static int tcp_try_undo_loss(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + if (tcp_may_undo(tp)) { + struct sk_buff *skb; + for_retrans_queue(skb, sk, tp) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + DBGUNDO(sk, tp, "partial loss"); + tp->lost_out = 0; + tp->left_out = tp->sacked_out; + tcp_undo_cwr(tp, 1); + NET_INC_STATS_BH(TCPLossUndo); + tp->retransmits = 0; + tp->undo_marker = 0; + if (!IsReno(tp)) + tp->ca_state = TCP_CA_Open; + return 1; + } + return 0; +#else + return 0; +#endif +} + +static __inline__ void tcp_complete_cwr(struct tcp_opt *tp) +{ +#if 0 + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_stamp = tcp_time_stamp; +#endif +} + +static void tcp_try_to_open(struct sock *sk, struct tcp_opt *tp, int flag) +{ +#if 0 + tp->left_out = tp->sacked_out; + + if (tp->retrans_out == 0) + tp->retrans_stamp = 0; + + if (flag&FLAG_ECE) + tcp_enter_cwr(tp); + + if (tp->ca_state != TCP_CA_CWR) { + int state = TCP_CA_Open; + + if (tp->left_out || + tp->retrans_out || + tp->undo_marker) + state = TCP_CA_Disorder; + + if (tp->ca_state != state) { + tp->ca_state = state; + tp->high_seq = tp->snd_nxt; + } + tcp_moderate_cwnd(tp); + } else { + tcp_cwnd_down(tp); + } +#endif +} + +/* Process an event, which can update packets-in-flight not trivially. + * Main goal of this function is to calculate new estimate for left_out, + * taking into account both packets sitting in receiver's buffer and + * packets lost by network. + * + * Besides that it does CWND reduction, when packet loss is detected + * and changes state of machine. + * + * It does _not_ decide what to send, it is made in function + * tcp_xmit_retransmit_queue(). + */ +static void +tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, + int prior_packets, int flag) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); + + /* Some technical things: + * 1. Reno does not count dupacks (sacked_out) automatically. */ + if (!tp->packets_out) + tp->sacked_out = 0; + /* 2. SACK counts snd_fack in packets inaccurately. */ + if (tp->sacked_out == 0) + tp->fackets_out = 0; + + /* Now state machine starts. + * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ + if (flag&FLAG_ECE) + tp->prior_ssthresh = 0; + + /* B. In all the states check for reneging SACKs. */ + if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + return; + + /* C. Process data loss notification, provided it is valid. */ + if ((flag&FLAG_DATA_LOST) && + before(tp->snd_una, tp->high_seq) && + tp->ca_state != TCP_CA_Open && + tp->fackets_out > tp->reordering) { + tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + NET_INC_STATS_BH(TCPLoss); + } + + /* D. Synchronize left_out to current state. */ + tcp_sync_left_out(tp); + + /* E. Check state exit conditions. State can be terminated + * when high_seq is ACKed. */ + if (tp->ca_state == TCP_CA_Open) { + BUG_TRAP(tp->retrans_out == 0); + tp->retrans_stamp = 0; + } else if (!before(tp->snd_una, tp->high_seq)) { + switch (tp->ca_state) { + case TCP_CA_Loss: + tp->retransmits = 0; + if (tcp_try_undo_recovery(sk, tp)) + return; + break; + + case TCP_CA_CWR: + /* CWR is to be held something *above* high_seq + * is ACKed for CWR bit to reach receiver. */ + if (tp->snd_una != tp->high_seq) { + tcp_complete_cwr(tp); + tp->ca_state = TCP_CA_Open; + } + break; + + case TCP_CA_Disorder: + tcp_try_undo_dsack(sk, tp); + if (!tp->undo_marker || + /* For SACK case do not Open to allow to undo + * catching for all duplicate ACKs. */ + IsReno(tp) || tp->snd_una != tp->high_seq) { + tp->undo_marker = 0; + tp->ca_state = TCP_CA_Open; + } + break; + + case TCP_CA_Recovery: + if (IsReno(tp)) + tcp_reset_reno_sack(tp); + if (tcp_try_undo_recovery(sk, tp)) + return; + tcp_complete_cwr(tp); + break; + } + } + + /* F. Process state. */ + switch (tp->ca_state) { + case TCP_CA_Recovery: + if (prior_snd_una == tp->snd_una) { + if (IsReno(tp) && is_dupack) + tcp_add_reno_sack(tp); + } else { + int acked = prior_packets - tp->packets_out; + if (IsReno(tp)) + tcp_remove_reno_sacks(sk, tp, acked); + is_dupack = tcp_try_undo_partial(sk, tp, acked); + } + break; + case TCP_CA_Loss: + if (flag&FLAG_DATA_ACKED) + tp->retransmits = 0; + if (!tcp_try_undo_loss(sk, tp)) { + tcp_moderate_cwnd(tp); + tcp_xmit_retransmit_queue(sk); + return; + } + if (tp->ca_state != TCP_CA_Open) + return; + /* Loss is undone; fall through to processing in Open state. */ + default: + if (IsReno(tp)) { + if (tp->snd_una != prior_snd_una) + tcp_reset_reno_sack(tp); + if (is_dupack) + tcp_add_reno_sack(tp); + } + + if (tp->ca_state == TCP_CA_Disorder) + tcp_try_undo_dsack(sk, tp); + + if (!tcp_time_to_recover(sk, tp)) { + tcp_try_to_open(sk, tp, flag); + return; + } + + /* Otherwise enter Recovery state */ + + if (IsReno(tp)) + NET_INC_STATS_BH(TCPRenoRecovery); + else + NET_INC_STATS_BH(TCPSackRecovery); + + tp->high_seq = tp->snd_nxt; + tp->prior_ssthresh = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = tp->retrans_out; + + if (tp->ca_state < TCP_CA_CWR) { + if (!(flag&FLAG_ECE)) + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + TCP_ECN_queue_cwr(tp); + } + + tp->snd_cwnd_cnt = 0; + tp->ca_state = TCP_CA_Recovery; + } + + if (is_dupack || tcp_head_timedout(sk, tp)) + tcp_update_scoreboard(sk, tp); + tcp_cwnd_down(tp); + tcp_xmit_retransmit_queue(sk); +#endif +} + +/* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Superceeds RFC1323) + */ +static void tcp_ack_saw_tstamp(struct tcp_opt *tp, int flag) +{ +#if 0 + __u32 seq_rtt; + + /* RTTM Rule: A TSecr value received in a segment is used to + * update the averaged RTT measurement only if the segment + * acknowledges some new data, i.e., only if it advances the + * left edge of the send window. + * + * See draft-ietf-tcplw-high-performance-00, section 3.3. + * 1998/04/10 Andrey V. Savochkin + * + * Changed: reset backoff as soon as we see the first valid sample. + * If we do not, we get strongly overstimated rto. With timestamps + * samples are accepted even from very old segments: f.e., when rtt=1 + * increases to 8, we retransmit 5 times and after 8 seconds delayed + * answer arrives rto becomes 120 seconds! If at least one of segments + * in window is lost... Voila. --ANK (010210) + */ + seq_rtt = tcp_time_stamp - tp->rcv_tsecr; + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tp->backoff = 0; + tcp_bound_rto(tp); +#endif +} + +static void tcp_ack_no_tstamp(struct tcp_opt *tp, u32 seq_rtt, int flag) +{ +#if 0 + /* We don't have a timestamp. Can only use + * packets that are not retransmitted to determine + * rtt estimates. Also, we must not reset the + * backoff for rto until we get a non-retransmitted + * packet. This allows us to deal with a situation + * where the network delay has increased suddenly. + * I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + + if (flag & FLAG_RETRANS_DATA_ACKED) + return; + + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tp->backoff = 0; + tcp_bound_rto(tp); +#endif +} + +static __inline__ void +tcp_ack_update_rtt(struct tcp_opt *tp, int flag, s32 seq_rtt) +{ +#if 0 + /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ + if (tp->saw_tstamp && tp->rcv_tsecr) + tcp_ack_saw_tstamp(tp, flag); + else if (seq_rtt >= 0) + tcp_ack_no_tstamp(tp, seq_rtt, flag); +#endif +} + +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) +{ +#if 0 + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt=0; + } else + tp->snd_cwnd_cnt++; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +#endif +} + +/* Restart timer after forward progress on connection. + * RFC2988 recommends to restart timer to now+rto. + */ + +static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + if (tp->packets_out==0) { + tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + } else { + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } +#endif +} + +/* Remove acknowledged frames from the retransmission queue. */ +static int tcp_clean_rtx_queue(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + __u32 now = tcp_time_stamp; + int acked = 0; + __s32 seq_rtt = -1; + + while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + __u8 sacked = scb->sacked; + + /* If our packet is before the ack sequence we can + * discard it as it's confirmed to have arrived at + * the other end. + */ + if (after(scb->end_seq, tp->snd_una)) + break; + + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if(!(scb->flags & TCPCB_FLAG_SYN)) { + acked |= FLAG_DATA_ACKED; + } else { + acked |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + + if (sacked) { + if(sacked & TCPCB_RETRANS) { + if(sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out--; + acked |= FLAG_RETRANS_DATA_ACKED; + seq_rtt = -1; + } else if (seq_rtt < 0) + seq_rtt = now - scb->when; + if(sacked & TCPCB_SACKED_ACKED) + tp->sacked_out--; + if(sacked & TCPCB_LOST) + tp->lost_out--; + if(sacked & TCPCB_URG) { + if (tp->urg_mode && + !before(scb->end_seq, tp->snd_up)) + tp->urg_mode = 0; + } + } else if (seq_rtt < 0) + seq_rtt = now - scb->when; + if(tp->fackets_out) + tp->fackets_out--; + tp->packets_out--; + __skb_unlink(skb, skb->list); + tcp_free_skb(sk, skb); + } + + if (acked&FLAG_ACKED) { + tcp_ack_update_rtt(tp, acked, seq_rtt); + tcp_ack_packets_out(sk, tp); + } + +#if FASTRETRANS_DEBUG > 0 + BUG_TRAP((int)tp->sacked_out >= 0); + BUG_TRAP((int)tp->lost_out >= 0); + BUG_TRAP((int)tp->retrans_out >= 0); + if (tp->packets_out==0 && tp->sack_ok) { + if (tp->lost_out) { + printk(KERN_DEBUG "Leak l=%u %d\n", tp->lost_out, tp->ca_state); + tp->lost_out = 0; + } + if (tp->sacked_out) { + printk(KERN_DEBUG "Leak s=%u %d\n", tp->sacked_out, tp->ca_state); + tp->sacked_out = 0; + } + if (tp->retrans_out) { + printk(KERN_DEBUG "Leak r=%u %d\n", tp->retrans_out, tp->ca_state); + tp->retrans_out = 0; + } + } +#endif + return acked; +#else + return 0; +#endif +} + +static void tcp_ack_probe(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Was it a usable window open? */ + + if (!after(TCP_SKB_CB(tp->send_head)->end_seq, tp->snd_una + tp->snd_wnd)) { + tp->backoff = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + /* Socket must be waked up by subsequent tcp_data_snd_check(). + * This function is not for random using! + */ + } else { + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } +#endif +} + +static __inline__ int tcp_ack_is_dubious(struct tcp_opt *tp, int flag) +{ +#if 0 + return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || + tp->ca_state != TCP_CA_Open); +#else + return 0; +#endif +} + +static __inline__ int tcp_may_raise_cwnd(struct tcp_opt *tp, int flag) +{ +#if 0 + return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && + !((1<ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); +#else + return 0; +#endif +} + +/* Check that window update is acceptable. + * The function assumes that snd_una<=ack<=snd_next. + */ +static __inline__ int +tcp_may_update_window(struct tcp_opt *tp, u32 ack, u32 ack_seq, u32 nwin) +{ +#if 0 + return (after(ack, tp->snd_una) || + after(ack_seq, tp->snd_wl1) || + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); +#else + return 0; +#endif +} + +/* Update our send window. + * + * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 + * and in FreeBSD. NetBSD's one is even worse.) is wrong. + */ +static int tcp_ack_update_window(struct sock *sk, struct tcp_opt *tp, + struct sk_buff *skb, u32 ack, u32 ack_seq) +{ +#if 0 + int flag = 0; + u32 nwin = ntohs(skb->h.th->window) << tp->snd_wscale; + + if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { + flag |= FLAG_WIN_UPDATE; + tcp_update_wl(tp, ack, ack_seq); + + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + tcp_fast_path_check(sk, tp); + + if (nwin > tp->max_window) { + tp->max_window = nwin; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } + } + + tp->snd_una = ack; + + return flag; +#else + return 0; +#endif +} + +/* This routine deals with incoming acks, but not outgoing ones. */ +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 prior_snd_una = tp->snd_una; + u32 ack_seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + u32 prior_in_flight; + int prior_packets; + + /* If the ack is newer than sent or older than previous acks + * then we can probably ignore it. + */ + if (after(ack, tp->snd_nxt)) + goto uninteresting_ack; + + if (before(ack, prior_snd_una)) + goto old_ack; + + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + /* Window is constant, pure forward advance. + * No more checks are required. + * Note, we use the fact that SND.UNA>=SND.WL2. + */ + tcp_update_wl(tp, ack, ack_seq); + tp->snd_una = ack; + flag |= FLAG_WIN_UPDATE; + + NET_INC_STATS_BH(TCPHPAcks); + } else { + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS_BH(TCPPureAcks); + + flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); + + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + + if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + flag |= FLAG_ECE; + } + + /* We passed data and got it acked, remove any soft error + * log. Something worked... + */ + sk->err_soft = 0; + tp->rcv_tstamp = tcp_time_stamp; + if ((prior_packets = tp->packets_out) == 0) + goto no_queue; + + prior_in_flight = tcp_packets_in_flight(tp); + + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk); + + if (tcp_ack_is_dubious(tp, flag)) { + /* Advanve CWND, if state allows this. */ + if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd && + tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp); + tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); + } else { + if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) + tcp_cong_avoid(tp); + } + + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) + dst_confirm(sk->dst_cache); + + return 1; + +no_queue: + tp->probes_out = 0; + + /* If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. + */ + if (tp->send_head) + tcp_ack_probe(sk); + return 1; + +old_ack: + if (TCP_SKB_CB(skb)->sacked) + tcp_sacktag_write_queue(sk, skb, prior_snd_una); + +uninteresting_ack: + SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + return 0; +#else + return 0; +#endif +} + + +/* Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. + */ +void tcp_parse_options(struct sk_buff *skb, struct tcp_opt *tp, int estab) +{ +#if 0 + unsigned char *ptr; + struct tcphdr *th = skb->h.th; + int length=(th->doff*4)-sizeof(struct tcphdr); + + ptr = (unsigned char *)(th + 1); + tp->saw_tstamp = 0; + + while(length>0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + switch(opcode) { + case TCPOPT_MSS: + if(opsize==TCPOLEN_MSS && th->syn && !estab) { + u16 in_mss = ntohs(*(__u16 *)ptr); + if (in_mss) { + if (tp->user_mss && tp->user_mss < in_mss) + in_mss = tp->user_mss; + tp->mss_clamp = in_mss; + } + } + break; + case TCPOPT_WINDOW: + if(opsize==TCPOLEN_WINDOW && th->syn && !estab) + if (sysctl_tcp_window_scaling) { + tp->wscale_ok = 1; + tp->snd_wscale = *(__u8 *)ptr; + if(tp->snd_wscale > 14) { + if(net_ratelimit()) + printk("tcp_parse_options: Illegal window " + "scaling value %d >14 received.", + tp->snd_wscale); + tp->snd_wscale = 14; + } + } + break; + case TCPOPT_TIMESTAMP: + if(opsize==TCPOLEN_TIMESTAMP) { + if ((estab && tp->tstamp_ok) || + (!estab && sysctl_tcp_timestamps)) { + tp->saw_tstamp = 1; + tp->rcv_tsval = ntohl(*(__u32 *)ptr); + tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); + } + } + break; + case TCPOPT_SACK_PERM: + if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { + if (sysctl_tcp_sack) { + tp->sack_ok = 1; + tcp_sack_reset(tp); + } + } + break; + + case TCPOPT_SACK: + if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && + tp->sack_ok) { + TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; + } + }; + ptr+=opsize-2; + length-=opsize; + }; + } +#endif +} + +/* Fast parse options. This hopes to only see timestamps. + * If it is wrong it falls back on tcp_parse_options(). + */ +static __inline__ int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, struct tcp_opt *tp) +{ +#if 0 + if (th->doff == sizeof(struct tcphdr)>>2) { + tp->saw_tstamp = 0; + return 0; + } else if (tp->tstamp_ok && + th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + __u32 *ptr = (__u32 *)(th + 1); + if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->saw_tstamp = 1; + ++ptr; + tp->rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rcv_tsecr = ntohl(*ptr); + return 1; + } + } + tcp_parse_options(skb, tp, 1); + return 1; +#else + return 0; +#endif +} + +extern __inline__ void +tcp_store_ts_recent(struct tcp_opt *tp) +{ +#if 0 + tp->ts_recent = tp->rcv_tsval; + tp->ts_recent_stamp = xtime.tv_sec; +#endif +} + +extern __inline__ void +tcp_replace_ts_recent(struct tcp_opt *tp, u32 seq) +{ +#if 0 + if (tp->saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. + */ + + if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0 || + xtime.tv_sec >= tp->ts_recent_stamp + TCP_PAWS_24DAYS) + tcp_store_ts_recent(tp); + } +#endif +} + +/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM + * + * It is not fatal. If this ACK does _not_ change critical state (seqs, window) + * it can pass through stack. So, the following predicate verifies that + * this segment is not used for anything but congestion avoidance or + * fast retransmit. Moreover, we even are able to eliminate most of such + * second order effects, if we apply some small "replay" window (~RTO) + * to timestamp space. + * + * All these measures still do not guarantee that we reject wrapped ACKs + * on networks with high bandwidth, when sequence space is recycled fastly, + * but it guarantees that such events will be very rare and do not affect + * connection seriously. This doesn't look nice, but alas, PAWS is really + * buggy extension. + * + * [ Later note. Even worse! It is buggy for segments _with_ data. RFC + * states that events when retransmit arrives after original data are rare. + * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is + * the biggest problem on large power networks even with minor reordering. + * OK, let's give it small replay window. If peer clock is even 1hz, it is safe + * up to bandwidth of 18Gigabit/sec. 8) ] + */ + +static int tcp_disordered_ack(struct tcp_opt *tp, struct sk_buff *skb) +{ +#if 0 + struct tcphdr *th = skb->h.th; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + + return (/* 1. Pure ACK with correct sequence number. */ + (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && + + /* 2. ... and duplicate ACK. */ + ack == tp->snd_una && + + /* 3. ... and does not update window. */ + !tcp_may_update_window(tp, ack, seq, ntohs(th->window)<snd_wscale) && + + /* 4. ... and sits in replay window. */ + (s32)(tp->ts_recent - tp->rcv_tsval) <= (tp->rto*1024)/HZ); +#endif +} + +extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct sk_buff *skb) +{ +#if 0 + return ((s32)(tp->ts_recent - tp->rcv_tsval) > TCP_PAWS_WINDOW && + xtime.tv_sec < tp->ts_recent_stamp + TCP_PAWS_24DAYS && + !tcp_disordered_ack(tp, skb)); +#else + return 0; +#endif +} + +/* Check segment sequence number for validity. + * + * Segment controls are considered valid, if the segment + * fits to the window after truncation to the window. Acceptability + * of data (and SYN, FIN, of course) is checked separately. + * See tcp_data_queue(), for example. + * + * Also, controls (RST is main one) are accepted using RCV.WUP instead + * of RCV.NXT. Peer still did not advance his SND.UNA when we + * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. + * (borrowed from freebsd) + */ + +static inline int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ +#if 0 + return !before(end_seq, tp->rcv_wup) && + !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); +#else + return 0; +#endif +} + +/* When we get a reset we do this. */ +static void tcp_reset(struct sock *sk) +{ +#if 0 + /* We want the right error as BSD sees it (and indeed as we do). */ + switch (sk->state) { + case TCP_SYN_SENT: + sk->err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->err = ECONNRESET; + } + + if (!sk->dead) + sk->error_report(sk); + + tcp_done(sk); +#endif +} + +/* + * Process the FIN bit. This now behaves as it is supposed to work + * and the FIN takes effect when it is validly part of sequence + * space. Not before when we get holes. + * + * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT + * (and thence onto LAST-ACK and finally, CLOSE, we never enter + * TIME-WAIT) + * + * If we are in FINWAIT-1, a received FIN indicates simultaneous + * close and we go into CLOSING (and later onto TIME-WAIT) + * + * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. + */ +static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tcp_schedule_ack(tp); + + sk->shutdown |= RCV_SHUTDOWN; + sk->done = 1; + + switch(sk->state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + tp->ack.pingpong = 1; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_send_ack(sk); + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + printk("tcp_fin: Impossible, sk->state=%d\n", sk->state); + break; + }; + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + __skb_queue_purge(&tp->out_of_order_queue); + if (tp->sack_ok) + tcp_sack_reset(tp); + tcp_mem_reclaim(sk); + + if (!sk->dead) { + sk->state_change(sk); + + /* Do not send POLL_HUP for half duplex close. */ + if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE) + sk_wake_async(sk, 1, POLL_HUP); + else + sk_wake_async(sk, 1, POLL_IN); + } +#endif +} + +static __inline__ int +tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +{ +#if 0 + if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { + if (before(seq, sp->start_seq)) + sp->start_seq = seq; + if (after(end_seq, sp->end_seq)) + sp->end_seq = end_seq; + return 1; + } + return 0; +#else + return 0; +#endif +} + +static __inline__ void tcp_dsack_set(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ +#if 0 + if (tp->sack_ok && sysctl_tcp_dsack) { + if (before(seq, tp->rcv_nxt)) + NET_INC_STATS_BH(TCPDSACKOldSent); + else + NET_INC_STATS_BH(TCPDSACKOfoSent); + + tp->dsack = 1; + tp->duplicate_sack[0].start_seq = seq; + tp->duplicate_sack[0].end_seq = end_seq; + tp->eff_sacks = min(tp->num_sacks+1, 4-tp->tstamp_ok); + } +#endif +} + +static __inline__ void tcp_dsack_extend(struct tcp_opt *tp, u32 seq, u32 end_seq) +{ +#if 0 + if (!tp->dsack) + tcp_dsack_set(tp, seq, end_seq); + else + tcp_sack_extend(tp->duplicate_sack, seq, end_seq); +#endif +} + +static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS_BH(DelayedACKLost); + tcp_enter_quickack_mode(tp); + + if (tp->sack_ok && sysctl_tcp_dsack) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) + end_seq = tp->rcv_nxt; + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq); + } + } + + tcp_send_ack(sk); +#endif +} + +/* These routines update the SACK block as out-of-order packets arrive or + * in-order packets close up the sequence space. + */ +static void tcp_sack_maybe_coalesce(struct tcp_opt *tp) +{ +#if 0 + int this_sack; + struct tcp_sack_block *sp = &tp->selective_acks[0]; + struct tcp_sack_block *swalk = sp+1; + + /* See if the recent change to the first SACK eats into + * or hits the sequence space of other SACK blocks, if so coalesce. + */ + for (this_sack = 1; this_sack < tp->num_sacks; ) { + if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { + int i; + + /* Zap SWALK, by moving every further SACK up by one slot. + * Decrease num_sacks. + */ + tp->num_sacks--; + tp->eff_sacks = min(tp->num_sacks+tp->dsack, 4-tp->tstamp_ok); + for(i=this_sack; i < tp->num_sacks; i++) + sp[i] = sp[i+1]; + continue; + } + this_sack++, swalk++; + } +#endif +} + +static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +{ +#if 0 + __u32 tmp; + + tmp = sack1->start_seq; + sack1->start_seq = sack2->start_seq; + sack2->start_seq = tmp; + + tmp = sack1->end_seq; + sack1->end_seq = sack2->end_seq; + sack2->end_seq = tmp; +#endif +} + +static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int cur_sacks = tp->num_sacks; + int this_sack; + + if (!cur_sacks) + goto new_sack; + + for (this_sack=0; this_sack0; this_sack--, sp--) + tcp_sack_swap(sp, sp-1); + if (cur_sacks > 1) + tcp_sack_maybe_coalesce(tp); + return; + } + } + + /* Could not find an adjacent existing SACK, build a new one, + * put it at the front, and shift everyone else down. We + * always know there is at least one SACK present already here. + * + * If the sack array is full, forget about the last one. + */ + if (this_sack >= 4) { + this_sack--; + tp->num_sacks--; + sp--; + } + for(; this_sack > 0; this_sack--, sp--) + *sp = *(sp-1); + +new_sack: + /* Build the new head SACK, and we're done. */ + sp->start_seq = seq; + sp->end_seq = end_seq; + tp->num_sacks++; + tp->eff_sacks = min(tp->num_sacks+tp->dsack, 4-tp->tstamp_ok); +#endif +} + +/* RCV.NXT advances, some SACKs should be eaten. */ + +static void tcp_sack_remove(struct tcp_opt *tp) +{ +#if 0 + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->num_sacks; + int this_sack; + + /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ + if (skb_queue_len(&tp->out_of_order_queue) == 0) { + tp->num_sacks = 0; + tp->eff_sacks = tp->dsack; + return; + } + + for(this_sack = 0; this_sack < num_sacks; ) { + /* Check if the start of the sack is covered by RCV.NXT. */ + if (!before(tp->rcv_nxt, sp->start_seq)) { + int i; + + /* RCV.NXT must cover all the block! */ + BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq)); + + /* Zap this SACK, by moving forward any other SACKS. */ + for (i=this_sack+1; i < num_sacks; i++) + tp->selective_acks[i-1] = tp->selective_acks[i]; + num_sacks--; + continue; + } + this_sack++; + sp++; + } + if (num_sacks != tp->num_sacks) { + tp->num_sacks = num_sacks; + tp->eff_sacks = min(tp->num_sacks+tp->dsack, 4-tp->tstamp_ok); + } +#endif +} + +/* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +static void tcp_ofo_queue(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + __u32 dsack_high = tp->rcv_nxt; + struct sk_buff *skb; + + while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + + if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { + __u32 dsack = dsack_high; + if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) + dsack_high = TCP_SKB_CB(skb)->end_seq; + tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack); + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + SOCK_DEBUG(sk, "ofo packet was already received \n"); + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + continue; + } + SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + __skb_unlink(skb, skb->list); + __skb_queue_tail(&sk->receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if(skb->h.th->fin) + tcp_fin(skb, sk, skb->h.th); + } +#endif +} + +static inline int tcp_rmem_schedule(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + return (int)skb->truesize <= sk->forward_alloc || + tcp_mem_schedule(sk, skb->truesize, 1); +#else + return 0; +#endif +} + +static int tcp_prune_queue(struct sock *sk); + +static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + struct tcphdr *th = skb->h.th; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int eaten = -1; + + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) + goto drop; + + th = skb->h.th; + __skb_pull(skb, th->doff*4); + + TCP_ECN_accept_cwr(tp, skb); + + if (tp->dsack) { + tp->dsack = 0; + tp->eff_sacks = min_t(unsigned int, tp->num_sacks, 4-tp->tstamp_ok); + } + + /* Queue data for delivery to the user. + * Packets in sequence go to the receive queue. + * Out of sequence packets to the out_of_order_queue. + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (tcp_receive_window(tp) == 0) + goto out_of_window; + + /* Ok. In sequence. In window. */ + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + tp->ucopy.len && + sk->lock.users && + !tp->urg_data) { + int chunk = min_t(unsigned int, skb->len, tp->ucopy.len); + + __set_current_state(TASK_RUNNING); + + local_bh_enable(); + if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + eaten = (chunk == skb->len && !th->fin); + } + local_bh_disable(); + } + + if (eaten <= 0) { +queue_and_out: + if (eaten < 0 && + (atomic_read(&sk->rmem_alloc) > sk->rcvbuf || + !tcp_rmem_schedule(sk, skb))) { + if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb)) + goto drop; + } + tcp_set_owner_r(skb, sk); + __skb_queue_tail(&sk->receive_queue, skb); + } + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if(skb->len) + tcp_event_data_recv(sk, tp, skb); + if(th->fin) + tcp_fin(skb, sk, th); + + if (skb_queue_len(&tp->out_of_order_queue)) { + tcp_ofo_queue(sk); + + /* RFC2581. 4.2. SHOULD send immediate ACK, when + * gap in queue is filled. + */ + if (skb_queue_len(&tp->out_of_order_queue) == 0) + tp->ack.pingpong = 0; + } + + if(tp->num_sacks) + tcp_sack_remove(tp); + + tcp_fast_path_check(sk, tp); + + if (eaten > 0) { + __kfree_skb(skb); + } else if (!sk->dead) + sk->data_ready(sk, 0); + return; + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + /* A retransmit, 2nd most common case. Force an immediate ack. */ + NET_INC_STATS_BH(DelayedACKLost); + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + +out_of_window: + tcp_enter_quickack_mode(tp); + tcp_schedule_ack(tp); +drop: + __kfree_skb(skb); + return; + } + + /* Out of window. F.e. zero window probe. */ + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt+tcp_receive_window(tp))) + goto out_of_window; + + tcp_enter_quickack_mode(tp); + + if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* Partial packet, seq < rcv_next < end_seq */ + SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + + /* If window is closed, drop tail of packet. But after + * remembering D-SACK for its head made in previous line. + */ + if (!tcp_receive_window(tp)) + goto out_of_window; + goto queue_and_out; + } + + TCP_ECN_check_ce(tp, skb); + + if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf || + !tcp_rmem_schedule(sk, skb)) { + if (tcp_prune_queue(sk) < 0 || !tcp_rmem_schedule(sk, skb)) + goto drop; + } + + /* Disable header prediction. */ + tp->pred_flags = 0; + tcp_schedule_ack(tp); + + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + + tcp_set_owner_r(skb, sk); + + if (skb_peek(&tp->out_of_order_queue) == NULL) { + /* Initial out of order segment, build 1 SACK. */ + if(tp->sack_ok) { + tp->num_sacks = 1; + tp->dsack = 0; + tp->eff_sacks = 1; + tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; + tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq; + } + __skb_queue_head(&tp->out_of_order_queue,skb); + } else { + struct sk_buff *skb1=tp->out_of_order_queue.prev; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (seq == TCP_SKB_CB(skb1)->end_seq) { + __skb_append(skb1, skb); + + if (tp->num_sacks == 0 || + tp->selective_acks[0].end_seq != seq) + goto add_sack; + + /* Common case: data arrive in order after hole. */ + tp->selective_acks[0].end_seq = end_seq; + return; + } + + /* Find place to insert this segment. */ + do { + if (!after(TCP_SKB_CB(skb1)->seq, seq)) + break; + } while ((skb1=skb1->prev) != (struct sk_buff*)&tp->out_of_order_queue); + + /* Do skb overlap to previous one? */ + if (skb1 != (struct sk_buff*)&tp->out_of_order_queue && + before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + __kfree_skb(skb); + tcp_dsack_set(tp, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + skb1 = skb1->prev; + } + } + __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); + + /* And clean segments covered by new one as whole. */ + while ((skb1 = skb->next) != (struct sk_buff*)&tp->out_of_order_queue && + after(end_seq, TCP_SKB_CB(skb1)->seq)) { + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); + break; + } + __skb_unlink(skb1, skb1->list); + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); + } + +add_sack: + if (tp->sack_ok) + tcp_sack_new_ofo_skb(sk, seq, end_seq); + } +#endif +} + +/* Collapse contiguous sequence of skbs head..tail with + * sequence numbers start..end. + * Segments with FIN/SYN are not collapsed (only because this + * simplifies code) + */ +static void +tcp_collapse(struct sock *sk, struct sk_buff *head, + struct sk_buff *tail, u32 start, u32 end) +{ +#if 0 + struct sk_buff *skb; + + /* First, check that queue is collapsable and find + * the point where collapsing can be useful. */ + for (skb = head; skb != tail; ) { + /* No new bits? It is possible on ofo queue. */ + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { + struct sk_buff *next = skb->next; + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + NET_INC_STATS_BH(TCPRcvCollapsed); + skb = next; + continue; + } + + /* The first skb to collapse is: + * - not SYN/FIN and + * - bloated or contains data before "start" or + * overlaps to the next one. + */ + if (!skb->h.th->syn && !skb->h.th->fin && + (tcp_win_from_space(skb->truesize) > skb->len || + before(TCP_SKB_CB(skb)->seq, start) || + (skb->next != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq))) + break; + + /* Decided to skip this, advance start seq. */ + start = TCP_SKB_CB(skb)->end_seq; + skb = skb->next; + } + if (skb == tail || skb->h.th->syn || skb->h.th->fin) + return; + + while (before(start, end)) { + struct sk_buff *nskb; + int header = skb_headroom(skb); + int copy = (PAGE_SIZE - sizeof(struct sk_buff) - + sizeof(struct skb_shared_info) - header - 31)&~15; + + /* Too big header? This can happen with IPv6. */ + if (copy < 0) + return; + if (end-start < copy) + copy = end-start; + nskb = alloc_skb(copy+header, GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, header); + memcpy(nskb->head, skb->head, header); + nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); + nskb->h.raw = nskb->head + (skb->h.raw-skb->head); + nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; + __skb_insert(nskb, skb->prev, skb, skb->list); + tcp_set_owner_r(nskb, sk); + + /* Copy data, releasing collapsed skbs. */ + while (copy > 0) { + int offset = start - TCP_SKB_CB(skb)->seq; + int size = TCP_SKB_CB(skb)->end_seq - start; + + if (offset < 0) BUG(); + if (size > 0) { + size = min(copy, size); + if (skb_copy_bits(skb, offset, skb_put(nskb, size), size)) + BUG(); + TCP_SKB_CB(nskb)->end_seq += size; + copy -= size; + start += size; + } + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { + struct sk_buff *next = skb->next; + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + NET_INC_STATS_BH(TCPRcvCollapsed); + skb = next; + if (skb == tail || skb->h.th->syn || skb->h.th->fin) + return; + } + } + } +#endif +} + +/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs + * and tcp_collapse() them until all the queue is collapsed. + */ +static void tcp_collapse_ofo_queue(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); + struct sk_buff *head; + u32 start, end; + + if (skb == NULL) + return; + + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + head = skb; + + for (;;) { + skb = skb->next; + + /* Segment is terminated when we see gap or when + * we are at the end of all the queue. */ + if (skb == (struct sk_buff *)&tp->out_of_order_queue || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { + tcp_collapse(sk, head, skb, start, end); + head = skb; + if (skb == (struct sk_buff *)&tp->out_of_order_queue) + break; + /* Start new segment */ + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + } else { + if (before(TCP_SKB_CB(skb)->seq, start)) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) + end = TCP_SKB_CB(skb)->end_seq; + } + } +#endif +} + +/* Reduce allocated memory if we can, trying to get + * the socket within its memory limits again. + * + * Return less than zero if we should start dropping frames + * until the socket owning process reads some of the data + * to stabilize the situation. + */ +static int tcp_prune_queue(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); + + NET_INC_STATS_BH(PruneCalled); + + if (atomic_read(&sk->rmem_alloc) >= sk->rcvbuf) + tcp_clamp_window(sk, tp); + else if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + + tcp_collapse_ofo_queue(sk); + tcp_collapse(sk, sk->receive_queue.next, + (struct sk_buff*)&sk->receive_queue, + tp->copied_seq, tp->rcv_nxt); + tcp_mem_reclaim(sk); + + if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) + return 0; + + /* Collapsing did not help, destructive actions follow. + * This must not ever occur. */ + + /* First, purge the out_of_order queue. */ + if (skb_queue_len(&tp->out_of_order_queue)) { + net_statistics[smp_processor_id()*2].OfoPruned += skb_queue_len(&tp->out_of_order_queue); + __skb_queue_purge(&tp->out_of_order_queue); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if(tp->sack_ok) + tcp_sack_reset(tp); + tcp_mem_reclaim(sk); + } + + if(atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) + return 0; + + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. + */ + NET_INC_STATS_BH(RcvPruned); + + /* Massive buffer overcommit. */ + tp->pred_flags = 0; + return -1; +#else + return 0; +#endif +} + + +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +void tcp_cwnd_application_limited(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (tp->ca_state == TCP_CA_Open && + sk->socket && !test_bit(SOCK_NOSPACE, &sk->socket->flags)) { + /* Limited by application or receiver window. */ + u32 win_used = max(tp->snd_cwnd_used, 2U); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_cwnd = (tp->snd_cwnd+win_used)>>1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +#endif +} + + +/* When incoming ACK allowed to free some skb from write_queue, + * we remember this event in flag tp->queue_shrunk and wake up socket + * on the exit from tcp input handler. + */ +static void tcp_new_space(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (tp->packets_out < tp->snd_cwnd && + !(sk->userlocks&SOCK_SNDBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + int sndmem, demanded; + + sndmem = tp->mss_clamp+MAX_TCP_HEADER+16+sizeof(struct sk_buff); + demanded = max_t(unsigned int, tp->snd_cwnd, tp->reordering+1); + sndmem *= 2*demanded; + if (sndmem > sk->sndbuf) + sk->sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + tp->snd_cwnd_stamp = tcp_time_stamp; + } + + sk->write_space(sk); +#endif +} + +static inline void tcp_check_space(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (tp->queue_shrunk) { + tp->queue_shrunk = 0; + if (sk->socket && test_bit(SOCK_NOSPACE, &sk->socket->flags)) + tcp_new_space(sk); + } +#endif +} + +static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +#endif +} + +static __inline__ void tcp_data_snd_check(struct sock *sk) +{ +#if 0 + struct sk_buff *skb = sk->tp_pinfo.af_tcp.send_head; + + if (skb != NULL) + __tcp_data_snd_check(sk, skb); + tcp_check_space(sk); +#endif +} + +/* + * Check if sending an ack is needed. + */ +static __inline__ void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). Or... + */ + && __tcp_select_window(sk) >= tp->rcv_wnd) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(tp) || + /* We have out of order data. */ + (ofo_possible && + skb_peek(&tp->out_of_order_queue) != NULL)) { + /* Then ack it now */ + tcp_send_ack(sk); + } else { + /* Else, send delayed ack. */ + tcp_send_delayed_ack(sk); + } +#endif +} + +static __inline__ void tcp_ack_snd_check(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if (!tcp_ack_scheduled(tp)) { + /* We sent a data segment already. */ + return; + } + __tcp_ack_snd_check(sk, 1); +#endif +} + +/* + * This routine is only called when we have urgent data + * signalled. Its the 'slow' part of tcp_urg. It could be + * moved inline now as tcp_urg is only called from one + * place. We handle URGent data wrong. We have to - as + * BSD still doesn't use the correction from RFC961. + * For 1003.1g we should support a new option TCP_STDURG to permit + * either form (or just set the sysctl tcp_stdurg). + */ + +static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 ptr = ntohs(th->urg_ptr); + + if (ptr && !sysctl_tcp_stdurg) + ptr--; + ptr += ntohl(th->seq); + + /* Ignore urgent data that we've already seen and read. */ + if (after(tp->copied_seq, ptr)) + return; + + /* Do not replay urg ptr. + * + * NOTE: interesting situation not covered by specs. + * Misbehaving sender may send urg ptr, pointing to segment, + * which we already have in ofo queue. We are not able to fetch + * such data and will stay in TCP_URG_NOTYET until will be eaten + * by recvmsg(). Seems, we are not obliged to handle such wicked + * situations. But it is worth to think about possibility of some + * DoSes using some hypothetical application level deadlock. + */ + if (before(ptr, tp->rcv_nxt)) + return; + + /* Do we already have a newer (or duplicate) urgent pointer? */ + if (tp->urg_data && !after(ptr, tp->urg_seq)) + return; + + /* Tell the world about our new urgent pointer. */ + if (sk->proc != 0) { + if (sk->proc > 0) + kill_proc(sk->proc, SIGURG, 1); + else + kill_pg(-sk->proc, SIGURG, 1); + sk_wake_async(sk, 3, POLL_PRI); + } + + /* We may be adding urgent data when the last byte read was + * urgent. To do this requires some care. We cannot just ignore + * tp->copied_seq since we would read the last urgent byte again + * as data, nor can we alter copied_seq until this data arrives + * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * + * NOTE. Double Dutch. Rendering to plain English: author of comment + * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); + * and expect that both A and B disappear from stream. This is _wrong_. + * Though this happens in BSD with high probability, this is occasional. + * Any application relying on this is buggy. Note also, that fix "works" + * only in this artificial test. Insert some normal data between A and B and we will + * decline of BSD again. Verdict: it is better to remove to trap + * buggy users. + */ + if (tp->urg_seq == tp->copied_seq && tp->urg_data && + !sk->urginline && + tp->copied_seq != tp->rcv_nxt) { + struct sk_buff *skb = skb_peek(&sk->receive_queue); + tp->copied_seq++; + if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + } + } + + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = ptr; + + /* Disable header prediction. */ + tp->pred_flags = 0; +#endif +} + +/* This is the 'fast' part of urgent handling. */ +static inline void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Check if we get a new urgent pointer - normally not. */ + if (th->urg) + tcp_check_urg(sk,th); + + /* Do we wait for any urgent data? - normally not... */ + if (tp->urg_data == TCP_URG_NOTYET) { + u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff*4) - th->syn; + + /* Is the urgent pointer pointing into this packet? */ + if (ptr < skb->len) { + u8 tmp; + if (skb_copy_bits(skb, ptr, &tmp, 1)) + BUG(); + tp->urg_data = TCP_URG_VALID | tmp; + if (!sk->dead) + sk->data_ready(sk,0); + } + } +#endif +} + +static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int chunk = skb->len - hlen; + int err; + + local_bh_enable(); + if (skb->ip_summed==CHECKSUM_UNNECESSARY) + err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); + else + err = skb_copy_and_csum_datagram_iovec(skb, hlen, tp->ucopy.iov); + + if (!err) { + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + } + + local_bh_disable(); + return err; +#else + return 0; +#endif +} + +static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + int result; + + if (sk->lock.users) { + local_bh_enable(); + result = __tcp_checksum_complete(skb); + local_bh_disable(); + } else { + result = __tcp_checksum_complete(skb); + } + return result; +#else + return 0; +#endif +} + +static __inline__ int +tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __tcp_checksum_complete_user(sk, skb); +#else + return 0; +#endif +} + +/* + * TCP receive function for the ESTABLISHED state. + * + * It is split into a fast path and a slow path. The fast path is + * disabled when: + * - A zero window was announced from us - zero window probing + * is only handled properly in the slow path. + * - Out of order segments arrived. + * - Urgent data is expected. + * - There is no buffer space left + * - Unexpected TCP flags/window values/header lengths are received + * (detected by checking the TCP header against pred_flags) + * - Data is sent in both directions. Fast path only supports pure senders + * or pure receivers (this means either the sequence number or the ack + * value must stay constant) + * - Unexpected TCP option. + * + * When these conditions are not satisfied it drops into a standard + * receive procedure patterned after RFC793 to handle all cases. + * The first three cases are guaranteed by proper pred_flags setting, + * the rest is checked inline. Fast processing is turned on in + * tcp_data_queue when everything is OK. + */ +int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* + * Header prediction. + * The code losely follows the one in the famous + * "30 instruction TCP receive" Van Jacobson mail. + * + * Van's trick is to deposit buffers into socket queue + * on a device interrupt, to call tcp_recv function + * on the receive process context and checksum and copy + * the buffer to user space. smart... + * + * Our current scheme is not silly either but we take the + * extra cost of the net_bh soft interrupt processing... + * We do checksum and copy also but from device to kernel. + */ + + tp->saw_tstamp = 0; + + /* pred_flags is 0xS?10 << 16 + snd_wnd + * if header_predition is to be made + * 'S' will always be tp->tcp_header_len >> 2 + * '?' will be 0 for the fast path, otherwise pred_flags is 0 to + * turn it off (when there are holes in the receive + * space for instance) + * PSH flag is ignored. + */ + + if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && + TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + int tcp_header_len = tp->tcp_header_len; + + /* Timestamp header prediction: tcp_header_len + * is automatically equal to th->doff*4 due to pred_flags + * match. + */ + + /* Check timestamp */ + if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { + __u32 *ptr = (__u32 *)(th + 1); + + /* No? Slow path! */ + if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) + goto slow_path; + + tp->saw_tstamp = 1; + ++ptr; + tp->rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rcv_tsecr = ntohl(*ptr); + + /* If PAWS failed, check it more carefully in slow path */ + if ((s32)(tp->rcv_tsval - tp->ts_recent) < 0) + goto slow_path; + + /* DO NOT update ts_recent here, if checksum fails + * and timestamp was corrupted part, it will result + * in a hung connection since we will drop all + * future packets due to the PAWS test. + */ + } + + if (len <= tcp_header_len) { + /* Bulk data transfer: sender */ + if (len == tcp_header_len) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + /* We know that such packets are checksummed + * on entry. + */ + tcp_ack(sk, skb, 0); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } else { /* Header too small */ + TCP_INC_STATS_BH(TcpInErrs); + goto discard; + } + } else { + int eaten = 0; + + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sk->lock.users) { + __set_current_state(TASK_RUNNING); + + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + __skb_pull(skb, tcp_header_len); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + NET_INC_STATS_BH(TCPHPHitsToUser); + eaten = 1; + } + } + if (!eaten) { + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + if ((int)skb->truesize > sk->forward_alloc) + goto step5; + + NET_INC_STATS_BH(TCPHPHits); + + /* Bulk data transfer: receiver */ + __skb_pull(skb,tcp_header_len); + __skb_queue_tail(&sk->receive_queue, skb); + tcp_set_owner_r(skb, sk); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + } + + tcp_event_data_recv(sk, tp, skb); + + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { + /* Well, only one small jumplet in fast path... */ + tcp_ack(sk, skb, FLAG_DATA); + tcp_data_snd_check(sk); + if (!tcp_ack_scheduled(tp)) + goto no_ack; + } + + if (eaten) { + if (tcp_in_quickack_mode(tp)) { + tcp_send_ack(sk); + } else { + tcp_send_delayed_ack(sk); + } + } else { + __tcp_ack_snd_check(sk, 0); + } + +no_ack: + if (eaten) + __kfree_skb(skb); + else + sk->data_ready(sk, 0); + return 0; + } + } + +slow_path: + if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + /* + * RFC1323: H1. Apply PAWS check first. + */ + if (tcp_fast_parse_options(skb, th, tp) && tp->saw_tstamp && + tcp_paws_discard(tp, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(PAWSEstabRejected); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Resets are accepted even if PAWS failed. + + ts_recent update must be made after we are sure + that the packet is in window. + */ + } + + /* + * Standard slow path. + */ + + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + /* RFC793, page 37: "In all states except SYN-SENT, all reset + * (RST) segments are validated by checking their SEQ-fields." + * And page 69: "If an incoming segment is not acceptable, + * an acknowledgment should be sent in reply (unless the RST bit + * is set, if so drop the segment and return)". + */ + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + TCP_INC_STATS_BH(TcpInErrs); + NET_INC_STATS_BH(TCPAbortOnSyn); + tcp_reset(sk); + return 1; + } + +step5: + if(th->ack) + tcp_ack(sk, skb, FLAG_SLOWPATH); + + /* Process urgent data. */ + tcp_urg(sk, skb, th); + + /* step 7: process the segment text */ + tcp_data_queue(sk, skb); + + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + return 0; + +csum_error: + TCP_INC_STATS_BH(TcpInErrs); + +discard: + __kfree_skb(skb); + return 0; +#else + return 0; +#endif +} + +static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int saved_clamp = tp->mss_clamp; + + tcp_parse_options(skb, tp, 0); + + if (th->ack) { + /* rfc793: + * "If the state is SYN-SENT then + * first check the ACK bit + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send + * a reset (unless the RST bit is set, if so drop + * the segment and return)" + * + * We do not send data with SYN, so that RFC-correct + * test reduces to: + */ + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + goto reset_and_undo; + + if (tp->saw_tstamp && tp->rcv_tsecr && + !between(tp->rcv_tsecr, tp->retrans_stamp, tcp_time_stamp)) { + NET_INC_STATS_BH(PAWSActiveRejected); + goto reset_and_undo; + } + + /* Now ACK is acceptable. + * + * "If the RST bit is set + * If the ACK was acceptable then signal the user "error: + * connection reset", drop the segment, enter CLOSED state, + * delete TCB, and return." + */ + + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* rfc793: + * "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + * + * See note below! + * --ANK(990513) + */ + if (!th->syn) + goto discard_and_undo; + + /* rfc793: + * "If the SYN bit is on ... + * are acceptable then ... + * (our SYN has been ACKed), change the connection + * state to ESTABLISHED..." + */ + + TCP_ECN_rcv_synack(tp, th); + + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_ack(sk, skb, FLAG_SLOWPATH); + + /* Ok.. it's good. Set up sequence numbers and + * move to established. + */ + tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq+1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = ntohs(th->window); + tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); + + if (tp->wscale_ok == 0) { + tp->snd_wscale = tp->rcv_wscale = 0; + tp->window_clamp = min(tp->window_clamp, 65535U); + } + + if (tp->saw_tstamp) { + tp->tstamp_ok = 1; + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + tcp_store_ts_recent(tp); + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + if (tp->sack_ok && sysctl_tcp_fack) + tp->sack_ok |= 2; + + tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_initialize_rcv_mss(sk); + tcp_init_metrics(sk); + tcp_init_buffer_space(sk); + + if (sk->keepopen) + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + if (tp->snd_wscale == 0) + __tcp_fast_path_on(tp, tp->snd_wnd); + else + tp->pred_flags = 0; + + /* Remember, tcp_poll() does not lock socket! + * Change state from SYN-SENT only after copied_seq + * is initialized. */ + tp->copied_seq = tp->rcv_nxt; + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + + if(!sk->dead) { + sk->state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + } + + if (tp->write_pending || tp->defer_accept || tp->ack.pingpong) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK + */ + tcp_schedule_ack(tp); + tp->ack.lrcvtime = tcp_time_stamp; + tp->ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(tp); + tcp_enter_quickack_mode(tp); + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + +discard: + __kfree_skb(skb); + return 0; + } else { + tcp_send_ack(sk); + } + return -1; + } + + /* No ACK in the segment */ + + if (th->rst) { + /* rfc793: + * "If the RST bit is set + * + * Otherwise (no ACK) drop the segment and return." + */ + + goto discard_and_undo; + } + + /* PAWS check. */ + if (tp->ts_recent_stamp && tp->saw_tstamp && tcp_paws_check(tp, 0)) + goto discard_and_undo; + + if (th->syn) { + /* We see SYN without ACK. It is attempt of + * simultaneous connect with crossed SYNs. + * Particularly, it can be connect to self. + */ + tcp_set_state(sk, TCP_SYN_RECV); + + if (tp->saw_tstamp) { + tp->tstamp_ok = 1; + tcp_store_ts_recent(tp); + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = ntohs(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->max_window = tp->snd_wnd; + + tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_initialize_rcv_mss(sk); + + TCP_ECN_rcv_syn(tp, th); + + tcp_send_synack(sk); +#if 0 + /* Note, we could accept data and URG from this segment. + * There are no obstacles to make this. + * + * However, if we ignore data in ACKless segments sometimes, + * we have no reasons to accept it sometimes. + * Also, seems the code doing it in step6 of tcp_rcv_state_process + * is not flawless. So, discard packet for sanity. + * Uncomment this return to process the data. + */ + return -1; +#else + goto discard; +#endif + } + /* "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + */ + +discard_and_undo: + tcp_clear_options(tp); + tp->mss_clamp = saved_clamp; + goto discard; + +reset_and_undo: + tcp_clear_options(tp); + tp->mss_clamp = saved_clamp; + return 1; +#else + return 0; +#endif +} + + +/* + * This function implements the receiving procedure of RFC 793 for + * all states except ESTABLISHED and TIME_WAIT. + * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be + * address independent. + */ + +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int queued = 0; + + tp->saw_tstamp = 0; + + switch (sk->state) { + case TCP_CLOSE: + goto discard; + + case TCP_LISTEN: + if(th->ack) + return 1; + + if(th->rst) + goto discard; + + if(th->syn) { + if(tp->af_specific->conn_request(sk, skb) < 0) + return 1; + + /* Now we have several options: In theory there is + * nothing else in the frame. KA9Q has an option to + * send data with the syn, BSD accepts data with the + * syn up to the [to be] advertised window and + * Solaris 2.1 gives you a protocol error. For now + * we just ignore it, that fits the spec precisely + * and avoids incompatibilities. It would be nice in + * future to drop through and process the data. + * + * Now that TTCP is starting to be used we ought to + * queue this data. + * But, this leaves one open to an easy denial of + * service attack, and SYN cookies can't defend + * against this problem. So, we drop the data + * in the interest of security over speed. + */ + goto discard; + } + goto discard; + + case TCP_SYN_SENT: + queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + if (queued >= 0) + return queued; + + /* Do step6 onward by hand. */ + tcp_urg(sk, skb, th); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } + + if (tcp_fast_parse_options(skb, th, tp) && tp->saw_tstamp && + tcp_paws_discard(tp, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(PAWSEstabRejected); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Reset is accepted even if it did not pass PAWS. */ + } + + /* step 1: check sequence number */ + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + /* step 2: check RST bit */ + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + /* step 3: check security and precedence [ignored] */ + + /* step 4: + * + * Check for a SYN in window. + */ + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS_BH(TCPAbortOnSyn); + tcp_reset(sk); + return 1; + } + + /* step 5: check the ACK field */ + if (th->ack) { + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); + + switch(sk->state) { + case TCP_SYN_RECV: + if (acceptable) { + tp->copied_seq = tp->rcv_nxt; + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->state_change(sk); + + /* Note, that this wakeup is only for marginal + * crossed SYN case. Passively open sockets + * are not waked up, because sk->sleep == NULL + * and sk->socket == NULL. + */ + if (sk->socket) { + sk_wake_async(sk,0,POLL_OUT); + } + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << tp->snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); + + /* tcp_ack considers this ACK as duplicate + * and does not calculate rtt. + * Fix it at least with timestamps. + */ + if (tp->saw_tstamp && tp->rcv_tsecr && !tp->srtt) + tcp_ack_saw_tstamp(tp, 0); + + if (tp->tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + + tcp_init_metrics(sk); + tcp_initialize_rcv_mss(sk); + tcp_init_buffer_space(sk); + tcp_fast_path_on(tp); + } else { + return 1; + } + break; + + case TCP_FIN_WAIT1: + if (tp->snd_una == tp->write_seq) { + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->shutdown |= SEND_SHUTDOWN; + dst_confirm(sk->dst_cache); + + if (!sk->dead) { + /* Wake up lingering close() */ + sk->state_change(sk); + } else { + int tmo; + + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(TCPAbortOnData); + return 1; + } + + tmo = tcp_fin_time(tp); + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sk->lock.users) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + tcp_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + } + } + break; + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; + + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + tcp_update_metrics(sk); + tcp_done(sk); + goto discard; + } + break; + } + } else + goto discard; + + /* step 6: check the URG bit */ + tcp_urg(sk, skb, th); + + /* step 7: process the segment text */ + switch (sk->state) { + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_LAST_ACK: + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* RFC 793 says to queue data in these states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. + */ + if (sk->shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + NET_INC_STATS_BH(TCPAbortOnData); + tcp_reset(sk); + return 1; + } + } + /* Fall through */ + case TCP_ESTABLISHED: + tcp_data_queue(sk, skb); + queued = 1; + break; + } + + /* tcp_data could move socket to TIME-WAIT */ + if (sk->state != TCP_CLOSE) { + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + } + + if (!queued) { +discard: + __kfree_skb(skb); + } + return 0; +#else + return 0; +#endif +} diff --git a/reactos/drivers/net/tcpip/transport/tcp/tcp_ipv4.c b/reactos/drivers/net/tcpip/transport/tcp/tcp_ipv4.c new file mode 100755 index 00000000000..cf45c02046b --- /dev/null +++ b/reactos/drivers/net/tcpip/transport/tcp/tcp_ipv4.c @@ -0,0 +1,2523 @@ +/* + * COPYRIGHT: See COPYING in the top level directory + * PROJECT: ReactOS TCP/IP protocol driver + * FILE: transport/tcp/tcp_ipv4.c + * PURPOSE: Transmission Control Protocol + * PROGRAMMERS: Casper S. Hornstrup (chorns@users.sourceforge.net) + * REVISIONS: + * CSH 15-01-2003 Imported from linux kernel 2.4.20 + */ + +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_ipv4.c,v 1.1 2003/01/15 21:57:31 chorns Exp $ + * + * IPv4 specific functions + * + * + * code split from: + * linux/ipv4/tcp.c + * linux/ipv4/tcp_input.c + * linux/ipv4/tcp_output.c + * + * See tcp.c for author information + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * David S. Miller : New socket lookup architecture. + * This code is dedicated to John Dyson. + * David S. Miller : Change semantics of established hash, + * half is devoted to TIME_WAIT sockets + * and the rest go in the other half. + * Andi Kleen : Add support for syncookies and fixed + * some bugs: ip options weren't passed to + * the TCP layer, missed a check for an ACK bit. + * Andi Kleen : Implemented fast path mtu discovery. + * Fixed many serious bugs in the + * open_request handling and moved + * most of it into the af independent code. + * Added tail drop and some other bugfixes. + * Added new listen sematics. + * Mike McLagan : Routing by source + * Juan Jose Ciarlante: ip_dynaddr bits + * Andi Kleen: various fixes. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Fix new listen. + * Andi Kleen : Fix accept error reporting. + */ + +#if 0 +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#else +#include "linux.h" +#include "tcpcore.h" +#endif + +extern int sysctl_ip_dynaddr; +extern int sysctl_ip_default_ttl; +int sysctl_tcp_tw_reuse = 0; + +/* Check TCP sequence numbers in ICMP packets. */ +#define ICMP_MIN_LENGTH 8 + +/* Socket used for sending RSTs */ +#if 0 +static struct inode tcp_inode; +static struct socket *tcp_socket=&tcp_inode.u.socket_i; +#endif + +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb); + +/* + * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation + */ +#if 0 +struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { + __tcp_ehash: NULL, + __tcp_bhash: NULL, + __tcp_bhash_size: 0, + __tcp_ehash_size: 0, + __tcp_listening_hash: { NULL, }, + __tcp_lhash_lock: RW_LOCK_UNLOCKED, + __tcp_lhash_users: ATOMIC_INIT(0), + __tcp_lhash_wait: + __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), + __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED +}; +#endif + +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; +int tcp_port_rover = (1024 - 1); + +static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, + __u32 faddr, __u16 fport) +{ + int h = ((laddr ^ lport) ^ (faddr ^ fport)); + h ^= h>>16; + h ^= h>>8; + return h & (tcp_ehash_size - 1); +} + +static __inline__ int tcp_sk_hashfn(struct sock *sk) +{ + __u32 laddr = sk->rcv_saddr; + __u16 lport = sk->num; + __u32 faddr = sk->daddr; + __u16 fport = sk->dport; + + return tcp_hashfn(laddr, lport, faddr, fport); +} + +/* Allocate and initialize a new TCP local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. + */ +struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, + unsigned short snum) +{ +#if 0 + struct tcp_bind_bucket *tb; + + tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); + if(tb != NULL) { + tb->port = snum; + tb->fastreuse = 0; + tb->owners = NULL; + if((tb->next = head->chain) != NULL) + tb->next->pprev = &tb->next; + head->chain = tb; + tb->pprev = &head->chain; + } + return tb; +#else + return NULL; +#endif +} + +/* Caller must disable local BH processing. */ +static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) +{ +#if 0 + struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)]; + struct tcp_bind_bucket *tb; + + spin_lock(&head->lock); + tb = (struct tcp_bind_bucket *)sk->prev; + if ((child->bind_next = tb->owners) != NULL) + tb->owners->bind_pprev = &child->bind_next; + tb->owners = child; + child->bind_pprev = &tb->owners; + child->prev = (struct sock *) tb; + spin_unlock(&head->lock); +#endif +} + +__inline__ void tcp_inherit_port(struct sock *sk, struct sock *child) +{ +#if 0 + local_bh_disable(); + __tcp_inherit_port(sk, child); + local_bh_enable(); +#endif +} + +static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum) +{ +#if 0 + sk->num = snum; + if ((sk->bind_next = tb->owners) != NULL) + tb->owners->bind_pprev = &sk->bind_next; + tb->owners = sk; + sk->bind_pprev = &tb->owners; + sk->prev = (struct sock *) tb; +#endif +} + +static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) +{ +#if 0 + struct sock *sk2 = tb->owners; + int sk_reuse = sk->reuse; + + for( ; sk2 != NULL; sk2 = sk2->bind_next) { + if (sk != sk2 && + sk2->reuse <= 1 && + sk->bound_dev_if == sk2->bound_dev_if) { + if (!sk_reuse || + !sk2->reuse || + sk2->state == TCP_LISTEN) { + if (!sk2->rcv_saddr || + !sk->rcv_saddr || + (sk2->rcv_saddr == sk->rcv_saddr)) + break; + } + } + } + return sk2 != NULL; +#else + return 0; +#endif +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +static int tcp_v4_get_port(struct sock *sk, unsigned short snum) +{ +#if 0 + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + int ret; + + local_bh_disable(); + if (snum == 0) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&tcp_portalloc_lock); + rover = tcp_port_rover; + do { rover++; + if ((rover < low) || (rover > high)) + rover = low; + head = &tcp_bhash[tcp_bhashfn(rover)]; + spin_lock(&head->lock); + for (tb = head->chain; tb; tb = tb->next) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + /* Exhausted local port range during search? */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD is + * non-NULL and we hold it's mutex. + */ + snum = rover; + tb = NULL; + } else { + head = &tcp_bhash[tcp_bhashfn(snum)]; + spin_lock(&head->lock); + for (tb = head->chain; tb != NULL; tb = tb->next) + if (tb->port == snum) + break; + } + if (tb != NULL && tb->owners != NULL) { + if (sk->reuse > 1) + goto success; + if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (tcp_bind_conflict(sk, tb)) + goto fail_unlock; + } + } + ret = 1; + if (tb == NULL && + (tb = tcp_bucket_create(head, snum)) == NULL) + goto fail_unlock; + if (tb->owners == NULL) { + if (sk->reuse && sk->state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + ((sk->reuse == 0) || (sk->state == TCP_LISTEN))) + tb->fastreuse = 0; +success: + if (sk->prev == NULL) + tcp_bind_hash(sk, tb, snum); + BUG_TRAP(sk->prev == (struct sock *) tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +#else + return 0; +#endif +} + +/* Get rid of any references to a local port held by the + * given sock. + */ +__inline__ void __tcp_put_port(struct sock *sk) +{ +#if 0 + struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)]; + struct tcp_bind_bucket *tb; + + spin_lock(&head->lock); + tb = (struct tcp_bind_bucket *) sk->prev; + if (sk->bind_next) + sk->bind_next->bind_pprev = sk->bind_pprev; + *(sk->bind_pprev) = sk->bind_next; + sk->prev = NULL; + sk->num = 0; + if (tb->owners == NULL) { + if (tb->next) + tb->next->pprev = tb->pprev; + *(tb->pprev) = tb->next; + kmem_cache_free(tcp_bucket_cachep, tb); + } + spin_unlock(&head->lock); +#endif +} + +void tcp_put_port(struct sock *sk) +{ +#if 0 + local_bh_disable(); + __tcp_put_port(sk); + local_bh_enable(); +#endif +} + +/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. + */ + +void tcp_listen_wlock(void) +{ +#if 0 + write_lock(&tcp_lhash_lock); + + if (atomic_read(&tcp_lhash_users)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&tcp_lhash_wait, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&tcp_lhash_users) == 0) + break; + write_unlock_bh(&tcp_lhash_lock); + schedule(); + write_lock_bh(&tcp_lhash_lock); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&tcp_lhash_wait, &wait); + } +#endif +} + +static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) +{ +#if 0 + struct sock **skp; + rwlock_t *lock; + + BUG_TRAP(sk->pprev==NULL); + if(listen_possible && sk->state == TCP_LISTEN) { + skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + lock = &tcp_lhash_lock; + tcp_listen_wlock(); + } else { + skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain; + lock = &tcp_ehash[sk->hashent].lock; + write_lock(lock); + } + if((sk->next = *skp) != NULL) + (*skp)->pprev = &sk->next; + *skp = sk; + sk->pprev = skp; + sock_prot_inc_use(sk->prot); + write_unlock(lock); + if (listen_possible && sk->state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); +#endif +} + +static void tcp_v4_hash(struct sock *sk) +{ +#if 0 + if (sk->state != TCP_CLOSE) { + local_bh_disable(); + __tcp_v4_hash(sk, 1); + local_bh_enable(); + } +#endif +} + +void tcp_unhash(struct sock *sk) +{ +#if 0 + rwlock_t *lock; + + if (!sk->pprev) + goto ende; + + if (sk->state == TCP_LISTEN) { + local_bh_disable(); + tcp_listen_wlock(); + lock = &tcp_lhash_lock; + } else { + struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent]; + lock = &head->lock; + write_lock_bh(&head->lock); + } + + if(sk->pprev) { + if(sk->next) + sk->next->pprev = sk->pprev; + *sk->pprev = sk->next; + sk->pprev = NULL; + sock_prot_dec_use(sk->prot); + } + write_unlock_bh(lock); + + ende: + if (sk->state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); +#endif +} + +/* Don't inline this cruft. Here are some nice properties to + * exploit here. The BSD API does not allow a listening TCP + * to specify the remote port nor the remote address for the + * connection. So always assume those are both wildcarded + * during the search since they can never be otherwise. + */ +static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif) +{ +#if 0 + struct sock *result = NULL; + int score, hiscore; + + hiscore=0; + for(; sk; sk = sk->next) { + if(sk->num == hnum) { + __u32 rcv_saddr = sk->rcv_saddr; + + score = 1; + if(rcv_saddr) { + if (rcv_saddr != daddr) + continue; + score++; + } + if (sk->bound_dev_if) { + if (sk->bound_dev_if != dif) + continue; + score++; + } + if (score == 3) + return sk; + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + return result; +#else + return NULL; +#endif +} + +/* Optimize the common listener case. */ +__inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif) +{ +#if 0 + struct sock *sk; + + read_lock(&tcp_lhash_lock); + sk = tcp_listening_hash[tcp_lhashfn(hnum)]; + if (sk) { + if (sk->num == hnum && + sk->next == NULL && + (!sk->rcv_saddr || sk->rcv_saddr == daddr) && + !sk->bound_dev_if) + goto sherry_cache; + sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif); + } + if (sk) { +sherry_cache: + sock_hold(sk); + } + read_unlock(&tcp_lhash_lock); + return sk; +#else + return NULL; +#endif +} + +/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * Local BH must be disabled here. + */ + +static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, + u32 daddr, u16 hnum, int dif) +{ +#if 0 + struct tcp_ehash_bucket *head; + TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) + __u32 ports = TCP_COMBINED_PORTS(sport, hnum); + struct sock *sk; + int hash; + + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + hash = tcp_hashfn(daddr, hnum, saddr, sport); + head = &tcp_ehash[hash]; + read_lock(&head->lock); + for(sk = head->chain; sk; sk = sk->next) { + if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next) + if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; + read_unlock(&head->lock); + + return NULL; + +hit: + sock_hold(sk); + read_unlock(&head->lock); + return sk; +#else + return NULL; +#endif +} + +static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, + u32 daddr, u16 hnum, int dif) +{ +#if 0 + struct sock *sk; + + sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif); + + if (sk) + return sk; + + return tcp_v4_lookup_listener(daddr, hnum, dif); +#else + return NULL; +#endif +} + +__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif) +{ +#if 0 + struct sock *sk; + + local_bh_disable(); + sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +#else + return NULL; +#endif +} + +static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + return secure_tcp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + skb->h.th->dest, + skb->h.th->source); +#else + return 0; +#endif +} + +/* called with local bh disabled */ +static int __tcp_v4_check_established(struct sock *sk, __u16 lport, + struct tcp_tw_bucket **twp) +{ +#if 0 + u32 daddr = sk->rcv_saddr; + u32 saddr = sk->daddr; + int dif = sk->bound_dev_if; + TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) + __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport); + int hash = tcp_hashfn(daddr, lport, saddr, sk->dport); + struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + struct sock *sk2, **skp; + struct tcp_tw_bucket *tw; + + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL; + skp = &sk2->next) { + tw = (struct tcp_tw_bucket*)sk2; + + if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it + is safe provided sequence spaces do not + overlap i.e. at data rates <= 80Mbit/sec. + + Actually, the idea is close to VJ's one, + only timestamp cache is held not per host, + but per port pair and TW bucket is used + as state holder. + + If TW bucket has been already destroyed we + fall back to VJ's scheme and use initial + timestamp retrieved from peer table. + */ + if (tw->ts_recent_stamp && + (!twp || (sysctl_tcp_tw_reuse && + xtime.tv_sec - tw->ts_recent_stamp > 1))) { + if ((tp->write_seq = tw->snd_nxt+65535+2) == 0) + tp->write_seq = 1; + tp->ts_recent = tw->ts_recent; + tp->ts_recent_stamp = tw->ts_recent_stamp; + sock_hold(sk2); + skp = &head->chain; + goto unique; + } else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) { + if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + sk->num = lport; + sk->sport = htons(lport); + BUG_TRAP(sk->pprev==NULL); + if ((sk->next = *skp) != NULL) + (*skp)->pprev = &sk->next; + + *skp = sk; + sk->pprev = skp; + sk->hashent = hash; + sock_prot_inc_use(sk->prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(TimeWaitRecycled); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + NET_INC_STATS_BH(TimeWaitRecycled); + + tcp_tw_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +#else + return 0; +#endif +} + +/* + * Bind a port for a connect operation and hash it. + */ +static int tcp_v4_hash_connect(struct sock *sk) +{ +#if 0 + unsigned short snum = sk->num; + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + + if (snum == 0) { + int rover; + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + struct tcp_tw_bucket *tw = NULL; + + local_bh_disable(); + + /* TODO. Actually it is not so bad idea to remove + * tcp_portalloc_lock before next submission to Linus. + * As soon as we touch this place at all it is time to think. + * + * Now it protects single _advisory_ variable tcp_port_rover, + * hence it is mostly useless. + * Code will work nicely if we just delete it, but + * I am afraid in contented case it will work not better or + * even worse: another cpu just will hit the same bucket + * and spin there. + * So some cpu salt could remove both contention and + * memory pingpong. Any ideas how to do this in a nice way? + */ + spin_lock(&tcp_portalloc_lock); + rover = tcp_port_rover; + + do { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + head = &tcp_bhash[tcp_bhashfn(rover)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + for (tb = head->chain; tb; tb = tb->next) { + if (tb->port == rover) { + BUG_TRAP(tb->owners != NULL); + if (tb->fastreuse >= 0) + goto next_port; + if (!__tcp_v4_check_established(sk, rover, &tw)) + goto ok; + goto next_port; + } + } + + tb = tcp_bucket_create(head, rover); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } while (--remaining > 0); + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + local_bh_enable(); + + return -EADDRNOTAVAIL; + + ok: + /* All locks still held and bhs disabled */ + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + tcp_bind_hash(sk, tb, rover); + if (!sk->pprev) { + sk->sport = htons(rover); + __tcp_v4_hash(sk, 0); + } + spin_unlock(&head->lock); + + if (tw) { + tcp_tw_deschedule(tw); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + } + + local_bh_enable(); + return 0; + } + + head = &tcp_bhash[tcp_bhashfn(snum)]; + tb = (struct tcp_bind_bucket *)sk->prev; + spin_lock_bh(&head->lock); + if (tb->owners == sk && sk->bind_next == NULL) { + __tcp_v4_hash(sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + int ret; + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __tcp_v4_check_established(sk, snum, NULL); + local_bh_enable(); + return ret; + } +#else + return 0; +#endif +} + +/* This will initiate an outgoing connection. */ +int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct rtable *rt; + u32 daddr, nexthop; + int tmp; + int err; + + if (addr_len < sizeof(struct sockaddr_in)) + return(-EINVAL); + + if (usin->sin_family != AF_INET) + return(-EAFNOSUPPORT); + + nexthop = daddr = usin->sin_addr.s_addr; + if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) { + if (daddr == 0) + return -EINVAL; + nexthop = sk->protinfo.af_inet.opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, sk->saddr, + RT_CONN_FLAGS(sk), sk->bound_dev_if); + if (tmp < 0) + return tmp; + + if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + __sk_dst_set(sk, &rt->u.dst); + sk->route_caps = rt->u.dst.dev->features; + + if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr) + daddr = rt->rt_dst; + + if (!sk->saddr) + sk->saddr = rt->rt_src; + sk->rcv_saddr = sk->saddr; + + if (tp->ts_recent_stamp && sk->daddr != daddr) { + /* Reset inherited state */ + tp->ts_recent = 0; + tp->ts_recent_stamp = 0; + tp->write_seq = 0; + } + + if (sysctl_tcp_tw_recycle && + !tp->ts_recent_stamp && + rt->rt_dst == daddr) { + struct inet_peer *peer = rt_get_peer(rt); + + /* VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state TIME-WAIT + * and initialize ts_recent from it, when trying new connection. + */ + + if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { + tp->ts_recent_stamp = peer->tcp_ts_stamp; + tp->ts_recent = peer->tcp_ts; + } + } + + sk->dport = usin->sin_port; + sk->daddr = daddr; + + tp->ext_header_len = 0; + if (sk->protinfo.af_inet.opt) + tp->ext_header_len = sk->protinfo.af_inet.opt->optlen; + + tp->mss_clamp = 536; + + /* Socket identity is still unknown (sport may be zero). + * However we set state to SYN-SENT and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initalization after this. + */ + tcp_set_state(sk, TCP_SYN_SENT); + err = tcp_v4_hash_connect(sk); + if (err) + goto failure; + + if (!tp->write_seq) + tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr, + sk->sport, usin->sin_port); + + sk->protinfo.af_inet.id = tp->write_seq^jiffies; + + err = tcp_connect(sk); + if (err) + goto failure; + + return 0; + +failure: + tcp_set_state(sk, TCP_CLOSE); + __sk_dst_reset(sk); + sk->route_caps = 0; + sk->dport = 0; + return err; +#else + return 0; +#endif +} + +static __inline__ int tcp_v4_iif(struct sk_buff *skb) +{ +#if 0 + return ((struct rtable*)skb->dst)->rt_iif; +#else + return 0; +#endif +} + +static __inline__ unsigned tcp_v4_synq_hash(u32 raddr, u16 rport) +{ +#if 0 + unsigned h = raddr ^ rport; + h ^= h>>16; + h ^= h>>8; + return h&(TCP_SYNQ_HSIZE-1); +#else + return 0; +#endif +} + +static struct open_request *tcp_v4_search_req(struct tcp_opt *tp, + struct open_request ***prevp, + __u16 rport, + __u32 raddr, __u32 laddr) +{ +#if 0 + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *req, **prev; + + for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + if (req->rmt_port == rport && + req->af.v4_req.rmt_addr == raddr && + req->af.v4_req.loc_addr == laddr && + TCP_INET_FAMILY(req->class->family)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +#else + return NULL; +#endif +} + +static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct tcp_listen_opt *lopt = tp->listen_opt; + unsigned h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port); + + req->expires = jiffies + TCP_TIMEOUT_INIT; + req->retrans = 0; + req->sk = NULL; + req->dl_next = lopt->syn_table[h]; + + write_lock(&tp->syn_wait_lock); + lopt->syn_table[h] = req; + write_unlock(&tp->syn_wait_lock); + + tcp_synq_added(sk); +#endif +} + + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu) +{ +#if 0 + struct dst_entry *dst; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs + * send out by Linux are always <576bytes so they should go through + * unfragmented). + */ + if (sk->state == TCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if ((dst = __sk_dst_check(sk, 0)) == NULL) + return; + + ip_rt_update_pmtu(dst, mtu); + + /* Something is about to be wrong... Remember soft error + * for the case, if this connection will not able to recover. + */ + if (mtu < dst->pmtu && ip_dont_fragment(sk, dst)) + sk->err_soft = EMSGSIZE; + + if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT && + tp->pmtu_cookie > dst->pmtu) { + tcp_sync_mss(sk, dst->pmtu); + + /* Resend the TCP packet because it's + * clear that the old packet has been + * dropped. This is the new "fast" path mtu + * discovery. + */ + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ +#endif +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the tcp header. We need + * to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When + * someone else accesses the socket the ICMP is just dropped + * and for some paths there is no check at all. + * A more general error queue to queue errors for later handling + * is probably better. + * + */ + +void tcp_v4_err(struct sk_buff *skb, u32 info) +{ +#if 0 + struct iphdr *iph = (struct iphdr*)skb->data; + struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2)); + struct tcp_opt *tp; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct sock *sk; + __u32 seq; + int err; + + if (skb->len < (iph->ihl << 2) + 8) { + ICMP_INC_STATS_BH(IcmpInErrors); + return; + } + + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb)); + if (sk == NULL) { + ICMP_INC_STATS_BH(IcmpInErrors); + return; + } + if (sk->state == TCP_TIME_WAIT) { + tcp_tw_put((struct tcp_tw_bucket*)sk); + return; + } + + bh_lock_sock(sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sk->lock.users != 0) + NET_INC_STATS_BH(LockDroppedIcmps); + + if (sk->state == TCP_CLOSE) + goto out; + + tp = &sk->tp_pinfo.af_tcp; + seq = ntohl(th->seq); + if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { + NET_INC_STATS(OutOfWindowIcmps); + goto out; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: + /* This is deprecated, but if someone generated it, + * we have no reasons to ignore it. + */ + if (sk->lock.users == 0) + tcp_enter_cwr(tp); + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + if (sk->lock.users == 0) + do_pmtu_discovery(sk, iph, info); + goto out; + } + + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + goto out; + } + + switch (sk->state) { + struct open_request *req, **prev; + case TCP_LISTEN: + if (sk->lock.users != 0) + goto out; + + req = tcp_v4_search_req(tp, &prev, + th->dest, + iph->daddr, iph->saddr); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + an established socket here. + */ + BUG_TRAP(req->sk == NULL); + + if (seq != req->snt_isn) { + NET_INC_STATS_BH(OutOfWindowIcmps); + goto out; + } + + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + tcp_synq_drop(sk, req, prev); + goto out; + + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen. + It can f.e. if SYNs crossed. + */ + if (sk->lock.users == 0) { + TCP_INC_STATS_BH(TcpAttemptFails); + sk->err = err; + + sk->error_report(sk); + + tcp_done(sk); + } else { + sk->err_soft = err; + } + goto out; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) { + sk->err = err; + sk->error_report(sk); + } else { /* Only an error on timeout */ + sk->err_soft = err; + } + +out: + bh_unlock_sock(sk); + sock_put(sk); +#endif +} + +/* This routine computes an IPv4 TCP checksum. */ +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ +#if 0 + if (skb->ip_summed == CHECKSUM_HW) { + th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0); + skb->csum = offsetof(struct tcphdr, check); + } else { + th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr, + csum_partial((char *)th, th->doff<<2, skb->csum)); + } +#endif +} + +/* + * This routine will send an RST to the other tcp. + * + * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) + * for reset. + * Answer: if a packet caused RST, it is not for a socket + * existing in our system, if it is matched to a socket, + * it is just duplicate segment or bug in other side's TCP. + * So that we build reply only basing on parameters + * arrived with segment. + * Exception: precedence violation. We do not implement it in any case. + */ + +static void tcp_v4_send_reset(struct sk_buff *skb) +{ +#if 0 + struct tcphdr *th = skb->h.th; + struct tcphdr rth; + struct ip_reply_arg arg; + + /* Never send a reset in response to a reset. */ + if (th->rst) + return; + + if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL) + return; + + /* Swap the send and the receive. */ + memset(&rth, 0, sizeof(struct tcphdr)); + rth.dest = th->source; + rth.source = th->dest; + rth.doff = sizeof(struct tcphdr)/4; + rth.rst = 1; + + if (th->ack) { + rth.seq = th->ack_seq; + } else { + rth.ack = 1; + rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + + skb->len - (th->doff<<2)); + } + + memset(&arg, 0, sizeof arg); + arg.iov[0].iov_base = (unsigned char *)&rth; + arg.iov[0].iov_len = sizeof rth; + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + sizeof(struct tcphdr), + IPPROTO_TCP, + 0); + arg.n_iov = 1; + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + + tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; + ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); + + TCP_INC_STATS_BH(TcpOutSegs); + TCP_INC_STATS_BH(TcpOutRsts); +#endif +} + +/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states + outside socket context is ugly, certainly. What can I do? + */ + +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) +{ +#if 0 + struct tcphdr *th = skb->h.th; + struct { + struct tcphdr th; + u32 tsopt[3]; + } rep; + struct ip_reply_arg arg; + + memset(&rep.th, 0, sizeof(struct tcphdr)); + memset(&arg, 0, sizeof arg); + + arg.iov[0].iov_base = (unsigned char *)&rep; + arg.iov[0].iov_len = sizeof(rep.th); + arg.n_iov = 1; + if (ts) { + rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + rep.tsopt[1] = htonl(tcp_time_stamp); + rep.tsopt[2] = htonl(ts); + arg.iov[0].iov_len = sizeof(rep); + } + + /* Swap the send and the receive. */ + rep.th.dest = th->source; + rep.th.source = th->dest; + rep.th.doff = arg.iov[0].iov_len/4; + rep.th.seq = htonl(seq); + rep.th.ack_seq = htonl(ack); + rep.th.ack = 1; + rep.th.window = htons(win); + + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + arg.iov[0].iov_len, + IPPROTO_TCP, + 0); + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + + ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); + + TCP_INC_STATS_BH(TcpOutSegs); +#endif +} + +static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + + tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt, + tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent); + + tcp_tw_put(tw); +#endif +} + +static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) +{ +#if 0 + tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, + req->ts_recent); +#endif +} + +static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req) +{ +#if 0 + struct rtable *rt; + struct ip_options *opt; + + opt = req->af.v4_req.opt; + if(ip_route_output(&rt, ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + req->af.v4_req.loc_addr, + RT_CONN_FLAGS(sk), sk->bound_dev_if)) { + IP_INC_STATS_BH(IpOutNoRoutes); + return NULL; + } + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + IP_INC_STATS_BH(IpOutNoRoutes); + return NULL; + } + return &rt->u.dst; +#else + return NULL; +#endif +} + +/* + * Send a SYN-ACK after having received an ACK. + * This still operates on a open_request only, not on a big + * socket. + */ +static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, + struct dst_entry *dst) +{ +#if 0 + int err = -1; + struct sk_buff * skb; + + /* First, grab a route. */ + if (dst == NULL && + (dst = tcp_v4_route_req(sk, req)) == NULL) + goto out; + + skb = tcp_make_synack(sk, dst, req); + + if (skb) { + struct tcphdr *th = skb->h.th; + + th->check = tcp_v4_check(th, skb->len, + req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr, + csum_partial((char *)th, skb->len, skb->csum)); + + err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, req->af.v4_req.opt); + if (err == NET_XMIT_CN) + err = 0; + } + +out: + dst_release(dst); + return err; +#else + return 0; +#endif +} + +/* + * IPv4 open_request destructor. + */ +static void tcp_v4_or_free(struct open_request *req) +{ +#if 0 + if (req->af.v4_req.opt) + kfree(req->af.v4_req.opt); +#endif +} + +static inline void syn_flood_warning(struct sk_buff *skb) +{ +#if 0 + static unsigned long warntime; + + if (jiffies - warntime > HZ*60) { + warntime = jiffies; + printk(KERN_INFO + "possible SYN flooding on port %d. Sending cookies.\n", + ntohs(skb->h.th->dest)); + } +#endif +} + +/* + * Save and compile IPv4 options into the open_request if needed. + */ +static inline struct ip_options * +tcp_v4_save_options(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + struct ip_options *opt = &(IPCB(skb)->opt); + struct ip_options *dopt = NULL; + + if (opt && opt->optlen) { + int opt_size = optlength(opt); + dopt = kmalloc(opt_size, GFP_ATOMIC); + if (dopt) { + if (ip_options_echo(dopt, skb)) { + kfree(dopt); + dopt = NULL; + } + } + } + return dopt; +#else + return NULL; +#endif +} + +/* + * Maximum number of SYN_RECV sockets in queue per LISTEN socket. + * One SYN_RECV socket costs about 80bytes on a 32bit machine. + * It would be better to replace it with a global counter for all sockets + * but then some measure against one socket starving all other sockets + * would be needed. + * + * It was 128 by default. Experiments with real servers show, that + * it is absolutely not enough even at 100conn/sec. 256 cures most + * of problems. This value is adjusted to 128 for very small machines + * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). + * Further increasing requires to change hash table size. + */ +int sysctl_max_syn_backlog = 256; + +#if 0 +struct or_calltable or_ipv4 = { + PF_INET, + tcp_v4_send_synack, + tcp_v4_or_send_ack, + tcp_v4_or_free, + tcp_v4_send_reset +}; +#endif + +int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + struct tcp_opt tp; + struct open_request *req; + __u32 saddr = skb->nh.iph->saddr; + __u32 daddr = skb->nh.iph->daddr; + __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; +#ifdef CONFIG_SYN_COOKIES + int want_cookie = 0; +#else +#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ +#endif + + /* Never answer to SYNs send to broadcast or multicast */ + if (((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST|RTCF_MULTICAST)) + goto drop; + + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (tcp_synq_is_full(sk) && !isn) { +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) { + want_cookie = 1; + } else +#endif + goto drop; + } + + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + goto drop; + + req = tcp_openreq_alloc(); + if (req == NULL) + goto drop; + + tcp_clear_options(&tp); + tp.mss_clamp = 536; + tp.user_mss = sk->tp_pinfo.af_tcp.user_mss; + + tcp_parse_options(skb, &tp, 0); + + if (want_cookie) { + tcp_clear_options(&tp); + tp.saw_tstamp = 0; + } + + if (tp.saw_tstamp && tp.rcv_tsval == 0) { + /* Some OSes (unknown ones, but I see them on web server, which + * contains information interesting only for windows' + * users) do not send their stamp in SYN. It is easy case. + * We simply do not advertise TS support. + */ + tp.saw_tstamp = 0; + tp.tstamp_ok = 0; + } + tp.tstamp_ok = tp.saw_tstamp; + + tcp_openreq_init(req, &tp, skb); + + req->af.v4_req.loc_addr = daddr; + req->af.v4_req.rmt_addr = saddr; + req->af.v4_req.opt = tcp_v4_save_options(sk, skb); + req->class = &or_ipv4; + if (!want_cookie) + TCP_ECN_create_request(req, skb->h.th); + + if (want_cookie) { +#ifdef CONFIG_SYN_COOKIES + syn_flood_warning(skb); +#endif + isn = cookie_v4_init_sequence(sk, skb, &req->mss); + } else if (isn == 0) { + struct inet_peer *peer = NULL; + + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tp.saw_tstamp && + sysctl_tcp_tw_recycle && + (dst = tcp_v4_route_req(sk, req)) != NULL && + (peer = rt_get_peer((struct rtable*)dst)) != NULL && + peer->v4daddr == saddr) { + if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && + (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { + NET_INC_STATS_BH(PAWSPassiveRejected); + dst_release(dst); + goto drop_and_free; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - tcp_synq_len(sk) + < (sysctl_max_syn_backlog>>2)) && + (!peer || !peer->tcp_ts_stamp) && + (!dst || !dst->rtt)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + NETDEBUG(if (net_ratelimit()) \ + printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \ + NIPQUAD(saddr), ntohs(skb->h.th->source))); + dst_release(dst); + goto drop_and_free; + } + + isn = tcp_v4_init_sequence(sk, skb); + } + req->snt_isn = isn; + + if (tcp_v4_send_synack(sk, req, dst)) + goto drop_and_free; + + if (want_cookie) { + tcp_openreq_free(req); + } else { + tcp_v4_synq_add(sk, req); + } + return 0; + +drop_and_free: + tcp_openreq_free(req); +drop: + TCP_INC_STATS_BH(TcpAttemptFails); + return 0; +#else + return 0; +#endif +} + + +/* + * The three way handshake has completed - we got a valid synack - + * now create the new socket. + */ +struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ +#if 0 + struct tcp_opt *newtp; + struct sock *newsk; + + if (tcp_acceptq_is_full(sk)) + goto exit_overflow; + + if (dst == NULL && + (dst = tcp_v4_route_req(sk, req)) == NULL) + goto exit; + + newsk = tcp_create_openreq_child(sk, req, skb); + if (!newsk) + goto exit; + + newsk->dst_cache = dst; + newsk->route_caps = dst->dev->features; + + newtp = &(newsk->tp_pinfo.af_tcp); + newsk->daddr = req->af.v4_req.rmt_addr; + newsk->saddr = req->af.v4_req.loc_addr; + newsk->rcv_saddr = req->af.v4_req.loc_addr; + newsk->protinfo.af_inet.opt = req->af.v4_req.opt; + req->af.v4_req.opt = NULL; + newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb); + newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl; + newtp->ext_header_len = 0; + if (newsk->protinfo.af_inet.opt) + newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen; + newsk->protinfo.af_inet.id = newtp->write_seq^jiffies; + + tcp_sync_mss(newsk, dst->pmtu); + newtp->advmss = dst->advmss; + tcp_initialize_rcv_mss(newsk); + + __tcp_v4_hash(newsk, 0); + __tcp_inherit_port(sk, newsk); + + return newsk; + +exit_overflow: + NET_INC_STATS_BH(ListenOverflows); +exit: + NET_INC_STATS_BH(ListenDrops); + dst_release(dst); + return NULL; +#else + return NULL; +#endif +} + +static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb) +{ +#if 0 + struct open_request *req, **prev; + struct tcphdr *th = skb->h.th; + struct iphdr *iph = skb->nh.iph; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sock *nsk; + + /* Find possible connection requests. */ + req = tcp_v4_search_req(tp, &prev, + th->source, + iph->saddr, iph->daddr); + if (req) + return tcp_check_req(sk, skb, req, prev); + + nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, + th->source, + skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); + + if (nsk) { + if (nsk->state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + tcp_tw_put((struct tcp_tw_bucket*)nsk); + return NULL; + } + +#ifdef CONFIG_SYN_COOKIES + if (!th->rst && !th->syn && th->ack) + sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); +#endif + return sk; +#else + return NULL; +#endif +} + +static int tcp_v4_checksum_init(struct sk_buff *skb) +{ +#if 0 + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr,skb->csum)) + return 0; + + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n")); + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->len <= 76) { + if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr, + skb_checksum(skb, 0, skb->len, 0))) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr, + skb->nh.iph->daddr,0); + } + return 0; +#else + return 0; +#endif +} + + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ +#if 0 +#ifdef CONFIG_FILTER + struct sk_filter *filter = sk->filter; + if (filter && sk_filter(skb, filter)) + goto discard; +#endif /* CONFIG_FILTER */ + + IP_INC_STATS_BH(IpInDelivers); + + if (sk->state == TCP_ESTABLISHED) { /* Fast path */ + TCP_CHECK_TIMER(sk); + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + return 0; + } + + if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb)) + goto csum_err; + + if (sk->state == TCP_LISTEN) { + struct sock *nsk = tcp_v4_hnd_req(sk, skb); + if (!nsk) + goto discard; + + if (nsk != sk) { + if (tcp_child_process(sk, nsk, skb)) + goto reset; + return 0; + } + } + + TCP_CHECK_TIMER(sk); + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + return 0; + +reset: + tcp_v4_send_reset(skb); +discard: + kfree_skb(skb); + /* Be careful here. If this function gets more complicated and + * gcc suffers from register pressure on the x86, sk (in %ebx) + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ + return 0; + +csum_err: + TCP_INC_STATS_BH(TcpInErrs); + goto discard; +#else + return 0; +#endif +} + +/* + * From tcp_input.c + */ + +int tcp_v4_rcv(struct sk_buff *skb) +{ +#if 0 + struct tcphdr *th; + struct sock *sk; + int ret; + + if (skb->pkt_type!=PACKET_HOST) + goto discard_it; + + /* Count it even if it's bad */ + TCP_INC_STATS_BH(TcpInSegs); + + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + goto discard_it; + + th = skb->h.th; + + if (th->doff < sizeof(struct tcphdr)/4) + goto bad_packet; + if (!pskb_may_pull(skb, th->doff*4)) + goto discard_it; + + /* An explanation is required here, I think. + * Packet length and doff are validated by header prediction, + * provided case of th->doff==0 is elimineted. + * So, we defer the checks. */ + if ((skb->ip_summed != CHECKSUM_UNNECESSARY && + tcp_v4_checksum_init(skb) < 0)) + goto bad_packet; + + th = skb->h.th; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; + TCP_SKB_CB(skb)->sacked = 0; + + sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); + + if (!sk) + goto no_tcp_socket; + +process: + if(!ipsec_sk_policy(sk,skb)) + goto discard_and_relse; + + if (sk->state == TCP_TIME_WAIT) + goto do_time_wait; + + skb->dev = NULL; + + bh_lock_sock(sk); + ret = 0; + if (!sk->lock.users) { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + + return ret; + +no_tcp_socket: + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { +bad_packet: + TCP_INC_STATS_BH(TcpInErrs); + } else { + tcp_v4_send_reset(skb); + } + +discard_it: + /* Discard frame. */ + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + TCP_INC_STATS_BH(TcpInErrs); + goto discard_and_relse; + } + switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, + skb, th, skb->len)) { + case TCP_TW_SYN: + { + struct sock *sk2; + + sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); + if (sk2 != NULL) { + tcp_tw_deschedule((struct tcp_tw_bucket *)sk); + tcp_timewait_kill((struct tcp_tw_bucket *)sk); + tcp_tw_put((struct tcp_tw_bucket *)sk); + sk = sk2; + goto process; + } + /* Fall through to ACK */ + } + case TCP_TW_ACK: + tcp_v4_timewait_ack(sk, skb); + break; + case TCP_TW_RST: + goto no_tcp_socket; + case TCP_TW_SUCCESS:; + } + goto discard_it; +#endif +} + +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ +static void __tcp_v4_rehash(struct sock *sk) +{ +#if 0 + sk->prot->unhash(sk); + sk->prot->hash(sk); +#endif +} + +static int tcp_v4_reselect_saddr(struct sock *sk) +{ +#if 0 + int err; + struct rtable *rt; + __u32 old_saddr = sk->saddr; + __u32 new_saddr; + __u32 daddr = sk->daddr; + + if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) + daddr = sk->protinfo.af_inet.opt->faddr; + + /* Query new route. */ + err = ip_route_connect(&rt, daddr, 0, + RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute, + sk->bound_dev_if); + if (err) + return err; + + __sk_dst_set(sk, &rt->u.dst); + sk->route_caps = rt->u.dst.dev->features; + + new_saddr = rt->rt_src; + + if (new_saddr == old_saddr) + return 0; + + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr " + "from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(old_saddr), + NIPQUAD(new_saddr)); + } + + sk->saddr = new_saddr; + sk->rcv_saddr = new_saddr; + + /* XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + * + * Besides that, it does not check for connection + * uniqueness. Wait for troubles. + */ + __tcp_v4_rehash(sk); + return 0; +#else + return 0; +#endif +} + +int tcp_v4_rebuild_header(struct sock *sk) +{ +#if 0 + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); + u32 daddr; + int err; + + /* Route is OK, nothing to do. */ + if (rt != NULL) + return 0; + + /* Reroute. */ + daddr = sk->daddr; + if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) + daddr = sk->protinfo.af_inet.opt->faddr; + + err = ip_route_output(&rt, daddr, sk->saddr, + RT_CONN_FLAGS(sk), sk->bound_dev_if); + if (!err) { + __sk_dst_set(sk, &rt->u.dst); + sk->route_caps = rt->u.dst.dev->features; + return 0; + } + + /* Routing failed... */ + sk->route_caps = 0; + + if (!sysctl_ip_dynaddr || + sk->state != TCP_SYN_SENT || + (sk->userlocks & SOCK_BINDADDR_LOCK) || + (err = tcp_v4_reselect_saddr(sk)) != 0) + sk->err_soft=-err; + + return err; +#else + return 0; +#endif +} + +static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ +#if 0 + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = sk->daddr; + sin->sin_port = sk->dport; +#endif +} + +/* VJ's idea. Save last timestamp seen from this destination + * and hold it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter synchronized + * state. + */ + +int tcp_v4_remember_stamp(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct rtable *rt = (struct rtable*)__sk_dst_get(sk); + struct inet_peer *peer = NULL; + int release_it = 0; + + if (rt == NULL || rt->rt_dst != sk->daddr) { + peer = inet_getpeer(sk->daddr, 1); + release_it = 1; + } else { + if (rt->peer == NULL) + rt_bind_peer(rt, 1); + peer = rt->peer; + } + + if (peer) { + if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tp->ts_recent_stamp)) { + peer->tcp_ts_stamp = tp->ts_recent_stamp; + peer->tcp_ts = tp->ts_recent; + } + if (release_it) + inet_putpeer(peer); + return 1; + } + + return 0; +#else + return 0; +#endif +} + +int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +{ +#if 0 + struct inet_peer *peer = NULL; + + peer = inet_getpeer(tw->daddr, 1); + + if (peer) { + if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tw->ts_recent_stamp)) { + peer->tcp_ts_stamp = tw->ts_recent_stamp; + peer->tcp_ts = tw->ts_recent; + } + inet_putpeer(peer); + return 1; + } + + return 0; +#else + return 0; +#endif +} + +#if 0 +struct tcp_func ipv4_specific = { + ip_queue_xmit, + tcp_v4_send_check, + tcp_v4_rebuild_header, + tcp_v4_conn_request, + tcp_v4_syn_recv_sock, + tcp_v4_remember_stamp, + sizeof(struct iphdr), + + ip_setsockopt, + ip_getsockopt, + v4_addr2sockaddr, + sizeof(struct sockaddr_in) +}; +#endif + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v4_init_sock(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + + tp->rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = 0x7fffffff; /* Infinity */ + tp->snd_cwnd_clamp = ~0; + tp->mss_cache = 536; + + tp->reordering = sysctl_tcp_reordering; + + sk->state = TCP_CLOSE; + + sk->write_space = tcp_write_space; + sk->use_write_queue = 1; + + sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific; + + sk->sndbuf = sysctl_tcp_wmem[1]; + sk->rcvbuf = sysctl_tcp_rmem[1]; + + atomic_inc(&tcp_sockets_allocated); + + return 0; +#else + return 0; +#endif +} + +static int tcp_v4_destroy_sock(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tcp_clear_xmit_timers(sk); + + /* Cleanup up the write buffer. */ + tcp_writequeue_purge(sk); + + /* Cleans up our, hopefully empty, out_of_order_queue. */ + __skb_queue_purge(&tp->out_of_order_queue); + + /* Clean prequeue, it must be empty really */ + __skb_queue_purge(&tp->ucopy.prequeue); + + /* Clean up a referenced TCP bind bucket. */ + if(sk->prev != NULL) + tcp_put_port(sk); + + /* If sendmsg cached page exists, toss it. */ + if (tp->sndmsg_page != NULL) + __free_page(tp->sndmsg_page); + + atomic_dec(&tcp_sockets_allocated); + + return 0; +#else + return 0; +#endif +} + +/* Proc filesystem TCP sock list dumping. */ +static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid) +{ +#if 0 + int ttd = req->expires - jiffies; + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p", + i, + req->af.v4_req.loc_addr, + ntohs(sk->sport), + req->af.v4_req.rmt_addr, + ntohs(req->rmt_port), + TCP_SYN_RECV, + 0,0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + ttd, + req->retrans, + uid, + 0, /* non standard timer */ + 0, /* open_requests have no inode */ + atomic_read(&sk->refcnt), + req + ); +#endif +} + +static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i) +{ +#if 0 + unsigned int dest, src; + __u16 destp, srcp; + int timer_active; + unsigned long timer_expires; + struct tcp_opt *tp = &sp->tp_pinfo.af_tcp; + + dest = sp->daddr; + src = sp->rcv_saddr; + destp = ntohs(sp->dport); + srcp = ntohs(sp->sport); + if (tp->pending == TCP_TIME_RETRANS) { + timer_active = 1; + timer_expires = tp->timeout; + } else if (tp->pending == TCP_TIME_PROBE0) { + timer_active = 4; + timer_expires = tp->timeout; + } else if (timer_pending(&sp->timer)) { + timer_active = 2; + timer_expires = sp->timer.expires; + } else { + timer_active = 0; + timer_expires = jiffies; + } + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d", + i, src, srcp, dest, destp, sp->state, + tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, + timer_active, timer_expires-jiffies, + tp->retransmits, + sock_i_uid(sp), + tp->probes_out, + sock_i_ino(sp), + atomic_read(&sp->refcnt), sp, + tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong, + tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh + ); +#endif +} + +static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) +{ +#if 0 + unsigned int dest, src; + __u16 destp, srcp; + int ttd = tw->ttd - jiffies; + + if (ttd < 0) + ttd = 0; + + dest = tw->daddr; + src = tw->rcv_saddr; + destp = ntohs(tw->dport); + srcp = ntohs(tw->sport); + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p", + i, src, srcp, dest, destp, tw->substate, 0, 0, + 3, ttd, 0, 0, 0, 0, + atomic_read(&tw->refcnt), tw); +#endif +} + +#define TMPSZ 150 + +int tcp_get_info(char *buffer, char **start, off_t offset, int length) +{ +#if 0 + int len = 0, num = 0, i; + off_t begin, pos = 0; + char tmpbuf[TMPSZ+1]; + + if (offset < TMPSZ) + len += sprintf(buffer, "%-*s\n", TMPSZ-1, + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout inode"); + + pos = TMPSZ; + + /* First, walk listening socket table. */ + tcp_listen_lock(); + for(i = 0; i < TCP_LHTABLE_SIZE; i++) { + struct sock *sk; + struct tcp_listen_opt *lopt; + int k; + + for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) { + struct open_request *req; + int uid; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + if (!TCP_INET_FAMILY(sk->family)) + goto skip_listen; + + pos += TMPSZ; + if (pos >= offset) { + get_tcp_sock(sk, tmpbuf, num); + len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); + if (pos >= offset + length) { + tcp_listen_unlock(); + goto out_no_bh; + } + } + +skip_listen: + uid = sock_i_uid(sk); + read_lock_bh(&tp->syn_wait_lock); + lopt = tp->listen_opt; + if (lopt && lopt->qlen != 0) { + for (k=0; ksyn_table[k]; req; req = req->dl_next, num++) { + if (!TCP_INET_FAMILY(req->class->family)) + continue; + + pos += TMPSZ; + if (pos <= offset) + continue; + get_openreq(sk, req, tmpbuf, num, uid); + len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); + if (pos >= offset + length) { + read_unlock_bh(&tp->syn_wait_lock); + tcp_listen_unlock(); + goto out_no_bh; + } + } + } + } + read_unlock_bh(&tp->syn_wait_lock); + + /* Completed requests are in normal socket hash table */ + } + } + tcp_listen_unlock(); + + local_bh_disable(); + + /* Next, walk established hash chain. */ + for (i = 0; i < tcp_ehash_size; i++) { + struct tcp_ehash_bucket *head = &tcp_ehash[i]; + struct sock *sk; + struct tcp_tw_bucket *tw; + + read_lock(&head->lock); + for(sk = head->chain; sk; sk = sk->next, num++) { + if (!TCP_INET_FAMILY(sk->family)) + continue; + pos += TMPSZ; + if (pos <= offset) + continue; + get_tcp_sock(sk, tmpbuf, num); + len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); + if (pos >= offset + length) { + read_unlock(&head->lock); + goto out; + } + } + for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain; + tw != NULL; + tw = (struct tcp_tw_bucket *)tw->next, num++) { + if (!TCP_INET_FAMILY(tw->family)) + continue; + pos += TMPSZ; + if (pos <= offset) + continue; + get_timewait_sock(tw, tmpbuf, num); + len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf); + if (pos >= offset + length) { + read_unlock(&head->lock); + goto out; + } + } + read_unlock(&head->lock); + } + +out: + local_bh_enable(); +out_no_bh: + + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if (len > length) + len = length; + if (len < 0) + len = 0; + return len; +#endif +} + +struct proto tcp_prot = { + name: "TCP", + close: tcp_close, + connect: tcp_v4_connect, + disconnect: tcp_disconnect, + accept: tcp_accept, + ioctl: tcp_ioctl, + init: tcp_v4_init_sock, + destroy: tcp_v4_destroy_sock, + shutdown: tcp_shutdown, + setsockopt: tcp_setsockopt, + getsockopt: tcp_getsockopt, + sendmsg: tcp_sendmsg, + recvmsg: tcp_recvmsg, + backlog_rcv: tcp_v4_do_rcv, + hash: tcp_v4_hash, + unhash: tcp_unhash, + get_port: tcp_v4_get_port, +}; + + + +void tcp_v4_init(struct net_proto_family *ops) +{ +#if 0 + int err; + + tcp_inode.i_mode = S_IFSOCK; + tcp_inode.i_sock = 1; + tcp_inode.i_uid = 0; + tcp_inode.i_gid = 0; + init_waitqueue_head(&tcp_inode.i_wait); + init_waitqueue_head(&tcp_inode.u.socket_i.wait); + + tcp_socket->inode = &tcp_inode; + tcp_socket->state = SS_UNCONNECTED; + tcp_socket->type=SOCK_RAW; + + if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0) + panic("Failed to create the TCP control socket.\n"); + tcp_socket->sk->allocation=GFP_ATOMIC; + tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + tcp_socket->sk->prot->unhash(tcp_socket->sk); +#endif +} diff --git a/reactos/drivers/net/tcpip/transport/tcp/tcp_output.c b/reactos/drivers/net/tcpip/transport/tcp/tcp_output.c new file mode 100755 index 00000000000..c20b019e1cd --- /dev/null +++ b/reactos/drivers/net/tcpip/transport/tcp/tcp_output.c @@ -0,0 +1,1549 @@ +/* + * COPYRIGHT: See COPYING in the top level directory + * PROJECT: ReactOS TCP/IP protocol driver + * FILE: transport/tcp/tcp_output.c + * PURPOSE: Transmission Control Protocol + * PROGRAMMERS: Casper S. Hornstrup (chorns@users.sourceforge.net) + * REVISIONS: + * CSH 15-01-2003 Imported from linux kernel 2.4.20 + */ + +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_output.c,v 1.1 2003/01/15 21:57:31 chorns Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +/* + * Changes: Pedro Roque : Retransmit queue handled by TCP. + * : Fragmentation on mtu decrease + * : Segment collapse on retransmit + * : AF independence + * + * Linus Torvalds : send_delayed_ack + * David S. Miller : Charge memory using the right skb + * during syn/ack processing. + * David S. Miller : Output engine completely rewritten. + * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. + * Cacophonix Gaul : draft-minshall-nagle-01 + * J Hadi Salim : ECN support + * + */ + +#if 0 +#include + +#include +#include +#else +#include "linux.h" +#include "tcpcore.h" +#endif + +/* People can turn this off for buggy TCP's found in printers etc. */ +int sysctl_tcp_retrans_collapse = 1; + +static __inline__ +void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) +{ + tp->send_head = skb->next; + if (tp->send_head == (struct sk_buff *) &sk->write_queue) + tp->send_head = NULL; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + if (tp->packets_out++ == 0) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); +} + +/* SND.NXT, if window was not shrunk. + * If window has been shrunk, what should we make? It is not clear at all. + * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( + * Anything in between SND.UNA...SND.UNA+SND.WND also can be already + * invalid. OK, let's make this for now: + */ +static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp) +{ + if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) + return tp->snd_nxt; + else + return tp->snd_una+tp->snd_wnd; +} + +/* Calculate mss to advertise in SYN segment. + * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: + * + * 1. It is independent of path mtu. + * 2. Ideally, it is maximal possible segment size i.e. 65535-40. + * 3. For IPv4 it is reasonable to calculate it from maximal MTU of + * attached devices, because some buggy hosts are confused by + * large MSS. + * 4. We do not make 3, we advertise MSS, calculated from first + * hop device mtu, but allow to raise it to ip_rt_min_advmss. + * This may be overriden via information stored in routing table. + * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, + * probably even Jumbo". + */ +static __u16 tcp_advertise_mss(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct dst_entry *dst = __sk_dst_get(sk); + int mss = tp->advmss; + + if (dst && dst->advmss < mss) { + mss = dst->advmss; + tp->advmss = mss; + } + + return (__u16)mss; +#else + return 0; +#endif +} + +/* RFC2861. Reset CWND after idle period longer RTO to "restart window". + * This is the first part of cwnd validation mechanism. */ +static void tcp_cwnd_restart(struct tcp_opt *tp) +{ +#if 0 + s32 delta = tcp_time_stamp - tp->lsndtime; + u32 restart_cwnd = tcp_init_cwnd(tp); + u32 cwnd = tp->snd_cwnd; + + tp->snd_ssthresh = tcp_current_ssthresh(tp); + restart_cwnd = min(restart_cwnd, cwnd); + + while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) + cwnd >>= 1; + tp->snd_cwnd = max(cwnd, restart_cwnd); + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_used = 0; +#endif +} + +static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb) +{ +#if 0 + u32 now = tcp_time_stamp; + + if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) + tcp_cwnd_restart(tp); + + tp->lsndtime = now; + + /* If it is a reply for ato after last received + * packet, enter pingpong mode. + */ + if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) + tp->ack.pingpong = 1; +#endif +} + +static __inline__ void tcp_event_ack_sent(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + tcp_dec_quickack_mode(tp); + tcp_clear_xmit_timer(sk, TCP_TIME_DACK); +#endif +} + +/* Chose a new window to advertise, update state in tcp_opt for the + * socket, and return result with RFC1323 scaling applied. The return + * value can be stuffed directly into th->window for an outgoing + * frame. + */ +static __inline__ u16 tcp_select_window(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 cur_win = tcp_receive_window(tp); + u32 new_win = __tcp_select_window(sk); + + /* Never shrink the offered window */ + if(new_win < cur_win) { + /* Danger Will Robinson! + * Don't update rcv_wup/rcv_wnd here or else + * we will not be able to advertise a zero + * window in time. --DaveM + * + * Relax Will Robinson. + */ + new_win = cur_win; + } + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; + + /* RFC1323 scaling applied */ + new_win >>= tp->rcv_wscale; + + /* If we advertise zero window, disable fast path. */ + if (new_win == 0) + tp->pred_flags = 0; + + return new_win; +#else + return 0; +#endif +} + + +/* This routine actually transmits TCP packets queued in by + * tcp_do_sendmsg(). This is used by both the initial + * transmission and possible later retransmissions. + * All SKB's seen here are completely headerless. It is our + * job to build the TCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + * + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. + */ +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + if(skb != NULL) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + int tcp_header_size = tp->tcp_header_len; + struct tcphdr *th; + int sysctl_flags; + int err; + +#define SYSCTL_FLAG_TSTAMPS 0x1 +#define SYSCTL_FLAG_WSCALE 0x2 +#define SYSCTL_FLAG_SACK 0x4 + + sysctl_flags = 0; + if (tcb->flags & TCPCB_FLAG_SYN) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; + } + if(sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; + } + if(sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; + } + } else if (tp->eff_sacks) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); + } + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = sk->sport; + th->dest = sk->dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); + if (tcb->flags & TCPCB_FLAG_SYN) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; + + if (tp->urg_mode && + between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { + th->urg_ptr = htons(tp->snd_up-tcb->seq); + th->urg = 1; + } + + if (tcb->flags & TCPCB_FLAG_SYN) { + tcp_syn_build_options((__u32 *)(th + 1), + tcp_advertise_mss(sk), + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rcv_wscale, + tcb->when, + tp->ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, tcb->when); + + TCP_ECN_send(sk, tp, skb, tcp_header_size); + } + tp->af_specific->send_check(sk, th, skb->len, skb); + + if (tcb->flags & TCPCB_FLAG_ACK) + tcp_event_ack_sent(sk); + + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb); + + TCP_INC_STATS(TcpOutSegs); + + err = tp->af_specific->queue_xmit(skb); + if (err <= 0) + return err; + + tcp_enter_cwr(tp); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; + } + return -ENOBUFS; +#undef SYSCTL_FLAG_TSTAMPS +#undef SYSCTL_FLAG_WSCALE +#undef SYSCTL_FLAG_SACK +#else + return 0; +#endif +} + + +/* This is the main buffer sending routine. We queue the buffer + * and decide whether to queue or transmit now. + * + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, + * otherwise socket can stall. + */ +void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq = TCP_SKB_CB(skb)->end_seq; + __skb_queue_tail(&sk->write_queue, skb); + tcp_charge_skb(sk, skb); + + if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) { + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) { + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_minshall_update(tp, cur_mss, skb); + if (tp->packets_out++ == 0) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return; + } + } + /* Queue it, remembering where we must start sending. */ + if (tp->send_head == NULL) + tp->send_head = skb; +#endif +} + +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned cur_mss) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = tp->send_head; + + if (tcp_snd_test(tp, skb, cur_mss, 1)) { + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) { + tp->send_head = NULL; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + if (tp->packets_out++ == 0) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return; + } + } +#endif +} + +/* Split fragmented skb to two parts at length len. */ + +static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len) +{ +#if 0 + int i; + int pos = skb->len - skb->data_len; + + if (len < pos) { + /* Split line is inside header. */ + memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len); + + /* And move data appendix as is. */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + + skb1->data_len = skb->data_len; + skb1->len += skb1->data_len; + skb->data_len = 0; + skb->len = len; + skb->tail = skb->data+len; + } else { + int k = 0; + int nfrags = skb_shinfo(skb)->nr_frags; + + /* Second chunk has no header, nothing to copy. */ + + skb_shinfo(skb)->nr_frags = 0; + skb1->len = skb1->data_len = skb->len - len; + skb->len = len; + skb->data_len = len - pos; + + for (i=0; ifrags[i].size; + if (pos + size > len) { + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < len) { + /* Split frag. + * We have to variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb1)->frags[0].page_offset += (len-pos); + skb_shinfo(skb1)->frags[0].size -= (len-pos); + skb_shinfo(skb)->frags[i].size = len-pos; + skb_shinfo(skb)->nr_frags++; + } + k++; + } else { + skb_shinfo(skb)->nr_frags++; + } + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; + } +#endif +} + +/* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. + * Remember, these are still headerless SKBs at this point. + */ +static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct sk_buff *buff; + int nsize = skb->len - len; + u16 flags; + + if (skb_cloned(skb) && + skb_is_nonlinear(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + + /* Get a new skb... force flag on. */ + buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; /* We'll just try again later. */ + tcp_charge_skb(sk, buff); + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); + if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { + tp->lost_out++; + tp->left_out++; + } + TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; + + if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) { + /* Copy and checksum data tail into the new buffer. */ + buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), + nsize, 0); + + skb_trim(skb, len); + + skb->csum = csum_block_sub(skb->csum, buff->csum, len); + } else { + skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + } + + buff->ip_summed = skb->ip_summed; + + /* Looks stupid, but our code really uses when of + * skbs, which it never sent before. --ANK + */ + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + + /* Link BUFF into the send queue. */ + __skb_append(skb, buff); + + return 0; +#else + return 0; +#endif +} + +/* This function synchronize snd mss to current pmtu/exthdr set. + + tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts + for TCP options, but includes only bare TCP header. + + tp->mss_clamp is mss negotiated at connection setup. + It is minumum of user_mss and mss received with SYN. + It also does not include TCP options. + + tp->pmtu_cookie is last pmtu, seen by this function. + + tp->mss_cache is current effective sending mss, including + all tcp options except for SACKs. It is evaluated, + taking into account current pmtu, but never exceeds + tp->mss_clamp. + + NOTE1. rfc1122 clearly states that advertised MSS + DOES NOT include either tcp or ip options. + + NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside + this function. --ANK (980731) + */ + +int tcp_sync_mss(struct sock *sk, u32 pmtu) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + + mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->mss_clamp) + mss_now = tp->mss_clamp; + + /* Now subtract optional transport overhead */ + mss_now -= tp->ext_header_len; + + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + /* Bound mss with half of window */ + if (tp->max_window && mss_now > (tp->max_window>>1)) + mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); + + /* And store cached results */ + tp->pmtu_cookie = pmtu; + tp->mss_cache = mss_now; + return mss_now; +#else + return 0; +#endif +} + + +/* This routine writes packets to the network. It advances the + * send_head. This happens as incoming acks open up the remote + * window for us. + * + * Returns 1, if no segments are in flight and we have queued segments, but + * cannot send anything now because of SWS or another problem. + */ +int tcp_write_xmit(struct sock *sk, int nonagle) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned int mss_now; + + /* If we are closed, the bytes will have to remain here. + * In time closedown will finish, we empty the write queue and all + * will be happy. + */ + if(sk->state != TCP_CLOSE) { + struct sk_buff *skb; + int sent_pkts = 0; + + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk); + + while((skb = tp->send_head) && + tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : 1)) { + if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) + break; + } + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + break; + /* Advance the send_head. This one is sent out. */ + update_send_head(sk, tp, skb); + tcp_minshall_update(tp, mss_now, skb); + sent_pkts = 1; + } + + if (sent_pkts) { + tcp_cwnd_validate(sk, tp); + return 0; + } + + return !tp->packets_out && tp->send_head; + } + return 0; +#else + return 0; +#endif +} + +/* This function returns the amount that we can raise the + * usable window based on the following constraints + * + * 1. The window can never be shrunk once it is offered (RFC 793) + * 2. We limit memory per socket + * + * RFC 1122: + * "the suggested [SWS] avoidance algorithm for the receiver is to keep + * RECV.NEXT + RCV.WIN fixed until: + * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" + * + * i.e. don't raise the right edge of the window until you can raise + * it at least MSS bytes. + * + * Unfortunately, the recommended algorithm breaks header prediction, + * since header prediction assumes th->window stays fixed. + * + * Strictly speaking, keeping th->window fixed violates the receiver + * side SWS prevention criteria. The problem is that under this rule + * a stream of single byte packets will cause the right side of the + * window to always advance by a single byte. + * + * Of course, if the sender implements sender side SWS prevention + * then this will not be a problem. + * + * BSD seems to make the following compromise: + * + * If the free space is less than the 1/4 of the maximum + * space available and the free space is less than 1/2 mss, + * then set the window to 0. + * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] + * Otherwise, just prevent the window from shrinking + * and from being larger than the largest representable value. + * + * This prevents incremental opening of the window in the regime + * where TCP is limited by the speed of the reader side taking + * data out of the TCP receive queue. It does nothing about + * those cases where the window is constrained on the sender side + * because the pipeline is full. + * + * BSD also seems to "accidentally" limit itself to windows that are a + * multiple of MSS, at least until the free space gets quite small. + * This would appear to be a side effect of the mbuf implementation. + * Combining these two algorithms results in the observed behavior + * of having a fixed window size at almost all times. + * + * Below we obtain similar behavior by forcing the offered window to + * a multiple of the mss when it is feasible to do so. + * + * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. + * Regular options like TIMESTAMP are taken into account. + */ +u32 __tcp_select_window(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + /* MSS for the peer's data. Previous verions used mss_clamp + * here. I don't know if the value based on our guesses + * of peer's MSS is better for the performance. It's more correct + * but may be worse for the performance because of rcv_mss + * fluctuations. --SAW 1998/11/1 + */ + int mss = tp->ack.rcv_mss; + int free_space = tcp_space(sk); + int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int window; + + if (mss > full_space) + mss = full_space; + + if (free_space < full_space/2) { + tp->ack.quick = 0; + + if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + + if (free_space < mss) + return 0; + } + + if (free_space > tp->rcv_ssthresh) + free_space = tp->rcv_ssthresh; + + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + window = tp->rcv_wnd; + if (window <= free_space - mss || window > free_space) + window = (free_space/mss)*mss; + + return window; +#else + return 0; +#endif +} + +/* Attempt to collapse two adjacent SKB's during retransmission. */ +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct sk_buff *next_skb = skb->next; + + /* The first test we must make is that neither of these two + * SKB's are still referenced by someone else. + */ + if(!skb_cloned(skb) && !skb_cloned(next_skb)) { + int skb_size = skb->len, next_skb_size = next_skb->len; + u16 flags = TCP_SKB_CB(skb)->flags; + + /* Also punt if next skb has been SACK'd. */ + if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + return; + + /* Next skb is out of window. */ + if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd)) + return; + + /* Punt if not enough space exists in the first SKB for + * the data in the second, or the total combined payload + * would exceed the MSS. + */ + if ((next_skb_size > skb_tailroom(skb)) || + ((skb_size + next_skb_size) > mss_now)) + return; + + /* Ok. We will be able to collapse the packet. */ + __skb_unlink(next_skb, next_skb->list); + + if (next_skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_HW; + + if (skb->ip_summed != CHECKSUM_HW) { + memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); + } + + /* Update sequence range on original skb. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags = flags; + + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) + tp->retrans_out--; + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { + tp->lost_out--; + tp->left_out--; + } + /* Reno case is special. Sigh... */ + if (!tp->sack_ok && tp->sacked_out) { + tp->sacked_out--; + tp->left_out--; + } + + /* Not quite right: it can be > snd.fack, but + * it is better to underestimate fackets. + */ + if (tp->fackets_out) + tp->fackets_out--; + tcp_free_skb(sk, next_skb); + tp->packets_out--; + } +#endif +} + +/* Do a simple retransmit without using the backoff mechanisms in + * tcp_timer. This is used for path mtu discovery. + * The socket is already locked here. + */ +void tcp_simple_retransmit(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk); + int lost = 0; + + for_retrans_queue(skb, sk, tp) { + if (skb->len > mss && + !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out--; + } + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out++; + lost = 1; + } + } + } + + if (!lost) + return; + + tcp_sync_left_out(tp); + + /* Don't muck with the congestion window here. + * Reason is that we do not increase amount of _data_ + * in network, but units changed and effective + * cwnd/ssthresh really reduced now. + */ + if (tp->ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tp->ca_state = TCP_CA_Loss; + } + tcp_xmit_retransmit_queue(sk); +#endif +} + +/* This retransmits one SKB. Policy decisions and retransmit queue + * state updates are done by the caller. Returns non-zero if an + * error occurred which prevented the send. + */ +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + unsigned int cur_mss = tcp_current_mss(sk); + int err; + + /* Do not sent more than we queued. 1/4 is reserved for possible + * copying overhead: frgagmentation, tunneling, mangling etc. + */ + if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf)) + return -EAGAIN; + + /* If receiver has shrunk his window, and skb is out of + * new window, do not retransmit it. The exception is the + * case, when window is shrunk to zero. In this case + * our retransmit serves as a zero window probe. + */ + if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd) + && TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + + if(skb->len > cur_mss) { + if(tcp_fragment(sk, skb, cur_mss)) + return -ENOMEM; /* We'll try again later. */ + + /* New SKB created, account for it. */ + tp->packets_out++; + } + + /* Collapse two adjacent packets if worthwhile and we can. */ + if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && + (skb->len < (cur_mss >> 1)) && + (skb->next != tp->send_head) && + (skb->next != (struct sk_buff *)&sk->write_queue) && + (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) && + (sysctl_tcp_retrans_collapse != 0)) + tcp_retrans_try_collapse(sk, skb, cur_mss); + + if(tp->af_specific->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + /* Some Solaris stacks overoptimize and ignore the FIN on a + * retransmit when old data is attached. So strip it off + * since it is cheap to do so and saves bytes on the network. + */ + if(skb->len > 0 && + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + if (!pskb_trim(skb, 0)) { + TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + } + } + + /* Make a copy, if the first transmission SKB clone we made + * is still in somebody's hands, else make a clone. + */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + + err = tcp_transmit_skb(sk, (skb_cloned(skb) ? + pskb_copy(skb, GFP_ATOMIC): + skb_clone(skb, GFP_ATOMIC))); + + if (err == 0) { + /* Update global TCP statistics. */ + TCP_INC_STATS(TcpRetransSegs); + +#if FASTRETRANS_DEBUG > 0 + if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + if (net_ratelimit()) + printk(KERN_DEBUG "retrans_out leaked.\n"); + } +#endif + TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; + tp->retrans_out++; + + /* Save stamp of the first retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = TCP_SKB_CB(skb)->when; + + tp->undo_retrans++; + + /* snd_nxt is stored to detect loss of retransmitted segment, + * see tcp_input.c tcp_sacktag_write_queue(). + */ + TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } + return err; +#else + return 0; +#endif +} + +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + int packet_cnt = tp->lost_out; + + /* First pass: retransmit lost packets. */ + if (packet_cnt) { + for_retrans_queue(skb, sk, tp) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + return; + + if (sacked&TCPCB_LOST) { + if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { + if (tcp_retransmit_skb(sk, skb)) + return; + if (tp->ca_state != TCP_CA_Loss) + NET_INC_STATS_BH(TCPFastRetrans); + else + NET_INC_STATS_BH(TCPSlowStartRetrans); + + if (skb == skb_peek(&sk->write_queue)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } + + if (--packet_cnt <= 0) + break; + } + } + } + + /* OK, demanded retransmission is finished. */ + + /* Forward retransmissions are possible only during Recovery. */ + if (tp->ca_state != TCP_CA_Recovery) + return; + + /* No forward retransmissions in Reno are possible. */ + if (!tp->sack_ok) + return; + + /* Yeah, we have to make difficult choice between forward transmission + * and retransmission... Both ways have their merits... + * + * For now we do not retrnamsit anything, while we have some new + * segments to send. + */ + + if (tcp_may_send_now(sk, tp)) + return; + + packet_cnt = 0; + + for_retrans_queue(skb, sk, tp) { + if(++packet_cnt > tp->fackets_out) + break; + + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + break; + + if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + continue; + + /* Ok, retransmit it. */ + if(tcp_retransmit_skb(sk, skb)) + break; + + if (skb == skb_peek(&sk->write_queue)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + + NET_INC_STATS_BH(TCPForwardRetrans); + } +#endif +} + + +/* Send a fin. The caller locks the socket for us. This cannot be + * allowed to fail queueing a FIN frame under any circumstances. + */ +void tcp_send_fin(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb = skb_peek_tail(&sk->write_queue); + unsigned int mss_now; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = tcp_current_mss(sk); + + if(tp->send_head != NULL) { + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + TCP_SKB_CB(skb)->end_seq++; + tp->write_seq++; + } else { + /* Socket is locked, keep trying until memory is available. */ + for (;;) { + skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); + if (skb) + break; + yield(); + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCP_SKB_CB(skb)->sacked = 0; + + /* FIN eats a sequence byte, write_seq advanced by tcp_send_skb(). */ + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + tcp_send_skb(sk, skb, 1, mss_now); + } + __tcp_push_pending_frames(sk, tp, mss_now, 1); +#endif +} + +/* We get here when a process closes a file descriptor (either due to + * an explicit close() or as a byproduct of exit()'ing) and there + * was unread data in the receive queue. This behavior is recommended + * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + */ +void tcp_send_active_reset(struct sock *sk, int priority) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + /* NOTE: No TCP options attached and we never retransmit this. */ + skb = alloc_skb(MAX_TCP_HEADER, priority); + if (!skb) { + NET_INC_STATS(TCPAbortFailed); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCP_SKB_CB(skb)->sacked = 0; + + /* Send it off. */ + TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb)) + NET_INC_STATS(TCPAbortFailed); +#endif +} + +/* WARNING: This routine must only be called when we have already sent + * a SYN packet that crossed the incoming SYN that caused this routine + * to get called. If this assumption fails then the initial rcv_wnd + * and rcv_wscale values will not be correct. + */ +int tcp_send_synack(struct sock *sk) +{ +#if 0 + struct sk_buff* skb; + + skb = skb_peek(&sk->write_queue); + if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { + printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); + return -EFAULT; + } + if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) { + if (skb_cloned(skb)) { + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + if (nskb == NULL) + return -ENOMEM; + __skb_unlink(skb, &sk->write_queue); + __skb_queue_head(&sk->write_queue, nskb); + tcp_free_skb(sk, skb); + tcp_charge_skb(sk, nskb); + skb = nskb; + } + + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; + TCP_ECN_send_synack(&sk->tp_pinfo.af_tcp, skb); + } + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); +#else + return 0; +#endif +} + +/* + * Prepare a SYN-ACK. + */ +struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct open_request *req) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcphdr *th; + int tcp_header_size; + struct sk_buff *skb; + + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_TCP_HEADER); + + skb->dst = dst_clone(dst); + + tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + + (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + + (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + + /* SACK_PERM is in the place of NOP NOP of TS */ + ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); + + memset(th, 0, sizeof(struct tcphdr)); + th->syn = 1; + th->ack = 1; + TCP_ECN_make_synack(req, th); + th->source = sk->sport; + th->dest = req->rmt_port; + TCP_SKB_CB(skb)->seq = req->snt_isn; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + th->seq = htonl(TCP_SKB_CB(skb)->seq); + th->ack_seq = htonl(req->rcv_isn + 1); + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst->window; + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + dst->advmss - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + req->wscale_ok, + &rcv_wscale); + req->rcv_wscale = rcv_wscale; + } + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ + th->window = htons(req->rcv_wnd); + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_syn_build_options((__u32 *)(th + 1), dst->advmss, req->tstamp_ok, + req->sack_ok, req->wscale_ok, req->rcv_wscale, + TCP_SKB_CB(skb)->when, + req->ts_recent); + + skb->csum = 0; + th->doff = (tcp_header_size >> 2); + TCP_INC_STATS(TcpOutSegs); + return skb; +#else + return 0; +#endif +} + +/* + * Do all connect socket setups that can be done AF independent. + */ +static inline void tcp_connect_init(struct sock *sk) +{ +#if 0 + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. + */ + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + + /* If user gave his TCP_MAXSEG, record it to clamp */ + if (tp->user_mss) + tp->mss_clamp = tp->user_mss; + tp->max_window = 0; + tcp_sync_mss(sk, dst->pmtu); + + if (!tp->window_clamp) + tp->window_clamp = dst->window; + tp->advmss = dst->advmss; + tcp_initialize_rcv_mss(sk); + + tcp_select_initial_window(tcp_full_space(sk), + tp->advmss - (tp->ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_window_scaling, + &tp->rcv_wscale); + + tp->rcv_ssthresh = tp->rcv_wnd; + + sk->err = 0; + sk->done = 0; + tp->snd_wnd = 0; + tcp_init_wl(tp, tp->write_seq, 0); + tp->snd_una = tp->write_seq; + tp->snd_sml = tp->write_seq; + tp->rcv_nxt = 0; + tp->rcv_wup = 0; + tp->copied_seq = 0; + + tp->rto = TCP_TIMEOUT_INIT; + tp->retransmits = 0; + tcp_clear_retrans(tp); +#endif +} + +/* + * Build a SYN and send it off. + */ +int tcp_connect(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *buff; + + tcp_connect_init(sk); + + buff = alloc_skb(MAX_TCP_HEADER + 15, sk->allocation); + if (unlikely(buff == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_TCP_HEADER); + + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; + TCP_ECN_send_syn(tp, buff); + TCP_SKB_CB(buff)->sacked = 0; + buff->csum = 0; + TCP_SKB_CB(buff)->seq = tp->write_seq++; + TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; + + /* Send it off. */ + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = TCP_SKB_CB(buff)->when; + __skb_queue_tail(&sk->write_queue, buff); + tcp_charge_skb(sk, buff); + tp->packets_out++; + tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + TCP_INC_STATS(TcpActiveOpens); + + /* Timer for repeating the SYN until an answer. */ + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 0; +#else + return 0; +#endif +} + +/* Send out a delayed ack, the caller does the policy checking + * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() + * for details. + */ +void tcp_send_delayed_ack(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int ato = tp->ack.ato; + unsigned long timeout; + + if (ato > TCP_DELACK_MIN) { + int max_ato = HZ/2; + + if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) + max_ato = TCP_DELACK_MAX; + + /* Slow path, intersegment interval is "high". */ + + /* If some rtt estimate is known, use it to bound delayed ack. + * Do not use tp->rto here, use results of rtt measurements + * directly. + */ + if (tp->srtt) { + int rtt = max(tp->srtt>>3, TCP_DELACK_MIN); + + if (rtt < max_ato) + max_ato = rtt; + } + + ato = min(ato, max_ato); + } + + /* Stay within the limit we were given */ + timeout = jiffies + ato; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (tp->ack.pending&TCP_ACK_TIMER) { + /* If delack timer was blocked or is about to expire, + * send ACK now. + */ + if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { + tcp_send_ack(sk); + return; + } + + if (!time_before(timeout, tp->ack.timeout)) + timeout = tp->ack.timeout; + } + tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; + tp->ack.timeout = timeout; + if (!mod_timer(&tp->delack_timer, timeout)) + sock_hold(sk); +#endif +} + +/* This routine sends an ack and also updates the window. */ +void tcp_send_ack(struct sock *sk) +{ +#if 0 + /* If we have been reset, we may not send again. */ + if(sk->state != TCP_CLOSE) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *buff; + + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. + */ + buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (buff == NULL) { + tcp_schedule_ack(tp); + tp->ack.ato = TCP_ATO_MIN; + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_TCP_HEADER); + buff->csum = 0; + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(buff)->sacked = 0; + + /* Send it off, this clears delayed acks for us. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_transmit_skb(sk, buff); + } +#else + return 0; +#endif +} + +/* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. + * + * Question: what should we make while urgent mode? + * 4.4BSD forces sending single byte of data. We cannot send + * out of window data, because we have SND.NXT==SND.MAX... + * + * Current solution: to send TWO zero-length segments in urgent mode: + * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is + * out-of-date with SND.UNA-1 to probe window. + */ +static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (skb == NULL) + return -1; + + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = urgent; + + /* Use a previous sequence. This should cause the other + * end to send an ack. Don't queue or clone SKB, just + * send it. + */ + TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb); +#else + return 0; +#endif +} + +int tcp_write_wakeup(struct sock *sk) +{ +#if 0 + if (sk->state != TCP_CLOSE) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct sk_buff *skb; + + if ((skb = tp->send_head) != NULL && + before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { + int err; + int mss = tcp_current_mss(sk); + int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + if (tcp_fragment(sk, skb, seg_size)) + return -1; + } + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if (!err) { + update_send_head(sk, tp, skb); + } + return err; + } else { + if (tp->urg_mode && + between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF)) + tcp_xmit_probe_skb(sk, TCPCB_URG); + return tcp_xmit_probe_skb(sk, 0); + } + } + return -1; +#else + return 0; +#endif +} + +/* A window probe timeout has occurred. If window is not closed send + * a partial packet else a zero probe. + */ +void tcp_send_probe0(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int err; + + err = tcp_write_wakeup(sk); + + if (tp->packets_out || !tp->send_head) { + /* Cancel probe timer, if it is not required. */ + tp->probes_out = 0; + tp->backoff = 0; + return; + } + + if (err <= 0) { + tp->backoff++; + tp->probes_out++; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } else { + /* If packet was not sent due to local congestion, + * do not backoff and do not remember probes_out. + * Let local senders to fight for local resources. + * + * Use accumulated backoff yet. + */ + if (!tp->probes_out) + tp->probes_out=1; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + } +#endif +} diff --git a/reactos/drivers/net/tcpip/transport/tcp/tcp_timer.c b/reactos/drivers/net/tcpip/transport/tcp/tcp_timer.c new file mode 100755 index 00000000000..5f9affc5c4d --- /dev/null +++ b/reactos/drivers/net/tcpip/transport/tcp/tcp_timer.c @@ -0,0 +1,702 @@ +/* + * COPYRIGHT: See COPYING in the top level directory + * PROJECT: ReactOS TCP/IP protocol driver + * FILE: transport/tcp/tcp_input.c + * PURPOSE: Transmission Control Protocol + * PROGRAMMERS: Casper S. Hornstrup (chorns@users.sourceforge.net) + * REVISIONS: + * CSH 15-01-2003 Imported from linux kernel 2.4.20 + */ + +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_timer.c,v 1.1 2003/01/15 21:57:31 chorns Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +#if 0 +#include +#else +#include "linux.h" +#include "tcpcore.h" +#endif + +int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; +int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; +//int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; +int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; +//int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; +int sysctl_tcp_retries1 = TCP_RETR1; +int sysctl_tcp_retries2 = TCP_RETR2; +int sysctl_tcp_orphan_retries; + +static void tcp_write_timer(unsigned long); +static void tcp_delack_timer(unsigned long); +static void tcp_keepalive_timer (unsigned long data); + +//const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ + +void tcp_init_xmit_timers(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + init_timer(&tp->retransmit_timer); + tp->retransmit_timer.function=&tcp_write_timer; + tp->retransmit_timer.data = (unsigned long) sk; + tp->pending = 0; + + init_timer(&tp->delack_timer); + tp->delack_timer.function=&tcp_delack_timer; + tp->delack_timer.data = (unsigned long) sk; + tp->ack.pending = 0; + + init_timer(&sk->timer); + sk->timer.function=&tcp_keepalive_timer; + sk->timer.data = (unsigned long) sk; +#endif +} + +void tcp_clear_xmit_timers(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + tp->pending = 0; + if (timer_pending(&tp->retransmit_timer) && + del_timer(&tp->retransmit_timer)) + __sock_put(sk); + + tp->ack.pending = 0; + tp->ack.blocked = 0; + if (timer_pending(&tp->delack_timer) && + del_timer(&tp->delack_timer)) + __sock_put(sk); + + if(timer_pending(&sk->timer) && del_timer(&sk->timer)) + __sock_put(sk); +#endif +} + +static void tcp_write_err(struct sock *sk) +{ +#if 0 + sk->err = sk->err_soft ? : ETIMEDOUT; + sk->error_report(sk); + + tcp_done(sk); + NET_INC_STATS_BH(TCPAbortOnTimeout); +#endif +} + +/* Do not allow orphaned sockets to eat all our resources. + * This is direct violation of TCP specs, but it is required + * to prevent DoS attacks. It is called when a retransmission timeout + * or zero probe timeout occurs on orphaned socket. + * + * Criterium is still not confirmed experimentally and may change. + * We kill the socket, if: + * 1. If number of orphaned sockets exceeds an administratively configured + * limit. + * 2. If we have strong memory pressure. + */ +static int tcp_out_of_resources(struct sock *sk, int do_reset) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int orphans = atomic_read(&tcp_orphan_count); + + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + orphans <<= 1; + + /* If some dubious ICMP arrived, penalize even more. */ + if (sk->err_soft) + orphans <<= 1; + + if (orphans >= sysctl_tcp_max_orphans || + (sk->wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (net_ratelimit()) + printk(KERN_INFO "Out of socket memory\n"); + + /* Catch exceptional cases, when connection requires reset. + * 1. Last segment was sent recently. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + /* 2. Window is closed. */ + (!tp->snd_wnd && !tp->packets_out)) + do_reset = 1; + if (do_reset) + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); + NET_INC_STATS_BH(TCPAbortOnMemory); + return 1; + } + return 0; +#else + return 0; +#endif +} + +/* Calculate maximal number or retries on an orphaned socket. */ +static int tcp_orphan_retries(struct sock *sk, int alive) +{ +#if 0 + int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + + /* We know from an ICMP that something is wrong. */ + if (sk->err_soft && !alive) + retries = 0; + + /* However, if socket sent something recently, select some safe + * number of retries. 8 corresponds to >100 seconds with minimal + * RTO of 200msec. */ + if (retries == 0 && alive) + retries = 8; + return retries; +#else + return 0; +#endif +} + +/* A write timeout has occurred. Process the after effects. */ +static int tcp_write_timeout(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int retry_until; + + if ((1<state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) { + if (tp->retransmits) + dst_negative_advice(&sk->dst_cache); + retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + } else { + if (tp->retransmits >= sysctl_tcp_retries1) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black + hole detection. :-( + + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: + + "The one security concern raised by this memo is that ICMP black holes + are often caused by over-zealous security administrators who block + all ICMP messages. It is vitally important that those who design and + deploy security systems understand the impact of strict filtering on + upper-layer protocols. The safest web site in the world is worthless + if most TCP implementations cannot transfer data from it. It would + be far nicer to have all of the black holes fixed rather than fixing + all of the TCP implementations." + + Golden words :-). + */ + + dst_negative_advice(&sk->dst_cache); + } + + retry_until = sysctl_tcp_retries2; + if (sk->dead) { + int alive = (tp->rto < TCP_RTO_MAX); + + retry_until = tcp_orphan_retries(sk, alive); + + if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + return 1; + } + } + + if (tp->retransmits >= retry_until) { + /* Has it gone just too far? */ + tcp_write_err(sk); + return 1; + } + return 0; +#else + return 0; +#endif +} + +static void tcp_delack_timer(unsigned long data) +{ +#if 0 + struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + bh_lock_sock(sk); + if (sk->lock.users) { + /* Try again later. */ + tp->ack.blocked = 1; + NET_INC_STATS_BH(DelayedACKLocked); + if (!mod_timer(&tp->delack_timer, jiffies + TCP_DELACK_MIN)) + sock_hold(sk); + goto out_unlock; + } + + tcp_mem_reclaim(sk); + + if (sk->state == TCP_CLOSE || !(tp->ack.pending&TCP_ACK_TIMER)) + goto out; + + if ((long)(tp->ack.timeout - jiffies) > 0) { + if (!mod_timer(&tp->delack_timer, tp->ack.timeout)) + sock_hold(sk); + goto out; + } + tp->ack.pending &= ~TCP_ACK_TIMER; + + if (skb_queue_len(&tp->ucopy.prequeue)) { + struct sk_buff *skb; + + net_statistics[smp_processor_id()*2].TCPSchedulerFailed += skb_queue_len(&tp->ucopy.prequeue); + + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->backlog_rcv(sk, skb); + + tp->ucopy.memory = 0; + } + + if (tcp_ack_scheduled(tp)) { + if (!tp->ack.pingpong) { + /* Delayed ACK missed: inflate ATO. */ + tp->ack.ato = min(tp->ack.ato << 1, tp->rto); + } else { + /* Delayed ACK missed: leave pingpong mode and + * deflate ATO. + */ + tp->ack.pingpong = 0; + tp->ack.ato = TCP_ATO_MIN; + } + tcp_send_ack(sk); + NET_INC_STATS_BH(DelayedACKs); + } + TCP_CHECK_TIMER(sk); + +out: + if (tcp_memory_pressure) + tcp_mem_reclaim(sk); +out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +#endif +} + +static void tcp_probe_timer(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int max_probes; + + if (tp->packets_out || !tp->send_head) { + tp->probes_out = 0; + return; + } + + /* *WARNING* RFC 1122 forbids this + * + * It doesn't AFAIK, because we kill the retransmit timer -AK + * + * FIXME: We ought not to do it, Solaris 2.5 actually has fixing + * this behaviour in Solaris down as a bug fix. [AC] + * + * Let me to explain. probes_out is zeroed by incoming ACKs + * even if they advertise zero window. Hence, connection is killed only + * if we received no ACKs for normal connection timeout. It is not killed + * only because window stays zero for some time, window may be zero + * until armageddon and even later. We are in full accordance + * with RFCs, only probe timer combines both retransmission timeout + * and probe timeout in one bottle. --ANK + */ + max_probes = sysctl_tcp_retries2; + + if (sk->dead) { + int alive = ((tp->rto<backoff) < TCP_RTO_MAX); + + max_probes = tcp_orphan_retries(sk, alive); + + if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) + return; + } + + if (tp->probes_out > max_probes) { + tcp_write_err(sk); + } else { + /* Only send another probe if we didn't close things up. */ + tcp_send_probe0(sk); + } +#endif +} + +/* + * The TCP retransmit timer. + */ + +static void tcp_retransmit_timer(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + + if (tp->packets_out == 0) + goto out; + + BUG_TRAP(!skb_queue_empty(&sk->write_queue)); + + if (tp->snd_wnd == 0 && !sk->dead && + !((1<state)&(TCPF_SYN_SENT|TCPF_SYN_RECV))) { + /* Receiver dastardly shrinks window. Our retransmits + * become zero probes, but we should not timeout this + * connection. If the socket is an orphan, time it out, + * we cannot allow such beasts to hang infinitely. + */ +#ifdef TCP_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n", + NIPQUAD(sk->daddr), htons(sk->dport), sk->num, + tp->snd_una, tp->snd_nxt); +#endif + if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + tcp_write_err(sk); + goto out; + } + tcp_enter_loss(sk, 0); + tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)); + __sk_dst_reset(sk); + goto out_reset_timer; + } + + if (tcp_write_timeout(sk)) + goto out; + + if (tp->retransmits == 0) { + if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { + if (tp->sack_ok) { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(TCPSackRecoveryFail); + else + NET_INC_STATS_BH(TCPSackFailures); + } else { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(TCPRenoRecoveryFail); + else + NET_INC_STATS_BH(TCPRenoFailures); + } + } else if (tp->ca_state == TCP_CA_Loss) { + NET_INC_STATS_BH(TCPLossFailures); + } else { + NET_INC_STATS_BH(TCPTimeouts); + } + } + + tcp_enter_loss(sk, 0); + + if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!tp->retransmits) + tp->retransmits=1; + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, + min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + goto out; + } + + /* Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + tp->backoff++; + tp->retransmits++; + +out_reset_timer: + tp->rto = min(tp->rto << 1, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + if (tp->retransmits > sysctl_tcp_retries1) + __sk_dst_reset(sk); + +out:; +#endif +} + +static void tcp_write_timer(unsigned long data) +{ +#if 0 + struct sock *sk = (struct sock*)data; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int event; + + bh_lock_sock(sk); + if (sk->lock.users) { + /* Try again later */ + if (!mod_timer(&tp->retransmit_timer, jiffies + (HZ/20))) + sock_hold(sk); + goto out_unlock; + } + + if (sk->state == TCP_CLOSE || !tp->pending) + goto out; + + if ((long)(tp->timeout - jiffies) > 0) { + if (!mod_timer(&tp->retransmit_timer, tp->timeout)) + sock_hold(sk); + goto out; + } + + event = tp->pending; + tp->pending = 0; + + switch (event) { + case TCP_TIME_RETRANS: + tcp_retransmit_timer(sk); + break; + case TCP_TIME_PROBE0: + tcp_probe_timer(sk); + break; + } + TCP_CHECK_TIMER(sk); + +out: + tcp_mem_reclaim(sk); +out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +#endif +} + +/* + * Timer for listening sockets + */ + +static void tcp_synack_timer(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt = tp->listen_opt; + int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; + unsigned long now = jiffies; + struct open_request **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; + } + } + + if (tp->defer_accept) + max_retries = tp->defer_accept; + + budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if ((long)(now - req->expires) >= 0) { + if ((req->retrans < thresh || + (req->acked && req->retrans < max_retries)) + && !req->class->rtx_syn_ack(sk, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((TCP_TIMEOUT_INIT << req->retrans), + TCP_RTO_MAX); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + write_lock(&tp->syn_wait_lock); + *reqp = req->dl_next; + write_unlock(&tp->syn_wait_lock); + lopt->qlen--; + if (req->retrans == 0) + lopt->qlen_young--; + tcp_openreq_free(req); + continue; + } + reqp = &req->dl_next; + } + + i = (i+1)&(TCP_SYNQ_HSIZE-1); + + } while (--budget > 0); + + lopt->clock_hand = i; + + if (lopt->qlen) + tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); +#endif +} + +void tcp_delete_keepalive_timer (struct sock *sk) +{ +#if 0 + if (timer_pending(&sk->timer) && del_timer (&sk->timer)) + __sock_put(sk); +#endif +} + +void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) +{ +#if 0 + if (!mod_timer(&sk->timer, jiffies+len)) + sock_hold(sk); +#endif +} + +void tcp_set_keepalive(struct sock *sk, int val) +{ +#if 0 + if ((1<state)&(TCPF_CLOSE|TCPF_LISTEN)) + return; + + if (val && !sk->keepopen) + tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp)); + else if (!val) + tcp_delete_keepalive_timer(sk); +#endif +} + + +static void tcp_keepalive_timer (unsigned long data) +{ +#if 0 + struct sock *sk = (struct sock *) data; + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + __u32 elapsed; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sk->lock.users) { + /* Try again later. */ + tcp_reset_keepalive_timer (sk, HZ/20); + goto out; + } + + if (sk->state == TCP_LISTEN) { + tcp_synack_timer(sk); + goto out; + } + + if (sk->state == TCP_FIN_WAIT2 && sk->dead) { + if (tp->linger2 >= 0) { + int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + + if (tmo > 0) { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + tcp_send_active_reset(sk, GFP_ATOMIC); + goto death; + } + + if (!sk->keepopen || sk->state == TCP_CLOSE) + goto out; + + elapsed = keepalive_time_when(tp); + + /* It is alive without keepalive 8) */ + if (tp->packets_out || tp->send_head) + goto resched; + + elapsed = tcp_time_stamp - tp->rcv_tstamp; + + if (elapsed >= keepalive_time_when(tp)) { + if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || + (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_write_err(sk); + goto out; + } + if (tcp_write_wakeup(sk) <= 0) { + tp->probes_out++; + elapsed = keepalive_intvl_when(tp); + } else { + /* If keepalive was lost due to local congestion, + * try harder. + */ + elapsed = TCP_RESOURCE_PROBE_INTERVAL; + } + } else { + /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ + elapsed = keepalive_time_when(tp) - elapsed; + } + + TCP_CHECK_TIMER(sk); + tcp_mem_reclaim(sk); + +resched: + tcp_reset_keepalive_timer (sk, elapsed); + goto out; + +death: + tcp_done(sk); + +out: + bh_unlock_sock(sk); + sock_put(sk); +#endif +} diff --git a/reactos/drivers/net/tcpip/transport/tcp/tcpcore.c b/reactos/drivers/net/tcpip/transport/tcp/tcpcore.c new file mode 100755 index 00000000000..8e1b3f93736 --- /dev/null +++ b/reactos/drivers/net/tcpip/transport/tcp/tcpcore.c @@ -0,0 +1,2783 @@ +/* + * COPYRIGHT: See COPYING in the top level directory + * PROJECT: ReactOS TCP/IP protocol driver + * FILE: transport/tcp/tcpcore.c + * PURPOSE: Transmission Control Protocol + * PROGRAMMERS: Casper S. Hornstrup (chorns@users.sourceforge.net) + * REVISIONS: + * CSH 15-01-2003 Imported from linux kernel 2.4.20 + */ + +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcpcore.c,v 1.1 2003/01/15 21:57:31 chorns Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + * + * Fixes: + * Alan Cox : Numerous verify_area() calls + * Alan Cox : Set the ACK bit on a reset + * Alan Cox : Stopped it crashing if it closed while + * sk->inuse=1 and was trying to connect + * (tcp_err()). + * Alan Cox : All icmp error handling was broken + * pointers passed where wrong and the + * socket was looked up backwards. Nobody + * tested any icmp error code obviously. + * Alan Cox : tcp_err() now handled properly. It + * wakes people on errors. poll + * behaves and the icmp error race + * has gone by moving it into sock.c + * Alan Cox : tcp_send_reset() fixed to work for + * everything not just packets for + * unknown sockets. + * Alan Cox : tcp option processing. + * Alan Cox : Reset tweaked (still not 100%) [Had + * syn rule wrong] + * Herp Rosmanith : More reset fixes + * Alan Cox : No longer acks invalid rst frames. + * Acking any kind of RST is right out. + * Alan Cox : Sets an ignore me flag on an rst + * receive otherwise odd bits of prattle + * escape still + * Alan Cox : Fixed another acking RST frame bug. + * Should stop LAN workplace lockups. + * Alan Cox : Some tidyups using the new skb list + * facilities + * Alan Cox : sk->keepopen now seems to work + * Alan Cox : Pulls options out correctly on accepts + * Alan Cox : Fixed assorted sk->rqueue->next errors + * Alan Cox : PSH doesn't end a TCP read. Switched a + * bit to skb ops. + * Alan Cox : Tidied tcp_data to avoid a potential + * nasty. + * Alan Cox : Added some better commenting, as the + * tcp is hard to follow + * Alan Cox : Removed incorrect check for 20 * psh + * Michael O'Reilly : ack < copied bug fix. + * Johannes Stille : Misc tcp fixes (not all in yet). + * Alan Cox : FIN with no memory -> CRASH + * Alan Cox : Added socket option proto entries. + * Also added awareness of them to accept. + * Alan Cox : Added TCP options (SOL_TCP) + * Alan Cox : Switched wakeup calls to callbacks, + * so the kernel can layer network + * sockets. + * Alan Cox : Use ip_tos/ip_ttl settings. + * Alan Cox : Handle FIN (more) properly (we hope). + * Alan Cox : RST frames sent on unsynchronised + * state ack error. + * Alan Cox : Put in missing check for SYN bit. + * Alan Cox : Added tcp_select_window() aka NET2E + * window non shrink trick. + * Alan Cox : Added a couple of small NET2E timer + * fixes + * Charles Hedrick : TCP fixes + * Toomas Tamm : TCP window fixes + * Alan Cox : Small URG fix to rlogin ^C ack fight + * Charles Hedrick : Rewrote most of it to actually work + * Linus : Rewrote tcp_read() and URG handling + * completely + * Gerhard Koerting: Fixed some missing timer handling + * Matthew Dillon : Reworked TCP machine states as per RFC + * Gerhard Koerting: PC/TCP workarounds + * Adam Caldwell : Assorted timer/timing errors + * Matthew Dillon : Fixed another RST bug + * Alan Cox : Move to kernel side addressing changes. + * Alan Cox : Beginning work on TCP fastpathing + * (not yet usable) + * Arnt Gulbrandsen: Turbocharged tcp_check() routine. + * Alan Cox : TCP fast path debugging + * Alan Cox : Window clamping + * Michael Riepe : Bug in tcp_check() + * Matt Dillon : More TCP improvements and RST bug fixes + * Matt Dillon : Yet more small nasties remove from the + * TCP code (Be very nice to this man if + * tcp finally works 100%) 8) + * Alan Cox : BSD accept semantics. + * Alan Cox : Reset on closedown bug. + * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). + * Michael Pall : Handle poll() after URG properly in + * all cases. + * Michael Pall : Undo the last fix in tcp_read_urg() + * (multi URG PUSH broke rlogin). + * Michael Pall : Fix the multi URG PUSH problem in + * tcp_readable(), poll() after URG + * works now. + * Michael Pall : recv(...,MSG_OOB) never blocks in the + * BSD api. + * Alan Cox : Changed the semantics of sk->socket to + * fix a race and a signal problem with + * accept() and async I/O. + * Alan Cox : Relaxed the rules on tcp_sendto(). + * Yury Shevchuk : Really fixed accept() blocking problem. + * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for + * clients/servers which listen in on + * fixed ports. + * Alan Cox : Cleaned the above up and shrank it to + * a sensible code size. + * Alan Cox : Self connect lockup fix. + * Alan Cox : No connect to multicast. + * Ross Biro : Close unaccepted children on master + * socket close. + * Alan Cox : Reset tracing code. + * Alan Cox : Spurious resets on shutdown. + * Alan Cox : Giant 15 minute/60 second timer error + * Alan Cox : Small whoops in polling before an + * accept. + * Alan Cox : Kept the state trace facility since + * it's handy for debugging. + * Alan Cox : More reset handler fixes. + * Alan Cox : Started rewriting the code based on + * the RFC's for other useful protocol + * references see: Comer, KA9Q NOS, and + * for a reference on the difference + * between specifications and how BSD + * works see the 4.4lite source. + * A.N.Kuznetsov : Don't time wait on completion of tidy + * close. + * Linus Torvalds : Fin/Shutdown & copied_seq changes. + * Linus Torvalds : Fixed BSD port reuse to work first syn + * Alan Cox : Reimplemented timers as per the RFC + * and using multiple timers for sanity. + * Alan Cox : Small bug fixes, and a lot of new + * comments. + * Alan Cox : Fixed dual reader crash by locking + * the buffers (much like datagram.c) + * Alan Cox : Fixed stuck sockets in probe. A probe + * now gets fed up of retrying without + * (even a no space) answer. + * Alan Cox : Extracted closing code better + * Alan Cox : Fixed the closing state machine to + * resemble the RFC. + * Alan Cox : More 'per spec' fixes. + * Jorge Cwik : Even faster checksumming. + * Alan Cox : tcp_data() doesn't ack illegal PSH + * only frames. At least one pc tcp stack + * generates them. + * Alan Cox : Cache last socket. + * Alan Cox : Per route irtt. + * Matt Day : poll()->select() match BSD precisely on error + * Alan Cox : New buffers + * Marc Tamsky : Various sk->prot->retransmits and + * sk->retransmits misupdating fixed. + * Fixed tcp_write_timeout: stuck close, + * and TCP syn retries gets used now. + * Mark Yarvis : In tcp_read_wakeup(), don't send an + * ack if state is TCP_CLOSED. + * Alan Cox : Look up device on a retransmit - routes may + * change. Doesn't yet cope with MSS shrink right + * but its a start! + * Marc Tamsky : Closing in closing fixes. + * Mike Shaver : RFC1122 verifications. + * Alan Cox : rcv_saddr errors. + * Alan Cox : Block double connect(). + * Alan Cox : Small hooks for enSKIP. + * Alexey Kuznetsov: Path MTU discovery. + * Alan Cox : Support soft errors. + * Alan Cox : Fix MTU discovery pathological case + * when the remote claims no mtu! + * Marc Tamsky : TCP_CLOSE fix. + * Colin (G3TNE) : Send a reset on syn ack replies in + * window but wrong (fixes NT lpd problems) + * Pedro Roque : Better TCP window handling, delayed ack. + * Joerg Reuter : No modification of locked buffers in + * tcp_do_retransmit() + * Eric Schenk : Changed receiver side silly window + * avoidance algorithm to BSD style + * algorithm. This doubles throughput + * against machines running Solaris, + * and seems to result in general + * improvement. + * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * Keith Owens : Do proper merging with partial SKB's in + * tcp_do_sendmsg to avoid burstiness. + * Eric Schenk : Fix fast close down bug with + * shutdown() followed by close(). + * Andi Kleen : Make poll agree with SIGIO + * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and + * lingertime == 0 (RFC 793 ABORT Call) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + * + * Description of States: + * + * TCP_SYN_SENT sent a connection request, waiting for ack + * + * TCP_SYN_RECV received a connection request, sent ack, + * waiting for final ack in three-way handshake. + * + * TCP_ESTABLISHED connection established + * + * TCP_FIN_WAIT1 our side has shutdown, waiting to complete + * transmission of remaining buffered data + * + * TCP_FIN_WAIT2 all buffered data sent, waiting for remote + * to shutdown + * + * TCP_CLOSING both sides have shutdown but we still have + * data we have to finish sending + * + * TCP_TIME_WAIT timeout to catch resent junk before entering + * closed, can only be entered from FIN_WAIT2 + * or CLOSING. Required because the other end + * may not have gotten our last ACK causing it + * to retransmit the data packet (which we ignore) + * + * TCP_CLOSE_WAIT remote side has shutdown and is waiting for + * us to finish writing our data and to shutdown + * (we have to close() to move on to LAST_ACK) + * + * TCP_LAST_ACK out side has shutdown after remote has + * shutdown. There may still be data in our + * buffer that we have to finish sending + * + * TCP_CLOSE socket is finished + */ + +#if 0 +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#else +#include "linux.h" +#include "tcpcore.h" +#endif + +int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + +#ifdef ROS_STATISTICS +struct tcp_mib tcp_statistics[NR_CPUS*2]; +#endif + +kmem_cache_t *tcp_openreq_cachep; +kmem_cache_t *tcp_bucket_cachep; +kmem_cache_t *tcp_timewait_cachep; + +#if 0 +atomic_t tcp_orphan_count = ATOMIC_INIT(0); +#endif + +int sysctl_tcp_mem[3]; +int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 }; +int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 }; + +atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ + +/* Pressure flag: try to collapse. + * Technical note: it is used by multiple contexts non atomically. + * All the tcp_mem_schedule() is of this nature: accounting + * is strict, actions are advisory and have some latency. */ +int tcp_memory_pressure; + +#define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM) + +int tcp_mem_schedule(struct sock *sk, int size, int kind) +{ + int amt = TCP_PAGES(size); + + sk->forward_alloc += amt*TCP_MEM_QUANTUM; + atomic_add(amt, &tcp_memory_allocated); + + /* Under limit. */ + if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + if (tcp_memory_pressure) + tcp_memory_pressure = 0; + return 1; + } + + /* Over hard limit. */ + if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) { + tcp_enter_memory_pressure(); + goto suppress_allocation; + } + + /* Under pressure. */ + if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1]) + tcp_enter_memory_pressure(); + + if (kind) { + if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0]) + return 1; + } else { + if (sk->wmem_queued < sysctl_tcp_wmem[0]) + return 1; + } + + if (!tcp_memory_pressure || + sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) + * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+ + sk->forward_alloc)) + return 1; + +suppress_allocation: + + if (kind == 0) { + tcp_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->wmem_queued+size >= sk->sndbuf) + return 1; + } + + /* Alas. Undo changes. */ + sk->forward_alloc -= amt*TCP_MEM_QUANTUM; + atomic_sub(amt, &tcp_memory_allocated); + return 0; +} + +void __tcp_mem_reclaim(struct sock *sk) +{ + if (sk->forward_alloc >= TCP_MEM_QUANTUM) { + atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated); + sk->forward_alloc &= (TCP_MEM_QUANTUM-1); + if (tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) + tcp_memory_pressure = 0; + } +} + +void tcp_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->rmem_alloc); + sk->forward_alloc += skb->truesize; +} + +/* + * LISTEN is a special case for poll.. + */ +static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) +{ + return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0; +} + +/* + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait) +{ +#if 0 + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + poll_wait(file, sk->sleep, wait); + if (sk->state == TCP_LISTEN) + return tcp_listen_poll(sk, wait); + + /* Socket is not locked. We are protected from async events + by poll logic and correct handling of state changes + made by another threads is impossible in any case. + */ + + mask = 0; + if (sk->err) + mask = POLLERR; + + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a + * socket the read side is more interesting. + * + * Some poll() documentation says that POLLHUP is incompatible + * with the POLLOUT/POLLWR flags, so somebody should check this + * all. But careful, it tends to be safer to return too many + * bits than too few, and you can easily break real applications + * if you don't tell them that something has hung up! + * + * Check-me. + * + * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and + * our fs/select.c). It means that after we received EOF, + * poll always returns immediately, making impossible poll() on write() + * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP + * if and only if shutdown has been made in both directions. + * Actually, it is interesting to look how Solaris and DUX + * solve this dilemma. I would prefer, if PULLHUP were maskable, + * then we could set it on SND_SHUTDOWN. BTW examples given + * in Stevens' books assume exactly this behaviour, it explains + * why PULLHUP is incompatible with POLLOUT. --ANK + * + * NOTE. Check for TCP_CLOSE is added. The goal is to prevent + * blocking on fresh not-connected or disconnected socket. --ANK + */ + if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE) + mask |= POLLHUP; + if (sk->shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM; + + /* Connected? */ + if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) { + /* Potential race condition. If read of tp below will + * escape above sk->state, we can be illegally awaken + * in SYN_* states. */ + if ((tp->rcv_nxt != tp->copied_seq) && + (tp->urg_seq != tp->copied_seq || + tp->rcv_nxt != tp->copied_seq+1 || + sk->urginline || !tp->urg_data)) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->shutdown & SEND_SHUTDOWN)) { + if (tcp_wspace(sk) >= tcp_min_write_space(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + set_bit(SOCK_NOSPACE, &sk->socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (tcp_wspace(sk) >= tcp_min_write_space(sk)) + mask |= POLLOUT | POLLWRNORM; + } + } + + if (tp->urg_data & TCP_URG_VALID) + mask |= POLLPRI; + } + return mask; +#else + return 0; +#endif +} + +/* + * TCP socket write_space callback. + */ +void tcp_write_space(struct sock *sk) +{ +#if 0 + struct socket *sock = sk->socket; + + if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + if (sk->sleep && waitqueue_active(sk->sleep)) + wake_up_interruptible(sk->sleep); + + if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN)) + sock_wake_async(sock, 2, POLL_OUT); + } +#endif +} + +int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int answ; + + switch(cmd) { + case SIOCINQ: + if (sk->state == TCP_LISTEN) + return(-EINVAL); + + lock_sock(sk); + if ((1<state) & (TCPF_SYN_SENT|TCPF_SYN_RECV)) + answ = 0; + else if (sk->urginline || !tp->urg_data || + before(tp->urg_seq,tp->copied_seq) || + !before(tp->urg_seq,tp->rcv_nxt)) { + answ = tp->rcv_nxt - tp->copied_seq; + + /* Subtract 1, if FIN is in queue. */ + if (answ && !skb_queue_empty(&sk->receive_queue)) + answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin; + } else + answ = tp->urg_seq - tp->copied_seq; + release_sock(sk); + break; + case SIOCATMARK: + { + answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + break; + } + case SIOCOUTQ: + if (sk->state == TCP_LISTEN) + return(-EINVAL); + + if ((1<state) & (TCPF_SYN_SENT|TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_una; + break; + default: + return(-ENOIOCTLCMD); + }; + + return put_user(answ, (int *)arg); +#else +return 0; +#endif +} + + +int tcp_listen_start(struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt; + + sk->max_ack_backlog = 0; + sk->ack_backlog = 0; + tp->accept_queue = tp->accept_queue_tail = NULL; + tp->syn_wait_lock = RW_LOCK_UNLOCKED; + tcp_delack_init(tp); + + lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL); + if (!lopt) + return -ENOMEM; + + memset(lopt, 0, sizeof(struct tcp_listen_opt)); + for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++) + if ((1<max_qlen_log) >= sysctl_max_syn_backlog) + break; + + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = lopt; + write_unlock_bh(&tp->syn_wait_lock); + + /* There is race window here: we announce ourselves listening, + * but this transition is still not validated by get_port(). + * It is OK, because this socket enters to hash table only + * after validation is complete. + */ + sk->state = TCP_LISTEN; + if (sk->prot->get_port(sk, sk->num) == 0) { + sk->sport = htons(sk->num); + + sk_dst_reset(sk); + sk->prot->hash(sk); + + return 0; + } + + sk->state = TCP_CLOSE; + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = NULL; + write_unlock_bh(&tp->syn_wait_lock); + kfree(lopt); + return -EADDRINUSE; +#endif +} + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ + +static void tcp_listen_stop (struct sock *sk) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *acc_req = tp->accept_queue; + struct open_request *req; + int i; + + tcp_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt =NULL; + write_unlock_bh(&tp->syn_wait_lock); + tp->accept_queue = tp->accept_queue_tail = NULL; + + if (lopt->qlen) { + for (i=0; isyn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + tcp_openreq_free(req); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + } + } + } + BUG_TRAP(lopt->qlen == 0); + + kfree(lopt); + + while ((req=acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(child->lock.users==0); + sock_hold(child); + + tcp_disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(&tcp_orphan_count); + + tcp_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + tcp_acceptq_removed(sk); + tcp_openreq_fastfree(req); + } + BUG_TRAP(sk->ack_backlog == 0); +#endif +} + +/* + * Wait for a socket to get into the connected state + * + * Note: Must be called with the socket locked. + */ +static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p) +{ +#if 0 + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { + if(sk->err) + return sock_error(sk); + if((1 << sk->state) & + ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) + return -EPIPE; + if(!*timeo_p) + return -EAGAIN; + if(signal_pending(tsk)) + return sock_intr_errno(*timeo_p); + + __set_task_state(tsk, TASK_INTERRUPTIBLE); + add_wait_queue(sk->sleep, &wait); + sk->tp_pinfo.af_tcp.write_pending++; + + release_sock(sk); + *timeo_p = schedule_timeout(*timeo_p); + lock_sock(sk); + + __set_task_state(tsk, TASK_RUNNING); + remove_wait_queue(sk->sleep, &wait); + sk->tp_pinfo.af_tcp.write_pending--; + } + return 0; +#else + return 0; +#endif +} + +static inline int tcp_memory_free(struct sock *sk) +{ + return sk->wmem_queued < sk->sndbuf; +} + +/* + * Wait for more memory for a socket + */ +static int wait_for_tcp_memory(struct sock * sk, long *timeo) +{ +#if 0 + int err = 0; + long vm_wait = 0; + long current_timeo = *timeo; + DECLARE_WAITQUEUE(wait, current); + + if (tcp_memory_free(sk)) + current_timeo = vm_wait = (net_random()%(HZ/5))+2; + + add_wait_queue(sk->sleep, &wait); + for (;;) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + + set_current_state(TASK_INTERRUPTIBLE); + + if (sk->err || (sk->shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo) + goto do_nonblock; + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + if (tcp_memory_free(sk) && !vm_wait) + break; + + set_bit(SOCK_NOSPACE, &sk->socket->flags); + sk->tp_pinfo.af_tcp.write_pending++; + release_sock(sk); + if (!tcp_memory_free(sk) || vm_wait) + current_timeo = schedule_timeout(current_timeo); + lock_sock(sk); + sk->tp_pinfo.af_tcp.write_pending--; + + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeo; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } + *timeo = current_timeo; + } +out: + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + return err; + +do_error: + err = -EPIPE; + goto out; +do_nonblock: + err = -EAGAIN; + goto out; +do_interrupted: + err = sock_intr_errno(*timeo); + goto out; +#endif +} + +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); + +static inline int +can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) +{ +#if 0 + if (i) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + return page == frag->page && + off == frag->page_offset+frag->size; + } + return 0; +#else +return 0; +#endif +} + +static inline void +fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) +{ + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + frag->page = page; + frag->page_offset = off; + frag->size = size; + skb_shinfo(skb)->nr_frags = i+1; +} + +static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + tp->pushed_seq = tp->write_seq; +} + +static inline int forced_push(struct tcp_opt *tp) +{ + return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1)); +} + +static inline void +skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb) +{ + skb->csum = 0; + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = tp->write_seq; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = 0; + __skb_queue_tail(&sk->write_queue, skb); + tcp_charge_skb(sk, skb); + if (tp->send_head == NULL) + tp->send_head = skb; +} + +static inline void +tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb) +{ +#if 0 + if (flags & MSG_OOB) { + tp->urg_mode = 1; + tp->snd_up = tp->write_seq; + TCP_SKB_CB(skb)->sacked |= TCPCB_URG; + } +#endif +} + +static inline void +tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle) +{ +#if 0 + if (tp->send_head) { + struct sk_buff *skb = sk->write_queue.prev; + if (!(flags&MSG_MORE) || forced_push(tp)) + tcp_mark_push(tp, skb); + tcp_mark_urg(tp, flags, skb); + __tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle); + } +#endif +} + +static int tcp_error(struct sock *sk, int flags, int err) +{ +#if 0 + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +#else + return 0; +#endif +} + +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now; + int err; + ssize_t copied; + long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0) + goto out_err; + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + + mss_now = tcp_current_mss(sk); + copied = 0; + + err = -EPIPE; + if (sk->err || (sk->shutdown & SEND_SHUTDOWN)) + goto do_error; + + while (psize > 0) { + struct sk_buff *skb = sk->write_queue.prev; + int offset, size, copy, i; + struct page *page; + + page = pages[poffset/PAGE_SIZE]; + offset = poffset % PAGE_SIZE; + size = min_t(size_t, psize, PAGE_SIZE-offset); + + if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) { +new_segment: + if (!tcp_memory_free(sk)) + goto wait_for_sndbuf; + + skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation); + if (skb == NULL) + goto wait_for_memory; + + skb_entail(sk, tp, skb); + copy = mss_now; + } + + if (copy > size) + copy = size; + + i = skb_shinfo(skb)->nr_frags; + if (can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += copy; + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + fill_page_desc(skb, i, page, offset, copy); + } else { + tcp_mark_push(tp, skb); + goto new_segment; + } + + skb->len += copy; + skb->data_len += copy; + skb->ip_summed = CHECKSUM_HW; + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + + if (!copied) + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + + copied += copy; + poffset += copy; + if (!(psize -= copy)) + goto out; + + if (skb->len != mss_now || (flags&MSG_OOB)) + continue; + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, tp, mss_now, 1); + } else if (skb == tp->send_head) + tcp_push_one(sk, mss_now); + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->socket->flags); +wait_for_memory: + if (copied) + tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1); + + if ((err = wait_for_tcp_memory(sk, &timeo)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk); + } + +out: + if (copied) + tcp_push(sk, tp, flags, mss_now, tp->nonagle); + return copied; + +do_error: + if (copied) + goto out; +out_err: + return tcp_error(sk, flags, err); +#else +return 0; +#endif +} + +ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ +#if 0 + ssize_t res; + struct sock *sk = sock->sk; + +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) + + if (!(sk->route_caps & NETIF_F_SG) || + !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) + return sock_no_sendpage(sock, page, offset, size, flags); + +#undef TCP_ZC_CSUM_FLAGS + + lock_sock(sk); + TCP_CHECK_TIMER(sk); + res = do_tcp_sendpages(sk, &page, offset, size, flags); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; +#else + return 0; +#endif +} + +#define TCP_PAGE(sk) (sk->tp_pinfo.af_tcp.sndmsg_page) +#define TCP_OFF(sk) (sk->tp_pinfo.af_tcp.sndmsg_off) + +static inline int +tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb, + struct page *page, int off, int copy) +{ + int err = 0; + unsigned int csum; + + csum = csum_and_copy_from_user(from, page_address(page)+off, + copy, 0, &err); + if (!err) { + if (skb->ip_summed == CHECKSUM_NONE) + skb->csum = csum_block_add(skb->csum, csum, skb->len); + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk->wmem_queued += copy; + sk->forward_alloc -= copy; + } + return err; +} + +static inline int +skb_add_data(struct sk_buff *skb, char *from, int copy) +{ +#if 0 + int err = 0; + unsigned int csum; + int off = skb->len; + + csum = csum_and_copy_from_user(from, skb_put(skb, copy), + copy, 0, &err); + if (!err) { + skb->csum = csum_block_add(skb->csum, csum, off); + return 0; + } + + __skb_trim(skb, off); + return -EFAULT; +#else +return 0; +#endif +} + +static inline int select_size(struct sock *sk, struct tcp_opt *tp) +{ +#if 0 + int tmp = tp->mss_cache; + + if (sk->route_caps&NETIF_F_SG) { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + + if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE) + tmp = pgbreak; + } + return tmp; +#else + return 0; +#endif +} + +int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size) +{ +#if 0 + struct iovec *iov; + struct tcp_opt *tp; + struct sk_buff *skb; + int iovlen, flags; + int mss_now; + int err, copied; + long timeo; + + tp = &(sk->tp_pinfo.af_tcp); + + lock_sock(sk); + TCP_CHECK_TIMER(sk); + + flags = msg->msg_flags; + timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0) + goto out_err; + + /* This should be in poll */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags); + + mss_now = tcp_current_mss(sk); + + /* Ok commence sending. */ + iovlen = msg->msg_iovlen; + iov = msg->msg_iov; + copied = 0; + + err = -EPIPE; + if (sk->err || (sk->shutdown&SEND_SHUTDOWN)) + goto do_error; + + while (--iovlen >= 0) { + int seglen=iov->iov_len; + unsigned char * from=iov->iov_base; + + iov++; + + while (seglen > 0) { + int copy; + + skb = sk->write_queue.prev; + + if (tp->send_head == NULL || + (copy = mss_now - skb->len) <= 0) { + +new_segment: + /* Allocate new segment. If the interface is SG, + * allocate skb fitting to single page. + */ + if (!tcp_memory_free(sk)) + goto wait_for_sndbuf; + + skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation); + if (skb == NULL) + goto wait_for_memory; + + skb_entail(sk, tp, skb); + copy = mss_now; + } + + /* Try to append data to the end of skb. */ + if (copy > seglen) + copy = seglen; + + /* Where to copy to? */ + if (skb_tailroom(skb) > 0) { + /* We have some space in skb head. Superb! */ + if (copy > skb_tailroom(skb)) + copy = skb_tailroom(skb); + if ((err = skb_add_data(skb, from, copy)) != 0) + goto do_fault; + } else { + int merge = 0; + int i = skb_shinfo(skb)->nr_frags; + struct page *page = TCP_PAGE(sk); + int off = TCP_OFF(sk); + + if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) { + /* We can extend the last page fragment. */ + merge = 1; + } else if (i == MAX_SKB_FRAGS || + (i == 0 && !(sk->route_caps&NETIF_F_SG))) { + /* Need to add new fragment and cannot + * do this because interface is non-SG, + * or because all the page slots are busy. + */ + tcp_mark_push(tp, skb); + goto new_segment; + } else if (page) { + /* If page is cached, align + * offset to L1 cache boundary + */ + off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1); + if (off == PAGE_SIZE) { + put_page(page); + TCP_PAGE(sk) = page = NULL; + } + } + + if (!page) { + /* Allocate new cache page. */ + if (!(page=tcp_alloc_page(sk))) + goto wait_for_memory; + off = 0; + } + + if (copy > PAGE_SIZE-off) + copy = PAGE_SIZE-off; + + /* Time to copy data. We are close to the end! */ + err = tcp_copy_to_page(sk, from, skb, page, off, copy); + if (err) { + /* If this page was new, give it to the + * socket so it does not get leaked. + */ + if (TCP_PAGE(sk) == NULL) { + TCP_PAGE(sk) = page; + TCP_OFF(sk) = 0; + } + goto do_error; + } + + /* Update the skb. */ + if (merge) { + skb_shinfo(skb)->frags[i-1].size += copy; + } else { + fill_page_desc(skb, i, page, off, copy); + if (TCP_PAGE(sk)) { + get_page(page); + } else if (off + copy < PAGE_SIZE) { + get_page(page); + TCP_PAGE(sk) = page; + } + } + + TCP_OFF(sk) = off+copy; + } + + if (!copied) + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + + from += copy; + copied += copy; + if ((seglen -= copy) == 0 && iovlen == 0) + goto out; + + if (skb->len != mss_now || (flags&MSG_OOB)) + continue; + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, tp, mss_now, 1); + } else if (skb == tp->send_head) + tcp_push_one(sk, mss_now); + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->socket->flags); +wait_for_memory: + if (copied) + tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1); + + if ((err = wait_for_tcp_memory(sk, &timeo)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk); + } + } + +out: + if (copied) + tcp_push(sk, tp, flags, mss_now, tp->nonagle); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return copied; + +do_fault: + if (skb->len == 0) { + if (tp->send_head == skb) + tp->send_head = NULL; + __skb_unlink(skb, skb->list); + tcp_free_skb(sk, skb); + } + +do_error: + if (copied) + goto out; +out_err: + err = tcp_error(sk, flags, err); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return err; +#else + return 0; +#endif +} + +/* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) + */ + +static int tcp_recv_urg(struct sock * sk, long timeo, + struct msghdr *msg, int len, int flags, + int *addr_len) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + /* No URG data to read. */ + if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->state==TCP_CLOSE && !sk->done) + return -ENOTCONN; + + if (tp->urg_data & TCP_URG_VALID) { + int err = 0; + char c = tp->urg_data; + + if (!(flags & MSG_PEEK)) + tp->urg_data = TCP_URG_READ; + + /* Read urgent data. */ + msg->msg_flags|=MSG_OOB; + + if(len>0) { + if (!(flags & MSG_TRUNC)) + err = memcpy_toiovec(msg->msg_iov, &c, 1); + len = 1; + } else + msg->msg_flags|=MSG_TRUNC; + + return err ? -EFAULT : len; + } + + if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) + return 0; + + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike + */ + return -EAGAIN; +#else +return 0; +#endif +} + +/* + * Release a skb if it is no longer needed. This routine + * must be called with interrupts disabled or with the + * socket locked so that the sk_buff queue operation is ok. + */ + +static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb) +{ +#if 0 + __skb_unlink(skb, &sk->receive_queue); + __kfree_skb(skb); +#endif +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +static void cleanup_rbuf(struct sock *sk, int copied) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int time_to_ack = 0; + +#if TCP_DEBUG + struct sk_buff *skb = skb_peek(&sk->receive_queue); + + BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); +#endif + + if (tcp_ack_scheduled(tp)) { + /* Delayed ACKs frequently hit locked sockets during bulk receive. */ + if (tp->ack.blocked + /* Once-per-two-segments ACK was not sent by tcp_input.c */ + || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss + /* + * If this read emptied read buffer, we send ACK, if + * connection is not bidirectional, user drained + * receive buffer and there was a small segment + * in queue. + */ + || (copied > 0 && + (tp->ack.pending&TCP_ACK_PUSHED) && + !tp->ack.pingpong && + atomic_read(&sk->rmem_alloc) == 0)) { + time_to_ack = 1; + } + } + + /* We send an ACK if we can now advertise a non-zero window + * which has been raised "significantly". + * + * Even if window raised up to infinity, do not send window open ACK + * in states, where we will not receive more. It is useless. + */ + if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) { + __u32 rcv_window_now = tcp_receive_window(tp); + + /* Optimize, __tcp_select_window() is not cheap. */ + if (2*rcv_window_now <= tp->window_clamp) { + __u32 new_window = __tcp_select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. + * We can advertise it now, if it is not less than current one. + * "Lots" means "at least twice" here. + */ + if(new_window && new_window >= 2*rcv_window_now) + time_to_ack = 1; + } + } + if (time_to_ack) + tcp_send_ack(sk); +#endif +} + +/* Now socket state including sk->err is changed only under lock, + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. + */ + +static long tcp_data_wait(struct sock *sk, long timeo) +{ +#if 0 + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk->sleep, &wait); + + __set_current_state(TASK_INTERRUPTIBLE); + + set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); + release_sock(sk); + + if (skb_queue_empty(&sk->receive_queue)) + timeo = schedule_timeout(timeo); + + lock_sock(sk); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags); + + remove_wait_queue(sk->sleep, &wait); + __set_current_state(TASK_RUNNING); + return timeo; +#else + return 0; +#endif +} + +static void tcp_prequeue_process(struct sock *sk) +{ +#if 0 + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + + net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue); + + /* RX process wants to run with disabled BHs, though it is not necessary */ + local_bh_disable(); + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->backlog_rcv(sk, skb); + local_bh_enable(); + + /* Clear memory counter. */ + tp->ucopy.memory = 0; +#endif +} + +static inline +struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +{ +#if 0 + struct sk_buff *skb; + u32 offset; + + skb_queue_walk(&sk->receive_queue, skb) { + offset = seq - TCP_SKB_CB(skb)->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len || skb->h.th->fin) { + *off = offset; + return skb; + } + } + return NULL; +#else + return NULL; +#endif +} + +/* + * This routine provides an alternative to tcp_recvmsg() for routines + * that would like to handle copying from skbuffs directly in 'sendfile' + * fashion. + * Note: + * - It is assumed that the socket was locked by the caller. + * - The routine does not block. + * - At present, there is no support for reading OOB data + * or for 'peeking' the socket using this routine + * (although both would be easy to implement). + */ +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ +#if 0 + struct sk_buff *skb; + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + u32 seq = tp->copied_seq; + u32 offset; + int copied = 0; + + if (sk->state == TCP_LISTEN) + return -ENOTCONN; + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + if (offset < skb->len) { + size_t used, len; + + len = skb->len - offset; + /* Stop reading if we hit a patch of urgent data */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - seq; + if (urg_offset < len) + len = urg_offset; + if (!len) + break; + } + used = recv_actor(desc, skb, offset, len); + if (used <= len) { + seq += used; + copied += used; + offset += used; + } + if (offset != skb->len) + break; + } + if (skb->h.th->fin) { + tcp_eat_skb(sk, skb); + ++seq; + break; + } + tcp_eat_skb(sk, skb); + if (!desc->count) + break; + } + tp->copied_seq = seq; + /* Clean up data we have read: This will do ACK frames. */ + if (copied) + cleanup_rbuf(sk, copied); + return copied; +#else +#endif +} + +/* + * This routine copies from a sock struct into the user buffer. + * + * Technical note: in 2.3 we work on _locked_ socket, so that + * tricks with *seq access order and skb->users are not required. + * Probably, code can be easily improved even more. + */ + +int tcp_recvmsg(struct sock *sk, struct msghdr *msg, + int len, int nonblock, int flags, int *addr_len) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int copied = 0; + u32 peek_seq; + u32 *seq; + unsigned long used; + int err; + int target; /* Read at least this many bytes */ + long timeo; + struct task_struct *user_recv = NULL; + + lock_sock(sk); + + TCP_CHECK_TIMER(sk); + + err = -ENOTCONN; + if (sk->state == TCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + goto recv_urg; + + seq = &tp->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; + seq = &peek_seq; + } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + do { + struct sk_buff * skb; + u32 offset; + + /* Are we at urgent data? Stop if we have read anything. */ + if (copied && tp->urg_data && tp->urg_seq == *seq) + break; + + /* We need to check signals first, to get correct SIGURG + * handling. FIXME: Need to check this doesn't impact 1003.1g + * and move it down to the bottom of the loop + */ + if (signal_pending(current)) { + if (copied) + break; + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + + /* Next get a buffer. */ + + skb = skb_peek(&sk->receive_queue); + do { + if (!skb) + break; + + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (before(*seq, TCP_SKB_CB(skb)->seq)) { + printk(KERN_INFO "recvmsg bug: copied %X seq %X\n", + *seq, TCP_SKB_CB(skb)->seq); + break; + } + offset = *seq - TCP_SKB_CB(skb)->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len) + goto found_ok_skb; + if (skb->h.th->fin) + goto found_fin_ok; + BUG_TRAP(flags&MSG_PEEK); + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->receive_queue); + + /* Well, if we have backlog, try to process it now yet. */ + + if (copied >= target && sk->backlog.tail == NULL) + break; + + if (copied) { + if (sk->err || + sk->state == TCP_CLOSE || + (sk->shutdown & RCV_SHUTDOWN) || + !timeo || + (flags & MSG_PEEK)) + break; + } else { + if (sk->done) + break; + + if (sk->err) { + copied = sock_error(sk); + break; + } + + if (sk->shutdown & RCV_SHUTDOWN) + break; + + if (sk->state == TCP_CLOSE) { + if (!sk->done) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + } + + cleanup_rbuf(sk, copied); + + if (tp->ucopy.task == user_recv) { + /* Install new reader */ + if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) { + user_recv = current; + tp->ucopy.task = user_recv; + tp->ucopy.iov = msg->msg_iov; + } + + tp->ucopy.len = len; + + BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC))); + + /* Ugly... If prequeue is not empty, we have to + * process it before releasing socket, otherwise + * order will be broken at second iteration. + * More elegant solution is required!!! + * + * Look: we have the following (pseudo)queues: + * + * 1. packets in flight + * 2. backlog + * 3. prequeue + * 4. receive_queue + * + * Each queue can be processed only if the next ones + * are empty. At this point we have empty receive_queue. + * But prequeue _can_ be not empty after second iteration, + * when we jumped to start of loop because backlog + * processing added something to receive_queue. + * We cannot release_sock(), because backlog contains + * packets arrived _after_ prequeued ones. + * + * Shortly, algorithm is clear --- to process all + * the queues in order. We could make it more directly, + * requeueing packets from backlog to prequeue, if + * is not empty. It is more elegant, but eats cycles, + * unfortunately. + */ + if (skb_queue_len(&tp->ucopy.prequeue)) + goto do_prequeue; + + /* __ Set realtime policy in scheduler __ */ + } + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else { + timeo = tcp_data_wait(sk, timeo); + } + + if (user_recv) { + int chunk; + + /* __ Restore normal policy in scheduler __ */ + + if ((chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk; + len -= chunk; + copied += chunk; + } + + if (tp->rcv_nxt == tp->copied_seq && + skb_queue_len(&tp->ucopy.prequeue)) { +do_prequeue: + tcp_prequeue_process(sk); + + if ((chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk; + len -= chunk; + copied += chunk; + } + } + } + if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { + if (net_ratelimit()) + printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", + current->comm, current->pid); + peek_seq = tp->copied_seq; + } + continue; + + found_ok_skb: + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + /* Do we have urgent data here? */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; + if (urg_offset < used) { + if (!urg_offset) { + if (!sk->urginline) { + ++*seq; + offset++; + used--; + if (!used) + goto skip_copy; + } + } else + used = urg_offset; + } + } + + if (!(flags&MSG_TRUNC)) { + err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } + + *seq += used; + copied += used; + len -= used; + +skip_copy: + if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { + tp->urg_data = 0; + tcp_fast_path_check(sk, tp); + } + if (used + offset < skb->len) + continue; + + if (skb->h.th->fin) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + tcp_eat_skb(sk, skb); + continue; + + found_fin_ok: + /* Process the FIN. */ + ++*seq; + if (!(flags & MSG_PEEK)) + tcp_eat_skb(sk, skb); + break; + } while (len > 0); + + if (user_recv) { + if (skb_queue_len(&tp->ucopy.prequeue)) { + int chunk; + + tp->ucopy.len = copied > 0 ? len : 0; + + tcp_prequeue_process(sk); + + if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { + net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk; + len -= chunk; + copied += chunk; + } + } + + tp->ucopy.task = NULL; + tp->ucopy.len = 0; + } + + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ + + /* Clean up data we have read: This will do ACK frames. */ + cleanup_rbuf(sk, copied); + + TCP_CHECK_TIMER(sk); + release_sock(sk); + return copied; + +out: + TCP_CHECK_TIMER(sk); + release_sock(sk); + return err; + +recv_urg: + err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + goto out; +#else + return 0; +#endif +} + +/* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some + * states. A shutdown() may have already sent the FIN, or we may be + * closed. + */ + +static unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_CLOSE, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + +static int tcp_close_state(struct sock *sk) +{ +#if 0 + int next = (int) new_state[sk->state]; + int ns = (next & TCP_STATE_MASK); + + tcp_set_state(sk, ns); + + return (next & TCP_ACTION_FIN); +#else + return 0; +#endif +} + +/* + * Shutdown the sending side of a connection. Much like close except + * that we don't receive shut down or set sk->dead. + */ + +void tcp_shutdown(struct sock *sk, int how) +{ +#if 0 + /* We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. + */ + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if ((1 << sk->state) & + (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) { + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_close_state(sk)) + tcp_send_fin(sk); + } +#endif +} + + +/* + * Return 1 if we still have things to send in our buffers. + */ + +static inline int closing(struct sock * sk) +{ +#if 0 + return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK)); +#else + return 0; +#endif +} + +static __inline__ void tcp_kill_sk_queues(struct sock *sk) +{ +#if 0 + /* First the read buffer. */ + __skb_queue_purge(&sk->receive_queue); + + /* Next, the error queue. */ + __skb_queue_purge(&sk->error_queue); + + /* Next, the write queue. */ + BUG_TRAP(skb_queue_empty(&sk->write_queue)); + + /* Account for returned memory. */ + tcp_mem_reclaim(sk); + + BUG_TRAP(sk->wmem_queued == 0); + BUG_TRAP(sk->forward_alloc == 0); + + /* It is _impossible_ for the backlog to contain anything + * when we get here. All user references to this socket + * have gone away, only the net layer knows can touch it. + */ +#endif +} + +/* + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ +void tcp_destroy_sock(struct sock *sk) +{ +#if 0 + BUG_TRAP(sk->state==TCP_CLOSE); + BUG_TRAP(sk->dead); + + /* It cannot be in hash table! */ + BUG_TRAP(sk->pprev==NULL); + + /* If it has not 0 sk->num, it must be bound */ + BUG_TRAP(!sk->num || sk->prev!=NULL); + +#ifdef TCP_DEBUG + if (sk->zapped) { + printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk); + sock_hold(sk); + } + sk->zapped = 1; +#endif + + sk->prot->destroy(sk); + + tcp_kill_sk_queues(sk); + +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&sk->refcnt) != 1) { + printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt)); + } +#endif + + atomic_dec(&tcp_orphan_count); + sock_put(sk); +#endif +} + +void tcp_close(struct sock *sk, long timeout) +{ +#if 0 + struct sk_buff *skb; + int data_was_unread = 0; + + lock_sock(sk); + sk->shutdown = SHUTDOWN_MASK; + + if(sk->state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); + + /* Special case. */ + tcp_listen_stop(sk); + + goto adjudge_to_death; + } + + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin; + data_was_unread += len; + __kfree_skb(skb); + } + + tcp_mem_reclaim(sk); + + /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section + * 3.10, we send a RST here because data was lost. To + * witness the awful effects of the old behavior of always + * doing a FIN, run an older 2.1.x kernel or 2.0.x, start + * a bulk GET in an FTP client, suspend the process, wait + * for the client to advertise a zero window, then kill -9 + * the FTP client, wheee... Note: timeout is always zero + * in such a case. + */ + if(data_was_unread != 0) { + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS_USER(TCPAbortOnClose); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_KERNEL); + } else if (sk->linger && sk->lingertime==0) { + /* Check zero linger _after_ checking for unread data. */ + sk->prot->disconnect(sk, 0); + NET_INC_STATS_USER(TCPAbortOnData); + } else if (tcp_close_state(sk)) { + /* We FIN if the application ate all the data before + * zapping the connection. + */ + + /* RED-PEN. Formally speaking, we have broken TCP state + * machine. State transitions: + * + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_CLOSE_WAIT -> TCP_LAST_ACK + * + * are legal only when FIN has been sent (i.e. in window), + * rather than queued out of window. Purists blame. + * + * F.e. "RFC state" is ESTABLISHED, + * if Linux state is FIN-WAIT-1, but FIN is still not sent. + * + * The visible declinations are that sometimes + * we enter time-wait state, when it is not required really + * (harmless), do not send active resets, when they are + * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when + * they look as CLOSING or LAST_ACK for Linux) + * Probably, I missed some more holelets. + * --ANK + */ + tcp_send_fin(sk); + } + + if (timeout) { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk->sleep, &wait); + + do { + set_current_state(TASK_INTERRUPTIBLE); + if (!closing(sk)) + break; + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + } while (!signal_pending(tsk) && timeout); + + tsk->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + } + +adjudge_to_death: + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(sk); + + + /* Now socket is owned by kernel and we acquire BH lock + to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(sk); + BUG_TRAP(sk->lock.users==0); + + sock_hold(sk); + sock_orphan(sk); + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + * + * Nope, it was not mistake. It is really desired behaviour + * f.e. on http servers, when such sockets are useless, but + * consume significant resources. Let's do it with special + * linger2 option. --ANK + */ + + if (sk->state == TCP_FIN_WAIT2) { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + if (tp->linger2 < 0) { + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(TCPAbortOnLinger); + } else { + int tmo = tcp_fin_time(tp); + + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + } else { + atomic_inc(&tcp_orphan_count); + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + } + if (sk->state != TCP_CLOSE) { + tcp_mem_reclaim(sk); + if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || + (sk->wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned sockets\n"); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(TCPAbortOnMemory); + } + } + atomic_inc(&tcp_orphan_count); + + if (sk->state == TCP_CLOSE) + tcp_destroy_sock(sk); + /* Otherwise, socket is reprieved until protocol close. */ + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +#endif +} + +/* These states need RST on ABORT according to RFC793 */ + +extern __inline__ int tcp_need_reset(int state) +{ +#if 0 + return ((1 << state) & + (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1| + TCPF_FIN_WAIT2|TCPF_SYN_RECV)); +#else + return 0; +#endif +} + +int tcp_disconnect(struct sock *sk, int flags) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + int old_state; + int err = 0; + + old_state = sk->state; + if (old_state != TCP_CLOSE) + tcp_set_state(sk, TCP_CLOSE); + + /* ABORT function of RFC793 */ + if (old_state == TCP_LISTEN) { + tcp_listen_stop(sk); + } else if (tcp_need_reset(old_state) || + (tp->snd_nxt != tp->write_seq && + (1<err = ECONNRESET; + } else if (old_state == TCP_SYN_SENT) + sk->err = ECONNRESET; + + tcp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->receive_queue); + tcp_writequeue_purge(sk); + __skb_queue_purge(&tp->out_of_order_queue); + + sk->dport = 0; + + if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) { + sk->rcv_saddr = 0; + sk->saddr = 0; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + memset(&sk->net_pinfo.af_inet6.saddr, 0, 16); + memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16); +#endif + } + + sk->shutdown = 0; + sk->done = 0; + tp->srtt = 0; + if ((tp->write_seq += tp->max_window+2) == 0) + tp->write_seq = 1; + tp->backoff = 0; + tp->snd_cwnd = 2; + tp->probes_out = 0; + tp->packets_out = 0; + tp->snd_ssthresh = 0x7fffffff; + tp->snd_cwnd_cnt = 0; + tp->ca_state = TCP_CA_Open; + tcp_clear_retrans(tp); + tcp_delack_init(tp); + tp->send_head = NULL; + tp->saw_tstamp = 0; + tcp_sack_reset(tp); + __sk_dst_reset(sk); + + BUG_TRAP(!sk->num || sk->prev); + + sk->error_report(sk); + return err; +#else + return 0; +#endif +} + +/* + * Wait for an incoming connection, avoid race + * conditions. This must be called with the socket locked. + */ +static int wait_for_connect(struct sock * sk, long timeo) +{ +#if 0 + DECLARE_WAITQUEUE(wait, current); + int err; + + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + add_wait_queue_exclusive(sk->sleep, &wait); + for (;;) { + current->state = TASK_INTERRUPTIBLE; + release_sock(sk); + if (sk->tp_pinfo.af_tcp.accept_queue == NULL) + timeo = schedule_timeout(timeo); + lock_sock(sk); + err = 0; + if (sk->tp_pinfo.af_tcp.accept_queue) + break; + err = -EINVAL; + if (sk->state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sleep, &wait); + return err; +#else + return 0; +#endif +} + +/* + * This will accept the next outstanding connection. + */ + +struct sock *tcp_accept(struct sock *sk, int flags, int *err) +{ +#if 0 + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct open_request *req; + struct sock *newsk; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->state != TCP_LISTEN) + goto out; + + /* Find already established connection */ + if (!tp->accept_queue) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out; + + error = wait_for_connect(sk, timeo); + if (error) + goto out; + } + + req = tp->accept_queue; + if ((tp->accept_queue = req->dl_next) == NULL) + tp->accept_queue_tail = NULL; + + newsk = req->sk; + tcp_acceptq_removed(sk); + tcp_openreq_fastfree(req); + BUG_TRAP(newsk->state != TCP_SYN_RECV); + release_sock(sk); + return newsk; + +out: + release_sock(sk); + *err = error; + return NULL; +#else + return NULL; +#endif +} + +/* + * Socket option code for TCP. + */ + +int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval, + int optlen) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int val; + int err = 0; + + if (level != SOL_TCP) + return tp->af_specific->setsockopt(sk, level, optname, + optval, optlen); + + if(optlen MAX_TCP_WINDOW) { + err = -EINVAL; + break; + } + tp->user_mss = val; + break; + + case TCP_NODELAY: + /* You cannot try to use this and TCP_CORK in + * tandem, so let the user know. + */ + if (tp->nonagle == 2) { + err = -EINVAL; + break; + } + tp->nonagle = (val == 0) ? 0 : 1; + if (val) + tcp_push_pending_frames(sk, tp); + break; + + case TCP_CORK: + /* When set indicates to always queue non-full frames. + * Later the user clears this option and we transmit + * any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly + * filled frames when the user (for example) must write + * out headers with a write() call first and then use + * sendfile to send out the data parts. + * + * You cannot try to use TCP_NODELAY and this mechanism + * at the same time, so let the user know. + */ + if (tp->nonagle == 1) { + err = -EINVAL; + break; + } + if (val != 0) { + tp->nonagle = 2; + } else { + tp->nonagle = 0; + + tcp_push_pending_frames(sk, tp); + } + break; + + case TCP_KEEPIDLE: + if (val < 1 || val > MAX_TCP_KEEPIDLE) + err = -EINVAL; + else { + tp->keepalive_time = val * HZ; + if (sk->keepopen && !((1<state)&(TCPF_CLOSE|TCPF_LISTEN))) { + __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + tcp_reset_keepalive_timer(sk, elapsed); + } + } + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + err = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + err = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + err = -EINVAL; + else + tp->syn_retries = val; + break; + + case TCP_LINGER2: + if (val < 0) + tp->linger2 = -1; + else if (val > sysctl_tcp_fin_timeout/HZ) + tp->linger2 = 0; + else + tp->linger2 = val*HZ; + break; + + case TCP_DEFER_ACCEPT: + tp->defer_accept = 0; + if (val > 0) { + /* Translate value in seconds to number of retransmits */ + while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<defer_accept)) + tp->defer_accept++; + tp->defer_accept++; + } + break; + + case TCP_WINDOW_CLAMP: + if (val==0) { + if (sk->state != TCP_CLOSE) { + err = -EINVAL; + break; + } + tp->window_clamp = 0; + } else { + tp->window_clamp = valack.pingpong = 1; + } else { + tp->ack.pingpong = 0; + if ((1<state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) && + tcp_ack_scheduled(tp)) { + tp->ack.pending |= TCP_ACK_PUSHED; + cleanup_rbuf(sk, 1); + if (!(val & 1)) + tp->ack.pingpong = 1; + } + } + break; + + default: + err = -ENOPROTOOPT; + break; + }; + release_sock(sk); + return err; +#else + return 0; +#endif +} + +int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval, + int *optlen) +{ +#if 0 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int val, len; + + if(level != SOL_TCP) + return tp->af_specific->getsockopt(sk, level, optname, + optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if(len < 0) + return -EINVAL; + + switch(optname) { + case TCP_MAXSEG: + val = tp->mss_cache; + if (val == 0 && ((1<state)&(TCPF_CLOSE|TCPF_LISTEN))) + val = tp->user_mss; + break; + case TCP_NODELAY: + val = (tp->nonagle == 1); + break; + case TCP_CORK: + val = (tp->nonagle == 2); + break; + case TCP_KEEPIDLE: + val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ; + break; + case TCP_KEEPINTVL: + val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ; + break; + case TCP_KEEPCNT: + val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; + break; + case TCP_SYNCNT: + val = tp->syn_retries ? : sysctl_tcp_syn_retries; + break; + case TCP_LINGER2: + val = tp->linger2; + if (val >= 0) + val = (val ? : sysctl_tcp_fin_timeout)/HZ; + break; + case TCP_DEFER_ACCEPT: + val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1)); + break; + case TCP_WINDOW_CLAMP: + val = tp->window_clamp; + break; + case TCP_INFO: + { + struct tcp_info info; + u32 now = tcp_time_stamp; + + if(get_user(len,optlen)) + return -EFAULT; + info.tcpi_state = sk->state; + info.tcpi_ca_state = tp->ca_state; + info.tcpi_retransmits = tp->retransmits; + info.tcpi_probes = tp->probes_out; + info.tcpi_backoff = tp->backoff; + info.tcpi_options = 0; + if (tp->tstamp_ok) + info.tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->sack_ok) + info.tcpi_options |= TCPI_OPT_SACK; + if (tp->wscale_ok) { + info.tcpi_options |= TCPI_OPT_WSCALE; + info.tcpi_snd_wscale = tp->snd_wscale; + info.tcpi_rcv_wscale = tp->rcv_wscale; + } else { + info.tcpi_snd_wscale = 0; + info.tcpi_rcv_wscale = 0; + } + if (tp->ecn_flags&TCP_ECN_OK) + info.tcpi_options |= TCPI_OPT_ECN; + + info.tcpi_rto = (1000000*tp->rto)/HZ; + info.tcpi_ato = (1000000*tp->ack.ato)/HZ; + info.tcpi_snd_mss = tp->mss_cache; + info.tcpi_rcv_mss = tp->ack.rcv_mss; + + info.tcpi_unacked = tp->packets_out; + info.tcpi_sacked = tp->sacked_out; + info.tcpi_lost = tp->lost_out; + info.tcpi_retrans = tp->retrans_out; + info.tcpi_fackets = tp->fackets_out; + + info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ; + info.tcpi_last_ack_sent = 0; + info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ; + info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ; + + info.tcpi_pmtu = tp->pmtu_cookie; + info.tcpi_rcv_ssthresh = tp->rcv_ssthresh; + info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3; + info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2; + info.tcpi_snd_ssthresh = tp->snd_ssthresh; + info.tcpi_snd_cwnd = tp->snd_cwnd; + info.tcpi_advmss = tp->advmss; + info.tcpi_reordering = tp->reordering; + + len = min_t(unsigned int, len, sizeof(info)); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &info,len)) + return -EFAULT; + return 0; + } + case TCP_QUICKACK: + val = !tp->ack.pingpong; + break; + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +#else + return 0; +#endif +} + + +//extern void __skb_cb_too_small_for_tcp(int, int); +//extern void tcpdiag_init(void); + +void /* __init */ tcp_init(void) +{ +#if 0 + struct sk_buff *skb = NULL; + unsigned long goal; + int order, i; + + if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) + __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), + sizeof(skb->cb)); + + tcp_openreq_cachep = kmem_cache_create("tcp_open_request", + sizeof(struct open_request), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_openreq_cachep) + panic("tcp_init: Cannot alloc open_request cache."); + + tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", + sizeof(struct tcp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); + + tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", + sizeof(struct tcp_tw_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if(!tcp_timewait_cachep) + panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); + + /* Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + if (num_physpages >= (128 * 1024)) + goal = num_physpages >> (21 - PAGE_SHIFT); + else + goal = num_physpages >> (23 - PAGE_SHIFT); + + for(order = 0; (1UL << order) < goal; order++) + ; + do { + tcp_ehash_size = (1UL << order) * PAGE_SIZE / + sizeof(struct tcp_ehash_bucket); + tcp_ehash_size >>= 1; + while (tcp_ehash_size & (tcp_ehash_size-1)) + tcp_ehash_size--; + tcp_ehash = (struct tcp_ehash_bucket *) + __get_free_pages(GFP_ATOMIC, order); + } while (tcp_ehash == NULL && --order > 0); + + if (!tcp_ehash) + panic("Failed to allocate TCP established hash table\n"); + for (i = 0; i < (tcp_ehash_size<<1); i++) { + tcp_ehash[i].lock = RW_LOCK_UNLOCKED; + tcp_ehash[i].chain = NULL; + } + + do { + tcp_bhash_size = (1UL << order) * PAGE_SIZE / + sizeof(struct tcp_bind_hashbucket); + if ((tcp_bhash_size > (64 * 1024)) && order > 0) + continue; + tcp_bhash = (struct tcp_bind_hashbucket *) + __get_free_pages(GFP_ATOMIC, order); + } while (tcp_bhash == NULL && --order >= 0); + + if (!tcp_bhash) + panic("Failed to allocate TCP bind hash table\n"); + for (i = 0; i < tcp_bhash_size; i++) { + tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED; + tcp_bhash[i].chain = NULL; + } + + /* Try to be a bit smarter and adjust defaults depending + * on available memory. + */ + if (order > 4) { + sysctl_local_port_range[0] = 32768; + sysctl_local_port_range[1] = 61000; + sysctl_tcp_max_tw_buckets = 180000; + sysctl_tcp_max_orphans = 4096<<(order-4); + sysctl_max_syn_backlog = 1024; + } else if (order < 3) { + sysctl_local_port_range[0] = 1024*(3-order); + sysctl_tcp_max_tw_buckets >>= (3-order); + sysctl_tcp_max_orphans >>= (3-order); + sysctl_max_syn_backlog = 128; + } + tcp_port_rover = sysctl_local_port_range[0] - 1; + + sysctl_tcp_mem[0] = 768< 512) + sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512; + if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512) + sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512; + + if (order < 3) { + sysctl_tcp_wmem[2] = 64*1024; + sysctl_tcp_rmem[0] = PAGE_SIZE; + sysctl_tcp_rmem[1] = 43689; + sysctl_tcp_rmem[2] = 2*43689; + } + + printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n", + tcp_ehash_size<<1, tcp_bhash_size); + + tcpdiag_init(); +#endif +}