<!-- received="Fri Sep 10 07:41:10 1999 EET DST" -->
<!-- sent="Fri, 10 Sep 1999 05:41:03 +0200" -->
<!-- name="Artur Skawina" -->
<!-- email="skawina@geocities.com" -->
<!-- subject="[PATCH] ip checksum fixes and optimizations" -->
<!-- id="" -->
<!-- inreplyto="" -->
<title>Linux-kernel mailing list archive 1999-36,: [PATCH] ip checksum fixes and optimizations</title>
<body bgcolor="#FFFFFF"><font face="Arial,Helvetica">
<h1>[PATCH] ip checksum fixes and optimizations</h1>
<b>Artur Skawina</b> (<a href="mailto:skawina@geocities.com"><i>skawina@geocities.com</i></a>)<br>
<i>Fri, 10 Sep 1999 05:41:03 +0200</i>
<p>
<ul>
<li> <b>Messages sorted by:</b> <a href="date.html#797">[ date ]</a><a href="index.html#797">[ thread ]</a><a href="subject.html#797">[ subject ]</a><a href="author.html#797">[ author ]</a>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0798.html">fito: "smbfs unresolved symbol get_cached_page"</a>
<li> <b>Previous message:</b> <a href="0796.html">Albert D. Cahalan: "Re: 'Current correctly-operating ps?'"</a>
<!-- nextthread="start" -->
<!-- reply="end" -->
</ul>
<hr>
<!-- body="start" -->
This is a multi-part message in MIME format.<br>
<p>
--------------3219EB5E33F6067B37ED4058<br>
Content-Type: text/plain; charset=us-ascii<br>
Content-Transfer-Encoding: 7bit<br>
<p>
Some time ago i looked at the ip checksum routines and found a lot of<br>
places that could be improved. This patch constains the nonintrusive,<br>
selfcontained parts. aot the checksum routines were sometimes called<br>
with rediculously small args, already known information was discarded,<br>
the asm code was not optimized fully for modern processors etc.<br>
I just ported this to 2.3.17, and as i may not have the time to work<br>
on this in the next few days I'm posting it as is, only to get some<br>
feedback. Thoughts?<br>
<p>
Parts of this are ~100% correct, but there are also things i noticed<br>
while working on something else, and have still to recheck, some<br>
strange compiler issues not fully investigated etc. The checksum.S<br>
parts are ok, but they (apparently;) need a lot more comments. The<br>
[345]86 routines are unchanged.<br>
<p>
<p>
Highlights:<br>
o csum_partial_copy() backward compatibility stub removed<br>
o csum_partial() inlined for very small constant sizes (like struct udphdr)<br>
o ip_fast_csum() - various optimization<br>
o csum_tcpudp_nofold() optimized for sum==0 case<br>
o 686 csum_partial() optimized (up to 13%..1% faster)<br>
  (ie 13% for 32 bytes, but only 1% for 1480)<br>
o 686 csum_partial_copy_generic() optimized (14%..20% faster)<br>
o incremental checksum calculation fix<br>
<p>
--------------3219EB5E33F6067B37ED4058<br>
Content-Type: text/plain; charset=us-ascii; name="patch"<br>
Content-Transfer-Encoding: 7bit<br>
Content-Disposition: inline; filename="patch"<br>
<p>
diff -urNp /img/linux-2.3.17/arch/i386/kernel/i386_ksyms.c linux-2.3.17as/arch/i386/kernel/i386_ksyms.c<br>
--- /img/linux-2.3.17/arch/i386/kernel/i386_ksyms.c	Wed Sep  1 20:33:56 1999<br>
+++ linux-2.3.17as/arch/i386/kernel/i386_ksyms.c	Wed Sep  8 22:48:18 1999<br>
@@ -47,7 +47,6 @@ EXPORT_SYMBOL_NOVERS(__down_failed_inter<br>
 EXPORT_SYMBOL_NOVERS(__down_failed_trylock);<br>
 EXPORT_SYMBOL_NOVERS(__up_wakeup);<br>
 /* Networking helper routines. */<br>
-EXPORT_SYMBOL(csum_partial_copy);<br>
 EXPORT_SYMBOL(csum_partial_copy_generic);<br>
 /* Delay loops */<br>
 EXPORT_SYMBOL(__udelay);<br>
diff -urNp /img/linux-2.3.17/arch/i386/lib/Makefile linux-2.3.17as/arch/i386/lib/Makefile<br>
--- /img/linux-2.3.17/arch/i386/lib/Makefile	Fri Aug 27 13:13:11 1999<br>
+++ linux-2.3.17as/arch/i386/lib/Makefile	Wed Sep  8 17:45:06 1999<br>
@@ -6,7 +6,7 @@<br>
 	$(CC) -D__ASSEMBLY__ $(AFLAGS) -traditional -c $&lt; -o $*.o<br>
 <br>
 L_TARGET = lib.a<br>
-L_OBJS  = checksum.o old-checksum.o delay.o \<br>
+L_OBJS  = checksum.o delay.o \<br>
 	usercopy.o getuser.o putuser.o<br>
 <br>
 include $(TOPDIR)/Rules.make<br>
diff -urNp /img/linux-2.3.17/arch/i386/lib/checksum.S linux-2.3.17as/arch/i386/lib/checksum.S<br>
--- /img/linux-2.3.17/arch/i386/lib/checksum.S	Thu May 20 01:59:04 1999<br>
+++ linux-2.3.17as/arch/i386/lib/checksum.S	Fri Sep 10 02:50:45 1999<br>
@@ -36,8 +36,8 @@ unsigned int csum_partial(const unsigned<br>
  */<br>
 		<br>
 .text<br>
-.align 4<br>
-.globl csum_partial								<br>
+.align 32<br>
+.globl _csum_partial								<br>
 		<br>
 #if CPU!=686<br>
 <br>
@@ -48,7 +48,7 @@ unsigned int csum_partial(const unsigned<br>
 	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte<br>
 	   * alignment for the unrolled loop.<br>
 	   */		<br>
-csum_partial:	<br>
+_csum_partial:	<br>
 	pushl %esi<br>
 	pushl %ebx<br>
 	movl 20(%esp),%eax	# Function arg: unsigned int sum<br>
@@ -116,47 +116,29 @@ csum_partial:	<br>
 <br>
 #else /* CPU==686 */<br>
 <br>
-csum_partial:<br>
+_csum_partial:<br>
+	movl 16-4*2(%esp),%edx	# Function arg2: int len<br>
 	pushl %esi<br>
+	movl 12-4*1(%esp),%esi	# Function arg1: const unsigned char *buf<br>
 	pushl %ebx<br>
-	movl 20(%esp),%eax	# Function arg: unsigned int sum<br>
-	movl 16(%esp),%ecx	# Function arg: int len<br>
-	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf<br>
-<br>
-	testl $2, %esi         <br>
+	movl 20-4*0(%esp),%eax	# Function arg3: unsigned int sum<br>
+	testl $2, %esi<br>
 	jnz 30f                 <br>
 10:<br>
-	movl %ecx, %edx<br>
-	movl %ecx, %ebx<br>
+	movl %edx, %ebx<br>
+	movl %edx, %ecx<br>
 	andl $0x7c, %ebx<br>
-	shrl $7, %ecx<br>
+	shrl $7, %edx<br>
 	addl %ebx,%esi<br>
 	shrl $2, %ebx  <br>
 	negl %ebx<br>
+        testl  %edx,%edx<br>
 	lea 45f(%ebx,%ebx,2), %ebx<br>
-	testl %esi, %esi<br>
 	jmp *%ebx<br>
 <br>
-	# Handle 2-byte-aligned regions<br>
-20:	addw (%esi), %ax<br>
-	lea 2(%esi), %esi<br>
-	adcl $0, %eax<br>
-	jmp 10b<br>
-<br>
-30:	subl $2, %ecx          <br>
-	ja 20b                 <br>
-	je 32f<br>
-	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned<br>
-	addl %ebx, %eax<br>
-	adcl $0, %eax<br>
-	jmp 80f<br>
-32:<br>
-	addw (%esi), %ax	# csumming 2 bytes, 2-aligned<br>
-	adcl $0, %eax<br>
-	jmp 80f<br>
-<br>
 40: <br>
-	addl -128(%esi), %eax<br>
+	leal 128(%esi), %esi			# %esi += 128<br>
+	addl -128(%esi), %eax	# CF==0 and a plain 'add' is faster<br>
 	adcl -124(%esi), %eax<br>
 	adcl -120(%esi), %eax<br>
 	adcl -116(%esi), %eax   <br>
@@ -189,27 +171,44 @@ csum_partial:<br>
 	adcl -8(%esi), %eax<br>
 	adcl -4(%esi), %eax<br>
 45:<br>
-	lea 128(%esi), %esi<br>
-	adcl $0, %eax<br>
-	dec %ecx<br>
+	adcl $0, %eax			# %eax += CF<br>
+	dec %edx<br>
 	jge 40b<br>
-	movl %edx, %ecx<br>
-50:	andl $3, %ecx<br>
-	jz 80f<br>
 <br>
-	# Handle the last 1-3 bytes without jumping<br>
-	notl %ecx		# 1-&gt;2, 2-&gt;1, 3-&gt;0, higher bits are masked<br>
-	movl $0xffffff,%ebx	# by the shll and shrl instructions<br>
-	shll $3,%ecx<br>
-	shrl %cl,%ebx<br>
-	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok<br>
-	addl %ebx,%eax<br>
-	adcl $0,%eax<br>
+	andl $3, %ecx<br>
+	jz 80f<br>
+					# Handle the last 1-3 bytes without jumping<br>
+	notb %cl			# 1-&gt;2, 2-&gt;1, 3-&gt;0, higher bits are masked<br>
+	movl $0xffffff,%edx		# by the shll and shrl instructions<br>
+	shlb $3,%cl<br>
+	shrl %cl,%edx			# note: arg1 must be CL<br>
+	andl (%esi),%edx		# esi is 4-aligned so should be ok<br>
+	addl %edx,%eax<br>
+75:<br>
+	adcl $0,%eax			# %eax += CF<br>
 80: <br>
-	popl %ebx<br>
-	popl %esi<br>
+        popl %ebx<br>
+        popl %esi<br>
 	ret<br>
-				<br>
+<br>
+	# Handle 2-byte-aligned regions<br>
+20:<br>
+	addw (%esi), %ax<br>
+	lea 2(%esi), %esi<br>
+	adcl $0, %eax<br>
+	jmp 10b<br>
+<br>
+30:<br>
+	subl   $2, %edx  <br>
+        ja     20b<br>
+	js     35f<br>
+	addw   (%esi), %ax	# csumming 2 bytes, 2-aligned<br>
+        jmp 75b<br>
+<br>
+35:<br>
+	movzbl (%esi),%ecx	# csumming 1 byte, 2-aligned<br>
+	addl   %ecx, %eax<br>
+        jmp 75b<br>
 #endif /* CPU==686 */ <br>
 <br>
 /*<br>
@@ -240,13 +239,13 @@ unsigned int csum_partial_copy_generic (<br>
 	.long 9999b, 6002f	;	\<br>
 	.previous<br>
 <br>
-.align 4<br>
+.align 32<br>
 .globl csum_partial_copy_generic<br>
 				<br>
 #if CPU!=686<br>
 <br>
 #define ARGBASE 16		<br>
-#define FP		12<br>
+#define FP	12<br>
 		<br>
 csum_partial_copy_generic:<br>
 	subl  $4,%esp	<br>
@@ -371,81 +370,119 @@ DST(	movb %cl, (%edi)	)<br>
 <br>
 /* Version for PentiumII/PPro */<br>
 <br>
+/*<br>
+   This is <br>
+    o  70% slower when the source is not 32 bit aligned [ie (long)src&amp;3]<br>
+    o 190% slower when the destination is not 32 bit aligned<br>
+    o 260% slower when both source and destination are not 32 bit aligned<br>
+    o 175% slower when destination is not 64 bit aligned and source _is_ [ie (long)dst&amp;4]<br>
+    o whether source is 64 bit aligned or not does not seem to make much difference<br>
+ */<br>
+<br>
 #define ROUND1(x) \<br>
-	SRC(movl x(%esi), %ebx	)	;	\<br>
-	addl %ebx, %eax			;	\<br>
-	DST(movl %ebx, x(%edi)	)	; <br>
+	SRC(movl x(%esi), %edx	)	;\<br>
+	addl %edx, %eax			;\<br>
+	SRC(movl x+4(%esi), %ebx )	;\<br>
+	DST(movl %edx, x(%edi)	)	;\<br>
+	adcl %ebx, %eax			;\<br>
+	DST(movl %ebx, x+4(%edi) )	;\<br>
 <br>
 #define ROUND(x) \<br>
-	SRC(movl x(%esi), %ebx	)	;	\<br>
-	adcl %ebx, %eax			;	\<br>
-	DST(movl %ebx, x(%edi)	)	;<br>
+	SRC(movl x(%esi), %edx	)	;\<br>
+	adcl %edx, %eax			;\<br>
+	SRC(movl x+4(%esi), %ebx )	;\<br>
+	DST(movl %edx, x(%edi)	)	;\<br>
+	adcl %ebx, %eax			;\<br>
+	DST(movl %ebx, x+4(%edi) )	;\<br>
+        <br>
+#define ROUNDL(x) \<br>
+	SRC(movl x(%esi), %edx	)	;\<br>
+	adcl %edx, %eax			;\<br>
+	SRC(movl x+4(%esi), %ebx )	;\<br>
+	adcl %ebx, %eax			;\<br>
+	DST(movl %edx, x(%edi)	)	;\<br>
+	DST(movl %ebx, x+4(%edi) )	;\<br>
 <br>
 #define ARGBASE 12<br>
 		<br>
 csum_partial_copy_generic:<br>
 	pushl %ebx<br>
-	pushl %edi<br>
+	movl ARGBASE+12-4*2(%esp),%ebx	#len<br>
 	pushl %esi<br>
-	movl ARGBASE+4(%esp),%esi	#src<br>
-	movl ARGBASE+8(%esp),%edi	#dst	<br>
-	movl ARGBASE+12(%esp),%ecx	#len<br>
-	movl ARGBASE+16(%esp),%eax	#sum<br>
-	movl %ecx, %edx  <br>
-	movl %ecx, %ebx  <br>
-	shrl $6, %ecx     <br>
-	andl $0x3c, %ebx  <br>
+	movl ARGBASE+4-4*1(%esp),%esi	#src<br>
+	movl %ebx, %ecx  <br>
+	pushl %edi<br>
+	movl ARGBASE+8-4*0(%esp),%edi	#dst	<br>
+	andl $0x38, %ebx  <br>
+	addl %ebx, %esi  <br>
+	shrl $6, %ecx     		# len /= 64 (amount of longwords per iteration)<br>
+	addl %ebx, %edi  <br>
 	negl %ebx<br>
-	subl %ebx, %esi  <br>
-	subl %ebx, %edi  <br>
+	movl ARGBASE+16-4*0(%esp),%eax	#sum<br>
 	lea 3f(%ebx,%ebx), %ebx<br>
-	testl %esi, %esi <br>
+        testl  %eax,%eax		# CF=0<br>
 	jmp *%ebx         <br>
-1:	addl $64,%esi<br>
+1:<br>
+	addl $64,%esi<br>
 	addl $64,%edi <br>
-	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)	<br>
-	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)	<br>
-	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)	<br>
-	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)	<br>
-3:	adcl $0,%eax<br>
+	ROUND1(-64) ROUND (-56)   <br>
+	ROUND (-48) ROUND (-40)   <br>
+	ROUND (-32) ROUND (-24)   <br>
+	ROUND (-16) ROUNDL(-8) <br>
+3:<br>
+	adcl $0,%eax<br>
 	dec %ecx<br>
 	jge 1b<br>
-4:      andl $3, %edx<br>
+        <br>
+	movl ARGBASE+12(%esp),%edx	#len<br>
+        <br>
+        testl $4,%edx<br>
+        jz  4f<br>
+	SRC(movl (%esi), %ebx	)<br>
+	addl %ebx, %eax<br>
+	DST(movl %ebx, (%edi)	)<br>
+	leal 4(%esi), %esi<br>
+	leal 4(%edi), %edi<br>
+	adcl $0, %eax<br>
+4:<br>
+        andl $3, %edx<br>
 	jz 7f<br>
 	cmpl $2, %edx<br>
 	jb 5f<br>
 SRC(	movw (%esi), %dx         )<br>
-	leal 2(%esi), %esi<br>
 DST(	movw %dx, (%edi)         )<br>
-	leal 2(%edi), %edi<br>
 	je 6f<br>
+	leal 2(%esi), %esi<br>
 	shll $16,%edx<br>
+	leal 2(%edi), %edi<br>
 5:<br>
 SRC(	movb (%esi), %dl         )<br>
 DST(	movb %dl, (%edi)         )<br>
-6:	addl %edx, %eax<br>
+6:<br>
+	addl %edx, %eax<br>
 	adcl $0, %eax<br>
 7:<br>
 .section .fixup, "ax"<br>
-6001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr	<br>
-	movl $-EFAULT, (%ebx)<br>
+6001:<br>
 	# zero the complete destination (computing the rest is too much work)<br>
 	movl ARGBASE+8(%esp),%edi	# dst<br>
 	movl ARGBASE+12(%esp),%ecx	# len<br>
+	movl ARGBASE+20(%esp), %ebx	# src_err_ptr	<br>
 	xorl %eax,%eax<br>
+	movl $-EFAULT, (%ebx)<br>
 	rep; stosb<br>
-	jmp 7b<br>
+	jmp  7b<br>
 6002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr<br>
 	movl $-EFAULT, (%ebx)<br>
 	jmp  7b			<br>
 .previous				<br>
 <br>
-	popl %esi<br>
 	popl %edi<br>
+	popl %esi<br>
 	popl %ebx<br>
 	ret<br>
 				<br>
 #undef ROUND<br>
 #undef ROUND1		<br>
-		<br>
+<br>
 #endif	/* CPU==i686 */ <br>
diff -urNp /img/linux-2.3.17/arch/i386/lib/old-checksum.c linux-2.3.17as/arch/i386/lib/old-checksum.c<br>
--- /img/linux-2.3.17/arch/i386/lib/old-checksum.c	Sun Dec 27 18:32:09 1998<br>
+++ linux-2.3.17as/arch/i386/lib/old-checksum.c	Thu Jan  1 00:00:00 1970<br>
@@ -1,19 +0,0 @@<br>
-/*<br>
- * FIXME: old compatibility stuff, will be removed soon.<br>
- */<br>
-<br>
-#include &lt;net/checksum.h&gt;<br>
-<br>
-unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum)<br>
-{<br>
-	int src_err=0, dst_err=0;<br>
-<br>
-	sum = csum_partial_copy_generic ( src, dst, len, sum, &amp;src_err, &amp;dst_err);<br>
-<br>
-	if (src_err || dst_err)<br>
-		printk("old csum_partial_copy_fromuser(), tell mingo to convert me.\n");<br>
-<br>
-	return sum;<br>
-}<br>
-<br>
-<br>
diff -urNp /img/linux-2.3.17/drivers/net/ibmtr.c linux-2.3.17as/drivers/net/ibmtr.c<br>
--- /img/linux-2.3.17/drivers/net/ibmtr.c	Fri Aug 27 13:13:00 1999<br>
+++ linux-2.3.17as/drivers/net/ibmtr.c	Wed Sep  8 17:34:37 1999<br>
@@ -1598,7 +1598,7 @@ static void tr_rx(struct net_device *dev<br>
 	/* Copy the payload... */<br>
 	for (;;) {<br>
 		if (IPv4_p)<br>
-			chksum = csum_partial_copy(bus_to_virt(rbufdata), data,<br>
+			chksum = csum_partial_copy_nocheck(bus_to_virt(rbufdata), data,<br>
 						   length &lt; rbuffer_len ? length : rbuffer_len,<br>
 						   chksum);<br>
 		else<br>
diff -urNp /img/linux-2.3.17/include/asm-i386/checksum.h linux-2.3.17as/include/asm-i386/checksum.h<br>
--- /img/linux-2.3.17/include/asm-i386/checksum.h	Sun Dec 27 18:39:50 1998<br>
+++ linux-2.3.17as/include/asm-i386/checksum.h	Wed Sep  8 17:34:37 1999<br>
@@ -14,7 +14,40 @@<br>
  *<br>
  * it's best to have buff aligned on a 32-bit boundary<br>
  */<br>
-asmlinkage unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum);<br>
+<br>
+#define csum_partial(buff,len,sum)              \<br>
+(                                               \<br>
+   __builtin_constant_p(len) ?                  \<br>
+      _csum_partial_clen((buff),(len),(sum)) :  \<br>
+      _csum_partial((buff),(len),(sum))         \<br>
+)<br>
+asmlinkage unsigned int _csum_partial(const unsigned char * buff, int len, unsigned int sum);<br>
+<br>
+extern inline unsigned int _csum_partial_clen(const unsigned char *buff, int len, unsigned int sum)<br>
+{<br>
+   switch ( len )<br>
+   {<br>
+   case 4:<br>
+      asm (<br>
+         "addl  (%1),  %0\n\t"<br>
+         "adcl  $0,    %0"       /* += CF */<br>
+         : "=&amp;r" (sum)<br>
+         : "r" (buff), "0" (sum)<br>
+            );<br>
+      return sum;<br>
+   case 8:                       /* eg sizeof(struct udphdr) */<br>
+      asm (<br>
+         "addl  (%1),  %0\n\t"<br>
+         "adcl  4(%1), %0\n\t"<br>
+         "adcl  $0,    %0"       /* += CF */<br>
+         : "=&amp;r" (sum)<br>
+         : "r" (buff), "0" (sum)<br>
+            );<br>
+      return sum;<br>
+   default:<br>
+      return _csum_partial(buff,len,sum);<br>
+   }<br>
+}<br>
 <br>
 /*<br>
  * the same as csum_partial, but copies from src while it<br>
@@ -23,10 +56,12 @@ asmlinkage unsigned int csum_partial(con<br>
  * here even more important to align src and dst on a 32-bit (or even<br>
  * better 64-bit) boundary<br>
  */<br>
-<br>
+/* automatically inlining this routine isn't as obviously a win, as with checksum<br>
+   alone (exceptions must be handled, and currently (2.3.6). only icmp code would<br>
+   benefit from this anyway)<br>
+ */<br>
 asmlinkage unsigned int csum_partial_copy_generic( const char *src, char *dst, int len, int sum,<br>
 						   int *src_err_ptr, int *dst_err_ptr);<br>
-<br>
 /*<br>
  *	Note: when you get a NULL pointer exception here this means someone<br>
  *	passed in an incorrect kernel address to one of these functions. <br>
@@ -34,22 +69,13 @@ asmlinkage unsigned int csum_partial_cop<br>
  *	If you use these functions directly please don't forget the <br>
  *	verify_area().<br>
  */<br>
-extern __inline__<br>
-unsigned int csum_partial_copy_nocheck ( const char *src, char *dst,<br>
-					int len, int sum)<br>
-{<br>
-	return csum_partial_copy_generic ( src, dst, len, sum, NULL, NULL);<br>
-}<br>
-<br>
-extern __inline__<br>
-unsigned int csum_partial_copy_from_user ( const char *src, char *dst,<br>
-						int len, int sum, int *err_ptr)<br>
-{<br>
-	return csum_partial_copy_generic ( src, dst, len, sum, err_ptr, NULL);<br>
-}<br>
+#define csum_partial_copy_nocheck(src,dst,len,sum) \<br>
+   csum_partial_copy_generic((src), (dst), (len), (sum), NULL, NULL)<br>
+   <br>
+#define csum_partial_copy_from_user(src,dst,len,sum,err_ptr) \<br>
+   csum_partial_copy_generic((src), (dst), (len), (sum), (err_ptr), NULL)<br>
 <br>
 #if 0<br>
-<br>
 /* Not used at the moment. It is difficult to imagine for what purpose<br>
    it can be used :-) Please, do not forget to verify_area before it --ANK<br>
  */<br>
@@ -57,32 +83,18 @@ unsigned int csum_partial_copy_from_user<br>
 /*<br>
  * This combination is currently not used, but possible:<br>
  */<br>
-<br>
-extern __inline__<br>
-unsigned int csum_partial_copy_to_user ( const char *src, char *dst,<br>
-					int len, int sum, int *err_ptr)<br>
-{<br>
-	return csum_partial_copy_generic ( src, dst, len, sum, NULL, err_ptr);<br>
-}<br>
+#define csum_partial_copy_to_user(src,dst,len,sum,err_ptr) \<br>
+   csum_partial_copy_generic((src), (dst), (len), (sum), NULL, (err_ptr))<br>
 #endif<br>
 <br>
 /*<br>
- * These are the old (and unsafe) way of doing checksums, a warning message will be<br>
- * printed if they are used and an exeption occurs.<br>
- *<br>
- * these functions should go away after some time.<br>
- */<br>
-<br>
-#define csum_partial_copy_fromuser csum_partial_copy<br>
-unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum);<br>
-<br>
-/*<br>
  *	This is a version of ip_compute_csum() optimized for IP headers,<br>
  *	which always checksum on 4 octet boundaries.<br>
  *<br>
  *	By Jorge Cwik &lt;<a href="mailto:jorge@laser.satlink.net">jorge@laser.satlink.net</a>&gt;, adapted for linux by<br>
  *	Arnt Gulbrandsen.<br>
  */<br>
+#if 0<br>
 static inline unsigned short ip_fast_csum(unsigned char * iph,<br>
 					  unsigned int ihl) {<br>
 	unsigned int sum;<br>
@@ -109,16 +121,74 @@ static inline unsigned short ip_fast_csu<br>
 	/* Since the input registers which are loaded with iph and ipl<br>
 	   are modified, we must also specify them as outputs, or gcc<br>
 	   will assume they contain their original values. */<br>
-	: "=r" (sum), "=r" (iph), "=r" (ihl)<br>
+	: "=&amp;r" (sum), "=r" (iph), "=r" (ihl)<br>
 	: "1" (iph), "2" (ihl));<br>
 	return(sum);<br>
 }<br>
+#else<br>
+ /* ihl is the number of 32-bit words and is always &gt;= 5. */<br>
+#define ip_fast_csum(iph,ihl)    csum_fold(ip_fast_csum_nofold((iph),(ihl)))<br>
+<br>
+#define ip_fast_csum_nofold(iph,ihl)            \<br>
+(                                               \<br>
+   __builtin_constant_p(ihl) ?                  \<br>
+      _ip_fast_csum_nofold_cihl((iph),(ihl)) :  \<br>
+      _ip_fast_csum_nofold((iph),(ihl))         \<br>
+)<br>
+extern inline unsigned int _ip_fast_csum_nofold(unsigned char *iph, unsigned int ihl)<br>
+{<br>
+	__u32  sum;<br>
+<br>
+	__asm__ __volatile__(<br>
+	    "movl (%1), %0\n\t"<br>
+	    "addl 4(%1), %0\n\t"<br>
+	    "adcl 8(%1), %0\n\t"<br>
+	    "adcl 12(%1), %0\n"<br>
+	"1:\n\t"<br>
+	    "adcl 16(%1), %0\n\t"<br>
+	    "decl %2\n\t"<br>
+	    "lea 4(%1), %1\n\t"<br>
+	    "jne	1b\n\t"<br>
+	    "adcl $0, %0"           /* += CF */<br>
+            <br>
+	/* Since the input registers which are loaded with iph and ipl<br>
+	   are modified, we must also specify them as outputs, or gcc<br>
+	   will assume they contain their original values. */<br>
+	: "=&amp;r" (sum), "=r" (iph), "=r" (ihl)<br>
+	: "1" (iph), "2" (ihl-4));<br>
+        <br>
+	return sum;<br>
+}<br>
+extern inline unsigned int _ip_fast_csum_nofold_cihl(unsigned char *iph, unsigned int ihl)<br>
+{<br>
+   __u32    sum;<br>
+<br>
+   switch ( ihl )       /* "ihl" is a constant - pick the best asm sequence */<br>
+   {<br>
+   case 5:<br>
+	__asm__ __volatile__ (<br>
+	    "movl (%1),   %0\n\t"<br>
+	    "addl 4(%1),  %0\n\t"<br>
+	    "adcl 8(%1),  %0\n\t"<br>
+	    "adcl 12(%1), %0\n\t"<br>
+	    "adcl 16(%1), %0\n\t"<br>
+	    "adcl $0, %0"           /* += CF */<br>
+            <br>
+	: "=&amp;r" (sum)<br>
+	: "r" (iph) );<br>
+        <br>
+	return sum;<br>
+   default:<br>
+        return _ip_fast_csum_nofold(iph,ihl);   /* never happens */<br>
+   }<br>
+}<br>
+#endif<br>
 <br>
 /*<br>
  *	Fold a partial checksum<br>
  */<br>
 <br>
-static inline unsigned int csum_fold(unsigned int sum)<br>
+static inline __u16 csum_fold(unsigned int sum)<br>
 {<br>
 	__asm__("<br>
 		addl %1, %0<br>
@@ -130,7 +200,14 @@ static inline unsigned int csum_fold(uns<br>
 	return (~sum) &gt;&gt; 16;<br>
 }<br>
  <br>
-static inline unsigned long csum_tcpudp_nofold(unsigned long saddr,<br>
+ /* This is often called with a const "sum"==0 */<br>
+#define csum_tcpudp_nofold(saddr,daddr,len,proto,sum)                 \<br>
+(                                                                     \<br>
+   __builtin_constant_p(sum) ?                                        \<br>
+      _csum_tcpudp_nofold_csum((saddr),(daddr),(len),(proto),(sum)) : \<br>
+      _csum_tcpudp_nofold((saddr),(daddr),(len),(proto),(sum))        \<br>
+)<br>
+static inline unsigned long _csum_tcpudp_nofold(unsigned long saddr,<br>
 						   unsigned long daddr,<br>
 						   unsigned short len,<br>
 						   unsigned short proto,<br>
@@ -142,32 +219,47 @@ static inline unsigned long csum_tcpudp_<br>
 	adcl %3, %0<br>
 	adcl $0, %0<br>
 	"<br>
-	: "=r" (sum)<br>
-	: "g" (daddr), "g"(saddr), "g"((ntohs(len)&lt;&lt;16)+proto*256), "0"(sum));<br>
+	: "=&amp;r" (sum)<br>
+	: "g" (daddr), "g"(saddr), "g"((htons(len)&lt;&lt;16)+proto*256), "0"(sum));<br>
     return sum;<br>
 }<br>
+static inline unsigned long _csum_tcpudp_nofold_csum( unsigned long  saddr,<br>
+						      unsigned long  daddr,<br>
+						      unsigned short len,<br>
+						      unsigned short proto,<br>
+						      unsigned int   sum ) <br>
+{<br>
+   switch ( sum )       /* "sum" is a constant - pick the best asm sequence */<br>
+   {<br>
+   case 0:     /* sum==0 is special:<br>
+                *    - no need to use "sum" at all<br>
+                *    - no need to clear the register - just start with another arg<br>
+                */<br>
+      __asm__(<br>
+	"addl %1, %0\n\t"<br>
+	"adcl %2, %0\n\t"<br>
+	"adcl $0, %0"<br>
+	: "=&amp;r" (sum)<br>
+	: "g" (daddr), "g"(saddr), "0"((htons(len)&lt;&lt;16)+proto*256) );<br>
+      return sum;<br>
+      <br>
+   default:    /* not optimized - never happens */<br>
+      return _csum_tcpudp_nofold(saddr,daddr,len,proto,sum);<br>
+   }<br>
+}<br>
 <br>
 /*<br>
  * computes the checksum of the TCP/UDP pseudo-header<br>
  * returns a 16-bit checksum, already complemented<br>
  */<br>
-static inline unsigned short int csum_tcpudp_magic(unsigned long saddr,<br>
-						   unsigned long daddr,<br>
-						   unsigned short len,<br>
-						   unsigned short proto,<br>
-						   unsigned int sum) <br>
-{<br>
-	return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));<br>
-}<br>
-<br>
+#define csum_tcpudp_magic(saddr,daddr,len,proto,sum) \<br>
+   csum_fold(csum_tcpudp_nofold((saddr),(daddr),(len),(proto),(sum)))<br>
+   <br>
 /*<br>
  * this routine is used for miscellaneous IP-like checksums, mainly<br>
  * in icmp.c<br>
  */<br>
-<br>
-static inline unsigned short ip_compute_csum(unsigned char * buff, int len) {<br>
-    return csum_fold (csum_partial(buff, len, 0));<br>
-}<br>
+#define ip_compute_csum(buff,len)   csum_fold(csum_partial((buff), (len), 0))<br>
 <br>
 #define _HAVE_ARCH_IPV6_CSUM<br>
 static __inline__ unsigned short int csum_ipv6_magic(struct in6_addr *saddr,<br>
@@ -191,13 +283,14 @@ static __inline__ unsigned short int csu<br>
 		"<br>
 		: "=&amp;r" (sum)<br>
 		: "r" (saddr), "r" (daddr), <br>
-		  "r"(htonl((__u32) (len))), "r"(htonl(proto)), "0"(sum));<br>
+		  "g"(htonl((__u32) (len))), "g"(htonl(proto)), "0"(sum));<br>
 <br>
 	return csum_fold(sum);<br>
 }<br>
 <br>
 /* <br>
  *	Copy and checksum to user<br>
+ *      (this is currently never called with a constant len - AS)<br>
  */<br>
 #define HAVE_CSUM_COPY_USER<br>
 static __inline__ unsigned int csum_and_copy_to_user (const char *src, char *dst,<br>
diff -urNp /img/linux-2.3.17/include/net/ip.h linux-2.3.17as/include/net/ip.h<br>
--- /img/linux-2.3.17/include/net/ip.h	Wed Sep  1 20:44:32 1999<br>
+++ linux-2.3.17as/include/net/ip.h	Fri Sep 10 02:55:57 1999<br>
@@ -13,6 +13,8 @@<br>
  *<br>
  * Changes:<br>
  *		Mike McLagan    :       Routing by source<br>
+ *     19990714 Artur Skawina &lt;<a href="mailto:skawina@geocities.com">skawina@geocities.com</a>&gt; :<br>
+ *			Fixed incremental IP checksum calculation (ip_decrease_ttl)<br>
  *<br>
  *		This program is free software; you can redistribute it and/or<br>
  *		modify it under the terms of the GNU General Public License<br>
@@ -148,11 +150,18 @@ extern __inline__ int ip_send(struct sk_<br>
 extern __inline__<br>
 int ip_decrease_ttl(struct iphdr *iph)<br>
 {<br>
-	u16 check = iph-&gt;check;<br>
-	check = ntohs(check) + 0x0100;<br>
-	if ((check &amp; 0xFF00) == 0)<br>
-		check++;		/* carry overflow */<br>
-	iph-&gt;check = htons(check);<br>
+/* iph-&gt;check contains the complement of the sum, in one's complement<br>
+   arithmetic. We decrease one of the values (ttl), so we also have to<br>
+   decrement the sum (not the complement).<br>
+   Note that there are two different representations of zero, of which only<br>
+   one ["-0"] is valid here. This means that eg. 0xfeff [~(0x0100)] must<br>
+   become 0x0000 [~(-0)], not 0xffff [~(+0)]. /AS<br>
+ */<br>
+	u32 check = (u32)ntohs(iph-&gt;check) + 0x0100;<br>
+	if (check &gt;= 0xffff)           /* check for overflow (incl. +0) */<br>
+		check++;<br>
+	iph-&gt;check = htons((u16)check);<br>
+<br>
 	return --iph-&gt;ttl;<br>
 }<br>
 <br>
diff -urNp /img/linux-2.3.17/net/ethernet/eth.c linux-2.3.17as/net/ethernet/eth.c<br>
--- /img/linux-2.3.17/net/ethernet/eth.c	Fri Aug 27 13:13:03 1999<br>
+++ linux-2.3.17as/net/ethernet/eth.c	Wed Sep  8 17:34:37 1999<br>
@@ -298,7 +298,7 @@ void eth_copy_and_sum(struct sk_buff *de<br>
 	if ((ip_length &lt;= length) &amp;&amp; (ip_length &gt; 7))<br>
 		length=ip_length;<br>
 <br>
-	dest-&gt;csum=csum_partial_copy(src+sizeof(struct iphdr)+ETH_HLEN,dest-&gt;data+sizeof(struct iphdr)+ETH_HLEN,length,base);<br>
+	dest-&gt;csum=csum_partial_copy_nocheck(src+sizeof(struct iphdr)+ETH_HLEN,dest-&gt;data+sizeof(struct iphdr)+ETH_HLEN,length,base);<br>
 	dest-&gt;ip_summed=1;<br>
 }<br>
 <br>
diff -urNp /img/linux-2.3.17/net/ipv4/icmp.c linux-2.3.17as/net/ipv4/icmp.c<br>
--- /img/linux-2.3.17/net/ipv4/icmp.c	Wed Sep  1 20:34:30 1999<br>
+++ linux-2.3.17as/net/ipv4/icmp.c	Wed Sep  8 17:34:37 1999<br>
@@ -483,7 +483,7 @@ static int icmp_glue_bits(const void *p,<br>
 	unsigned long csum;<br>
 <br>
 	if (offset) {<br>
-		icmp_param-&gt;csum=csum_partial_copy(icmp_param-&gt;data_ptr+offset-sizeof(struct icmphdr), <br>
+		icmp_param-&gt;csum=csum_partial_copy_nocheck(icmp_param-&gt;data_ptr+offset-sizeof(struct icmphdr), <br>
 				to, fraglen,icmp_param-&gt;csum);<br>
 		return 0;<br>
 	}<br>
@@ -493,10 +493,10 @@ static int icmp_glue_bits(const void *p,<br>
 	 *	the other fragments first, so that we get the checksum<br>
 	 *	for the whole packet here.<br>
 	 */<br>
-	csum = csum_partial_copy((void *)&amp;icmp_param-&gt;icmph,<br>
+	csum = csum_partial_copy_nocheck((void *)&amp;icmp_param-&gt;icmph,<br>
 		to, sizeof(struct icmphdr), <br>
 		icmp_param-&gt;csum);<br>
-	csum = csum_partial_copy(icmp_param-&gt;data_ptr,<br>
+	csum = csum_partial_copy_nocheck(icmp_param-&gt;data_ptr,<br>
 		to+sizeof(struct icmphdr),<br>
 		fraglen-sizeof(struct icmphdr), csum);<br>
 	icmph=(struct icmphdr *)to;<br>
diff -urNp /img/linux-2.3.17/net/ipv4/ip_output.c linux-2.3.17as/net/ipv4/ip_output.c<br>
--- /img/linux-2.3.17/net/ipv4/ip_output.c	Wed Sep  1 20:34:30 1999<br>
+++ linux-2.3.17as/net/ipv4/ip_output.c	Wed Sep  8 17:34:37 1999<br>
@@ -733,8 +733,15 @@ int ip_build_xmit(struct sock *sk, <br>
 		iph-&gt;saddr=rt-&gt;rt_src;<br>
 		iph-&gt;daddr=rt-&gt;rt_dst;<br>
 		iph-&gt;check=0;<br>
+#if 1<br>
+                  /* ugh - egcs 1.0 and 1.1 miscompiles this.<br>
+                     egcs 2.95 19990615 prerelease seems to do ok */<br>
+		iph-&gt;check = ip_fast_csum((unsigned char *)iph, /*iph-&gt;ihl*/5);<br>
+		err = getfrag(frag, ((char *)iph)+/*iph-&gt;ihl*/5*4,0, length-/*iph-&gt;ihl*/5*4);<br>
+#else<br>
 		iph-&gt;check = ip_fast_csum((unsigned char *)iph, iph-&gt;ihl);<br>
 		err = getfrag(frag, ((char *)iph)+iph-&gt;ihl*4,0, length-iph-&gt;ihl*4);<br>
+#endif<br>
 	}<br>
 	else<br>
 		err = getfrag(frag, (void *)iph, 0, length);<br>
diff -urNp /img/linux-2.3.17/net/ipv4/ipconfig.c linux-2.3.17as/net/ipv4/ipconfig.c<br>
--- /img/linux-2.3.17/net/ipv4/ipconfig.c	Fri Aug 27 13:13:15 1999<br>
+++ linux-2.3.17as/net/ipv4/ipconfig.c	Wed Sep  8 17:34:37 1999<br>
@@ -524,7 +524,7 @@ static void __init ic_bootp_send_if(stru<br>
 	h-&gt;ttl = 64;<br>
 	h-&gt;protocol = IPPROTO_UDP;<br>
 	h-&gt;daddr = INADDR_BROADCAST;<br>
-	h-&gt;check = ip_fast_csum((unsigned char *) h, h-&gt;ihl);<br>
+	h-&gt;check = ip_fast_csum((unsigned char *) h, /*h-&gt;ihl*/5);<br>
 <br>
 	/* Construct UDP header */<br>
 	b-&gt;udph.source = htons(68);<br>
@@ -635,7 +635,7 @@ static int __init ic_bootp_recv(struct s<br>
 	    skb-&gt;len &lt; sizeof(struct udphdr) + sizeof(struct iphdr) ||<br>
 	    h-&gt;ihl != 5 ||<br>
 	    h-&gt;version != 4 ||<br>
-	    ip_fast_csum((char *) h, h-&gt;ihl) != 0 ||<br>
+	    ip_fast_csum((char *) h, /*h-&gt;ihl*/5) != 0 ||<br>
 	    skb-&gt;len &lt; ntohs(h-&gt;tot_len) ||<br>
 	    h-&gt;protocol != IPPROTO_UDP ||<br>
 	    b-&gt;udph.source != htons(67) ||<br>
diff -urNp /img/linux-2.3.17/net/ipv4/tcp_output.c linux-2.3.17as/net/ipv4/tcp_output.c<br>
--- /img/linux-2.3.17/net/ipv4/tcp_output.c	Fri Aug 27 13:13:15 1999<br>
+++ linux-2.3.17as/net/ipv4/tcp_output.c	Wed Sep  8 17:34:37 1999<br>
@@ -276,7 +276,7 @@ static int tcp_fragment(struct sock *sk,<br>
 	TCP_SKB_CB(buff)-&gt;sacked = 0;<br>
 <br>
 	/* Copy and checksum data tail into the new buffer. */<br>
-	buff-&gt;csum = csum_partial_copy(skb-&gt;data + len, skb_put(buff, nsize),<br>
+	buff-&gt;csum = csum_partial_copy_nocheck(skb-&gt;data + len, skb_put(buff, nsize),<br>
 				       nsize, 0);<br>
 <br>
 	/* This takes care of the FIN sequence number too. */<br>
@@ -542,7 +542,7 @@ static void tcp_retrans_try_collapse(str<br>
 			/* Optimize, actually we could also combine next_skb-&gt;csum<br>
 			 * to skb-&gt;csum using a single add w/carry operation too.<br>
 			 */<br>
-			skb-&gt;csum = csum_partial_copy(next_skb-&gt;data,<br>
+			skb-&gt;csum = csum_partial_copy_nocheck(next_skb-&gt;data,<br>
 						      skb_put(skb, next_skb_size),<br>
 						      next_skb_size, skb-&gt;csum);<br>
 		}<br>
diff -urNp /img/linux-2.3.17/net/ipv6/icmp.c linux-2.3.17as/net/ipv6/icmp.c<br>
--- /img/linux-2.3.17/net/ipv6/icmp.c	Fri Aug 27 13:13:15 1999<br>
+++ linux-2.3.17as/net/ipv6/icmp.c	Wed Sep  8 17:34:37 1999<br>
@@ -146,17 +146,17 @@ static int icmpv6_getfrag(const void *da<br>
 	 */<br>
 <br>
 	if (offset) {<br>
-		csum = csum_partial_copy((void *) msg-&gt;data +<br>
+		csum = csum_partial_copy_nocheck((void *) msg-&gt;data +<br>
 					 offset - sizeof(struct icmp6hdr), <br>
 					 buff, len, msg-&gt;csum);<br>
 		msg-&gt;csum = csum;<br>
 		return 0;<br>
 	}<br>
 <br>
-	csum = csum_partial_copy((void *) &amp;msg-&gt;icmph, buff,<br>
+	csum = csum_partial_copy_nocheck((void *) &amp;msg-&gt;icmph, buff,<br>
 				 sizeof(struct icmp6hdr), msg-&gt;csum);<br>
 <br>
-	csum = csum_partial_copy((void *) msg-&gt;data, <br>
+	csum = csum_partial_copy_nocheck((void *) msg-&gt;data, <br>
 				 buff + sizeof(struct icmp6hdr),<br>
 				 len - sizeof(struct icmp6hdr), csum);<br>
 <br>
diff -urNp /img/linux-2.3.17/net/netsyms.c linux-2.3.17as/net/netsyms.c<br>
--- /img/linux-2.3.17/net/netsyms.c	Wed Sep  1 20:34:33 1999<br>
+++ linux-2.3.17as/net/netsyms.c	Wed Sep  8 17:34:37 1999<br>
@@ -200,7 +200,11 @@ EXPORT_SYMBOL(scm_fp_dup);<br>
 EXPORT_SYMBOL(max_files);<br>
 EXPORT_SYMBOL(do_mknod);<br>
 EXPORT_SYMBOL(memcpy_toiovec);<br>
+#ifndef csum_partial<br>
 EXPORT_SYMBOL(csum_partial);<br>
+#else<br>
+EXPORT_SYMBOL(_csum_partial);<br>
+#endif<br>
 <br>
 #ifdef CONFIG_IPX_MODULE<br>
 EXPORT_SYMBOL(make_8023_client);<br>
<p>
--------------3219EB5E33F6067B37ED4058--<br>
<p>
<p>
<p>
-<br>
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in<br>
the body of a message to majordomo@vger.rutgers.edu<br>
Please read the FAQ at <a href="http://www.tux.org/lkml/">http://www.tux.org/lkml/</a><br>
<!-- body="end" -->
<hr>
<p>
<ul>
<!-- next="start" -->
<li> <b>Next message:</b> <a href="0798.html">fito: "smbfs unresolved symbol get_cached_page"</a>
<li> <b>Previous message:</b> <a href="0796.html">Albert D. Cahalan: "Re: 'Current correctly-operating ps?'"</a>
<!-- nextthread="start" -->
<!-- reply="end" -->
</ul>
</font></body>
