[PATCH] TCP Zero Copy for mmapped files

Thomas Ogrisegg (tom@rhadamanthys.org)
Mon, 30 Dec 2002 02:09:53 +0100


--6c2NcOVqGQ03X4Wi
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

The following patch (for 2.4.20 -- should work with all kernels
above 2.4.17) implements TCP Zero Copy for normal (writing)
socket operations on memory mapped files.

This is a major speedup for the TCP/IP stack (depending on the size
of the file more than 100% more throughput) and makes sendfile(2)
nearly useless.

BTW: When I did a (loopback) benchmark against my very own HTTP-
Server it outperformed TUX by roughly 6%. With logging disabled
by roughly 20%.

Please CC any replies to me, as I'm not subscribed to this list.

--6c2NcOVqGQ03X4Wi
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="tcp.diff"

--- linux.old/net/ipv4/tcp.c Fri Nov 29 00:53:15 2002
+++ linux-2.4.20/net/ipv4/tcp.c Sun Dec 29 20:30:10 2002
@@ -204,6 +204,7 @@
* Andi Kleen : Make poll agree with SIGIO
* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
* lingertime == 0 (RFC 793 ABORT Call)
+ * Thomas Ogrisegg : Added TCP Zero Copy for mmapped files
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -1006,6 +1007,41 @@
return tmp;
}

+static ssize_t file_send_actor (read_descriptor_t *desc, struct page *page,
+ unsigned long offset, unsigned long size)
+{
+ ssize_t res;
+ unsigned long count = desc->count;
+ struct sock *sk = (struct sock *) desc->buf;
+ int flags;
+
+ if (size > count)
+ size = count;
+
+ flags = (sk->socket->file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
+ if (size < count) flags |= MSG_MORE;
+
+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
+
+ if (!(sk->route_caps & NETIF_F_SG) ||
+ !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
+ return sock_no_sendpage(sk->socket, page, offset, size, flags);
+
+#undef TCP_ZC_CSUM_FLAGS
+
+ TCP_CHECK_TIMER(sk);
+ res = do_tcp_sendpages(sk, &page, offset, size, flags);
+ TCP_CHECK_TIMER(sk);
+
+ if (res < 0) desc->error = res;
+ else {
+ desc->count -= res;
+ desc->written += res;
+ }
+
+ return res;
+}
+
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
{
struct iovec *iov;
@@ -1015,6 +1051,7 @@
int mss_now;
int err, copied;
long timeo;
+ int has_sendpage = sk->socket->file->f_op->sendpage != NULL;

tp = &(sk->tp_pinfo.af_tcp);

@@ -1049,6 +1086,44 @@

iov++;

+ if (seglen >= PAGE_SIZE && has_sendpage) {
+ struct vm_area_struct *vma =
+ find_vma (current->mm, (long) from);
+ struct file *filp;
+
+ if (vma && (filp = vma->vm_file)) {
+ read_descriptor_t desc;
+ struct inode *in, *out;
+ loff_t pos = (long) from - vma->vm_start;
+
+ in = filp->f_dentry->d_inode;
+ out = sk->socket->file->f_dentry->d_inode;
+
+ if (locks_verify_area (FLOCK_VERIFY_READ, in,
+ filp, filp->f_pos, seglen))
+ goto out_no_zero_copy;
+
+ if (locks_verify_area (FLOCK_VERIFY_WRITE, out,
+ sk->socket->file, 0, seglen))
+ goto out_no_zero_copy;
+
+ desc.written = 0;
+ desc.count = seglen;
+ desc.buf = (char *) sk;
+ desc.error = 0;
+
+ do_generic_file_read (filp, &pos, &desc,
+ file_send_actor);
+
+ if (!desc.written) {
+ err = desc.error;
+ goto do_error;
+ }
+ copied += desc.written;
+ continue;
+ }
+ }
+out_no_zero_copy:
while (seglen > 0) {
int copy;

--6c2NcOVqGQ03X4Wi--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/