--- /dev/null
+From 8b9d3728977760f6bd1317c4420890f73695354e Mon Sep 17 00:00:00 2001
+From: Jarek Poplawski <jarkao2@gmail.com>
+Date: Mon, 19 Jan 2009 17:03:56 -0800
+Subject: net: Fix data corruption when splicing from sockets.
+
+From: Jarek Poplawski <jarkao2@gmail.com>
+
+[ Upstream commit 8b9d3728977760f6bd1317c4420890f73695354e ]
+
+The trick in socket splicing where we try to convert the skb->data
+into a page based reference using virt_to_page() does not work so
+well.
+
+The idea is to pass the virt_to_page() reference via the pipe
+buffer, and refcount the buffer using a SKB reference.
+
+But if we are splicing from a socket to a socket (via sendpage)
+this doesn't work.
+
+The from side processing will grab the page (and SKB) references.
+The sendpage() calls will grab page references only, return, and
+then the from side processing completes and drops the SKB ref.
+
+The page based reference to skb->data is not enough to keep the
+kmalloc() buffer backing it from being reused. Yet, that is
+all that the socket send side has at this point.
+
+This leads to data corruption if the skb->data buffer is reused
+by SLAB before the send side socket actually gets the TX packet
+out to the device.
+
+The fix employed here is to simply allocate a page and copy the
+skb->data bytes into that page.
+
+This will hurt performance, but there is no clear way to fix this
+properly without a copy at the present time, and it is important
+to get rid of the data corruption.
+
+With fixes from Herbert Xu.
+
+Tested-by: Willy Tarreau <w@1wt.eu>
+Foreseen-by: Changli Gao <xiaosuo@gmail.com>
+Diagnosed-by: Willy Tarreau <w@1wt.eu>
+Reported-by: Willy Tarreau <w@1wt.eu>
+Fixed-by: Jens Axboe <jens.axboe@oracle.com>
+Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ net/core/skbuff.c | 61 +++++++++++++++++++++++++-----------------------------
+ 1 file changed, 29 insertions(+), 32 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -73,17 +73,13 @@ static struct kmem_cache *skbuff_fclone_
+ static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+ {
+- struct sk_buff *skb = (struct sk_buff *) buf->private;
+-
+- kfree_skb(skb);
++ put_page(buf->page);
+ }
+
+ static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+ {
+- struct sk_buff *skb = (struct sk_buff *) buf->private;
+-
+- skb_get(skb);
++ get_page(buf->page);
+ }
+
+ static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
+@@ -1262,9 +1258,19 @@ fault:
+ */
+ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+ {
+- struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private;
++ put_page(spd->pages[i]);
++}
+
+- kfree_skb(skb);
++static inline struct page *linear_to_page(struct page *page, unsigned int len,
++ unsigned int offset)
++{
++ struct page *p = alloc_pages(GFP_KERNEL, 0);
++
++ if (!p)
++ return NULL;
++ memcpy(page_address(p) + offset, page_address(page) + offset, len);
++
++ return p;
+ }
+
+ /*
+@@ -1272,16 +1278,23 @@ static void sock_spd_release(struct spli
+ */
+ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+ unsigned int len, unsigned int offset,
+- struct sk_buff *skb)
++ struct sk_buff *skb, int linear)
+ {
+ if (unlikely(spd->nr_pages == PIPE_BUFFERS))
+ return 1;
+
++ if (linear) {
++ page = linear_to_page(page, len, offset);
++ if (!page)
++ return 1;
++ } else
++ get_page(page);
++
+ spd->pages[spd->nr_pages] = page;
+ spd->partial[spd->nr_pages].len = len;
+ spd->partial[spd->nr_pages].offset = offset;
+- spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb);
+ spd->nr_pages++;
++
+ return 0;
+ }
+
+@@ -1297,7 +1310,7 @@ static inline void __segment_seek(struct
+ static inline int __splice_segment(struct page *page, unsigned int poff,
+ unsigned int plen, unsigned int *off,
+ unsigned int *len, struct sk_buff *skb,
+- struct splice_pipe_desc *spd)
++ struct splice_pipe_desc *spd, int linear)
+ {
+ if (!*len)
+ return 1;
+@@ -1320,7 +1333,7 @@ static inline int __splice_segment(struc
+ /* the linear region may spread across several pages */
+ flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
+
+- if (spd_fill_page(spd, page, flen, poff, skb))
++ if (spd_fill_page(spd, page, flen, poff, skb, linear))
+ return 1;
+
+ __segment_seek(&page, &poff, &plen, flen);
+@@ -1347,7 +1360,7 @@ static int __skb_splice_bits(struct sk_b
+ if (__splice_segment(virt_to_page(skb->data),
+ (unsigned long) skb->data & (PAGE_SIZE - 1),
+ skb_headlen(skb),
+- offset, len, skb, spd))
++ offset, len, skb, spd, 1))
+ return 1;
+
+ /*
+@@ -1357,7 +1370,7 @@ static int __skb_splice_bits(struct sk_b
+ const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+
+ if (__splice_segment(f->page, f->page_offset, f->size,
+- offset, len, skb, spd))
++ offset, len, skb, spd, 0))
+ return 1;
+ }
+
+@@ -1370,7 +1383,7 @@ static int __skb_splice_bits(struct sk_b
+ * the frag list, if such a thing exists. We'd probably need to recurse to
+ * handle that cleanly.
+ */
+-int skb_splice_bits(struct sk_buff *__skb, unsigned int offset,
++int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+ struct pipe_inode_info *pipe, unsigned int tlen,
+ unsigned int flags)
+ {
+@@ -1383,16 +1396,6 @@ int skb_splice_bits(struct sk_buff *__sk
+ .ops = &sock_pipe_buf_ops,
+ .spd_release = sock_spd_release,
+ };
+- struct sk_buff *skb;
+-
+- /*
+- * I'd love to avoid the clone here, but tcp_read_sock()
+- * ignores reference counts and unconditonally kills the sk_buff
+- * on return from the actor.
+- */
+- skb = skb_clone(__skb, GFP_KERNEL);
+- if (unlikely(!skb))
+- return -ENOMEM;
+
+ /*
+ * __skb_splice_bits() only fails if the output has no room left,
+@@ -1416,15 +1419,9 @@ int skb_splice_bits(struct sk_buff *__sk
+ }
+
+ done:
+- /*
+- * drop our reference to the clone, the pipe consumption will
+- * drop the rest.
+- */
+- kfree_skb(skb);
+-
+ if (spd.nr_pages) {
++ struct sock *sk = skb->sk;
+ int ret;
+- struct sock *sk = __skb->sk;
+
+ /*
+ * Drop the socket lock, otherwise we have reverse