]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/2.6.28.6/net-fix-data-corruption-when-splicing-from-sockets.patch
Linux 4.14.107
[thirdparty/kernel/stable-queue.git] / releases / 2.6.28.6 / net-fix-data-corruption-when-splicing-from-sockets.patch
1 From 8b9d3728977760f6bd1317c4420890f73695354e Mon Sep 17 00:00:00 2001
2 From: Jarek Poplawski <jarkao2@gmail.com>
3 Date: Mon, 19 Jan 2009 17:03:56 -0800
4 Subject: net: Fix data corruption when splicing from sockets.
5
6 From: Jarek Poplawski <jarkao2@gmail.com>
7
8 [ Upstream commit 8b9d3728977760f6bd1317c4420890f73695354e ]
9
10 The trick in socket splicing where we try to convert the skb->data
11 into a page based reference using virt_to_page() does not work so
12 well.
13
14 The idea is to pass the virt_to_page() reference via the pipe
15 buffer, and refcount the buffer using a SKB reference.
16
17 But if we are splicing from a socket to a socket (via sendpage)
18 this doesn't work.
19
20 The from side processing will grab the page (and SKB) references.
21 The sendpage() calls will grab page references only, return, and
22 then the from side processing completes and drops the SKB ref.
23
24 The page based reference to skb->data is not enough to keep the
25 kmalloc() buffer backing it from being reused. Yet, that is
26 all that the socket send side has at this point.
27
28 This leads to data corruption if the skb->data buffer is reused
29 by SLAB before the send side socket actually gets the TX packet
30 out to the device.
31
32 The fix employed here is to simply allocate a page and copy the
33 skb->data bytes into that page.
34
35 This will hurt performance, but there is no clear way to fix this
36 properly without a copy at the present time, and it is important
37 to get rid of the data corruption.
38
39 With fixes from Herbert Xu.
40
41 Tested-by: Willy Tarreau <w@1wt.eu>
42 Foreseen-by: Changli Gao <xiaosuo@gmail.com>
43 Diagnosed-by: Willy Tarreau <w@1wt.eu>
44 Reported-by: Willy Tarreau <w@1wt.eu>
45 Fixed-by: Jens Axboe <jens.axboe@oracle.com>
46 Signed-off-by: Jarek Poplawski <jarkao2@gmail.com>
47 Signed-off-by: David S. Miller <davem@davemloft.net>
48 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
49
50 ---
51 net/core/skbuff.c | 61 +++++++++++++++++++++++++-----------------------------
52 1 file changed, 29 insertions(+), 32 deletions(-)
53
54 --- a/net/core/skbuff.c
55 +++ b/net/core/skbuff.c
56 @@ -73,17 +73,13 @@ static struct kmem_cache *skbuff_fclone_
57 static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
58 struct pipe_buffer *buf)
59 {
60 - struct sk_buff *skb = (struct sk_buff *) buf->private;
61 -
62 - kfree_skb(skb);
63 + put_page(buf->page);
64 }
65
66 static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
67 struct pipe_buffer *buf)
68 {
69 - struct sk_buff *skb = (struct sk_buff *) buf->private;
70 -
71 - skb_get(skb);
72 + get_page(buf->page);
73 }
74
75 static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
76 @@ -1333,9 +1329,19 @@ fault:
77 */
78 static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
79 {
80 - struct sk_buff *skb = (struct sk_buff *) spd->partial[i].private;
81 + put_page(spd->pages[i]);
82 +}
83
84 - kfree_skb(skb);
85 +static inline struct page *linear_to_page(struct page *page, unsigned int len,
86 + unsigned int offset)
87 +{
88 + struct page *p = alloc_pages(GFP_KERNEL, 0);
89 +
90 + if (!p)
91 + return NULL;
92 + memcpy(page_address(p) + offset, page_address(page) + offset, len);
93 +
94 + return p;
95 }
96
97 /*
98 @@ -1343,16 +1349,23 @@ static void sock_spd_release(struct spli
99 */
100 static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
101 unsigned int len, unsigned int offset,
102 - struct sk_buff *skb)
103 + struct sk_buff *skb, int linear)
104 {
105 if (unlikely(spd->nr_pages == PIPE_BUFFERS))
106 return 1;
107
108 + if (linear) {
109 + page = linear_to_page(page, len, offset);
110 + if (!page)
111 + return 1;
112 + } else
113 + get_page(page);
114 +
115 spd->pages[spd->nr_pages] = page;
116 spd->partial[spd->nr_pages].len = len;
117 spd->partial[spd->nr_pages].offset = offset;
118 - spd->partial[spd->nr_pages].private = (unsigned long) skb_get(skb);
119 spd->nr_pages++;
120 +
121 return 0;
122 }
123
124 @@ -1368,7 +1381,7 @@ static inline void __segment_seek(struct
125 static inline int __splice_segment(struct page *page, unsigned int poff,
126 unsigned int plen, unsigned int *off,
127 unsigned int *len, struct sk_buff *skb,
128 - struct splice_pipe_desc *spd)
129 + struct splice_pipe_desc *spd, int linear)
130 {
131 if (!*len)
132 return 1;
133 @@ -1391,7 +1404,7 @@ static inline int __splice_segment(struc
134 /* the linear region may spread across several pages */
135 flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
136
137 - if (spd_fill_page(spd, page, flen, poff, skb))
138 + if (spd_fill_page(spd, page, flen, poff, skb, linear))
139 return 1;
140
141 __segment_seek(&page, &poff, &plen, flen);
142 @@ -1418,7 +1431,7 @@ static int __skb_splice_bits(struct sk_b
143 if (__splice_segment(virt_to_page(skb->data),
144 (unsigned long) skb->data & (PAGE_SIZE - 1),
145 skb_headlen(skb),
146 - offset, len, skb, spd))
147 + offset, len, skb, spd, 1))
148 return 1;
149
150 /*
151 @@ -1428,7 +1441,7 @@ static int __skb_splice_bits(struct sk_b
152 const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
153
154 if (__splice_segment(f->page, f->page_offset, f->size,
155 - offset, len, skb, spd))
156 + offset, len, skb, spd, 0))
157 return 1;
158 }
159
160 @@ -1441,7 +1454,7 @@ static int __skb_splice_bits(struct sk_b
161 * the frag list, if such a thing exists. We'd probably need to recurse to
162 * handle that cleanly.
163 */
164 -int skb_splice_bits(struct sk_buff *__skb, unsigned int offset,
165 +int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
166 struct pipe_inode_info *pipe, unsigned int tlen,
167 unsigned int flags)
168 {
169 @@ -1454,16 +1467,6 @@ int skb_splice_bits(struct sk_buff *__sk
170 .ops = &sock_pipe_buf_ops,
171 .spd_release = sock_spd_release,
172 };
173 - struct sk_buff *skb;
174 -
175 - /*
176 - * I'd love to avoid the clone here, but tcp_read_sock()
177 - * ignores reference counts and unconditonally kills the sk_buff
178 - * on return from the actor.
179 - */
180 - skb = skb_clone(__skb, GFP_KERNEL);
181 - if (unlikely(!skb))
182 - return -ENOMEM;
183
184 /*
185 * __skb_splice_bits() only fails if the output has no room left,
186 @@ -1487,15 +1490,9 @@ int skb_splice_bits(struct sk_buff *__sk
187 }
188
189 done:
190 - /*
191 - * drop our reference to the clone, the pipe consumption will
192 - * drop the rest.
193 - */
194 - kfree_skb(skb);
195 -
196 if (spd.nr_pages) {
197 + struct sock *sk = skb->sk;
198 int ret;
199 - struct sock *sk = __skb->sk;
200
201 /*
202 * Drop the socket lock, otherwise we have reverse