]>
Commit | Line | Data |
---|---|---|
fc44ef5a | 1 | // SPDX-License-Identifier: GPL-2.0-only |
192dc405 ED |
2 | /* |
3 | * Copyright 2018 Google Inc. | |
4 | * Author: Eric Dumazet (edumazet@google.com) | |
5 | * | |
6 | * Reference program demonstrating tcp mmap() usage, | |
7 | * and SO_RCVLOWAT hints for receiver. | |
8 | * | |
9 | * Note : NIC with header split is needed to use mmap() on TCP : | |
10 | * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload. | |
11 | * | |
12 | * How to use on loopback interface : | |
13 | * | |
14 | * ifconfig lo mtu 61512 # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header) | |
15 | * tcp_mmap -s -z & | |
16 | * tcp_mmap -H ::1 -z | |
17 | * | |
18 | * Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12) | |
19 | * (4096 : page size on x86, 12: TCP TS option length) | |
20 | * tcp_mmap -s -z -M $((4096+12)) & | |
21 | * tcp_mmap -H ::1 -z -M $((4096+12)) | |
22 | * | |
23 | * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface. | |
24 | * We might use sendfile() instead, but really this test program is about mmap(), for receivers ;) | |
25 | * | |
26 | * $ ./tcp_mmap -s & # Without mmap() | |
27 | * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done | |
28 | * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit | |
29 | * cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches | |
30 | * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit | |
31 | * cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches | |
32 | * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit | |
33 | * cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches | |
34 | * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit | |
35 | * cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches | |
36 | * $ kill %1 # kill tcp_mmap server | |
37 | * | |
38 | * $ ./tcp_mmap -s -z & # With mmap() | |
39 | * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done | |
40 | * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit | |
41 | * cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches | |
42 | * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit | |
43 | * cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches | |
44 | * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit | |
45 | * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches | |
46 | * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit | |
47 | * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches | |
192dc405 ED |
48 | */ |
49 | #define _GNU_SOURCE | |
50 | #include <pthread.h> | |
51 | #include <sys/types.h> | |
52 | #include <fcntl.h> | |
53 | #include <error.h> | |
54 | #include <sys/socket.h> | |
55 | #include <sys/mman.h> | |
56 | #include <sys/resource.h> | |
57 | #include <unistd.h> | |
58 | #include <string.h> | |
59 | #include <stdlib.h> | |
60 | #include <stdio.h> | |
61 | #include <errno.h> | |
62 | #include <time.h> | |
63 | #include <sys/time.h> | |
64 | #include <netinet/in.h> | |
192dc405 ED |
65 | #include <arpa/inet.h> |
66 | #include <poll.h> | |
aacb0c2e ED |
67 | #include <linux/tcp.h> |
68 | #include <assert.h> | |
192dc405 ED |
69 | |
70 | #ifndef MSG_ZEROCOPY | |
71 | #define MSG_ZEROCOPY 0x4000000 | |
72 | #endif | |
73 | ||
e698a237 | 74 | #define FILE_SZ (1ULL << 35) |
192dc405 ED |
75 | static int cfg_family = AF_INET6; |
76 | static socklen_t cfg_alen = sizeof(struct sockaddr_in6); | |
77 | static int cfg_port = 8787; | |
78 | ||
79 | static int rcvbuf; /* Default: autotuning. Can be set with -r <integer> option */ | |
80 | static int sndbuf; /* Default: autotuning. Can be set with -w <integer> option */ | |
81 | static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */ | |
82 | static int xflg; /* hash received data (simple xor) (-h option) */ | |
83 | static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */ | |
84 | ||
597b01ed ED |
85 | static size_t chunk_size = 512*1024; |
86 | ||
87 | static size_t map_align; | |
192dc405 ED |
88 | |
89 | unsigned long htotal; | |
90 | ||
91 | static inline void prefetch(const void *x) | |
92 | { | |
93 | #if defined(__x86_64__) | |
94 | asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x)); | |
95 | #endif | |
96 | } | |
97 | ||
98 | void hash_zone(void *zone, unsigned int length) | |
99 | { | |
100 | unsigned long temp = htotal; | |
101 | ||
102 | while (length >= 8*sizeof(long)) { | |
103 | prefetch(zone + 384); | |
104 | temp ^= *(unsigned long *)zone; | |
105 | temp ^= *(unsigned long *)(zone + sizeof(long)); | |
106 | temp ^= *(unsigned long *)(zone + 2*sizeof(long)); | |
107 | temp ^= *(unsigned long *)(zone + 3*sizeof(long)); | |
108 | temp ^= *(unsigned long *)(zone + 4*sizeof(long)); | |
109 | temp ^= *(unsigned long *)(zone + 5*sizeof(long)); | |
110 | temp ^= *(unsigned long *)(zone + 6*sizeof(long)); | |
111 | temp ^= *(unsigned long *)(zone + 7*sizeof(long)); | |
112 | zone += 8*sizeof(long); | |
113 | length -= 8*sizeof(long); | |
114 | } | |
115 | while (length >= 1) { | |
116 | temp ^= *(unsigned char *)zone; | |
117 | zone += 1; | |
118 | length--; | |
119 | } | |
120 | htotal = temp; | |
121 | } | |
122 | ||
597b01ed ED |
123 | #define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) |
124 | #define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) | |
125 | ||
192dc405 ED |
126 | void *child_thread(void *arg) |
127 | { | |
128 | unsigned long total_mmap = 0, total = 0; | |
aacb0c2e | 129 | struct tcp_zerocopy_receive zc; |
192dc405 ED |
130 | unsigned long delta_usec; |
131 | int flags = MAP_SHARED; | |
132 | struct timeval t0, t1; | |
133 | char *buffer = NULL; | |
597b01ed | 134 | void *raddr = NULL; |
aacb0c2e | 135 | void *addr = NULL; |
192dc405 ED |
136 | double throughput; |
137 | struct rusage ru; | |
138 | int lu, fd; | |
139 | ||
140 | fd = (int)(unsigned long)arg; | |
141 | ||
142 | gettimeofday(&t0, NULL); | |
143 | ||
144 | fcntl(fd, F_SETFL, O_NDELAY); | |
145 | buffer = malloc(chunk_size); | |
146 | if (!buffer) { | |
147 | perror("malloc"); | |
148 | goto error; | |
149 | } | |
aacb0c2e | 150 | if (zflg) { |
597b01ed ED |
151 | raddr = mmap(NULL, chunk_size + map_align, PROT_READ, flags, fd, 0); |
152 | if (raddr == (void *)-1) { | |
153 | perror("mmap"); | |
aacb0c2e | 154 | zflg = 0; |
597b01ed ED |
155 | } else { |
156 | addr = ALIGN_PTR_UP(raddr, map_align); | |
157 | } | |
aacb0c2e | 158 | } |
192dc405 ED |
159 | while (1) { |
160 | struct pollfd pfd = { .fd = fd, .events = POLLIN, }; | |
161 | int sub; | |
162 | ||
163 | poll(&pfd, 1, 10000); | |
164 | if (zflg) { | |
aacb0c2e ED |
165 | socklen_t zc_len = sizeof(zc); |
166 | int res; | |
167 | ||
bf5525f3 | 168 | memset(&zc, 0, sizeof(zc)); |
e698a237 | 169 | zc.address = (__u64)((unsigned long)addr); |
aacb0c2e | 170 | zc.length = chunk_size; |
bf5525f3 | 171 | |
aacb0c2e ED |
172 | res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, |
173 | &zc, &zc_len); | |
174 | if (res == -1) | |
192dc405 | 175 | break; |
aacb0c2e ED |
176 | |
177 | if (zc.length) { | |
178 | assert(zc.length <= chunk_size); | |
179 | total_mmap += zc.length; | |
180 | if (xflg) | |
181 | hash_zone(addr, zc.length); | |
182 | total += zc.length; | |
192dc405 | 183 | } |
aacb0c2e ED |
184 | if (zc.recv_skip_hint) { |
185 | assert(zc.recv_skip_hint <= chunk_size); | |
186 | lu = read(fd, buffer, zc.recv_skip_hint); | |
187 | if (lu > 0) { | |
188 | if (xflg) | |
189 | hash_zone(buffer, lu); | |
190 | total += lu; | |
191 | } | |
192dc405 ED |
192 | } |
193 | continue; | |
194 | } | |
192dc405 ED |
195 | sub = 0; |
196 | while (sub < chunk_size) { | |
197 | lu = read(fd, buffer + sub, chunk_size - sub); | |
198 | if (lu == 0) | |
199 | goto end; | |
200 | if (lu < 0) | |
201 | break; | |
202 | if (xflg) | |
203 | hash_zone(buffer + sub, lu); | |
204 | total += lu; | |
205 | sub += lu; | |
206 | } | |
207 | } | |
208 | end: | |
209 | gettimeofday(&t1, NULL); | |
210 | delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; | |
211 | ||
212 | throughput = 0; | |
213 | if (delta_usec) | |
214 | throughput = total * 8.0 / (double)delta_usec / 1000.0; | |
215 | getrusage(RUSAGE_THREAD, &ru); | |
216 | if (total > 1024*1024) { | |
217 | unsigned long total_usec; | |
218 | unsigned long mb = total >> 20; | |
219 | total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec + | |
220 | 1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec; | |
221 | printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n" | |
222 | " cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n", | |
223 | total / (1024.0 * 1024.0), | |
224 | 100.0*total_mmap/total, | |
225 | (double)delta_usec / 1000000.0, | |
226 | throughput, | |
227 | (double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0, | |
228 | (double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0, | |
229 | (double)total_usec/mb, | |
230 | ru.ru_nvcsw); | |
231 | } | |
232 | error: | |
233 | free(buffer); | |
234 | close(fd); | |
aacb0c2e | 235 | if (zflg) |
597b01ed | 236 | munmap(raddr, chunk_size + map_align); |
192dc405 ED |
237 | pthread_exit(0); |
238 | } | |
239 | ||
240 | static void apply_rcvsnd_buf(int fd) | |
241 | { | |
242 | if (rcvbuf && setsockopt(fd, SOL_SOCKET, | |
243 | SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) { | |
244 | perror("setsockopt SO_RCVBUF"); | |
245 | } | |
246 | ||
247 | if (sndbuf && setsockopt(fd, SOL_SOCKET, | |
248 | SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) { | |
249 | perror("setsockopt SO_SNDBUF"); | |
250 | } | |
251 | } | |
252 | ||
253 | ||
254 | static void setup_sockaddr(int domain, const char *str_addr, | |
255 | struct sockaddr_storage *sockaddr) | |
256 | { | |
257 | struct sockaddr_in6 *addr6 = (void *) sockaddr; | |
258 | struct sockaddr_in *addr4 = (void *) sockaddr; | |
259 | ||
260 | switch (domain) { | |
261 | case PF_INET: | |
262 | memset(addr4, 0, sizeof(*addr4)); | |
263 | addr4->sin_family = AF_INET; | |
264 | addr4->sin_port = htons(cfg_port); | |
265 | if (str_addr && | |
266 | inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) | |
267 | error(1, 0, "ipv4 parse error: %s", str_addr); | |
268 | break; | |
269 | case PF_INET6: | |
270 | memset(addr6, 0, sizeof(*addr6)); | |
271 | addr6->sin6_family = AF_INET6; | |
272 | addr6->sin6_port = htons(cfg_port); | |
273 | if (str_addr && | |
274 | inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) | |
275 | error(1, 0, "ipv6 parse error: %s", str_addr); | |
276 | break; | |
277 | default: | |
278 | error(1, 0, "illegal domain"); | |
279 | } | |
280 | } | |
281 | ||
282 | static void do_accept(int fdlisten) | |
283 | { | |
20021578 | 284 | pthread_attr_t attr; |
a8472417 | 285 | int rcvlowat; |
20021578 ED |
286 | |
287 | pthread_attr_init(&attr); | |
288 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); | |
289 | ||
a8472417 | 290 | rcvlowat = chunk_size; |
192dc405 | 291 | if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT, |
a8472417 | 292 | &rcvlowat, sizeof(rcvlowat)) == -1) { |
192dc405 ED |
293 | perror("setsockopt SO_RCVLOWAT"); |
294 | } | |
295 | ||
296 | apply_rcvsnd_buf(fdlisten); | |
297 | ||
298 | while (1) { | |
299 | struct sockaddr_in addr; | |
300 | socklen_t addrlen = sizeof(addr); | |
301 | pthread_t th; | |
302 | int fd, res; | |
303 | ||
304 | fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen); | |
305 | if (fd == -1) { | |
306 | perror("accept"); | |
307 | continue; | |
308 | } | |
20021578 | 309 | res = pthread_create(&th, &attr, child_thread, |
192dc405 ED |
310 | (void *)(unsigned long)fd); |
311 | if (res) { | |
312 | errno = res; | |
313 | perror("pthread_create"); | |
314 | close(fd); | |
315 | } | |
316 | } | |
317 | } | |
318 | ||
597b01ed ED |
319 | /* Each thread should reserve a big enough vma to avoid |
320 | * spinlock collisions in ptl locks. | |
321 | * This size is 2MB on x86_64, and is exported in /proc/meminfo. | |
322 | */ | |
323 | static unsigned long default_huge_page_size(void) | |
324 | { | |
325 | FILE *f = fopen("/proc/meminfo", "r"); | |
326 | unsigned long hps = 0; | |
327 | size_t linelen = 0; | |
328 | char *line = NULL; | |
329 | ||
330 | if (!f) | |
331 | return 0; | |
332 | while (getline(&line, &linelen, f) > 0) { | |
333 | if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { | |
334 | hps <<= 10; | |
335 | break; | |
336 | } | |
337 | } | |
338 | free(line); | |
339 | fclose(f); | |
340 | return hps; | |
341 | } | |
342 | ||
192dc405 ED |
343 | int main(int argc, char *argv[]) |
344 | { | |
345 | struct sockaddr_storage listenaddr, addr; | |
346 | unsigned int max_pacing_rate = 0; | |
e698a237 | 347 | size_t total = 0; |
192dc405 ED |
348 | char *host = NULL; |
349 | int fd, c, on = 1; | |
350 | char *buffer; | |
351 | int sflg = 0; | |
352 | int mss = 0; | |
353 | ||
597b01ed | 354 | while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:C:a:")) != -1) { |
192dc405 ED |
355 | switch (c) { |
356 | case '4': | |
357 | cfg_family = PF_INET; | |
358 | cfg_alen = sizeof(struct sockaddr_in); | |
359 | break; | |
360 | case '6': | |
361 | cfg_family = PF_INET6; | |
362 | cfg_alen = sizeof(struct sockaddr_in6); | |
363 | break; | |
364 | case 'p': | |
365 | cfg_port = atoi(optarg); | |
366 | break; | |
367 | case 'H': | |
368 | host = optarg; | |
369 | break; | |
370 | case 's': /* server : listen for incoming connections */ | |
371 | sflg++; | |
372 | break; | |
373 | case 'r': | |
374 | rcvbuf = atoi(optarg); | |
375 | break; | |
376 | case 'w': | |
377 | sndbuf = atoi(optarg); | |
378 | break; | |
379 | case 'z': | |
380 | zflg = 1; | |
381 | break; | |
382 | case 'M': | |
383 | mss = atoi(optarg); | |
384 | break; | |
385 | case 'x': | |
386 | xflg = 1; | |
387 | break; | |
388 | case 'k': | |
389 | keepflag = 1; | |
390 | break; | |
391 | case 'P': | |
392 | max_pacing_rate = atoi(optarg) ; | |
393 | break; | |
597b01ed ED |
394 | case 'C': |
395 | chunk_size = atol(optarg); | |
396 | break; | |
397 | case 'a': | |
398 | map_align = atol(optarg); | |
399 | break; | |
192dc405 ED |
400 | default: |
401 | exit(1); | |
402 | } | |
403 | } | |
597b01ed ED |
404 | if (!map_align) { |
405 | map_align = default_huge_page_size(); | |
406 | /* if really /proc/meminfo is not helping, | |
407 | * we use the default x86_64 hugepagesize. | |
408 | */ | |
409 | if (!map_align) | |
410 | map_align = 2*1024*1024; | |
411 | } | |
192dc405 ED |
412 | if (sflg) { |
413 | int fdlisten = socket(cfg_family, SOCK_STREAM, 0); | |
414 | ||
415 | if (fdlisten == -1) { | |
416 | perror("socket"); | |
417 | exit(1); | |
418 | } | |
419 | apply_rcvsnd_buf(fdlisten); | |
420 | setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); | |
421 | ||
422 | setup_sockaddr(cfg_family, host, &listenaddr); | |
423 | ||
424 | if (mss && | |
aacb0c2e ED |
425 | setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG, |
426 | &mss, sizeof(mss)) == -1) { | |
192dc405 ED |
427 | perror("setsockopt TCP_MAXSEG"); |
428 | exit(1); | |
429 | } | |
430 | if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) { | |
431 | perror("bind"); | |
432 | exit(1); | |
433 | } | |
434 | if (listen(fdlisten, 128) == -1) { | |
435 | perror("listen"); | |
436 | exit(1); | |
437 | } | |
438 | do_accept(fdlisten); | |
439 | } | |
440 | buffer = mmap(NULL, chunk_size, PROT_READ | PROT_WRITE, | |
441 | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | |
442 | if (buffer == (char *)-1) { | |
443 | perror("mmap"); | |
444 | exit(1); | |
445 | } | |
446 | ||
258fe208 | 447 | fd = socket(cfg_family, SOCK_STREAM, 0); |
192dc405 ED |
448 | if (fd == -1) { |
449 | perror("socket"); | |
450 | exit(1); | |
451 | } | |
452 | apply_rcvsnd_buf(fd); | |
453 | ||
454 | setup_sockaddr(cfg_family, host, &addr); | |
455 | ||
456 | if (mss && | |
aacb0c2e | 457 | setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) { |
192dc405 ED |
458 | perror("setsockopt TCP_MAXSEG"); |
459 | exit(1); | |
460 | } | |
461 | if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) { | |
462 | perror("connect"); | |
463 | exit(1); | |
464 | } | |
465 | if (max_pacing_rate && | |
466 | setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE, | |
467 | &max_pacing_rate, sizeof(max_pacing_rate)) == -1) | |
468 | perror("setsockopt SO_MAX_PACING_RATE"); | |
469 | ||
470 | if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, | |
471 | &on, sizeof(on)) == -1) { | |
472 | perror("setsockopt SO_ZEROCOPY, (-z option disabled)"); | |
473 | zflg = 0; | |
474 | } | |
475 | while (total < FILE_SZ) { | |
e698a237 | 476 | ssize_t wr = FILE_SZ - total; |
192dc405 ED |
477 | |
478 | if (wr > chunk_size) | |
479 | wr = chunk_size; | |
480 | /* Note : we just want to fill the pipe with 0 bytes */ | |
481 | wr = send(fd, buffer, wr, zflg ? MSG_ZEROCOPY : 0); | |
482 | if (wr <= 0) | |
483 | break; | |
484 | total += wr; | |
485 | } | |
486 | close(fd); | |
487 | munmap(buffer, chunk_size); | |
488 | return 0; | |
489 | } |