]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/mmap-cache.c
Merge pull request #2471 from michaelolbrich/transient-mounts
[thirdparty/systemd.git] / src / journal / mmap-cache.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2012 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <stdlib.h>
22 #include <sys/mman.h>
23
24 #include "alloc-util.h"
25 #include "fd-util.h"
26 #include "hashmap.h"
27 #include "list.h"
28 #include "log.h"
29 #include "macro.h"
30 #include "mmap-cache.h"
31 #include "sigbus.h"
32 #include "util.h"
33
34 typedef struct Window Window;
35 typedef struct Context Context;
36 typedef struct FileDescriptor FileDescriptor;
37
38 struct Window {
39 MMapCache *cache;
40
41 bool invalidated:1;
42 bool keep_always:1;
43 bool in_unused:1;
44
45 int prot;
46 void *ptr;
47 uint64_t offset;
48 size_t size;
49
50 FileDescriptor *fd;
51
52 LIST_FIELDS(Window, by_fd);
53 LIST_FIELDS(Window, unused);
54
55 LIST_HEAD(Context, contexts);
56 };
57
58 struct Context {
59 MMapCache *cache;
60 unsigned id;
61 Window *window;
62
63 LIST_FIELDS(Context, by_window);
64 };
65
66 struct FileDescriptor {
67 MMapCache *cache;
68 int fd;
69 bool sigbus;
70 LIST_HEAD(Window, windows);
71 };
72
73 struct MMapCache {
74 int n_ref;
75 unsigned n_windows;
76
77 unsigned n_hit, n_missed;
78
79 Hashmap *fds;
80 Context *contexts[MMAP_CACHE_MAX_CONTEXTS];
81
82 LIST_HEAD(Window, unused);
83 Window *last_unused;
84 };
85
86 #define WINDOWS_MIN 64
87
88 #ifdef ENABLE_DEBUG_MMAP_CACHE
89 /* Tiny windows increase mmap activity and the chance of exposing unsafe use. */
90 # define WINDOW_SIZE (page_size())
91 #else
92 # define WINDOW_SIZE (8ULL*1024ULL*1024ULL)
93 #endif
94
95 MMapCache* mmap_cache_new(void) {
96 MMapCache *m;
97
98 m = new0(MMapCache, 1);
99 if (!m)
100 return NULL;
101
102 m->n_ref = 1;
103 return m;
104 }
105
106 MMapCache* mmap_cache_ref(MMapCache *m) {
107 assert(m);
108 assert(m->n_ref > 0);
109
110 m->n_ref++;
111 return m;
112 }
113
114 static void window_unlink(Window *w) {
115 Context *c;
116
117 assert(w);
118
119 if (w->ptr)
120 munmap(w->ptr, w->size);
121
122 if (w->fd)
123 LIST_REMOVE(by_fd, w->fd->windows, w);
124
125 if (w->in_unused) {
126 if (w->cache->last_unused == w)
127 w->cache->last_unused = w->unused_prev;
128
129 LIST_REMOVE(unused, w->cache->unused, w);
130 }
131
132 LIST_FOREACH(by_window, c, w->contexts) {
133 assert(c->window == w);
134 c->window = NULL;
135 }
136 }
137
138 static void window_invalidate(Window *w) {
139 assert(w);
140
141 if (w->invalidated)
142 return;
143
144 /* Replace the window with anonymous pages. This is useful
145 * when we hit a SIGBUS and want to make sure the file cannot
146 * trigger any further SIGBUS, possibly overrunning the sigbus
147 * queue. */
148
149 assert_se(mmap(w->ptr, w->size, w->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
150 w->invalidated = true;
151 }
152
153 static void window_free(Window *w) {
154 assert(w);
155
156 window_unlink(w);
157 w->cache->n_windows--;
158 free(w);
159 }
160
161 _pure_ static bool window_matches(Window *w, int fd, int prot, uint64_t offset, size_t size) {
162 assert(w);
163 assert(fd >= 0);
164 assert(size > 0);
165
166 return
167 w->fd &&
168 fd == w->fd->fd &&
169 prot == w->prot &&
170 offset >= w->offset &&
171 offset + size <= w->offset + w->size;
172 }
173
174 static Window *window_add(MMapCache *m, FileDescriptor *fd, int prot, bool keep_always, uint64_t offset, size_t size, void *ptr) {
175 Window *w;
176
177 assert(m);
178 assert(fd);
179
180 if (!m->last_unused || m->n_windows <= WINDOWS_MIN) {
181
182 /* Allocate a new window */
183 w = new0(Window, 1);
184 if (!w)
185 return NULL;
186 m->n_windows++;
187 } else {
188
189 /* Reuse an existing one */
190 w = m->last_unused;
191 window_unlink(w);
192 zero(*w);
193 }
194
195 w->cache = m;
196 w->fd = fd;
197 w->prot = prot;
198 w->keep_always = keep_always;
199 w->offset = offset;
200 w->size = size;
201 w->ptr = ptr;
202
203 LIST_PREPEND(by_fd, fd->windows, w);
204
205 return w;
206 }
207
208 static void context_detach_window(Context *c) {
209 Window *w;
210
211 assert(c);
212
213 if (!c->window)
214 return;
215
216 w = c->window;
217 c->window = NULL;
218 LIST_REMOVE(by_window, w->contexts, c);
219
220 if (!w->contexts && !w->keep_always) {
221 /* Not used anymore? */
222 #ifdef ENABLE_DEBUG_MMAP_CACHE
223 /* Unmap unused windows immediately to expose use-after-unmap
224 * by SIGSEGV. */
225 window_free(w);
226 #else
227 LIST_PREPEND(unused, c->cache->unused, w);
228 if (!c->cache->last_unused)
229 c->cache->last_unused = w;
230
231 w->in_unused = true;
232 #endif
233 }
234 }
235
236 static void context_attach_window(Context *c, Window *w) {
237 assert(c);
238 assert(w);
239
240 if (c->window == w)
241 return;
242
243 context_detach_window(c);
244
245 if (w->in_unused) {
246 /* Used again? */
247 LIST_REMOVE(unused, c->cache->unused, w);
248 if (c->cache->last_unused == w)
249 c->cache->last_unused = w->unused_prev;
250
251 w->in_unused = false;
252 }
253
254 c->window = w;
255 LIST_PREPEND(by_window, w->contexts, c);
256 }
257
258 static Context *context_add(MMapCache *m, unsigned id) {
259 Context *c;
260
261 assert(m);
262
263 c = m->contexts[id];
264 if (c)
265 return c;
266
267 c = new0(Context, 1);
268 if (!c)
269 return NULL;
270
271 c->cache = m;
272 c->id = id;
273
274 assert(!m->contexts[id]);
275 m->contexts[id] = c;
276
277 return c;
278 }
279
280 static void context_free(Context *c) {
281 assert(c);
282
283 context_detach_window(c);
284
285 if (c->cache) {
286 assert(c->cache->contexts[c->id] == c);
287 c->cache->contexts[c->id] = NULL;
288 }
289
290 free(c);
291 }
292
293 static void fd_free(FileDescriptor *f) {
294 assert(f);
295
296 while (f->windows)
297 window_free(f->windows);
298
299 if (f->cache)
300 assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)));
301
302 free(f);
303 }
304
305 static FileDescriptor* fd_add(MMapCache *m, int fd) {
306 FileDescriptor *f;
307 int r;
308
309 assert(m);
310 assert(fd >= 0);
311
312 f = hashmap_get(m->fds, FD_TO_PTR(fd));
313 if (f)
314 return f;
315
316 r = hashmap_ensure_allocated(&m->fds, NULL);
317 if (r < 0)
318 return NULL;
319
320 f = new0(FileDescriptor, 1);
321 if (!f)
322 return NULL;
323
324 f->cache = m;
325 f->fd = fd;
326
327 r = hashmap_put(m->fds, FD_TO_PTR(fd), f);
328 if (r < 0) {
329 free(f);
330 return NULL;
331 }
332
333 return f;
334 }
335
336 static void mmap_cache_free(MMapCache *m) {
337 FileDescriptor *f;
338 int i;
339
340 assert(m);
341
342 for (i = 0; i < MMAP_CACHE_MAX_CONTEXTS; i++)
343 if (m->contexts[i])
344 context_free(m->contexts[i]);
345
346 while ((f = hashmap_first(m->fds)))
347 fd_free(f);
348
349 hashmap_free(m->fds);
350
351 while (m->unused)
352 window_free(m->unused);
353
354 free(m);
355 }
356
357 MMapCache* mmap_cache_unref(MMapCache *m) {
358
359 if (!m)
360 return NULL;
361
362 assert(m->n_ref > 0);
363
364 m->n_ref--;
365 if (m->n_ref == 0)
366 mmap_cache_free(m);
367
368 return NULL;
369 }
370
371 static int make_room(MMapCache *m) {
372 assert(m);
373
374 if (!m->last_unused)
375 return 0;
376
377 window_free(m->last_unused);
378 return 1;
379 }
380
381 static int try_context(
382 MMapCache *m,
383 int fd,
384 int prot,
385 unsigned context,
386 bool keep_always,
387 uint64_t offset,
388 size_t size,
389 void **ret) {
390
391 Context *c;
392
393 assert(m);
394 assert(m->n_ref > 0);
395 assert(fd >= 0);
396 assert(size > 0);
397 assert(ret);
398
399 c = m->contexts[context];
400 if (!c)
401 return 0;
402
403 assert(c->id == context);
404
405 if (!c->window)
406 return 0;
407
408 if (!window_matches(c->window, fd, prot, offset, size)) {
409
410 /* Drop the reference to the window, since it's unnecessary now */
411 context_detach_window(c);
412 return 0;
413 }
414
415 if (c->window->fd->sigbus)
416 return -EIO;
417
418 c->window->keep_always = c->window->keep_always || keep_always;
419
420 *ret = (uint8_t*) c->window->ptr + (offset - c->window->offset);
421 return 1;
422 }
423
424 static int find_mmap(
425 MMapCache *m,
426 int fd,
427 int prot,
428 unsigned context,
429 bool keep_always,
430 uint64_t offset,
431 size_t size,
432 void **ret) {
433
434 FileDescriptor *f;
435 Window *w;
436 Context *c;
437
438 assert(m);
439 assert(m->n_ref > 0);
440 assert(fd >= 0);
441 assert(size > 0);
442
443 f = hashmap_get(m->fds, FD_TO_PTR(fd));
444 if (!f)
445 return 0;
446
447 assert(f->fd == fd);
448
449 if (f->sigbus)
450 return -EIO;
451
452 LIST_FOREACH(by_fd, w, f->windows)
453 if (window_matches(w, fd, prot, offset, size))
454 break;
455
456 if (!w)
457 return 0;
458
459 c = context_add(m, context);
460 if (!c)
461 return -ENOMEM;
462
463 context_attach_window(c, w);
464 w->keep_always = w->keep_always || keep_always;
465
466 *ret = (uint8_t*) w->ptr + (offset - w->offset);
467 return 1;
468 }
469
470 static int mmap_try_harder(MMapCache *m, void *addr, int fd, int prot, int flags, uint64_t offset, size_t size, void **res) {
471 void *ptr;
472
473 assert(m);
474 assert(fd >= 0);
475 assert(res);
476
477 for (;;) {
478 int r;
479
480 ptr = mmap(addr, size, prot, flags, fd, offset);
481 if (ptr != MAP_FAILED)
482 break;
483 if (errno != ENOMEM)
484 return negative_errno();
485
486 r = make_room(m);
487 if (r < 0)
488 return r;
489 if (r == 0)
490 return -ENOMEM;
491 }
492
493 *res = ptr;
494 return 0;
495 }
496
497 static int add_mmap(
498 MMapCache *m,
499 int fd,
500 int prot,
501 unsigned context,
502 bool keep_always,
503 uint64_t offset,
504 size_t size,
505 struct stat *st,
506 void **ret) {
507
508 uint64_t woffset, wsize;
509 Context *c;
510 FileDescriptor *f;
511 Window *w;
512 void *d;
513 int r;
514
515 assert(m);
516 assert(m->n_ref > 0);
517 assert(fd >= 0);
518 assert(size > 0);
519 assert(ret);
520
521 woffset = offset & ~((uint64_t) page_size() - 1ULL);
522 wsize = size + (offset - woffset);
523 wsize = PAGE_ALIGN(wsize);
524
525 if (wsize < WINDOW_SIZE) {
526 uint64_t delta;
527
528 delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);
529
530 if (delta > offset)
531 woffset = 0;
532 else
533 woffset -= delta;
534
535 wsize = WINDOW_SIZE;
536 }
537
538 if (st) {
539 /* Memory maps that are larger then the files
540 underneath have undefined behavior. Hence, clamp
541 things to the file size if we know it */
542
543 if (woffset >= (uint64_t) st->st_size)
544 return -EADDRNOTAVAIL;
545
546 if (woffset + wsize > (uint64_t) st->st_size)
547 wsize = PAGE_ALIGN(st->st_size - woffset);
548 }
549
550 r = mmap_try_harder(m, NULL, fd, prot, MAP_SHARED, woffset, wsize, &d);
551 if (r < 0)
552 return r;
553
554 c = context_add(m, context);
555 if (!c)
556 goto outofmem;
557
558 f = fd_add(m, fd);
559 if (!f)
560 goto outofmem;
561
562 w = window_add(m, f, prot, keep_always, woffset, wsize, d);
563 if (!w)
564 goto outofmem;
565
566 context_detach_window(c);
567 c->window = w;
568 LIST_PREPEND(by_window, w->contexts, c);
569
570 *ret = (uint8_t*) w->ptr + (offset - w->offset);
571 return 1;
572
573 outofmem:
574 (void) munmap(d, wsize);
575 return -ENOMEM;
576 }
577
578 int mmap_cache_get(
579 MMapCache *m,
580 int fd,
581 int prot,
582 unsigned context,
583 bool keep_always,
584 uint64_t offset,
585 size_t size,
586 struct stat *st,
587 void **ret) {
588
589 int r;
590
591 assert(m);
592 assert(m->n_ref > 0);
593 assert(fd >= 0);
594 assert(size > 0);
595 assert(ret);
596 assert(context < MMAP_CACHE_MAX_CONTEXTS);
597
598 /* Check whether the current context is the right one already */
599 r = try_context(m, fd, prot, context, keep_always, offset, size, ret);
600 if (r != 0) {
601 m->n_hit++;
602 return r;
603 }
604
605 /* Search for a matching mmap */
606 r = find_mmap(m, fd, prot, context, keep_always, offset, size, ret);
607 if (r != 0) {
608 m->n_hit++;
609 return r;
610 }
611
612 m->n_missed++;
613
614 /* Create a new mmap */
615 return add_mmap(m, fd, prot, context, keep_always, offset, size, st, ret);
616 }
617
618 unsigned mmap_cache_get_hit(MMapCache *m) {
619 assert(m);
620
621 return m->n_hit;
622 }
623
624 unsigned mmap_cache_get_missed(MMapCache *m) {
625 assert(m);
626
627 return m->n_missed;
628 }
629
630 static void mmap_cache_process_sigbus(MMapCache *m) {
631 bool found = false;
632 FileDescriptor *f;
633 Iterator i;
634 int r;
635
636 assert(m);
637
638 /* Iterate through all triggered pages and mark their files as
639 * invalidated */
640 for (;;) {
641 bool ours;
642 void *addr;
643
644 r = sigbus_pop(&addr);
645 if (_likely_(r == 0))
646 break;
647 if (r < 0) {
648 log_error_errno(r, "SIGBUS handling failed: %m");
649 abort();
650 }
651
652 ours = false;
653 HASHMAP_FOREACH(f, m->fds, i) {
654 Window *w;
655
656 LIST_FOREACH(by_fd, w, f->windows) {
657 if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
658 (uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
659 found = ours = f->sigbus = true;
660 break;
661 }
662 }
663
664 if (ours)
665 break;
666 }
667
668 /* Didn't find a matching window, give up */
669 if (!ours) {
670 log_error("Unknown SIGBUS page, aborting.");
671 abort();
672 }
673 }
674
675 /* The list of triggered pages is now empty. Now, let's remap
676 * all windows of the triggered file to anonymous maps, so
677 * that no page of the file in question is triggered again, so
678 * that we can be sure not to hit the queue size limit. */
679 if (_likely_(!found))
680 return;
681
682 HASHMAP_FOREACH(f, m->fds, i) {
683 Window *w;
684
685 if (!f->sigbus)
686 continue;
687
688 LIST_FOREACH(by_fd, w, f->windows)
689 window_invalidate(w);
690 }
691 }
692
693 bool mmap_cache_got_sigbus(MMapCache *m, int fd) {
694 FileDescriptor *f;
695
696 assert(m);
697 assert(fd >= 0);
698
699 mmap_cache_process_sigbus(m);
700
701 f = hashmap_get(m->fds, FD_TO_PTR(fd));
702 if (!f)
703 return false;
704
705 return f->sigbus;
706 }
707
708 void mmap_cache_close_fd(MMapCache *m, int fd) {
709 FileDescriptor *f;
710
711 assert(m);
712 assert(fd >= 0);
713
714 /* Make sure that any queued SIGBUS are first dispatched, so
715 * that we don't end up with a SIGBUS entry we cannot relate
716 * to any existing memory map */
717
718 mmap_cache_process_sigbus(m);
719
720 f = hashmap_get(m->fds, FD_TO_PTR(fd));
721 if (!f)
722 return;
723
724 fd_free(f);
725 }