]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/mmap-cache.c
Merge pull request #2138 from stefwalter/journal-combine
[thirdparty/systemd.git] / src / journal / mmap-cache.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2012 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <sys/mman.h>
25
26 #include "alloc-util.h"
27 #include "fd-util.h"
28 #include "hashmap.h"
29 #include "list.h"
30 #include "log.h"
31 #include "macro.h"
32 #include "mmap-cache.h"
33 #include "sigbus.h"
34 #include "util.h"
35
36 typedef struct Window Window;
37 typedef struct Context Context;
38 typedef struct FileDescriptor FileDescriptor;
39
40 struct Window {
41 MMapCache *cache;
42
43 bool invalidated:1;
44 bool keep_always:1;
45 bool in_unused:1;
46
47 int prot;
48 void *ptr;
49 uint64_t offset;
50 size_t size;
51
52 FileDescriptor *fd;
53
54 LIST_FIELDS(Window, by_fd);
55 LIST_FIELDS(Window, unused);
56
57 LIST_HEAD(Context, contexts);
58 };
59
60 struct Context {
61 MMapCache *cache;
62 unsigned id;
63 Window *window;
64
65 LIST_FIELDS(Context, by_window);
66 };
67
68 struct FileDescriptor {
69 MMapCache *cache;
70 int fd;
71 bool sigbus;
72 LIST_HEAD(Window, windows);
73 };
74
75 struct MMapCache {
76 int n_ref;
77 unsigned n_windows;
78
79 unsigned n_hit, n_missed;
80
81 Hashmap *fds;
82 Context *contexts[MMAP_CACHE_MAX_CONTEXTS];
83
84 LIST_HEAD(Window, unused);
85 Window *last_unused;
86 };
87
88 #define WINDOWS_MIN 64
89
90 #ifdef ENABLE_DEBUG_MMAP_CACHE
91 /* Tiny windows increase mmap activity and the chance of exposing unsafe use. */
92 # define WINDOW_SIZE (page_size())
93 #else
94 # define WINDOW_SIZE (8ULL*1024ULL*1024ULL)
95 #endif
96
97 MMapCache* mmap_cache_new(void) {
98 MMapCache *m;
99
100 m = new0(MMapCache, 1);
101 if (!m)
102 return NULL;
103
104 m->n_ref = 1;
105 return m;
106 }
107
108 MMapCache* mmap_cache_ref(MMapCache *m) {
109 assert(m);
110 assert(m->n_ref > 0);
111
112 m->n_ref ++;
113 return m;
114 }
115
116 static void window_unlink(Window *w) {
117 Context *c;
118
119 assert(w);
120
121 if (w->ptr)
122 munmap(w->ptr, w->size);
123
124 if (w->fd)
125 LIST_REMOVE(by_fd, w->fd->windows, w);
126
127 if (w->in_unused) {
128 if (w->cache->last_unused == w)
129 w->cache->last_unused = w->unused_prev;
130
131 LIST_REMOVE(unused, w->cache->unused, w);
132 }
133
134 LIST_FOREACH(by_window, c, w->contexts) {
135 assert(c->window == w);
136 c->window = NULL;
137 }
138 }
139
140 static void window_invalidate(Window *w) {
141 assert(w);
142
143 if (w->invalidated)
144 return;
145
146 /* Replace the window with anonymous pages. This is useful
147 * when we hit a SIGBUS and want to make sure the file cannot
148 * trigger any further SIGBUS, possibly overrunning the sigbus
149 * queue. */
150
151 assert_se(mmap(w->ptr, w->size, w->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
152 w->invalidated = true;
153 }
154
155 static void window_free(Window *w) {
156 assert(w);
157
158 window_unlink(w);
159 w->cache->n_windows--;
160 free(w);
161 }
162
163 _pure_ static bool window_matches(Window *w, int fd, int prot, uint64_t offset, size_t size) {
164 assert(w);
165 assert(fd >= 0);
166 assert(size > 0);
167
168 return
169 w->fd &&
170 fd == w->fd->fd &&
171 prot == w->prot &&
172 offset >= w->offset &&
173 offset + size <= w->offset + w->size;
174 }
175
176 static Window *window_add(MMapCache *m) {
177 Window *w;
178
179 assert(m);
180
181 if (!m->last_unused || m->n_windows <= WINDOWS_MIN) {
182
183 /* Allocate a new window */
184 w = new0(Window, 1);
185 if (!w)
186 return NULL;
187 m->n_windows++;
188 } else {
189
190 /* Reuse an existing one */
191 w = m->last_unused;
192 window_unlink(w);
193 zero(*w);
194 }
195
196 w->cache = m;
197 return w;
198 }
199
200 static void context_detach_window(Context *c) {
201 Window *w;
202
203 assert(c);
204
205 if (!c->window)
206 return;
207
208 w = c->window;
209 c->window = NULL;
210 LIST_REMOVE(by_window, w->contexts, c);
211
212 if (!w->contexts && !w->keep_always) {
213 /* Not used anymore? */
214 #ifdef ENABLE_DEBUG_MMAP_CACHE
215 /* Unmap unused windows immediately to expose use-after-unmap
216 * by SIGSEGV. */
217 window_free(w);
218 #else
219 LIST_PREPEND(unused, c->cache->unused, w);
220 if (!c->cache->last_unused)
221 c->cache->last_unused = w;
222
223 w->in_unused = true;
224 #endif
225 }
226 }
227
228 static void context_attach_window(Context *c, Window *w) {
229 assert(c);
230 assert(w);
231
232 if (c->window == w)
233 return;
234
235 context_detach_window(c);
236
237 if (w->in_unused) {
238 /* Used again? */
239 LIST_REMOVE(unused, c->cache->unused, w);
240 if (c->cache->last_unused == w)
241 c->cache->last_unused = w->unused_prev;
242
243 w->in_unused = false;
244 }
245
246 c->window = w;
247 LIST_PREPEND(by_window, w->contexts, c);
248 }
249
250 static Context *context_add(MMapCache *m, unsigned id) {
251 Context *c;
252
253 assert(m);
254
255 c = m->contexts[id];
256 if (c)
257 return c;
258
259 c = new0(Context, 1);
260 if (!c)
261 return NULL;
262
263 c->cache = m;
264 c->id = id;
265
266 assert(!m->contexts[id]);
267 m->contexts[id] = c;
268
269 return c;
270 }
271
272 static void context_free(Context *c) {
273 assert(c);
274
275 context_detach_window(c);
276
277 if (c->cache) {
278 assert(c->cache->contexts[c->id] == c);
279 c->cache->contexts[c->id] = NULL;
280 }
281
282 free(c);
283 }
284
285 static void fd_free(FileDescriptor *f) {
286 assert(f);
287
288 while (f->windows)
289 window_free(f->windows);
290
291 if (f->cache)
292 assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)));
293
294 free(f);
295 }
296
297 static FileDescriptor* fd_add(MMapCache *m, int fd) {
298 FileDescriptor *f;
299 int r;
300
301 assert(m);
302 assert(fd >= 0);
303
304 f = hashmap_get(m->fds, FD_TO_PTR(fd));
305 if (f)
306 return f;
307
308 r = hashmap_ensure_allocated(&m->fds, NULL);
309 if (r < 0)
310 return NULL;
311
312 f = new0(FileDescriptor, 1);
313 if (!f)
314 return NULL;
315
316 f->cache = m;
317 f->fd = fd;
318
319 r = hashmap_put(m->fds, FD_TO_PTR(fd), f);
320 if (r < 0) {
321 free(f);
322 return NULL;
323 }
324
325 return f;
326 }
327
328 static void mmap_cache_free(MMapCache *m) {
329 FileDescriptor *f;
330 int i;
331
332 assert(m);
333
334 for (i = 0; i < MMAP_CACHE_MAX_CONTEXTS; i++)
335 if (m->contexts[i])
336 context_free(m->contexts[i]);
337
338 while ((f = hashmap_first(m->fds)))
339 fd_free(f);
340
341 hashmap_free(m->fds);
342
343 while (m->unused)
344 window_free(m->unused);
345
346 free(m);
347 }
348
349 MMapCache* mmap_cache_unref(MMapCache *m) {
350
351 if (!m)
352 return NULL;
353
354 assert(m->n_ref > 0);
355
356 m->n_ref --;
357 if (m->n_ref == 0)
358 mmap_cache_free(m);
359
360 return NULL;
361 }
362
363 static int make_room(MMapCache *m) {
364 assert(m);
365
366 if (!m->last_unused)
367 return 0;
368
369 window_free(m->last_unused);
370 return 1;
371 }
372
373 static int try_context(
374 MMapCache *m,
375 int fd,
376 int prot,
377 unsigned context,
378 bool keep_always,
379 uint64_t offset,
380 size_t size,
381 void **ret) {
382
383 Context *c;
384
385 assert(m);
386 assert(m->n_ref > 0);
387 assert(fd >= 0);
388 assert(size > 0);
389 assert(ret);
390
391 c = m->contexts[context];
392 if (!c)
393 return 0;
394
395 assert(c->id == context);
396
397 if (!c->window)
398 return 0;
399
400 if (!window_matches(c->window, fd, prot, offset, size)) {
401
402 /* Drop the reference to the window, since it's unnecessary now */
403 context_detach_window(c);
404 return 0;
405 }
406
407 if (c->window->fd->sigbus)
408 return -EIO;
409
410 c->window->keep_always = c->window->keep_always || keep_always;
411
412 *ret = (uint8_t*) c->window->ptr + (offset - c->window->offset);
413 return 1;
414 }
415
416 static int find_mmap(
417 MMapCache *m,
418 int fd,
419 int prot,
420 unsigned context,
421 bool keep_always,
422 uint64_t offset,
423 size_t size,
424 void **ret) {
425
426 FileDescriptor *f;
427 Window *w;
428 Context *c;
429
430 assert(m);
431 assert(m->n_ref > 0);
432 assert(fd >= 0);
433 assert(size > 0);
434
435 f = hashmap_get(m->fds, FD_TO_PTR(fd));
436 if (!f)
437 return 0;
438
439 assert(f->fd == fd);
440
441 if (f->sigbus)
442 return -EIO;
443
444 LIST_FOREACH(by_fd, w, f->windows)
445 if (window_matches(w, fd, prot, offset, size))
446 break;
447
448 if (!w)
449 return 0;
450
451 c = context_add(m, context);
452 if (!c)
453 return -ENOMEM;
454
455 context_attach_window(c, w);
456 w->keep_always = w->keep_always || keep_always;
457
458 *ret = (uint8_t*) w->ptr + (offset - w->offset);
459 return 1;
460 }
461
462 static int add_mmap(
463 MMapCache *m,
464 int fd,
465 int prot,
466 unsigned context,
467 bool keep_always,
468 uint64_t offset,
469 size_t size,
470 struct stat *st,
471 void **ret) {
472
473 uint64_t woffset, wsize;
474 Context *c;
475 FileDescriptor *f;
476 Window *w;
477 void *d;
478 int r;
479
480 assert(m);
481 assert(m->n_ref > 0);
482 assert(fd >= 0);
483 assert(size > 0);
484 assert(ret);
485
486 woffset = offset & ~((uint64_t) page_size() - 1ULL);
487 wsize = size + (offset - woffset);
488 wsize = PAGE_ALIGN(wsize);
489
490 if (wsize < WINDOW_SIZE) {
491 uint64_t delta;
492
493 delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);
494
495 if (delta > offset)
496 woffset = 0;
497 else
498 woffset -= delta;
499
500 wsize = WINDOW_SIZE;
501 }
502
503 if (st) {
504 /* Memory maps that are larger then the files
505 underneath have undefined behavior. Hence, clamp
506 things to the file size if we know it */
507
508 if (woffset >= (uint64_t) st->st_size)
509 return -EADDRNOTAVAIL;
510
511 if (woffset + wsize > (uint64_t) st->st_size)
512 wsize = PAGE_ALIGN(st->st_size - woffset);
513 }
514
515 for (;;) {
516 d = mmap(NULL, wsize, prot, MAP_SHARED, fd, woffset);
517 if (d != MAP_FAILED)
518 break;
519 if (errno != ENOMEM)
520 return -errno;
521
522 r = make_room(m);
523 if (r < 0)
524 return r;
525 if (r == 0)
526 return -ENOMEM;
527 }
528
529 c = context_add(m, context);
530 if (!c)
531 goto outofmem;
532
533 f = fd_add(m, fd);
534 if (!f)
535 goto outofmem;
536
537 w = window_add(m);
538 if (!w)
539 goto outofmem;
540
541 w->keep_always = keep_always;
542 w->ptr = d;
543 w->offset = woffset;
544 w->prot = prot;
545 w->size = wsize;
546 w->fd = f;
547
548 LIST_PREPEND(by_fd, f->windows, w);
549
550 context_detach_window(c);
551 c->window = w;
552 LIST_PREPEND(by_window, w->contexts, c);
553
554 *ret = (uint8_t*) w->ptr + (offset - w->offset);
555 return 1;
556
557 outofmem:
558 munmap(d, wsize);
559 return -ENOMEM;
560 }
561
562 int mmap_cache_get(
563 MMapCache *m,
564 int fd,
565 int prot,
566 unsigned context,
567 bool keep_always,
568 uint64_t offset,
569 size_t size,
570 struct stat *st,
571 void **ret) {
572
573 int r;
574
575 assert(m);
576 assert(m->n_ref > 0);
577 assert(fd >= 0);
578 assert(size > 0);
579 assert(ret);
580 assert(context < MMAP_CACHE_MAX_CONTEXTS);
581
582 /* Check whether the current context is the right one already */
583 r = try_context(m, fd, prot, context, keep_always, offset, size, ret);
584 if (r != 0) {
585 m->n_hit ++;
586 return r;
587 }
588
589 /* Search for a matching mmap */
590 r = find_mmap(m, fd, prot, context, keep_always, offset, size, ret);
591 if (r != 0) {
592 m->n_hit ++;
593 return r;
594 }
595
596 m->n_missed++;
597
598 /* Create a new mmap */
599 return add_mmap(m, fd, prot, context, keep_always, offset, size, st, ret);
600 }
601
602 unsigned mmap_cache_get_hit(MMapCache *m) {
603 assert(m);
604
605 return m->n_hit;
606 }
607
608 unsigned mmap_cache_get_missed(MMapCache *m) {
609 assert(m);
610
611 return m->n_missed;
612 }
613
614 static void mmap_cache_process_sigbus(MMapCache *m) {
615 bool found = false;
616 FileDescriptor *f;
617 Iterator i;
618 int r;
619
620 assert(m);
621
622 /* Iterate through all triggered pages and mark their files as
623 * invalidated */
624 for (;;) {
625 bool ours;
626 void *addr;
627
628 r = sigbus_pop(&addr);
629 if (_likely_(r == 0))
630 break;
631 if (r < 0) {
632 log_error_errno(r, "SIGBUS handling failed: %m");
633 abort();
634 }
635
636 ours = false;
637 HASHMAP_FOREACH(f, m->fds, i) {
638 Window *w;
639
640 LIST_FOREACH(by_fd, w, f->windows) {
641 if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
642 (uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
643 found = ours = f->sigbus = true;
644 break;
645 }
646 }
647
648 if (ours)
649 break;
650 }
651
652 /* Didn't find a matching window, give up */
653 if (!ours) {
654 log_error("Unknown SIGBUS page, aborting.");
655 abort();
656 }
657 }
658
659 /* The list of triggered pages is now empty. Now, let's remap
660 * all windows of the triggered file to anonymous maps, so
661 * that no page of the file in question is triggered again, so
662 * that we can be sure not to hit the queue size limit. */
663 if (_likely_(!found))
664 return;
665
666 HASHMAP_FOREACH(f, m->fds, i) {
667 Window *w;
668
669 if (!f->sigbus)
670 continue;
671
672 LIST_FOREACH(by_fd, w, f->windows)
673 window_invalidate(w);
674 }
675 }
676
677 bool mmap_cache_got_sigbus(MMapCache *m, int fd) {
678 FileDescriptor *f;
679
680 assert(m);
681 assert(fd >= 0);
682
683 mmap_cache_process_sigbus(m);
684
685 f = hashmap_get(m->fds, FD_TO_PTR(fd));
686 if (!f)
687 return false;
688
689 return f->sigbus;
690 }
691
692 void mmap_cache_close_fd(MMapCache *m, int fd) {
693 FileDescriptor *f;
694
695 assert(m);
696 assert(fd >= 0);
697
698 /* Make sure that any queued SIGBUS are first dispatched, so
699 * that we don't end up with a SIGBUS entry we cannot relate
700 * to any existing memory map */
701
702 mmap_cache_process_sigbus(m);
703
704 f = hashmap_get(m->fds, FD_TO_PTR(fd));
705 if (!f)
706 return;
707
708 fd_free(f);
709 }