]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/mmap-cache.c
util-lib: split out allocation calls into alloc-util.[ch]
[thirdparty/systemd.git] / src / journal / mmap-cache.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2012 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <sys/mman.h>
25
26 #include "alloc-util.h"
27 #include "hashmap.h"
28 #include "list.h"
29 #include "log.h"
30 #include "util.h"
31 #include "macro.h"
32 #include "sigbus.h"
33 #include "mmap-cache.h"
34
35 typedef struct Window Window;
36 typedef struct Context Context;
37 typedef struct FileDescriptor FileDescriptor;
38
39 struct Window {
40 MMapCache *cache;
41
42 bool invalidated;
43 bool keep_always;
44 bool in_unused;
45
46 int prot;
47 void *ptr;
48 uint64_t offset;
49 size_t size;
50
51 FileDescriptor *fd;
52
53 LIST_FIELDS(Window, by_fd);
54 LIST_FIELDS(Window, unused);
55
56 LIST_HEAD(Context, contexts);
57 };
58
59 struct Context {
60 MMapCache *cache;
61 unsigned id;
62 Window *window;
63
64 LIST_FIELDS(Context, by_window);
65 };
66
67 struct FileDescriptor {
68 MMapCache *cache;
69 int fd;
70 bool sigbus;
71 LIST_HEAD(Window, windows);
72 };
73
74 struct MMapCache {
75 int n_ref;
76 unsigned n_windows;
77
78 unsigned n_hit, n_missed;
79
80
81 Hashmap *fds;
82 Context *contexts[MMAP_CACHE_MAX_CONTEXTS];
83
84 LIST_HEAD(Window, unused);
85 Window *last_unused;
86 };
87
88 #define WINDOWS_MIN 64
89
90 #ifdef ENABLE_DEBUG_MMAP_CACHE
91 /* Tiny windows increase mmap activity and the chance of exposing unsafe use. */
92 # define WINDOW_SIZE (page_size())
93 #else
94 # define WINDOW_SIZE (8ULL*1024ULL*1024ULL)
95 #endif
96
97 MMapCache* mmap_cache_new(void) {
98 MMapCache *m;
99
100 m = new0(MMapCache, 1);
101 if (!m)
102 return NULL;
103
104 m->n_ref = 1;
105 return m;
106 }
107
108 MMapCache* mmap_cache_ref(MMapCache *m) {
109 assert(m);
110 assert(m->n_ref > 0);
111
112 m->n_ref ++;
113 return m;
114 }
115
116 static void window_unlink(Window *w) {
117 Context *c;
118
119 assert(w);
120
121 if (w->ptr)
122 munmap(w->ptr, w->size);
123
124 if (w->fd)
125 LIST_REMOVE(by_fd, w->fd->windows, w);
126
127 if (w->in_unused) {
128 if (w->cache->last_unused == w)
129 w->cache->last_unused = w->unused_prev;
130
131 LIST_REMOVE(unused, w->cache->unused, w);
132 }
133
134 LIST_FOREACH(by_window, c, w->contexts) {
135 assert(c->window == w);
136 c->window = NULL;
137 }
138 }
139
140 static void window_invalidate(Window *w) {
141 assert(w);
142
143 if (w->invalidated)
144 return;
145
146 /* Replace the window with anonymous pages. This is useful
147 * when we hit a SIGBUS and want to make sure the file cannot
148 * trigger any further SIGBUS, possibly overrunning the sigbus
149 * queue. */
150
151 assert_se(mmap(w->ptr, w->size, w->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
152 w->invalidated = true;
153 }
154
155 static void window_free(Window *w) {
156 assert(w);
157
158 window_unlink(w);
159 w->cache->n_windows--;
160 free(w);
161 }
162
163 _pure_ static bool window_matches(Window *w, int fd, int prot, uint64_t offset, size_t size) {
164 assert(w);
165 assert(fd >= 0);
166 assert(size > 0);
167
168 return
169 w->fd &&
170 fd == w->fd->fd &&
171 prot == w->prot &&
172 offset >= w->offset &&
173 offset + size <= w->offset + w->size;
174 }
175
176 static Window *window_add(MMapCache *m) {
177 Window *w;
178
179 assert(m);
180
181 if (!m->last_unused || m->n_windows <= WINDOWS_MIN) {
182
183 /* Allocate a new window */
184 w = new0(Window, 1);
185 if (!w)
186 return NULL;
187 m->n_windows++;
188 } else {
189
190 /* Reuse an existing one */
191 w = m->last_unused;
192 window_unlink(w);
193 zero(*w);
194 }
195
196 w->cache = m;
197 return w;
198 }
199
200 static void context_detach_window(Context *c) {
201 Window *w;
202
203 assert(c);
204
205 if (!c->window)
206 return;
207
208 w = c->window;
209 c->window = NULL;
210 LIST_REMOVE(by_window, w->contexts, c);
211
212 if (!w->contexts && !w->keep_always) {
213 /* Not used anymore? */
214 #ifdef ENABLE_DEBUG_MMAP_CACHE
215 /* Unmap unused windows immediately to expose use-after-unmap
216 * by SIGSEGV. */
217 window_free(w);
218 #else
219 LIST_PREPEND(unused, c->cache->unused, w);
220 if (!c->cache->last_unused)
221 c->cache->last_unused = w;
222
223 w->in_unused = true;
224 #endif
225 }
226 }
227
228 static void context_attach_window(Context *c, Window *w) {
229 assert(c);
230 assert(w);
231
232 if (c->window == w)
233 return;
234
235 context_detach_window(c);
236
237 if (w->in_unused) {
238 /* Used again? */
239 LIST_REMOVE(unused, c->cache->unused, w);
240 if (c->cache->last_unused == w)
241 c->cache->last_unused = w->unused_prev;
242
243 w->in_unused = false;
244 }
245
246 c->window = w;
247 LIST_PREPEND(by_window, w->contexts, c);
248 }
249
250 static Context *context_add(MMapCache *m, unsigned id) {
251 Context *c;
252
253 assert(m);
254
255 c = m->contexts[id];
256 if (c)
257 return c;
258
259 c = new0(Context, 1);
260 if (!c)
261 return NULL;
262
263 c->cache = m;
264 c->id = id;
265
266 assert(!m->contexts[id]);
267 m->contexts[id] = c;
268
269 return c;
270 }
271
272 static void context_free(Context *c) {
273 assert(c);
274
275 context_detach_window(c);
276
277 if (c->cache) {
278 assert(c->cache->contexts[c->id] == c);
279 c->cache->contexts[c->id] = NULL;
280 }
281
282 free(c);
283 }
284
285 static void fd_free(FileDescriptor *f) {
286 assert(f);
287
288 while (f->windows)
289 window_free(f->windows);
290
291 if (f->cache)
292 assert_se(hashmap_remove(f->cache->fds, INT_TO_PTR(f->fd + 1)));
293
294 free(f);
295 }
296
297 static FileDescriptor* fd_add(MMapCache *m, int fd) {
298 FileDescriptor *f;
299 int r;
300
301 assert(m);
302 assert(fd >= 0);
303
304 f = hashmap_get(m->fds, INT_TO_PTR(fd + 1));
305 if (f)
306 return f;
307
308 r = hashmap_ensure_allocated(&m->fds, NULL);
309 if (r < 0)
310 return NULL;
311
312 f = new0(FileDescriptor, 1);
313 if (!f)
314 return NULL;
315
316 f->cache = m;
317 f->fd = fd;
318
319 r = hashmap_put(m->fds, UINT_TO_PTR(fd + 1), f);
320 if (r < 0) {
321 free(f);
322 return NULL;
323 }
324
325 return f;
326 }
327
328 static void mmap_cache_free(MMapCache *m) {
329 FileDescriptor *f;
330 int i;
331
332 assert(m);
333
334 for (i = 0; i < MMAP_CACHE_MAX_CONTEXTS; i++)
335 if (m->contexts[i])
336 context_free(m->contexts[i]);
337
338 while ((f = hashmap_first(m->fds)))
339 fd_free(f);
340
341 hashmap_free(m->fds);
342
343 while (m->unused)
344 window_free(m->unused);
345
346 free(m);
347 }
348
349 MMapCache* mmap_cache_unref(MMapCache *m) {
350 assert(m);
351 assert(m->n_ref > 0);
352
353 m->n_ref --;
354 if (m->n_ref == 0)
355 mmap_cache_free(m);
356
357 return NULL;
358 }
359
360 static int make_room(MMapCache *m) {
361 assert(m);
362
363 if (!m->last_unused)
364 return 0;
365
366 window_free(m->last_unused);
367 return 1;
368 }
369
370 static int try_context(
371 MMapCache *m,
372 int fd,
373 int prot,
374 unsigned context,
375 bool keep_always,
376 uint64_t offset,
377 size_t size,
378 void **ret) {
379
380 Context *c;
381
382 assert(m);
383 assert(m->n_ref > 0);
384 assert(fd >= 0);
385 assert(size > 0);
386 assert(ret);
387
388 c = m->contexts[context];
389 if (!c)
390 return 0;
391
392 assert(c->id == context);
393
394 if (!c->window)
395 return 0;
396
397 if (!window_matches(c->window, fd, prot, offset, size)) {
398
399 /* Drop the reference to the window, since it's unnecessary now */
400 context_detach_window(c);
401 return 0;
402 }
403
404 if (c->window->fd->sigbus)
405 return -EIO;
406
407 c->window->keep_always |= keep_always;
408
409 *ret = (uint8_t*) c->window->ptr + (offset - c->window->offset);
410 return 1;
411 }
412
413 static int find_mmap(
414 MMapCache *m,
415 int fd,
416 int prot,
417 unsigned context,
418 bool keep_always,
419 uint64_t offset,
420 size_t size,
421 void **ret) {
422
423 FileDescriptor *f;
424 Window *w;
425 Context *c;
426
427 assert(m);
428 assert(m->n_ref > 0);
429 assert(fd >= 0);
430 assert(size > 0);
431
432 f = hashmap_get(m->fds, INT_TO_PTR(fd + 1));
433 if (!f)
434 return 0;
435
436 assert(f->fd == fd);
437
438 if (f->sigbus)
439 return -EIO;
440
441 LIST_FOREACH(by_fd, w, f->windows)
442 if (window_matches(w, fd, prot, offset, size))
443 break;
444
445 if (!w)
446 return 0;
447
448 c = context_add(m, context);
449 if (!c)
450 return -ENOMEM;
451
452 context_attach_window(c, w);
453 w->keep_always += keep_always;
454
455 *ret = (uint8_t*) w->ptr + (offset - w->offset);
456 return 1;
457 }
458
459 static int add_mmap(
460 MMapCache *m,
461 int fd,
462 int prot,
463 unsigned context,
464 bool keep_always,
465 uint64_t offset,
466 size_t size,
467 struct stat *st,
468 void **ret) {
469
470 uint64_t woffset, wsize;
471 Context *c;
472 FileDescriptor *f;
473 Window *w;
474 void *d;
475 int r;
476
477 assert(m);
478 assert(m->n_ref > 0);
479 assert(fd >= 0);
480 assert(size > 0);
481 assert(ret);
482
483 woffset = offset & ~((uint64_t) page_size() - 1ULL);
484 wsize = size + (offset - woffset);
485 wsize = PAGE_ALIGN(wsize);
486
487 if (wsize < WINDOW_SIZE) {
488 uint64_t delta;
489
490 delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);
491
492 if (delta > offset)
493 woffset = 0;
494 else
495 woffset -= delta;
496
497 wsize = WINDOW_SIZE;
498 }
499
500 if (st) {
501 /* Memory maps that are larger then the files
502 underneath have undefined behavior. Hence, clamp
503 things to the file size if we know it */
504
505 if (woffset >= (uint64_t) st->st_size)
506 return -EADDRNOTAVAIL;
507
508 if (woffset + wsize > (uint64_t) st->st_size)
509 wsize = PAGE_ALIGN(st->st_size - woffset);
510 }
511
512 for (;;) {
513 d = mmap(NULL, wsize, prot, MAP_SHARED, fd, woffset);
514 if (d != MAP_FAILED)
515 break;
516 if (errno != ENOMEM)
517 return -errno;
518
519 r = make_room(m);
520 if (r < 0)
521 return r;
522 if (r == 0)
523 return -ENOMEM;
524 }
525
526 c = context_add(m, context);
527 if (!c)
528 goto outofmem;
529
530 f = fd_add(m, fd);
531 if (!f)
532 goto outofmem;
533
534 w = window_add(m);
535 if (!w)
536 goto outofmem;
537
538 w->keep_always = keep_always;
539 w->ptr = d;
540 w->offset = woffset;
541 w->prot = prot;
542 w->size = wsize;
543 w->fd = f;
544
545 LIST_PREPEND(by_fd, f->windows, w);
546
547 context_detach_window(c);
548 c->window = w;
549 LIST_PREPEND(by_window, w->contexts, c);
550
551 *ret = (uint8_t*) w->ptr + (offset - w->offset);
552 return 1;
553
554 outofmem:
555 munmap(d, wsize);
556 return -ENOMEM;
557 }
558
559 int mmap_cache_get(
560 MMapCache *m,
561 int fd,
562 int prot,
563 unsigned context,
564 bool keep_always,
565 uint64_t offset,
566 size_t size,
567 struct stat *st,
568 void **ret) {
569
570 int r;
571
572 assert(m);
573 assert(m->n_ref > 0);
574 assert(fd >= 0);
575 assert(size > 0);
576 assert(ret);
577 assert(context < MMAP_CACHE_MAX_CONTEXTS);
578
579 /* Check whether the current context is the right one already */
580 r = try_context(m, fd, prot, context, keep_always, offset, size, ret);
581 if (r != 0) {
582 m->n_hit ++;
583 return r;
584 }
585
586 /* Search for a matching mmap */
587 r = find_mmap(m, fd, prot, context, keep_always, offset, size, ret);
588 if (r != 0) {
589 m->n_hit ++;
590 return r;
591 }
592
593 m->n_missed++;
594
595 /* Create a new mmap */
596 return add_mmap(m, fd, prot, context, keep_always, offset, size, st, ret);
597 }
598
599 unsigned mmap_cache_get_hit(MMapCache *m) {
600 assert(m);
601
602 return m->n_hit;
603 }
604
605 unsigned mmap_cache_get_missed(MMapCache *m) {
606 assert(m);
607
608 return m->n_missed;
609 }
610
611 static void mmap_cache_process_sigbus(MMapCache *m) {
612 bool found = false;
613 FileDescriptor *f;
614 Iterator i;
615 int r;
616
617 assert(m);
618
619 /* Iterate through all triggered pages and mark their files as
620 * invalidated */
621 for (;;) {
622 bool ours;
623 void *addr;
624
625 r = sigbus_pop(&addr);
626 if (_likely_(r == 0))
627 break;
628 if (r < 0) {
629 log_error_errno(r, "SIGBUS handling failed: %m");
630 abort();
631 }
632
633 ours = false;
634 HASHMAP_FOREACH(f, m->fds, i) {
635 Window *w;
636
637 LIST_FOREACH(by_fd, w, f->windows) {
638 if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
639 (uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
640 found = ours = f->sigbus = true;
641 break;
642 }
643 }
644
645 if (ours)
646 break;
647 }
648
649 /* Didn't find a matching window, give up */
650 if (!ours) {
651 log_error("Unknown SIGBUS page, aborting.");
652 abort();
653 }
654 }
655
656 /* The list of triggered pages is now empty. Now, let's remap
657 * all windows of the triggered file to anonymous maps, so
658 * that no page of the file in question is triggered again, so
659 * that we can be sure not to hit the queue size limit. */
660 if (_likely_(!found))
661 return;
662
663 HASHMAP_FOREACH(f, m->fds, i) {
664 Window *w;
665
666 if (!f->sigbus)
667 continue;
668
669 LIST_FOREACH(by_fd, w, f->windows)
670 window_invalidate(w);
671 }
672 }
673
674 bool mmap_cache_got_sigbus(MMapCache *m, int fd) {
675 FileDescriptor *f;
676
677 assert(m);
678 assert(fd >= 0);
679
680 mmap_cache_process_sigbus(m);
681
682 f = hashmap_get(m->fds, INT_TO_PTR(fd + 1));
683 if (!f)
684 return false;
685
686 return f->sigbus;
687 }
688
689 void mmap_cache_close_fd(MMapCache *m, int fd) {
690 FileDescriptor *f;
691
692 assert(m);
693 assert(fd >= 0);
694
695 /* Make sure that any queued SIGBUS are first dispatched, so
696 * that we don't end up with a SIGBUS entry we cannot relate
697 * to any existing memory map */
698
699 mmap_cache_process_sigbus(m);
700
701 f = hashmap_get(m->fds, INT_TO_PTR(fd + 1));
702 if (!f)
703 return;
704
705 fd_free(f);
706 }