]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/mmap-cache.c
Merge pull request #15557 from poettering/journal-zero-fix
[thirdparty/systemd.git] / src / journal / mmap-cache.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <stdlib.h>
5 #include <sys/mman.h>
6
7 #include "alloc-util.h"
8 #include "errno-util.h"
9 #include "fd-util.h"
10 #include "hashmap.h"
11 #include "list.h"
12 #include "log.h"
13 #include "macro.h"
14 #include "memory-util.h"
15 #include "mmap-cache.h"
16 #include "sigbus.h"
17
18 typedef struct Window Window;
19 typedef struct Context Context;
20
21 struct Window {
22 MMapCache *cache;
23
24 bool invalidated:1;
25 bool keep_always:1;
26 bool in_unused:1;
27
28 int prot;
29 void *ptr;
30 uint64_t offset;
31 size_t size;
32
33 MMapFileDescriptor *fd;
34
35 LIST_FIELDS(Window, by_fd);
36 LIST_FIELDS(Window, unused);
37
38 LIST_HEAD(Context, contexts);
39 };
40
41 struct Context {
42 MMapCache *cache;
43 unsigned id;
44 Window *window;
45
46 LIST_FIELDS(Context, by_window);
47 };
48
49 struct MMapFileDescriptor {
50 MMapCache *cache;
51 int fd;
52 bool sigbus;
53 LIST_HEAD(Window, windows);
54 };
55
56 struct MMapCache {
57 unsigned n_ref;
58 unsigned n_windows;
59
60 unsigned n_hit, n_missed;
61
62 Hashmap *fds;
63 Context *contexts[MMAP_CACHE_MAX_CONTEXTS];
64
65 LIST_HEAD(Window, unused);
66 Window *last_unused;
67 };
68
69 #define WINDOWS_MIN 64
70
71 #if ENABLE_DEBUG_MMAP_CACHE
72 /* Tiny windows increase mmap activity and the chance of exposing unsafe use. */
73 # define WINDOW_SIZE (page_size())
74 #else
75 # define WINDOW_SIZE (8ULL*1024ULL*1024ULL)
76 #endif
77
78 MMapCache* mmap_cache_new(void) {
79 MMapCache *m;
80
81 m = new0(MMapCache, 1);
82 if (!m)
83 return NULL;
84
85 m->n_ref = 1;
86 return m;
87 }
88
89 static void window_unlink(Window *w) {
90 Context *c;
91
92 assert(w);
93
94 if (w->ptr)
95 munmap(w->ptr, w->size);
96
97 if (w->fd)
98 LIST_REMOVE(by_fd, w->fd->windows, w);
99
100 if (w->in_unused) {
101 if (w->cache->last_unused == w)
102 w->cache->last_unused = w->unused_prev;
103
104 LIST_REMOVE(unused, w->cache->unused, w);
105 }
106
107 LIST_FOREACH(by_window, c, w->contexts) {
108 assert(c->window == w);
109 c->window = NULL;
110 }
111 }
112
113 static void window_invalidate(Window *w) {
114 assert(w);
115
116 if (w->invalidated)
117 return;
118
119 /* Replace the window with anonymous pages. This is useful
120 * when we hit a SIGBUS and want to make sure the file cannot
121 * trigger any further SIGBUS, possibly overrunning the sigbus
122 * queue. */
123
124 assert_se(mmap(w->ptr, w->size, w->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
125 w->invalidated = true;
126 }
127
128 static void window_free(Window *w) {
129 assert(w);
130
131 window_unlink(w);
132 w->cache->n_windows--;
133 free(w);
134 }
135
136 _pure_ static bool window_matches(Window *w, int prot, uint64_t offset, size_t size) {
137 assert(w);
138 assert(size > 0);
139
140 return
141 prot == w->prot &&
142 offset >= w->offset &&
143 offset + size <= w->offset + w->size;
144 }
145
146 _pure_ static bool window_matches_fd(Window *w, MMapFileDescriptor *f, int prot, uint64_t offset, size_t size) {
147 assert(w);
148 assert(f);
149
150 return
151 w->fd &&
152 f->fd == w->fd->fd &&
153 window_matches(w, prot, offset, size);
154 }
155
156 static Window *window_add(MMapCache *m, MMapFileDescriptor *f, int prot, bool keep_always, uint64_t offset, size_t size, void *ptr) {
157 Window *w;
158
159 assert(m);
160 assert(f);
161
162 if (!m->last_unused || m->n_windows <= WINDOWS_MIN) {
163
164 /* Allocate a new window */
165 w = new(Window, 1);
166 if (!w)
167 return NULL;
168 m->n_windows++;
169 } else {
170
171 /* Reuse an existing one */
172 w = m->last_unused;
173 window_unlink(w);
174 }
175
176 *w = (Window) {
177 .cache = m,
178 .fd = f,
179 .prot = prot,
180 .keep_always = keep_always,
181 .offset = offset,
182 .size = size,
183 .ptr = ptr,
184 };
185
186 LIST_PREPEND(by_fd, f->windows, w);
187
188 return w;
189 }
190
191 static void context_detach_window(Context *c) {
192 Window *w;
193
194 assert(c);
195
196 if (!c->window)
197 return;
198
199 w = TAKE_PTR(c->window);
200 LIST_REMOVE(by_window, w->contexts, c);
201
202 if (!w->contexts && !w->keep_always) {
203 /* Not used anymore? */
204 #if ENABLE_DEBUG_MMAP_CACHE
205 /* Unmap unused windows immediately to expose use-after-unmap
206 * by SIGSEGV. */
207 window_free(w);
208 #else
209 LIST_PREPEND(unused, c->cache->unused, w);
210 if (!c->cache->last_unused)
211 c->cache->last_unused = w;
212
213 w->in_unused = true;
214 #endif
215 }
216 }
217
218 static void context_attach_window(Context *c, Window *w) {
219 assert(c);
220 assert(w);
221
222 if (c->window == w)
223 return;
224
225 context_detach_window(c);
226
227 if (w->in_unused) {
228 /* Used again? */
229 LIST_REMOVE(unused, c->cache->unused, w);
230 if (c->cache->last_unused == w)
231 c->cache->last_unused = w->unused_prev;
232
233 w->in_unused = false;
234 }
235
236 c->window = w;
237 LIST_PREPEND(by_window, w->contexts, c);
238 }
239
240 static Context *context_add(MMapCache *m, unsigned id) {
241 Context *c;
242
243 assert(m);
244
245 c = m->contexts[id];
246 if (c)
247 return c;
248
249 c = new0(Context, 1);
250 if (!c)
251 return NULL;
252
253 c->cache = m;
254 c->id = id;
255
256 assert(!m->contexts[id]);
257 m->contexts[id] = c;
258
259 return c;
260 }
261
262 static void context_free(Context *c) {
263 assert(c);
264
265 context_detach_window(c);
266
267 if (c->cache) {
268 assert(c->cache->contexts[c->id] == c);
269 c->cache->contexts[c->id] = NULL;
270 }
271
272 free(c);
273 }
274
275 static MMapCache *mmap_cache_free(MMapCache *m) {
276 int i;
277
278 assert(m);
279
280 for (i = 0; i < MMAP_CACHE_MAX_CONTEXTS; i++)
281 if (m->contexts[i])
282 context_free(m->contexts[i]);
283
284 hashmap_free(m->fds);
285
286 while (m->unused)
287 window_free(m->unused);
288
289 return mfree(m);
290 }
291
292 DEFINE_TRIVIAL_REF_UNREF_FUNC(MMapCache, mmap_cache, mmap_cache_free);
293
294 static int make_room(MMapCache *m) {
295 assert(m);
296
297 if (!m->last_unused)
298 return 0;
299
300 window_free(m->last_unused);
301 return 1;
302 }
303
304 static int try_context(
305 MMapCache *m,
306 MMapFileDescriptor *f,
307 int prot,
308 unsigned context,
309 bool keep_always,
310 uint64_t offset,
311 size_t size,
312 void **ret,
313 size_t *ret_size) {
314
315 Context *c;
316
317 assert(m);
318 assert(m->n_ref > 0);
319 assert(f);
320 assert(size > 0);
321 assert(ret);
322
323 c = m->contexts[context];
324 if (!c)
325 return 0;
326
327 assert(c->id == context);
328
329 if (!c->window)
330 return 0;
331
332 if (!window_matches_fd(c->window, f, prot, offset, size)) {
333
334 /* Drop the reference to the window, since it's unnecessary now */
335 context_detach_window(c);
336 return 0;
337 }
338
339 if (c->window->fd->sigbus)
340 return -EIO;
341
342 c->window->keep_always = c->window->keep_always || keep_always;
343
344 *ret = (uint8_t*) c->window->ptr + (offset - c->window->offset);
345 if (ret_size)
346 *ret_size = c->window->size - (offset - c->window->offset);
347
348 return 1;
349 }
350
351 static int find_mmap(
352 MMapCache *m,
353 MMapFileDescriptor *f,
354 int prot,
355 unsigned context,
356 bool keep_always,
357 uint64_t offset,
358 size_t size,
359 void **ret,
360 size_t *ret_size) {
361
362 Window *w;
363 Context *c;
364
365 assert(m);
366 assert(m->n_ref > 0);
367 assert(f);
368 assert(size > 0);
369
370 if (f->sigbus)
371 return -EIO;
372
373 LIST_FOREACH(by_fd, w, f->windows)
374 if (window_matches(w, prot, offset, size))
375 break;
376
377 if (!w)
378 return 0;
379
380 c = context_add(m, context);
381 if (!c)
382 return -ENOMEM;
383
384 context_attach_window(c, w);
385 w->keep_always = w->keep_always || keep_always;
386
387 *ret = (uint8_t*) w->ptr + (offset - w->offset);
388 if (ret_size)
389 *ret_size = w->size - (offset - w->offset);
390
391 return 1;
392 }
393
394 static int mmap_try_harder(MMapCache *m, void *addr, MMapFileDescriptor *f, int prot, int flags, uint64_t offset, size_t size, void **res) {
395 void *ptr;
396
397 assert(m);
398 assert(f);
399 assert(res);
400
401 for (;;) {
402 int r;
403
404 ptr = mmap(addr, size, prot, flags, f->fd, offset);
405 if (ptr != MAP_FAILED)
406 break;
407 if (errno != ENOMEM)
408 return negative_errno();
409
410 r = make_room(m);
411 if (r < 0)
412 return r;
413 if (r == 0)
414 return -ENOMEM;
415 }
416
417 *res = ptr;
418 return 0;
419 }
420
421 static int add_mmap(
422 MMapCache *m,
423 MMapFileDescriptor *f,
424 int prot,
425 unsigned context,
426 bool keep_always,
427 uint64_t offset,
428 size_t size,
429 struct stat *st,
430 void **ret,
431 size_t *ret_size) {
432
433 uint64_t woffset, wsize;
434 Context *c;
435 Window *w;
436 void *d;
437 int r;
438
439 assert(m);
440 assert(m->n_ref > 0);
441 assert(f);
442 assert(size > 0);
443 assert(ret);
444
445 woffset = offset & ~((uint64_t) page_size() - 1ULL);
446 wsize = size + (offset - woffset);
447 wsize = PAGE_ALIGN(wsize);
448
449 if (wsize < WINDOW_SIZE) {
450 uint64_t delta;
451
452 delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);
453
454 if (delta > offset)
455 woffset = 0;
456 else
457 woffset -= delta;
458
459 wsize = WINDOW_SIZE;
460 }
461
462 if (st) {
463 /* Memory maps that are larger then the files
464 underneath have undefined behavior. Hence, clamp
465 things to the file size if we know it */
466
467 if (woffset >= (uint64_t) st->st_size)
468 return -EADDRNOTAVAIL;
469
470 if (woffset + wsize > (uint64_t) st->st_size)
471 wsize = PAGE_ALIGN(st->st_size - woffset);
472 }
473
474 r = mmap_try_harder(m, NULL, f, prot, MAP_SHARED, woffset, wsize, &d);
475 if (r < 0)
476 return r;
477
478 c = context_add(m, context);
479 if (!c)
480 goto outofmem;
481
482 w = window_add(m, f, prot, keep_always, woffset, wsize, d);
483 if (!w)
484 goto outofmem;
485
486 context_attach_window(c, w);
487
488 *ret = (uint8_t*) w->ptr + (offset - w->offset);
489 if (ret_size)
490 *ret_size = w->size - (offset - w->offset);
491
492 return 1;
493
494 outofmem:
495 (void) munmap(d, wsize);
496 return -ENOMEM;
497 }
498
499 int mmap_cache_get(
500 MMapCache *m,
501 MMapFileDescriptor *f,
502 int prot,
503 unsigned context,
504 bool keep_always,
505 uint64_t offset,
506 size_t size,
507 struct stat *st,
508 void **ret,
509 size_t *ret_size) {
510
511 int r;
512
513 assert(m);
514 assert(m->n_ref > 0);
515 assert(f);
516 assert(size > 0);
517 assert(ret);
518 assert(context < MMAP_CACHE_MAX_CONTEXTS);
519
520 /* Check whether the current context is the right one already */
521 r = try_context(m, f, prot, context, keep_always, offset, size, ret, ret_size);
522 if (r != 0) {
523 m->n_hit++;
524 return r;
525 }
526
527 /* Search for a matching mmap */
528 r = find_mmap(m, f, prot, context, keep_always, offset, size, ret, ret_size);
529 if (r != 0) {
530 m->n_hit++;
531 return r;
532 }
533
534 m->n_missed++;
535
536 /* Create a new mmap */
537 return add_mmap(m, f, prot, context, keep_always, offset, size, st, ret, ret_size);
538 }
539
540 unsigned mmap_cache_get_hit(MMapCache *m) {
541 assert(m);
542
543 return m->n_hit;
544 }
545
546 unsigned mmap_cache_get_missed(MMapCache *m) {
547 assert(m);
548
549 return m->n_missed;
550 }
551
552 static void mmap_cache_process_sigbus(MMapCache *m) {
553 bool found = false;
554 MMapFileDescriptor *f;
555 Iterator i;
556 int r;
557
558 assert(m);
559
560 /* Iterate through all triggered pages and mark their files as
561 * invalidated */
562 for (;;) {
563 bool ours;
564 void *addr;
565
566 r = sigbus_pop(&addr);
567 if (_likely_(r == 0))
568 break;
569 if (r < 0) {
570 log_error_errno(r, "SIGBUS handling failed: %m");
571 abort();
572 }
573
574 ours = false;
575 HASHMAP_FOREACH(f, m->fds, i) {
576 Window *w;
577
578 LIST_FOREACH(by_fd, w, f->windows) {
579 if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
580 (uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
581 found = ours = f->sigbus = true;
582 break;
583 }
584 }
585
586 if (ours)
587 break;
588 }
589
590 /* Didn't find a matching window, give up */
591 if (!ours) {
592 log_error("Unknown SIGBUS page, aborting.");
593 abort();
594 }
595 }
596
597 /* The list of triggered pages is now empty. Now, let's remap
598 * all windows of the triggered file to anonymous maps, so
599 * that no page of the file in question is triggered again, so
600 * that we can be sure not to hit the queue size limit. */
601 if (_likely_(!found))
602 return;
603
604 HASHMAP_FOREACH(f, m->fds, i) {
605 Window *w;
606
607 if (!f->sigbus)
608 continue;
609
610 LIST_FOREACH(by_fd, w, f->windows)
611 window_invalidate(w);
612 }
613 }
614
615 bool mmap_cache_got_sigbus(MMapCache *m, MMapFileDescriptor *f) {
616 assert(m);
617 assert(f);
618
619 mmap_cache_process_sigbus(m);
620
621 return f->sigbus;
622 }
623
624 MMapFileDescriptor* mmap_cache_add_fd(MMapCache *m, int fd) {
625 MMapFileDescriptor *f;
626 int r;
627
628 assert(m);
629 assert(fd >= 0);
630
631 f = hashmap_get(m->fds, FD_TO_PTR(fd));
632 if (f)
633 return f;
634
635 r = hashmap_ensure_allocated(&m->fds, NULL);
636 if (r < 0)
637 return NULL;
638
639 f = new0(MMapFileDescriptor, 1);
640 if (!f)
641 return NULL;
642
643 f->cache = m;
644 f->fd = fd;
645
646 r = hashmap_put(m->fds, FD_TO_PTR(fd), f);
647 if (r < 0)
648 return mfree(f);
649
650 return f;
651 }
652
653 void mmap_cache_free_fd(MMapCache *m, MMapFileDescriptor *f) {
654 assert(m);
655 assert(f);
656
657 /* Make sure that any queued SIGBUS are first dispatched, so
658 * that we don't end up with a SIGBUS entry we cannot relate
659 * to any existing memory map */
660
661 mmap_cache_process_sigbus(m);
662
663 while (f->windows)
664 window_free(f->windows);
665
666 if (f->cache)
667 assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)));
668
669 free(f);
670 }