]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/libsystemd/sd-journal/mmap-cache.c
2c2f6c677d35763c381b443c6ef7d29b302be82e
[thirdparty/systemd.git] / src / libsystemd / sd-journal / mmap-cache.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <stdlib.h>
4 #include <sys/mman.h>
5
6 #include "alloc-util.h"
7 #include "bitfield.h"
8 #include "errno-util.h"
9 #include "fd-util.h"
10 #include "hashmap.h"
11 #include "list.h"
12 #include "log.h"
13 #include "memory-util.h"
14 #include "mmap-cache.h"
15 #include "sigbus.h"
16
17 typedef struct Window Window;
18
19 typedef enum WindowFlags {
20 WINDOW_KEEP_ALWAYS = 1u << (_MMAP_CACHE_CATEGORY_MAX + 0),
21 WINDOW_IN_UNUSED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 1),
22 WINDOW_INVALIDATED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 2),
23
24 _WINDOW_USED_MASK = WINDOW_IN_UNUSED - 1, /* The mask contains all bits that indicate the windows
25 * is currently in use. Covers the all the object types
26 * and the additional WINDOW_KEEP_ALWAYS flag. */
27 } WindowFlags;
28
29 #define WINDOW_IS_UNUSED(w) (((w)->flags & _WINDOW_USED_MASK) == 0)
30
31 struct Window {
32 MMapFileDescriptor *fd;
33
34 WindowFlags flags;
35
36 void *ptr;
37 uint64_t offset;
38 size_t size;
39
40 LIST_FIELDS(Window, windows);
41 LIST_FIELDS(Window, unused);
42 };
43
44 struct MMapFileDescriptor {
45 MMapCache *cache;
46
47 int fd;
48 int prot;
49 bool sigbus;
50
51 LIST_HEAD(Window, windows);
52 };
53
54 struct MMapCache {
55 unsigned n_ref;
56 unsigned n_windows;
57
58 unsigned n_category_cache_hit;
59 unsigned n_window_list_hit;
60 unsigned n_missed;
61
62 Hashmap *fds;
63
64 LIST_HEAD(Window, unused);
65 Window *last_unused;
66 unsigned n_unused;
67
68 Window *windows_by_category[_MMAP_CACHE_CATEGORY_MAX];
69 };
70
71 #define WINDOWS_MIN 64
72 #define UNUSED_MIN 4
73
74 #if ENABLE_DEBUG_MMAP_CACHE
75 /* Tiny windows increase mmap activity and the chance of exposing unsafe use. */
76 # define WINDOW_SIZE (page_size())
77 #else
78 # define WINDOW_SIZE ((size_t) (UINT64_C(8) * UINT64_C(1024) * UINT64_C(1024)))
79 #endif
80
81 MMapCache* mmap_cache_new(void) {
82 MMapCache *m;
83
84 m = new(MMapCache, 1);
85 if (!m)
86 return NULL;
87
88 *m = (MMapCache) {
89 .n_ref = 1,
90 };
91
92 return m;
93 }
94
95 static Window* window_unlink(Window *w) {
96 assert(w);
97
98 MMapCache *m = mmap_cache_fd_cache(w->fd);
99
100 if (w->ptr)
101 munmap(w->ptr, w->size);
102
103 if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) {
104 if (m->last_unused == w)
105 m->last_unused = w->unused_prev;
106 LIST_REMOVE(unused, m->unused, w);
107 m->n_unused--;
108 }
109
110 for (unsigned i = 0; i < _MMAP_CACHE_CATEGORY_MAX; i++)
111 if (BIT_SET(w->flags, i))
112 assert_se(TAKE_PTR(m->windows_by_category[i]) == w);
113
114 return LIST_REMOVE(windows, w->fd->windows, w);
115 }
116
117 static void window_invalidate(Window *w) {
118 assert(w);
119 assert(w->fd);
120
121 if (FLAGS_SET(w->flags, WINDOW_INVALIDATED))
122 return;
123
124 /* Replace the window with anonymous pages. This is useful when we hit a SIGBUS and want to make sure
125 * the file cannot trigger any further SIGBUS, possibly overrunning the sigbus queue. */
126
127 assert_se(mmap(w->ptr, w->size, w->fd->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
128 w->flags |= WINDOW_INVALIDATED;
129 }
130
131 static Window* window_free(Window *w) {
132 if (!w)
133 return NULL;
134
135 window_unlink(w);
136 w->fd->cache->n_windows--;
137
138 return mfree(w);
139 }
140
141 static bool window_matches(Window *w, MMapFileDescriptor *f, uint64_t offset, size_t size) {
142 assert(size > 0);
143
144 return
145 w &&
146 f == w->fd &&
147 offset >= w->offset &&
148 offset + size <= w->offset + w->size;
149 }
150
151 static bool window_matches_by_addr(Window *w, MMapFileDescriptor *f, void *addr, size_t size) {
152 assert(size > 0);
153
154 return
155 w &&
156 f == w->fd &&
157 (uint8_t*) addr >= (uint8_t*) w->ptr &&
158 (uint8_t*) addr + size <= (uint8_t*) w->ptr + w->size;
159 }
160
161 static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, void *ptr) {
162 MMapCache *m = mmap_cache_fd_cache(f);
163 Window *w;
164
165 if (!m->last_unused || m->n_windows < WINDOWS_MIN || m->n_unused < UNUSED_MIN) {
166 /* Allocate a new window */
167 w = new(Window, 1);
168 if (!w)
169 return NULL;
170 m->n_windows++;
171 } else
172 /* Reuse an existing one */
173 w = window_unlink(m->last_unused);
174
175 *w = (Window) {
176 .fd = f,
177 .offset = offset,
178 .size = size,
179 .ptr = ptr,
180 };
181
182 return LIST_PREPEND(windows, f->windows, w);
183 }
184
185 static void category_detach_window(MMapCache *m, MMapCacheCategory c) {
186 Window *w;
187
188 assert(m);
189 assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX);
190
191 w = TAKE_PTR(m->windows_by_category[c]);
192 if (!w)
193 return; /* Nothing attached. */
194
195 assert(BIT_SET(w->flags, c));
196 w->flags &= ~(1u << c);
197
198 if (WINDOW_IS_UNUSED(w)) {
199 /* Not used anymore? */
200 #if ENABLE_DEBUG_MMAP_CACHE
201 /* Unmap unused windows immediately to expose use-after-unmap by SIGSEGV. */
202 window_free(w);
203 #else
204 LIST_PREPEND(unused, m->unused, w);
205 if (!m->last_unused)
206 m->last_unused = w;
207 m->n_unused++;
208 w->flags |= WINDOW_IN_UNUSED;
209 #endif
210 }
211 }
212
213 static void category_attach_window(MMapCache *m, MMapCacheCategory c, Window *w) {
214 assert(m);
215 assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX);
216 assert(w);
217
218 if (m->windows_by_category[c] == w)
219 return; /* Already attached. */
220
221 category_detach_window(m, c);
222
223 if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) {
224 /* Used again? */
225 if (m->last_unused == w)
226 m->last_unused = w->unused_prev;
227 LIST_REMOVE(unused, m->unused, w);
228 m->n_unused--;
229 w->flags &= ~WINDOW_IN_UNUSED;
230 }
231
232 m->windows_by_category[c] = w;
233 w->flags |= (1u << c);
234 }
235
236 static MMapCache* mmap_cache_free(MMapCache *m) {
237 if (!m)
238 return NULL;
239
240 /* All windows are owned by fds, and each fd takes a reference of MMapCache. So, when this is called,
241 * all fds are already freed, and hence there is no window. */
242
243 assert(hashmap_isempty(m->fds));
244 hashmap_free(m->fds);
245
246 assert(!m->unused && m->n_unused == 0);
247 assert(m->n_windows == 0);
248
249 return mfree(m);
250 }
251
252 DEFINE_TRIVIAL_REF_UNREF_FUNC(MMapCache, mmap_cache, mmap_cache_free);
253
254 static int mmap_try_harder(MMapFileDescriptor *f, void *addr, int flags, uint64_t offset, size_t size, void **ret) {
255 MMapCache *m = mmap_cache_fd_cache(f);
256
257 assert(ret);
258
259 for (;;) {
260 void *ptr;
261
262 ptr = mmap(addr, size, f->prot, flags, f->fd, offset);
263 if (ptr != MAP_FAILED) {
264 *ret = ptr;
265 return 0;
266 }
267 if (errno != ENOMEM)
268 return negative_errno();
269
270 /* When failed with ENOMEM, try again after making a room by freeing an unused window. */
271
272 if (!m->last_unused)
273 return -ENOMEM; /* no free window, propagate the original error. */
274
275 window_free(m->last_unused);
276 }
277 }
278
279 static int add_mmap(
280 MMapFileDescriptor *f,
281 uint64_t offset,
282 size_t size,
283 struct stat *st,
284 Window **ret) {
285
286 Window *w;
287 void *d;
288 int r;
289
290 assert(f);
291 assert(size > 0);
292 assert(ret);
293
294 /* overflow check */
295 if (size > SIZE_MAX - PAGE_OFFSET_U64(offset))
296 return -EADDRNOTAVAIL;
297
298 size = PAGE_ALIGN(size + PAGE_OFFSET_U64(offset));
299 offset = PAGE_ALIGN_DOWN_U64(offset);
300
301 if (size < WINDOW_SIZE) {
302 uint64_t delta;
303
304 delta = PAGE_ALIGN((WINDOW_SIZE - size) / 2);
305 offset = LESS_BY(offset, delta);
306 size = WINDOW_SIZE;
307 }
308
309 if (st) {
310 /* Memory maps that are larger then the files underneath have undefined behavior. Hence,
311 * clamp things to the file size if we know it */
312
313 if (offset >= (uint64_t) st->st_size)
314 return -EADDRNOTAVAIL;
315
316 if (size > (uint64_t) st->st_size - offset)
317 size = PAGE_ALIGN((uint64_t) st->st_size - offset);
318 }
319
320 if (size >= SIZE_MAX)
321 return -EADDRNOTAVAIL;
322
323 r = mmap_try_harder(f, NULL, MAP_SHARED, offset, size, &d);
324 if (r < 0)
325 return r;
326
327 w = window_add(f, offset, size, d);
328 if (!w) {
329 (void) munmap(d, size);
330 return -ENOMEM;
331 }
332
333 *ret = w;
334 return 0;
335 }
336
337 int mmap_cache_fd_get(
338 MMapFileDescriptor *f,
339 MMapCacheCategory c,
340 bool keep_always,
341 uint64_t offset,
342 size_t size,
343 struct stat *st,
344 void **ret) {
345
346 MMapCache *m = mmap_cache_fd_cache(f);
347 Window *w;
348 int r;
349
350 assert(size > 0);
351 assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX);
352 assert(ret);
353
354 if (f->sigbus)
355 return -EIO;
356
357 /* Check whether the current category is the right one already */
358 if (window_matches(m->windows_by_category[c], f, offset, size)) {
359 m->n_category_cache_hit++;
360 w = m->windows_by_category[c];
361 goto found;
362 }
363
364 /* Drop the reference to the window, since it's unnecessary now */
365 category_detach_window(m, c);
366
367 /* Search for a matching mmap */
368 LIST_FOREACH(windows, i, f->windows)
369 if (window_matches(i, f, offset, size)) {
370 m->n_window_list_hit++;
371 w = i;
372 goto found;
373 }
374
375 m->n_missed++;
376
377 /* Create a new mmap */
378 r = add_mmap(f, offset, size, st, &w);
379 if (r < 0)
380 return r;
381
382 found:
383 if (keep_always)
384 w->flags |= WINDOW_KEEP_ALWAYS;
385
386 category_attach_window(m, c, w);
387 *ret = (uint8_t*) w->ptr + (offset - w->offset);
388 return 0;
389 }
390
391 int mmap_cache_fd_pin(
392 MMapFileDescriptor *f,
393 MMapCacheCategory c,
394 void *addr,
395 size_t size) {
396
397 MMapCache *m = mmap_cache_fd_cache(f);
398 Window *w;
399
400 assert(addr);
401 assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX);
402 assert(size > 0);
403
404 if (f->sigbus)
405 return -EIO;
406
407 /* Check if the current category is the right one. */
408 if (window_matches_by_addr(m->windows_by_category[c], f, addr, size)) {
409 m->n_category_cache_hit++;
410 w = m->windows_by_category[c];
411 goto found;
412 }
413
414 /* Search for a matching mmap. */
415 LIST_FOREACH(windows, i, f->windows)
416 if (window_matches_by_addr(i, f, addr, size)) {
417 m->n_window_list_hit++;
418 w = i;
419 goto found;
420 }
421
422 m->n_missed++;
423 return -EADDRNOTAVAIL; /* Not found. */
424
425 found:
426 if (FLAGS_SET(w->flags, WINDOW_KEEP_ALWAYS))
427 return 0; /* The window will never unmapped. */
428
429 /* Attach the window to the 'pinning' category. */
430 category_attach_window(m, MMAP_CACHE_CATEGORY_PIN, w);
431 return 1;
432 }
433
434 void mmap_cache_stats_log_debug(MMapCache *m) {
435 assert(m);
436
437 log_debug("mmap cache statistics: %u category cache hit, %u window list hit, %u miss, %u files, %u windows, %u unused",
438 m->n_category_cache_hit, m->n_window_list_hit, m->n_missed, hashmap_size(m->fds), m->n_windows, m->n_unused);
439 }
440
441 static void mmap_cache_process_sigbus(MMapCache *m) {
442 bool found = false;
443 MMapFileDescriptor *f;
444 int r;
445
446 assert(m);
447
448 /* Iterate through all triggered pages and mark their files as invalidated. */
449 for (;;) {
450 bool ours;
451 void *addr;
452
453 r = sigbus_pop(&addr);
454 if (_likely_(r == 0))
455 break;
456 if (r < 0) {
457 log_error_errno(r, "SIGBUS handling failed: %m");
458 abort();
459 }
460
461 ours = false;
462 HASHMAP_FOREACH(f, m->fds) {
463 LIST_FOREACH(windows, w, f->windows)
464 if (window_matches_by_addr(w, f, addr, 1)) {
465 found = ours = f->sigbus = true;
466 break;
467 }
468
469 if (ours)
470 break;
471 }
472
473 /* Didn't find a matching window, give up. */
474 if (!ours) {
475 log_error("Unknown SIGBUS page, aborting.");
476 abort();
477 }
478 }
479
480 /* The list of triggered pages is now empty. Now, let's remap all windows of the triggered file to
481 * anonymous maps, so that no page of the file in question is triggered again, so that we can be sure
482 * not to hit the queue size limit. */
483 if (_likely_(!found))
484 return;
485
486 HASHMAP_FOREACH(f, m->fds) {
487 if (!f->sigbus)
488 continue;
489
490 LIST_FOREACH(windows, w, f->windows)
491 window_invalidate(w);
492 }
493 }
494
495 bool mmap_cache_fd_got_sigbus(MMapFileDescriptor *f) {
496 assert(f);
497
498 mmap_cache_process_sigbus(f->cache);
499
500 return f->sigbus;
501 }
502
503 int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret) {
504 _cleanup_free_ MMapFileDescriptor *f = NULL;
505 MMapFileDescriptor *existing;
506 int r;
507
508 assert(m);
509 assert(fd >= 0);
510
511 existing = hashmap_get(m->fds, FD_TO_PTR(fd));
512 if (existing) {
513 if (existing->prot != prot)
514 return -EEXIST;
515 if (ret)
516 *ret = existing;
517 return 0;
518 }
519
520 f = new(MMapFileDescriptor, 1);
521 if (!f)
522 return -ENOMEM;
523
524 *f = (MMapFileDescriptor) {
525 .fd = fd,
526 .prot = prot,
527 };
528
529 r = hashmap_ensure_put(&m->fds, NULL, FD_TO_PTR(fd), f);
530 if (r < 0)
531 return r;
532 assert(r > 0);
533
534 f->cache = mmap_cache_ref(m);
535
536 if (ret)
537 *ret = f;
538
539 TAKE_PTR(f);
540 return 1;
541 }
542
543 MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f) {
544 if (!f)
545 return NULL;
546
547 /* Make sure that any queued SIGBUS are first dispatched, so that we don't end up with a SIGBUS entry
548 * we cannot relate to any existing memory map. */
549
550 mmap_cache_process_sigbus(f->cache);
551
552 while (f->windows)
553 window_free(f->windows);
554
555 assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)) == f);
556
557 /* Unref the cache at the end. Otherwise, the assertions in mmap_cache_free() may be triggered. */
558 f->cache = mmap_cache_unref(f->cache);
559
560 return mfree(f);
561 }
562
563 MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f) {
564 assert(f);
565 return ASSERT_PTR(f->cache);
566 }