]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/libsystemd/sd-journal/mmap-cache.c
mmap-cache: merge mmap_cache_fd_get() with try_context() and find_mmap()
[thirdparty/systemd.git] / src / libsystemd / sd-journal / mmap-cache.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
16e9f408 2
16e9f408
LP
3#include <errno.h>
4#include <stdlib.h>
f8019684 5#include <sys/mman.h>
16e9f408 6
b5efdb8a 7#include "alloc-util.h"
2b2fec7d 8#include "errno-util.h"
23e096cc 9#include "fd-util.h"
f8019684
LP
10#include "hashmap.h"
11#include "list.h"
12#include "log.h"
f8019684 13#include "macro.h"
0a970718 14#include "memory-util.h"
16e9f408 15#include "mmap-cache.h"
cf0fbc49 16#include "sigbus.h"
16e9f408 17
f8019684
LP
18typedef struct Window Window;
19typedef struct Context Context;
84168d80 20
f8019684
LP
21struct Window {
22 MMapCache *cache;
23
739731cd
LP
24 bool invalidated:1;
25 bool keep_always:1;
26 bool in_unused:1;
16e9f408 27
16e9f408
LP
28 void *ptr;
29 uint64_t offset;
f8019684
LP
30 size_t size;
31
be7cdd8e 32 MMapFileDescriptor *fd;
16e9f408 33
f8019684
LP
34 LIST_FIELDS(Window, by_fd);
35 LIST_FIELDS(Window, unused);
36
37 LIST_HEAD(Context, contexts);
38};
16e9f408 39
f8019684 40struct Context {
f8019684 41 Window *window;
16e9f408 42
f8019684
LP
43 LIST_FIELDS(Context, by_window);
44};
45
be7cdd8e 46struct MMapFileDescriptor {
f8019684 47 MMapCache *cache;
16e9f408 48 int fd;
104fc4be 49 int prot;
fa6ac760 50 bool sigbus;
f8019684
LP
51 LIST_HEAD(Window, windows);
52};
16e9f408
LP
53
54struct MMapCache {
cf4b2f99 55 unsigned n_ref;
68667801 56 unsigned n_windows;
16e9f408 57
8fc4d1be
YW
58 unsigned n_context_cache_hit;
59 unsigned n_window_list_hit;
60 unsigned n_missed;
bf807d4d 61
f8019684 62 Hashmap *fds;
16e9f408 63
f8019684
LP
64 LIST_HEAD(Window, unused);
65 Window *last_unused;
7580b0d8
VC
66
67 Context contexts[MMAP_CACHE_MAX_CONTEXTS];
16e9f408
LP
68};
69
f8019684 70#define WINDOWS_MIN 64
fad5a6c6 71
349cc4a5 72#if ENABLE_DEBUG_MMAP_CACHE
fad5a6c6
MS
73/* Tiny windows increase mmap activity and the chance of exposing unsafe use. */
74# define WINDOW_SIZE (page_size())
75#else
76# define WINDOW_SIZE (8ULL*1024ULL*1024ULL)
77#endif
16e9f408 78
f8019684
LP
79MMapCache* mmap_cache_new(void) {
80 MMapCache *m;
16e9f408 81
397caa81 82 m = new(MMapCache, 1);
f8019684
LP
83 if (!m)
84 return NULL;
16e9f408 85
397caa81
YW
86 *m = (MMapCache) {
87 .n_ref = 1,
88 };
89
f8019684 90 return m;
16e9f408
LP
91}
92
b38977e5 93static Window* window_unlink(Window *w) {
f8019684 94 assert(w);
16e9f408 95
b38977e5
YW
96 MMapCache *m = mmap_cache_fd_cache(w->fd);
97
f8019684
LP
98 if (w->ptr)
99 munmap(w->ptr, w->size);
16e9f408 100
f8019684 101 if (w->in_unused) {
b38977e5
YW
102 if (m->last_unused == w)
103 m->last_unused = w->unused_prev;
16e9f408 104
b38977e5 105 LIST_REMOVE(unused, m->unused, w);
f65425cb 106 }
16e9f408 107
f8019684
LP
108 LIST_FOREACH(by_window, c, w->contexts) {
109 assert(c->window == w);
110 c->window = NULL;
f65425cb 111 }
b38977e5
YW
112
113 return LIST_REMOVE(by_fd, w->fd->windows, w);
16e9f408
LP
114}
115
fa6ac760
LP
116static void window_invalidate(Window *w) {
117 assert(w);
104fc4be 118 assert(w->fd);
fa6ac760
LP
119
120 if (w->invalidated)
121 return;
122
8fc4d1be
YW
123 /* Replace the window with anonymous pages. This is useful when we hit a SIGBUS and want to make sure
124 * the file cannot trigger any further SIGBUS, possibly overrunning the sigbus queue. */
fa6ac760 125
104fc4be 126 assert_se(mmap(w->ptr, w->size, w->fd->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
fa6ac760
LP
127 w->invalidated = true;
128}
129
b38977e5
YW
130static Window* window_free(Window *w) {
131 if (!w)
132 return NULL;
f65425cb 133
f8019684 134 window_unlink(w);
89de6947 135 w->cache->n_windows--;
b38977e5
YW
136
137 return mfree(w);
f8019684 138}
f65425cb 139
81598f5e 140static bool window_matches(Window *w, MMapFileDescriptor *f, uint64_t offset, size_t size) {
f8019684 141 assert(size > 0);
16e9f408 142
f8019684 143 return
81598f5e
YW
144 w &&
145 f == w->fd &&
f8019684
LP
146 offset >= w->offset &&
147 offset + size <= w->offset + w->size;
16e9f408
LP
148}
149
40f5e6a9
YW
150static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, void *ptr) {
151 MMapCache *m = mmap_cache_fd_cache(f);
f8019684 152 Window *w;
16e9f408 153
f8019684 154 if (!m->last_unused || m->n_windows <= WINDOWS_MIN) {
f8019684 155 /* Allocate a new window */
41ab8c67 156 w = new(Window, 1);
f8019684
LP
157 if (!w)
158 return NULL;
89de6947 159 m->n_windows++;
b38977e5 160 } else
f8019684 161 /* Reuse an existing one */
b38977e5 162 w = window_unlink(m->last_unused);
f8019684 163
41ab8c67
LP
164 *w = (Window) {
165 .cache = m,
166 .fd = f,
41ab8c67
LP
167 .offset = offset,
168 .size = size,
169 .ptr = ptr,
170 };
6a491490 171
40f5e6a9 172 return LIST_PREPEND(by_fd, f->windows, w);
16e9f408
LP
173}
174
7580b0d8 175static void context_detach_window(MMapCache *m, Context *c) {
f8019684 176 Window *w;
16e9f408 177
7580b0d8 178 assert(m);
f8019684 179 assert(c);
16e9f408 180
f8019684 181 if (!c->window)
16e9f408
LP
182 return;
183
ae2a15bc 184 w = TAKE_PTR(c->window);
71fda00f 185 LIST_REMOVE(by_window, w->contexts, c);
16e9f408 186
1b8951e5 187 if (!w->contexts && !w->keep_always) {
f8019684 188 /* Not used anymore? */
349cc4a5 189#if ENABLE_DEBUG_MMAP_CACHE
8fc4d1be 190 /* Unmap unused windows immediately to expose use-after-unmap by SIGSEGV. */
fad5a6c6
MS
191 window_free(w);
192#else
7580b0d8
VC
193 LIST_PREPEND(unused, m->unused, w);
194 if (!m->last_unused)
195 m->last_unused = w;
16e9f408 196
f8019684 197 w->in_unused = true;
fad5a6c6 198#endif
f8019684 199 }
16e9f408
LP
200}
201
7580b0d8
VC
202static void context_attach_window(MMapCache *m, Context *c, Window *w) {
203 assert(m);
f8019684
LP
204 assert(c);
205 assert(w);
16e9f408 206
f8019684 207 if (c->window == w)
16e9f408
LP
208 return;
209
7580b0d8 210 context_detach_window(m, c);
16e9f408 211
e18021f7 212 if (w->in_unused) {
f8019684 213 /* Used again? */
7580b0d8
VC
214 if (m->last_unused == w)
215 m->last_unused = w->unused_prev;
216 LIST_REMOVE(unused, m->unused, w);
16e9f408 217
f8019684
LP
218 w->in_unused = false;
219 }
f65425cb 220
f8019684 221 c->window = w;
71fda00f 222 LIST_PREPEND(by_window, w->contexts, c);
16e9f408
LP
223}
224
b38977e5
YW
225static MMapCache* mmap_cache_free(MMapCache *m) {
226 if (!m)
227 return NULL;
16e9f408 228
b38977e5
YW
229 /* All windows are owned by fds, and each fd takes a reference of MMapCache. So, when this is called,
230 * all fds are already freed, and hence there is no window. */
8e6d9397 231
b38977e5 232 assert(hashmap_isempty(m->fds));
8e6d9397
GM
233 hashmap_free(m->fds);
234
b38977e5
YW
235 assert(!m->unused);
236 assert(m->n_windows == 0);
f8019684 237
8301aa0b 238 return mfree(m);
16e9f408
LP
239}
240
8301aa0b 241DEFINE_TRIVIAL_REF_UNREF_FUNC(MMapCache, mmap_cache, mmap_cache_free);
16e9f408 242
1ed867d3
YW
243static int mmap_try_harder(MMapFileDescriptor *f, void *addr, int flags, uint64_t offset, size_t size, void **ret) {
244 MMapCache *m = mmap_cache_fd_cache(f);
db87967e 245
1ed867d3 246 assert(ret);
db87967e
VC
247
248 for (;;) {
1ed867d3 249 void *ptr;
db87967e 250
104fc4be 251 ptr = mmap(addr, size, f->prot, flags, f->fd, offset);
1ed867d3
YW
252 if (ptr != MAP_FAILED) {
253 *ret = ptr;
254 return 0;
255 }
db87967e 256 if (errno != ENOMEM)
3f0083a2 257 return negative_errno();
db87967e 258
1ed867d3
YW
259 /* When failed with ENOMEM, try again after making a room by freeing an unused window. */
260
261 if (!m->last_unused)
262 return -ENOMEM; /* no free window, propagate the original error. */
db87967e 263
1ed867d3
YW
264 window_free(m->last_unused);
265 }
db87967e
VC
266}
267
f8019684 268static int add_mmap(
be7cdd8e 269 MMapFileDescriptor *f,
16e9f408 270 uint64_t offset,
f8019684 271 size_t size,
fcde2389 272 struct stat *st,
40f5e6a9 273 Window **ret) {
16e9f408 274
16e9f408 275 uint64_t woffset, wsize;
f8019684
LP
276 Window *w;
277 void *d;
16e9f408
LP
278 int r;
279
be7cdd8e 280 assert(f);
16e9f408 281 assert(size > 0);
1b8951e5 282 assert(ret);
16e9f408
LP
283
284 woffset = offset & ~((uint64_t) page_size() - 1ULL);
285 wsize = size + (offset - woffset);
286 wsize = PAGE_ALIGN(wsize);
287
288 if (wsize < WINDOW_SIZE) {
289 uint64_t delta;
290
beec0085 291 delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);
16e9f408
LP
292
293 if (delta > offset)
294 woffset = 0;
295 else
296 woffset -= delta;
297
298 wsize = WINDOW_SIZE;
299 }
300
fcde2389 301 if (st) {
8fc4d1be
YW
302 /* Memory maps that are larger then the files underneath have undefined behavior. Hence,
303 * clamp things to the file size if we know it */
fcde2389
LP
304
305 if (woffset >= (uint64_t) st->st_size)
306 return -EADDRNOTAVAIL;
307
308 if (woffset + wsize > (uint64_t) st->st_size)
309 wsize = PAGE_ALIGN(st->st_size - woffset);
310 }
311
1da2c4ce 312 r = mmap_try_harder(f, NULL, MAP_SHARED, woffset, wsize, &d);
db87967e
VC
313 if (r < 0)
314 return r;
16e9f408 315
40f5e6a9
YW
316 w = window_add(f, woffset, wsize, d);
317 if (!w) {
318 (void) munmap(d, wsize);
319 return -ENOMEM;
320 }
b67ddc7b 321
40f5e6a9
YW
322 *ret = w;
323 return 0;
16e9f408
LP
324}
325
c3bd54bf 326int mmap_cache_fd_get(
be7cdd8e 327 MMapFileDescriptor *f,
16e9f408 328 unsigned context,
fcde2389 329 bool keep_always,
16e9f408 330 uint64_t offset,
f8019684 331 size_t size,
fcde2389 332 struct stat *st,
258190a0 333 void **ret) {
16e9f408 334
40f5e6a9 335 MMapCache *m = mmap_cache_fd_cache(f);
7580b0d8 336 Context *c;
40f5e6a9 337 Window *w;
16e9f408
LP
338 int r;
339
40f5e6a9 340 assert(context < MMAP_CACHE_MAX_CONTEXTS);
16e9f408 341 assert(size > 0);
1b8951e5 342 assert(ret);
40f5e6a9
YW
343
344 if (f->sigbus)
345 return -EIO;
16e9f408 346
7580b0d8
VC
347 c = &f->cache->contexts[context];
348
f8019684 349 /* Check whether the current context is the right one already */
40f5e6a9
YW
350 if (window_matches(c->window, f, offset, size)) {
351 m->n_context_cache_hit++;
352 w = c->window;
353 goto found;
354 }
355
356 /* Drop the reference to the window, since it's unnecessary now */
357 context_detach_window(m, c);
16e9f408 358
f8019684 359 /* Search for a matching mmap */
40f5e6a9
YW
360 LIST_FOREACH(by_fd, i, f->windows)
361 if (window_matches(i, f, offset, size)) {
362 m->n_window_list_hit++;
363 w = i;
364 goto found;
365 }
bf807d4d 366
40f5e6a9 367 m->n_missed++;
16e9f408 368
f8019684 369 /* Create a new mmap */
40f5e6a9
YW
370 r = add_mmap(f, offset, size, st, &w);
371 if (r < 0)
372 return r;
373
374found:
375 w->keep_always = w->keep_always || keep_always;
376 context_attach_window(m, c, w);
377 *ret = (uint8_t*) w->ptr + (offset - w->offset);
378 return 0;
ae97089d
ZJS
379}
380
3a595c59 381void mmap_cache_stats_log_debug(MMapCache *m) {
fa6ac760
LP
382 assert(m);
383
8fc4d1be
YW
384 log_debug("mmap cache statistics: %u context cache hit, %u window list hit, %u miss",
385 m->n_context_cache_hit, m->n_window_list_hit, m->n_missed);
fa6ac760
LP
386}
387
388static void mmap_cache_process_sigbus(MMapCache *m) {
389 bool found = false;
be7cdd8e 390 MMapFileDescriptor *f;
fa6ac760 391 int r;
16e9f408
LP
392
393 assert(m);
16e9f408 394
8fc4d1be 395 /* Iterate through all triggered pages and mark their files as invalidated. */
fa6ac760
LP
396 for (;;) {
397 bool ours;
398 void *addr;
399
400 r = sigbus_pop(&addr);
401 if (_likely_(r == 0))
402 break;
403 if (r < 0) {
404 log_error_errno(r, "SIGBUS handling failed: %m");
405 abort();
406 }
407
408 ours = false;
90e74a66 409 HASHMAP_FOREACH(f, m->fds) {
fa6ac760
LP
410 LIST_FOREACH(by_fd, w, f->windows) {
411 if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
412 (uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
413 found = ours = f->sigbus = true;
414 break;
415 }
416 }
417
418 if (ours)
419 break;
420 }
421
8fc4d1be 422 /* Didn't find a matching window, give up. */
fa6ac760
LP
423 if (!ours) {
424 log_error("Unknown SIGBUS page, aborting.");
425 abort();
426 }
427 }
428
8fc4d1be
YW
429 /* The list of triggered pages is now empty. Now, let's remap all windows of the triggered file to
430 * anonymous maps, so that no page of the file in question is triggered again, so that we can be sure
431 * not to hit the queue size limit. */
fa6ac760 432 if (_likely_(!found))
16e9f408 433 return;
16e9f408 434
90e74a66 435 HASHMAP_FOREACH(f, m->fds) {
fa6ac760
LP
436 if (!f->sigbus)
437 continue;
438
439 LIST_FOREACH(by_fd, w, f->windows)
440 window_invalidate(w);
441 }
f8019684 442}
16e9f408 443
c3bd54bf 444bool mmap_cache_fd_got_sigbus(MMapFileDescriptor *f) {
be7cdd8e 445 assert(f);
bf807d4d 446
1da2c4ce 447 mmap_cache_process_sigbus(f->cache);
fa6ac760 448
fa6ac760 449 return f->sigbus;
bf807d4d
LP
450}
451
8926a6a4
YW
452int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret) {
453 _cleanup_free_ MMapFileDescriptor *f = NULL;
454 MMapFileDescriptor *existing;
be7cdd8e 455 int r;
fa6ac760 456
bf807d4d 457 assert(m);
fa6ac760 458 assert(fd >= 0);
bf807d4d 459
8926a6a4
YW
460 existing = hashmap_get(m->fds, FD_TO_PTR(fd));
461 if (existing) {
8ff0f36e
YW
462 if (existing->prot != prot)
463 return -EEXIST;
8926a6a4
YW
464 if (ret)
465 *ret = existing;
466 return 0;
467 }
be7cdd8e 468
8926a6a4 469 f = new(MMapFileDescriptor, 1);
be7cdd8e 470 if (!f)
8926a6a4
YW
471 return -ENOMEM;
472
473 *f = (MMapFileDescriptor) {
474 .fd = fd,
475 .prot = prot,
476 };
be7cdd8e 477
8926a6a4 478 r = hashmap_ensure_put(&m->fds, NULL, FD_TO_PTR(fd), f);
be7cdd8e 479 if (r < 0)
8926a6a4
YW
480 return r;
481 assert(r > 0);
be7cdd8e 482
fd9ac6c3 483 f->cache = mmap_cache_ref(m);
fd9ac6c3 484
8926a6a4
YW
485 if (ret)
486 *ret = f;
487
488 TAKE_PTR(f);
489 return 1;
be7cdd8e
VC
490}
491
b38977e5
YW
492MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f) {
493 if (!f)
494 return NULL;
be7cdd8e 495
8fc4d1be
YW
496 /* Make sure that any queued SIGBUS are first dispatched, so that we don't end up with a SIGBUS entry
497 * we cannot relate to any existing memory map. */
fa6ac760 498
1da2c4ce 499 mmap_cache_process_sigbus(f->cache);
fa6ac760 500
be7cdd8e
VC
501 while (f->windows)
502 window_free(f->windows);
503
b38977e5 504 assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)) == f);
fa6ac760 505
b38977e5
YW
506 /* Unref the cache at the end. Otherwise, the assertions in mmap_cache_free() may be triggered. */
507 f->cache = mmap_cache_unref(f->cache);
508
509 return mfree(f);
bf807d4d 510}
176bf8b8
VC
511
512MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f) {
513 assert(f);
b38977e5 514 return ASSERT_PTR(f->cache);
176bf8b8 515}