]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/libsystemd/sd-journal/mmap-cache.c
8b3f644f2d3230c4eaeba2b825596bad58526f91
[thirdparty/systemd.git] / src / libsystemd / sd-journal / mmap-cache.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <stdlib.h>
5 #include <sys/mman.h>
6
7 #include "alloc-util.h"
8 #include "errno-util.h"
9 #include "fd-util.h"
10 #include "hashmap.h"
11 #include "list.h"
12 #include "log.h"
13 #include "macro.h"
14 #include "memory-util.h"
15 #include "mmap-cache.h"
16 #include "sigbus.h"
17
18 typedef struct Window Window;
19 typedef struct Context Context;
20
21 struct Window {
22 MMapFileDescriptor *fd;
23
24 bool invalidated:1;
25 bool keep_always:1;
26 bool in_unused:1;
27
28 void *ptr;
29 uint64_t offset;
30 size_t size;
31
32 LIST_FIELDS(Window, windows);
33 LIST_FIELDS(Window, unused);
34
35 LIST_HEAD(Context, contexts);
36 };
37
38 struct Context {
39 Window *window;
40
41 LIST_FIELDS(Context, by_window);
42 };
43
44 struct MMapFileDescriptor {
45 MMapCache *cache;
46 int fd;
47 int prot;
48 bool sigbus;
49 LIST_HEAD(Window, windows);
50 };
51
52 struct MMapCache {
53 unsigned n_ref;
54 unsigned n_windows;
55
56 unsigned n_context_cache_hit;
57 unsigned n_window_list_hit;
58 unsigned n_missed;
59
60 Hashmap *fds;
61
62 LIST_HEAD(Window, unused);
63 Window *last_unused;
64
65 Context contexts[MMAP_CACHE_MAX_CONTEXTS];
66 };
67
68 #define WINDOWS_MIN 64
69
70 #if ENABLE_DEBUG_MMAP_CACHE
71 /* Tiny windows increase mmap activity and the chance of exposing unsafe use. */
72 # define WINDOW_SIZE (page_size())
73 #else
74 # define WINDOW_SIZE (8ULL*1024ULL*1024ULL)
75 #endif
76
77 MMapCache* mmap_cache_new(void) {
78 MMapCache *m;
79
80 m = new(MMapCache, 1);
81 if (!m)
82 return NULL;
83
84 *m = (MMapCache) {
85 .n_ref = 1,
86 };
87
88 return m;
89 }
90
91 static Window* window_unlink(Window *w) {
92 assert(w);
93
94 MMapCache *m = mmap_cache_fd_cache(w->fd);
95
96 if (w->ptr)
97 munmap(w->ptr, w->size);
98
99 if (w->in_unused) {
100 if (m->last_unused == w)
101 m->last_unused = w->unused_prev;
102
103 LIST_REMOVE(unused, m->unused, w);
104 }
105
106 LIST_FOREACH(by_window, c, w->contexts) {
107 assert(c->window == w);
108 c->window = NULL;
109 }
110
111 return LIST_REMOVE(windows, w->fd->windows, w);
112 }
113
114 static void window_invalidate(Window *w) {
115 assert(w);
116 assert(w->fd);
117
118 if (w->invalidated)
119 return;
120
121 /* Replace the window with anonymous pages. This is useful when we hit a SIGBUS and want to make sure
122 * the file cannot trigger any further SIGBUS, possibly overrunning the sigbus queue. */
123
124 assert_se(mmap(w->ptr, w->size, w->fd->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
125 w->invalidated = true;
126 }
127
128 static Window* window_free(Window *w) {
129 if (!w)
130 return NULL;
131
132 window_unlink(w);
133 w->fd->cache->n_windows--;
134
135 return mfree(w);
136 }
137
138 static bool window_matches(Window *w, MMapFileDescriptor *f, uint64_t offset, size_t size) {
139 assert(size > 0);
140
141 return
142 w &&
143 f == w->fd &&
144 offset >= w->offset &&
145 offset + size <= w->offset + w->size;
146 }
147
148 static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, void *ptr) {
149 MMapCache *m = mmap_cache_fd_cache(f);
150 Window *w;
151
152 if (!m->last_unused || m->n_windows <= WINDOWS_MIN) {
153 /* Allocate a new window */
154 w = new(Window, 1);
155 if (!w)
156 return NULL;
157 m->n_windows++;
158 } else
159 /* Reuse an existing one */
160 w = window_unlink(m->last_unused);
161
162 *w = (Window) {
163 .fd = f,
164 .offset = offset,
165 .size = size,
166 .ptr = ptr,
167 };
168
169 return LIST_PREPEND(windows, f->windows, w);
170 }
171
172 static void context_detach_window(MMapCache *m, Context *c) {
173 Window *w;
174
175 assert(m);
176 assert(c);
177
178 if (!c->window)
179 return;
180
181 w = TAKE_PTR(c->window);
182 LIST_REMOVE(by_window, w->contexts, c);
183
184 if (!w->contexts && !w->keep_always) {
185 /* Not used anymore? */
186 #if ENABLE_DEBUG_MMAP_CACHE
187 /* Unmap unused windows immediately to expose use-after-unmap by SIGSEGV. */
188 window_free(w);
189 #else
190 LIST_PREPEND(unused, m->unused, w);
191 if (!m->last_unused)
192 m->last_unused = w;
193
194 w->in_unused = true;
195 #endif
196 }
197 }
198
199 static void context_attach_window(MMapCache *m, Context *c, Window *w) {
200 assert(m);
201 assert(c);
202 assert(w);
203
204 if (c->window == w)
205 return;
206
207 context_detach_window(m, c);
208
209 if (w->in_unused) {
210 /* Used again? */
211 if (m->last_unused == w)
212 m->last_unused = w->unused_prev;
213 LIST_REMOVE(unused, m->unused, w);
214
215 w->in_unused = false;
216 }
217
218 c->window = w;
219 LIST_PREPEND(by_window, w->contexts, c);
220 }
221
222 static MMapCache* mmap_cache_free(MMapCache *m) {
223 if (!m)
224 return NULL;
225
226 /* All windows are owned by fds, and each fd takes a reference of MMapCache. So, when this is called,
227 * all fds are already freed, and hence there is no window. */
228
229 assert(hashmap_isempty(m->fds));
230 hashmap_free(m->fds);
231
232 assert(!m->unused);
233 assert(m->n_windows == 0);
234
235 return mfree(m);
236 }
237
238 DEFINE_TRIVIAL_REF_UNREF_FUNC(MMapCache, mmap_cache, mmap_cache_free);
239
240 static int mmap_try_harder(MMapFileDescriptor *f, void *addr, int flags, uint64_t offset, size_t size, void **ret) {
241 MMapCache *m = mmap_cache_fd_cache(f);
242
243 assert(ret);
244
245 for (;;) {
246 void *ptr;
247
248 ptr = mmap(addr, size, f->prot, flags, f->fd, offset);
249 if (ptr != MAP_FAILED) {
250 *ret = ptr;
251 return 0;
252 }
253 if (errno != ENOMEM)
254 return negative_errno();
255
256 /* When failed with ENOMEM, try again after making a room by freeing an unused window. */
257
258 if (!m->last_unused)
259 return -ENOMEM; /* no free window, propagate the original error. */
260
261 window_free(m->last_unused);
262 }
263 }
264
265 static int add_mmap(
266 MMapFileDescriptor *f,
267 uint64_t offset,
268 size_t size,
269 struct stat *st,
270 Window **ret) {
271
272 uint64_t woffset, wsize;
273 Window *w;
274 void *d;
275 int r;
276
277 assert(f);
278 assert(size > 0);
279 assert(ret);
280
281 woffset = offset & ~((uint64_t) page_size() - 1ULL);
282 wsize = size + (offset - woffset);
283 wsize = PAGE_ALIGN(wsize);
284
285 if (wsize < WINDOW_SIZE) {
286 uint64_t delta;
287
288 delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2);
289
290 if (delta > offset)
291 woffset = 0;
292 else
293 woffset -= delta;
294
295 wsize = WINDOW_SIZE;
296 }
297
298 if (st) {
299 /* Memory maps that are larger then the files underneath have undefined behavior. Hence,
300 * clamp things to the file size if we know it */
301
302 if (woffset >= (uint64_t) st->st_size)
303 return -EADDRNOTAVAIL;
304
305 if (woffset + wsize > (uint64_t) st->st_size)
306 wsize = PAGE_ALIGN(st->st_size - woffset);
307 }
308
309 r = mmap_try_harder(f, NULL, MAP_SHARED, woffset, wsize, &d);
310 if (r < 0)
311 return r;
312
313 w = window_add(f, woffset, wsize, d);
314 if (!w) {
315 (void) munmap(d, wsize);
316 return -ENOMEM;
317 }
318
319 *ret = w;
320 return 0;
321 }
322
323 int mmap_cache_fd_get(
324 MMapFileDescriptor *f,
325 unsigned context,
326 bool keep_always,
327 uint64_t offset,
328 size_t size,
329 struct stat *st,
330 void **ret) {
331
332 MMapCache *m = mmap_cache_fd_cache(f);
333 Context *c;
334 Window *w;
335 int r;
336
337 assert(context < MMAP_CACHE_MAX_CONTEXTS);
338 assert(size > 0);
339 assert(ret);
340
341 if (f->sigbus)
342 return -EIO;
343
344 c = &f->cache->contexts[context];
345
346 /* Check whether the current context is the right one already */
347 if (window_matches(c->window, f, offset, size)) {
348 m->n_context_cache_hit++;
349 w = c->window;
350 goto found;
351 }
352
353 /* Drop the reference to the window, since it's unnecessary now */
354 context_detach_window(m, c);
355
356 /* Search for a matching mmap */
357 LIST_FOREACH(windows, i, f->windows)
358 if (window_matches(i, f, offset, size)) {
359 m->n_window_list_hit++;
360 w = i;
361 goto found;
362 }
363
364 m->n_missed++;
365
366 /* Create a new mmap */
367 r = add_mmap(f, offset, size, st, &w);
368 if (r < 0)
369 return r;
370
371 found:
372 w->keep_always = w->keep_always || keep_always;
373 context_attach_window(m, c, w);
374 *ret = (uint8_t*) w->ptr + (offset - w->offset);
375 return 0;
376 }
377
378 void mmap_cache_stats_log_debug(MMapCache *m) {
379 assert(m);
380
381 log_debug("mmap cache statistics: %u context cache hit, %u window list hit, %u miss",
382 m->n_context_cache_hit, m->n_window_list_hit, m->n_missed);
383 }
384
385 static void mmap_cache_process_sigbus(MMapCache *m) {
386 bool found = false;
387 MMapFileDescriptor *f;
388 int r;
389
390 assert(m);
391
392 /* Iterate through all triggered pages and mark their files as invalidated. */
393 for (;;) {
394 bool ours;
395 void *addr;
396
397 r = sigbus_pop(&addr);
398 if (_likely_(r == 0))
399 break;
400 if (r < 0) {
401 log_error_errno(r, "SIGBUS handling failed: %m");
402 abort();
403 }
404
405 ours = false;
406 HASHMAP_FOREACH(f, m->fds) {
407 LIST_FOREACH(windows, w, f->windows) {
408 if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
409 (uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
410 found = ours = f->sigbus = true;
411 break;
412 }
413 }
414
415 if (ours)
416 break;
417 }
418
419 /* Didn't find a matching window, give up. */
420 if (!ours) {
421 log_error("Unknown SIGBUS page, aborting.");
422 abort();
423 }
424 }
425
426 /* The list of triggered pages is now empty. Now, let's remap all windows of the triggered file to
427 * anonymous maps, so that no page of the file in question is triggered again, so that we can be sure
428 * not to hit the queue size limit. */
429 if (_likely_(!found))
430 return;
431
432 HASHMAP_FOREACH(f, m->fds) {
433 if (!f->sigbus)
434 continue;
435
436 LIST_FOREACH(windows, w, f->windows)
437 window_invalidate(w);
438 }
439 }
440
441 bool mmap_cache_fd_got_sigbus(MMapFileDescriptor *f) {
442 assert(f);
443
444 mmap_cache_process_sigbus(f->cache);
445
446 return f->sigbus;
447 }
448
449 int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret) {
450 _cleanup_free_ MMapFileDescriptor *f = NULL;
451 MMapFileDescriptor *existing;
452 int r;
453
454 assert(m);
455 assert(fd >= 0);
456
457 existing = hashmap_get(m->fds, FD_TO_PTR(fd));
458 if (existing) {
459 if (existing->prot != prot)
460 return -EEXIST;
461 if (ret)
462 *ret = existing;
463 return 0;
464 }
465
466 f = new(MMapFileDescriptor, 1);
467 if (!f)
468 return -ENOMEM;
469
470 *f = (MMapFileDescriptor) {
471 .fd = fd,
472 .prot = prot,
473 };
474
475 r = hashmap_ensure_put(&m->fds, NULL, FD_TO_PTR(fd), f);
476 if (r < 0)
477 return r;
478 assert(r > 0);
479
480 f->cache = mmap_cache_ref(m);
481
482 if (ret)
483 *ret = f;
484
485 TAKE_PTR(f);
486 return 1;
487 }
488
489 MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f) {
490 if (!f)
491 return NULL;
492
493 /* Make sure that any queued SIGBUS are first dispatched, so that we don't end up with a SIGBUS entry
494 * we cannot relate to any existing memory map. */
495
496 mmap_cache_process_sigbus(f->cache);
497
498 while (f->windows)
499 window_free(f->windows);
500
501 assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)) == f);
502
503 /* Unref the cache at the end. Otherwise, the assertions in mmap_cache_free() may be triggered. */
504 f->cache = mmap_cache_unref(f->cache);
505
506 return mfree(f);
507 }
508
509 MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f) {
510 assert(f);
511 return ASSERT_PTR(f->cache);
512 }