]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
16e9f408 | 2 | |
16e9f408 LP |
3 | #include <errno.h> |
4 | #include <stdlib.h> | |
f8019684 | 5 | #include <sys/mman.h> |
16e9f408 | 6 | |
b5efdb8a | 7 | #include "alloc-util.h" |
2b2fec7d | 8 | #include "errno-util.h" |
23e096cc | 9 | #include "fd-util.h" |
f8019684 LP |
10 | #include "hashmap.h" |
11 | #include "list.h" | |
12 | #include "log.h" | |
f8019684 | 13 | #include "macro.h" |
0a970718 | 14 | #include "memory-util.h" |
16e9f408 | 15 | #include "mmap-cache.h" |
cf0fbc49 | 16 | #include "sigbus.h" |
16e9f408 | 17 | |
f8019684 LP |
18 | typedef struct Window Window; |
19 | typedef struct Context Context; | |
84168d80 | 20 | |
f8019684 LP |
21 | struct Window { |
22 | MMapCache *cache; | |
23 | ||
739731cd LP |
24 | bool invalidated:1; |
25 | bool keep_always:1; | |
26 | bool in_unused:1; | |
16e9f408 | 27 | |
16e9f408 LP |
28 | void *ptr; |
29 | uint64_t offset; | |
f8019684 LP |
30 | size_t size; |
31 | ||
be7cdd8e | 32 | MMapFileDescriptor *fd; |
16e9f408 | 33 | |
f8019684 LP |
34 | LIST_FIELDS(Window, by_fd); |
35 | LIST_FIELDS(Window, unused); | |
36 | ||
37 | LIST_HEAD(Context, contexts); | |
38 | }; | |
16e9f408 | 39 | |
f8019684 | 40 | struct Context { |
f8019684 | 41 | Window *window; |
16e9f408 | 42 | |
f8019684 LP |
43 | LIST_FIELDS(Context, by_window); |
44 | }; | |
45 | ||
be7cdd8e | 46 | struct MMapFileDescriptor { |
f8019684 | 47 | MMapCache *cache; |
16e9f408 | 48 | int fd; |
104fc4be | 49 | int prot; |
fa6ac760 | 50 | bool sigbus; |
f8019684 LP |
51 | LIST_HEAD(Window, windows); |
52 | }; | |
16e9f408 LP |
53 | |
54 | struct MMapCache { | |
cf4b2f99 | 55 | unsigned n_ref; |
68667801 | 56 | unsigned n_windows; |
16e9f408 | 57 | |
8fc4d1be YW |
58 | unsigned n_context_cache_hit; |
59 | unsigned n_window_list_hit; | |
60 | unsigned n_missed; | |
bf807d4d | 61 | |
f8019684 | 62 | Hashmap *fds; |
16e9f408 | 63 | |
f8019684 LP |
64 | LIST_HEAD(Window, unused); |
65 | Window *last_unused; | |
7580b0d8 VC |
66 | |
67 | Context contexts[MMAP_CACHE_MAX_CONTEXTS]; | |
16e9f408 LP |
68 | }; |
69 | ||
f8019684 | 70 | #define WINDOWS_MIN 64 |
fad5a6c6 | 71 | |
349cc4a5 | 72 | #if ENABLE_DEBUG_MMAP_CACHE |
fad5a6c6 MS |
73 | /* Tiny windows increase mmap activity and the chance of exposing unsafe use. */ |
74 | # define WINDOW_SIZE (page_size()) | |
75 | #else | |
76 | # define WINDOW_SIZE (8ULL*1024ULL*1024ULL) | |
77 | #endif | |
16e9f408 | 78 | |
f8019684 LP |
79 | MMapCache* mmap_cache_new(void) { |
80 | MMapCache *m; | |
16e9f408 | 81 | |
397caa81 | 82 | m = new(MMapCache, 1); |
f8019684 LP |
83 | if (!m) |
84 | return NULL; | |
16e9f408 | 85 | |
397caa81 YW |
86 | *m = (MMapCache) { |
87 | .n_ref = 1, | |
88 | }; | |
89 | ||
f8019684 | 90 | return m; |
16e9f408 LP |
91 | } |
92 | ||
b38977e5 | 93 | static Window* window_unlink(Window *w) { |
f8019684 | 94 | assert(w); |
16e9f408 | 95 | |
b38977e5 YW |
96 | MMapCache *m = mmap_cache_fd_cache(w->fd); |
97 | ||
f8019684 LP |
98 | if (w->ptr) |
99 | munmap(w->ptr, w->size); | |
16e9f408 | 100 | |
f8019684 | 101 | if (w->in_unused) { |
b38977e5 YW |
102 | if (m->last_unused == w) |
103 | m->last_unused = w->unused_prev; | |
16e9f408 | 104 | |
b38977e5 | 105 | LIST_REMOVE(unused, m->unused, w); |
f65425cb | 106 | } |
16e9f408 | 107 | |
f8019684 LP |
108 | LIST_FOREACH(by_window, c, w->contexts) { |
109 | assert(c->window == w); | |
110 | c->window = NULL; | |
f65425cb | 111 | } |
b38977e5 YW |
112 | |
113 | return LIST_REMOVE(by_fd, w->fd->windows, w); | |
16e9f408 LP |
114 | } |
115 | ||
fa6ac760 LP |
116 | static void window_invalidate(Window *w) { |
117 | assert(w); | |
104fc4be | 118 | assert(w->fd); |
fa6ac760 LP |
119 | |
120 | if (w->invalidated) | |
121 | return; | |
122 | ||
8fc4d1be YW |
123 | /* Replace the window with anonymous pages. This is useful when we hit a SIGBUS and want to make sure |
124 | * the file cannot trigger any further SIGBUS, possibly overrunning the sigbus queue. */ | |
fa6ac760 | 125 | |
104fc4be | 126 | assert_se(mmap(w->ptr, w->size, w->fd->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr); |
fa6ac760 LP |
127 | w->invalidated = true; |
128 | } | |
129 | ||
b38977e5 YW |
130 | static Window* window_free(Window *w) { |
131 | if (!w) | |
132 | return NULL; | |
f65425cb | 133 | |
f8019684 | 134 | window_unlink(w); |
89de6947 | 135 | w->cache->n_windows--; |
b38977e5 YW |
136 | |
137 | return mfree(w); | |
f8019684 | 138 | } |
f65425cb | 139 | |
81598f5e | 140 | static bool window_matches(Window *w, MMapFileDescriptor *f, uint64_t offset, size_t size) { |
f8019684 | 141 | assert(size > 0); |
16e9f408 | 142 | |
f8019684 | 143 | return |
81598f5e YW |
144 | w && |
145 | f == w->fd && | |
f8019684 LP |
146 | offset >= w->offset && |
147 | offset + size <= w->offset + w->size; | |
16e9f408 LP |
148 | } |
149 | ||
40f5e6a9 YW |
150 | static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, void *ptr) { |
151 | MMapCache *m = mmap_cache_fd_cache(f); | |
f8019684 | 152 | Window *w; |
16e9f408 | 153 | |
f8019684 | 154 | if (!m->last_unused || m->n_windows <= WINDOWS_MIN) { |
f8019684 | 155 | /* Allocate a new window */ |
41ab8c67 | 156 | w = new(Window, 1); |
f8019684 LP |
157 | if (!w) |
158 | return NULL; | |
89de6947 | 159 | m->n_windows++; |
b38977e5 | 160 | } else |
f8019684 | 161 | /* Reuse an existing one */ |
b38977e5 | 162 | w = window_unlink(m->last_unused); |
f8019684 | 163 | |
41ab8c67 LP |
164 | *w = (Window) { |
165 | .cache = m, | |
166 | .fd = f, | |
41ab8c67 LP |
167 | .offset = offset, |
168 | .size = size, | |
169 | .ptr = ptr, | |
170 | }; | |
6a491490 | 171 | |
40f5e6a9 | 172 | return LIST_PREPEND(by_fd, f->windows, w); |
16e9f408 LP |
173 | } |
174 | ||
7580b0d8 | 175 | static void context_detach_window(MMapCache *m, Context *c) { |
f8019684 | 176 | Window *w; |
16e9f408 | 177 | |
7580b0d8 | 178 | assert(m); |
f8019684 | 179 | assert(c); |
16e9f408 | 180 | |
f8019684 | 181 | if (!c->window) |
16e9f408 LP |
182 | return; |
183 | ||
ae2a15bc | 184 | w = TAKE_PTR(c->window); |
71fda00f | 185 | LIST_REMOVE(by_window, w->contexts, c); |
16e9f408 | 186 | |
1b8951e5 | 187 | if (!w->contexts && !w->keep_always) { |
f8019684 | 188 | /* Not used anymore? */ |
349cc4a5 | 189 | #if ENABLE_DEBUG_MMAP_CACHE |
8fc4d1be | 190 | /* Unmap unused windows immediately to expose use-after-unmap by SIGSEGV. */ |
fad5a6c6 MS |
191 | window_free(w); |
192 | #else | |
7580b0d8 VC |
193 | LIST_PREPEND(unused, m->unused, w); |
194 | if (!m->last_unused) | |
195 | m->last_unused = w; | |
16e9f408 | 196 | |
f8019684 | 197 | w->in_unused = true; |
fad5a6c6 | 198 | #endif |
f8019684 | 199 | } |
16e9f408 LP |
200 | } |
201 | ||
7580b0d8 VC |
202 | static void context_attach_window(MMapCache *m, Context *c, Window *w) { |
203 | assert(m); | |
f8019684 LP |
204 | assert(c); |
205 | assert(w); | |
16e9f408 | 206 | |
f8019684 | 207 | if (c->window == w) |
16e9f408 LP |
208 | return; |
209 | ||
7580b0d8 | 210 | context_detach_window(m, c); |
16e9f408 | 211 | |
e18021f7 | 212 | if (w->in_unused) { |
f8019684 | 213 | /* Used again? */ |
7580b0d8 VC |
214 | if (m->last_unused == w) |
215 | m->last_unused = w->unused_prev; | |
216 | LIST_REMOVE(unused, m->unused, w); | |
16e9f408 | 217 | |
f8019684 LP |
218 | w->in_unused = false; |
219 | } | |
f65425cb | 220 | |
f8019684 | 221 | c->window = w; |
71fda00f | 222 | LIST_PREPEND(by_window, w->contexts, c); |
16e9f408 LP |
223 | } |
224 | ||
b38977e5 YW |
225 | static MMapCache* mmap_cache_free(MMapCache *m) { |
226 | if (!m) | |
227 | return NULL; | |
16e9f408 | 228 | |
b38977e5 YW |
229 | /* All windows are owned by fds, and each fd takes a reference of MMapCache. So, when this is called, |
230 | * all fds are already freed, and hence there is no window. */ | |
8e6d9397 | 231 | |
b38977e5 | 232 | assert(hashmap_isempty(m->fds)); |
8e6d9397 GM |
233 | hashmap_free(m->fds); |
234 | ||
b38977e5 YW |
235 | assert(!m->unused); |
236 | assert(m->n_windows == 0); | |
f8019684 | 237 | |
8301aa0b | 238 | return mfree(m); |
16e9f408 LP |
239 | } |
240 | ||
8301aa0b | 241 | DEFINE_TRIVIAL_REF_UNREF_FUNC(MMapCache, mmap_cache, mmap_cache_free); |
16e9f408 | 242 | |
1ed867d3 YW |
243 | static int mmap_try_harder(MMapFileDescriptor *f, void *addr, int flags, uint64_t offset, size_t size, void **ret) { |
244 | MMapCache *m = mmap_cache_fd_cache(f); | |
db87967e | 245 | |
1ed867d3 | 246 | assert(ret); |
db87967e VC |
247 | |
248 | for (;;) { | |
1ed867d3 | 249 | void *ptr; |
db87967e | 250 | |
104fc4be | 251 | ptr = mmap(addr, size, f->prot, flags, f->fd, offset); |
1ed867d3 YW |
252 | if (ptr != MAP_FAILED) { |
253 | *ret = ptr; | |
254 | return 0; | |
255 | } | |
db87967e | 256 | if (errno != ENOMEM) |
3f0083a2 | 257 | return negative_errno(); |
db87967e | 258 | |
1ed867d3 YW |
259 | /* When failed with ENOMEM, try again after making a room by freeing an unused window. */ |
260 | ||
261 | if (!m->last_unused) | |
262 | return -ENOMEM; /* no free window, propagate the original error. */ | |
db87967e | 263 | |
1ed867d3 YW |
264 | window_free(m->last_unused); |
265 | } | |
db87967e VC |
266 | } |
267 | ||
f8019684 | 268 | static int add_mmap( |
be7cdd8e | 269 | MMapFileDescriptor *f, |
16e9f408 | 270 | uint64_t offset, |
f8019684 | 271 | size_t size, |
fcde2389 | 272 | struct stat *st, |
40f5e6a9 | 273 | Window **ret) { |
16e9f408 | 274 | |
16e9f408 | 275 | uint64_t woffset, wsize; |
f8019684 LP |
276 | Window *w; |
277 | void *d; | |
16e9f408 LP |
278 | int r; |
279 | ||
be7cdd8e | 280 | assert(f); |
16e9f408 | 281 | assert(size > 0); |
1b8951e5 | 282 | assert(ret); |
16e9f408 LP |
283 | |
284 | woffset = offset & ~((uint64_t) page_size() - 1ULL); | |
285 | wsize = size + (offset - woffset); | |
286 | wsize = PAGE_ALIGN(wsize); | |
287 | ||
288 | if (wsize < WINDOW_SIZE) { | |
289 | uint64_t delta; | |
290 | ||
beec0085 | 291 | delta = PAGE_ALIGN((WINDOW_SIZE - wsize) / 2); |
16e9f408 LP |
292 | |
293 | if (delta > offset) | |
294 | woffset = 0; | |
295 | else | |
296 | woffset -= delta; | |
297 | ||
298 | wsize = WINDOW_SIZE; | |
299 | } | |
300 | ||
fcde2389 | 301 | if (st) { |
8fc4d1be YW |
302 | /* Memory maps that are larger then the files underneath have undefined behavior. Hence, |
303 | * clamp things to the file size if we know it */ | |
fcde2389 LP |
304 | |
305 | if (woffset >= (uint64_t) st->st_size) | |
306 | return -EADDRNOTAVAIL; | |
307 | ||
308 | if (woffset + wsize > (uint64_t) st->st_size) | |
309 | wsize = PAGE_ALIGN(st->st_size - woffset); | |
310 | } | |
311 | ||
1da2c4ce | 312 | r = mmap_try_harder(f, NULL, MAP_SHARED, woffset, wsize, &d); |
db87967e VC |
313 | if (r < 0) |
314 | return r; | |
16e9f408 | 315 | |
40f5e6a9 YW |
316 | w = window_add(f, woffset, wsize, d); |
317 | if (!w) { | |
318 | (void) munmap(d, wsize); | |
319 | return -ENOMEM; | |
320 | } | |
b67ddc7b | 321 | |
40f5e6a9 YW |
322 | *ret = w; |
323 | return 0; | |
16e9f408 LP |
324 | } |
325 | ||
c3bd54bf | 326 | int mmap_cache_fd_get( |
be7cdd8e | 327 | MMapFileDescriptor *f, |
16e9f408 | 328 | unsigned context, |
fcde2389 | 329 | bool keep_always, |
16e9f408 | 330 | uint64_t offset, |
f8019684 | 331 | size_t size, |
fcde2389 | 332 | struct stat *st, |
258190a0 | 333 | void **ret) { |
16e9f408 | 334 | |
40f5e6a9 | 335 | MMapCache *m = mmap_cache_fd_cache(f); |
7580b0d8 | 336 | Context *c; |
40f5e6a9 | 337 | Window *w; |
16e9f408 LP |
338 | int r; |
339 | ||
40f5e6a9 | 340 | assert(context < MMAP_CACHE_MAX_CONTEXTS); |
16e9f408 | 341 | assert(size > 0); |
1b8951e5 | 342 | assert(ret); |
40f5e6a9 YW |
343 | |
344 | if (f->sigbus) | |
345 | return -EIO; | |
16e9f408 | 346 | |
7580b0d8 VC |
347 | c = &f->cache->contexts[context]; |
348 | ||
f8019684 | 349 | /* Check whether the current context is the right one already */ |
40f5e6a9 YW |
350 | if (window_matches(c->window, f, offset, size)) { |
351 | m->n_context_cache_hit++; | |
352 | w = c->window; | |
353 | goto found; | |
354 | } | |
355 | ||
356 | /* Drop the reference to the window, since it's unnecessary now */ | |
357 | context_detach_window(m, c); | |
16e9f408 | 358 | |
f8019684 | 359 | /* Search for a matching mmap */ |
40f5e6a9 YW |
360 | LIST_FOREACH(by_fd, i, f->windows) |
361 | if (window_matches(i, f, offset, size)) { | |
362 | m->n_window_list_hit++; | |
363 | w = i; | |
364 | goto found; | |
365 | } | |
bf807d4d | 366 | |
40f5e6a9 | 367 | m->n_missed++; |
16e9f408 | 368 | |
f8019684 | 369 | /* Create a new mmap */ |
40f5e6a9 YW |
370 | r = add_mmap(f, offset, size, st, &w); |
371 | if (r < 0) | |
372 | return r; | |
373 | ||
374 | found: | |
375 | w->keep_always = w->keep_always || keep_always; | |
376 | context_attach_window(m, c, w); | |
377 | *ret = (uint8_t*) w->ptr + (offset - w->offset); | |
378 | return 0; | |
ae97089d ZJS |
379 | } |
380 | ||
3a595c59 | 381 | void mmap_cache_stats_log_debug(MMapCache *m) { |
fa6ac760 LP |
382 | assert(m); |
383 | ||
8fc4d1be YW |
384 | log_debug("mmap cache statistics: %u context cache hit, %u window list hit, %u miss", |
385 | m->n_context_cache_hit, m->n_window_list_hit, m->n_missed); | |
fa6ac760 LP |
386 | } |
387 | ||
388 | static void mmap_cache_process_sigbus(MMapCache *m) { | |
389 | bool found = false; | |
be7cdd8e | 390 | MMapFileDescriptor *f; |
fa6ac760 | 391 | int r; |
16e9f408 LP |
392 | |
393 | assert(m); | |
16e9f408 | 394 | |
8fc4d1be | 395 | /* Iterate through all triggered pages and mark their files as invalidated. */ |
fa6ac760 LP |
396 | for (;;) { |
397 | bool ours; | |
398 | void *addr; | |
399 | ||
400 | r = sigbus_pop(&addr); | |
401 | if (_likely_(r == 0)) | |
402 | break; | |
403 | if (r < 0) { | |
404 | log_error_errno(r, "SIGBUS handling failed: %m"); | |
405 | abort(); | |
406 | } | |
407 | ||
408 | ours = false; | |
90e74a66 | 409 | HASHMAP_FOREACH(f, m->fds) { |
fa6ac760 LP |
410 | LIST_FOREACH(by_fd, w, f->windows) { |
411 | if ((uint8_t*) addr >= (uint8_t*) w->ptr && | |
412 | (uint8_t*) addr < (uint8_t*) w->ptr + w->size) { | |
413 | found = ours = f->sigbus = true; | |
414 | break; | |
415 | } | |
416 | } | |
417 | ||
418 | if (ours) | |
419 | break; | |
420 | } | |
421 | ||
8fc4d1be | 422 | /* Didn't find a matching window, give up. */ |
fa6ac760 LP |
423 | if (!ours) { |
424 | log_error("Unknown SIGBUS page, aborting."); | |
425 | abort(); | |
426 | } | |
427 | } | |
428 | ||
8fc4d1be YW |
429 | /* The list of triggered pages is now empty. Now, let's remap all windows of the triggered file to |
430 | * anonymous maps, so that no page of the file in question is triggered again, so that we can be sure | |
431 | * not to hit the queue size limit. */ | |
fa6ac760 | 432 | if (_likely_(!found)) |
16e9f408 | 433 | return; |
16e9f408 | 434 | |
90e74a66 | 435 | HASHMAP_FOREACH(f, m->fds) { |
fa6ac760 LP |
436 | if (!f->sigbus) |
437 | continue; | |
438 | ||
439 | LIST_FOREACH(by_fd, w, f->windows) | |
440 | window_invalidate(w); | |
441 | } | |
f8019684 | 442 | } |
16e9f408 | 443 | |
c3bd54bf | 444 | bool mmap_cache_fd_got_sigbus(MMapFileDescriptor *f) { |
be7cdd8e | 445 | assert(f); |
bf807d4d | 446 | |
1da2c4ce | 447 | mmap_cache_process_sigbus(f->cache); |
fa6ac760 | 448 | |
fa6ac760 | 449 | return f->sigbus; |
bf807d4d LP |
450 | } |
451 | ||
8926a6a4 YW |
452 | int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret) { |
453 | _cleanup_free_ MMapFileDescriptor *f = NULL; | |
454 | MMapFileDescriptor *existing; | |
be7cdd8e | 455 | int r; |
fa6ac760 | 456 | |
bf807d4d | 457 | assert(m); |
fa6ac760 | 458 | assert(fd >= 0); |
bf807d4d | 459 | |
8926a6a4 YW |
460 | existing = hashmap_get(m->fds, FD_TO_PTR(fd)); |
461 | if (existing) { | |
8ff0f36e YW |
462 | if (existing->prot != prot) |
463 | return -EEXIST; | |
8926a6a4 YW |
464 | if (ret) |
465 | *ret = existing; | |
466 | return 0; | |
467 | } | |
be7cdd8e | 468 | |
8926a6a4 | 469 | f = new(MMapFileDescriptor, 1); |
be7cdd8e | 470 | if (!f) |
8926a6a4 YW |
471 | return -ENOMEM; |
472 | ||
473 | *f = (MMapFileDescriptor) { | |
474 | .fd = fd, | |
475 | .prot = prot, | |
476 | }; | |
be7cdd8e | 477 | |
8926a6a4 | 478 | r = hashmap_ensure_put(&m->fds, NULL, FD_TO_PTR(fd), f); |
be7cdd8e | 479 | if (r < 0) |
8926a6a4 YW |
480 | return r; |
481 | assert(r > 0); | |
be7cdd8e | 482 | |
fd9ac6c3 | 483 | f->cache = mmap_cache_ref(m); |
fd9ac6c3 | 484 | |
8926a6a4 YW |
485 | if (ret) |
486 | *ret = f; | |
487 | ||
488 | TAKE_PTR(f); | |
489 | return 1; | |
be7cdd8e VC |
490 | } |
491 | ||
b38977e5 YW |
492 | MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f) { |
493 | if (!f) | |
494 | return NULL; | |
be7cdd8e | 495 | |
8fc4d1be YW |
496 | /* Make sure that any queued SIGBUS are first dispatched, so that we don't end up with a SIGBUS entry |
497 | * we cannot relate to any existing memory map. */ | |
fa6ac760 | 498 | |
1da2c4ce | 499 | mmap_cache_process_sigbus(f->cache); |
fa6ac760 | 500 | |
be7cdd8e VC |
501 | while (f->windows) |
502 | window_free(f->windows); | |
503 | ||
b38977e5 | 504 | assert_se(hashmap_remove(f->cache->fds, FD_TO_PTR(f->fd)) == f); |
fa6ac760 | 505 | |
b38977e5 YW |
506 | /* Unref the cache at the end. Otherwise, the assertions in mmap_cache_free() may be triggered. */ |
507 | f->cache = mmap_cache_unref(f->cache); | |
508 | ||
509 | return mfree(f); | |
bf807d4d | 510 | } |
176bf8b8 VC |
511 | |
512 | MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f) { | |
513 | assert(f); | |
b38977e5 | 514 | return ASSERT_PTR(f->cache); |
176bf8b8 | 515 | } |