]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/namespace.c
service: add the ability for units to join other unit's PrivateNetwork= and PrivateTm...
[thirdparty/systemd.git] / src / core / namespace.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <sys/mount.h>
24 #include <string.h>
25 #include <stdio.h>
26 #include <unistd.h>
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #include <sched.h>
30 #include <sys/syscall.h>
31 #include <limits.h>
32 #include <linux/fs.h>
33 #include <sys/file.h>
34
35 #include "strv.h"
36 #include "util.h"
37 #include "path-util.h"
38 #include "namespace.h"
39 #include "missing.h"
40 #include "execute.h"
41 #include "loopback-setup.h"
42
43 typedef enum MountMode {
44 /* This is ordered by priority! */
45 INACCESSIBLE,
46 READONLY,
47 PRIVATE_TMP,
48 PRIVATE_VAR_TMP,
49 READWRITE
50 } MountMode;
51
52 typedef struct BindMount {
53 const char *path;
54 MountMode mode;
55 bool done;
56 bool ignore;
57 } BindMount;
58
59 static int append_mounts(BindMount **p, char **strv, MountMode mode) {
60 char **i;
61
62 assert(p);
63
64 STRV_FOREACH(i, strv) {
65
66 (*p)->ignore = false;
67
68 if ((mode == INACCESSIBLE || mode == READONLY) && (*i)[0] == '-') {
69 (*p)->ignore = true;
70 (*i)++;
71 }
72
73 if (!path_is_absolute(*i))
74 return -EINVAL;
75
76 (*p)->path = *i;
77 (*p)->mode = mode;
78 (*p)++;
79 }
80
81 return 0;
82 }
83
84 static int mount_path_compare(const void *a, const void *b) {
85 const BindMount *p = a, *q = b;
86
87 if (path_equal(p->path, q->path)) {
88
89 /* If the paths are equal, check the mode */
90 if (p->mode < q->mode)
91 return -1;
92
93 if (p->mode > q->mode)
94 return 1;
95
96 return 0;
97 }
98
99 /* If the paths are not equal, then order prefixes first */
100 if (path_startswith(p->path, q->path))
101 return 1;
102
103 if (path_startswith(q->path, p->path))
104 return -1;
105
106 return 0;
107 }
108
109 static void drop_duplicates(BindMount *m, unsigned *n) {
110 BindMount *f, *t, *previous;
111
112 assert(m);
113 assert(n);
114
115 for (f = m, t = m, previous = NULL; f < m+*n; f++) {
116
117 /* The first one wins */
118 if (previous && path_equal(f->path, previous->path))
119 continue;
120
121 t->path = f->path;
122 t->mode = f->mode;
123
124 previous = t;
125
126 t++;
127 }
128
129 *n = t - m;
130 }
131
132 static int apply_mount(
133 BindMount *m,
134 const char *tmp_dir,
135 const char *var_tmp_dir) {
136
137 const char *what;
138 int r;
139
140 assert(m);
141
142 switch (m->mode) {
143
144 case INACCESSIBLE:
145 what = "/run/systemd/inaccessible";
146 break;
147
148 case READONLY:
149 case READWRITE:
150 what = m->path;
151 break;
152
153 case PRIVATE_TMP:
154 what = tmp_dir;
155 break;
156
157 case PRIVATE_VAR_TMP:
158 what = var_tmp_dir;
159 break;
160
161 default:
162 assert_not_reached("Unknown mode");
163 }
164
165 assert(what);
166
167 r = mount(what, m->path, NULL, MS_BIND|MS_REC, NULL);
168 if (r >= 0)
169 log_debug("Successfully mounted %s to %s", what, m->path);
170 else if (m->ignore && errno == ENOENT)
171 r = 0;
172
173 return r;
174 }
175
176 static int make_read_only(BindMount *m) {
177 int r;
178
179 assert(m);
180
181 if (m->mode != INACCESSIBLE && m->mode != READONLY)
182 return 0;
183
184 r = mount(NULL, m->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL);
185 if (r < 0 && !(m->ignore && errno == ENOENT))
186 return -errno;
187
188 return 0;
189 }
190
191 int setup_namespace(
192 char** read_write_dirs,
193 char** read_only_dirs,
194 char** inaccessible_dirs,
195 char* tmp_dir,
196 char* var_tmp_dir,
197 unsigned mount_flags) {
198
199 BindMount *m, *mounts = NULL;
200 unsigned n;
201 int r = 0;
202
203 if (mount_flags == 0)
204 mount_flags = MS_SHARED;
205
206 if (unshare(CLONE_NEWNS) < 0)
207 return -errno;
208
209 n = !!tmp_dir + !!var_tmp_dir +
210 strv_length(read_write_dirs) +
211 strv_length(read_only_dirs) +
212 strv_length(inaccessible_dirs);
213
214 if (n > 0) {
215 m = mounts = (BindMount *) alloca(n * sizeof(BindMount));
216 r = append_mounts(&m, read_write_dirs, READWRITE);
217 if (r < 0)
218 return r;
219
220 r = append_mounts(&m, read_only_dirs, READONLY);
221 if (r < 0)
222 return r;
223
224 r = append_mounts(&m, inaccessible_dirs, INACCESSIBLE);
225 if (r < 0)
226 return r;
227
228 if (tmp_dir) {
229 m->path = "/tmp";
230 m->mode = PRIVATE_TMP;
231 m++;
232 }
233
234 if (var_tmp_dir) {
235 m->path = "/var/tmp";
236 m->mode = PRIVATE_VAR_TMP;
237 m++;
238 }
239
240 assert(mounts + n == m);
241
242 qsort(mounts, n, sizeof(BindMount), mount_path_compare);
243 drop_duplicates(mounts, &n);
244 }
245
246 /* Remount / as SLAVE so that nothing now mounted in the namespace
247 shows up in the parent */
248 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
249 return -errno;
250
251 for (m = mounts; m < mounts + n; ++m) {
252 r = apply_mount(m, tmp_dir, var_tmp_dir);
253 if (r < 0)
254 goto fail;
255 }
256
257 for (m = mounts; m < mounts + n; ++m) {
258 r = make_read_only(m);
259 if (r < 0)
260 goto fail;
261 }
262
263 /* Remount / as the desired mode */
264 if (mount(NULL, "/", NULL, mount_flags | MS_REC, NULL) < 0) {
265 r = -errno;
266 goto fail;
267 }
268
269 return 0;
270
271 fail:
272 for (m = mounts; m < mounts + n; ++m)
273 if (m->done)
274 umount2(m->path, MNT_DETACH);
275
276 return r;
277 }
278
279 static int setup_one_tmp_dir(const char *id, const char *prefix, char **path) {
280 _cleanup_free_ char *x = NULL;
281
282 assert(id);
283 assert(prefix);
284 assert(path);
285
286 x = strjoin(prefix, "/systemd-", id, "-XXXXXX", NULL);
287 if (!x)
288 return -ENOMEM;
289
290 RUN_WITH_UMASK(0077)
291 if (!mkdtemp(x))
292 return -errno;
293
294 RUN_WITH_UMASK(0000) {
295 char *y;
296
297 y = strappenda(x, "/tmp");
298
299 if (mkdir(y, 0777 | S_ISVTX) < 0)
300 return -errno;
301 }
302
303 *path = x;
304 x = NULL;
305
306 return 0;
307 }
308
309 int setup_tmp_dirs(const char *id, char **tmp_dir, char **var_tmp_dir) {
310 char *a, *b;
311 int r;
312
313 assert(id);
314 assert(tmp_dir);
315 assert(var_tmp_dir);
316
317 r = setup_one_tmp_dir(id, "/tmp", &a);
318 if (r < 0)
319 return r;
320
321 r = setup_one_tmp_dir(id, "/var/tmp", &b);
322 if (r < 0) {
323 char *t;
324
325 t = strappenda(a, "/tmp");
326 rmdir(t);
327 rmdir(a);
328
329 free(a);
330 return r;
331 }
332
333 *tmp_dir = a;
334 *var_tmp_dir = b;
335
336 return 0;
337 }
338
339 int setup_netns(int netns_storage_socket[2]) {
340 _cleanup_close_ int netns = -1;
341 union {
342 struct cmsghdr cmsghdr;
343 uint8_t buf[CMSG_SPACE(sizeof(int))];
344 } control = {};
345 struct msghdr mh = {
346 .msg_control = &control,
347 .msg_controllen = sizeof(control),
348 };
349 struct cmsghdr *cmsg;
350 int r;
351
352 assert(netns_storage_socket);
353 assert(netns_storage_socket[0] >= 0);
354 assert(netns_storage_socket[1] >= 0);
355
356 /* We use the passed socketpair as a storage buffer for our
357 * namespace socket. Whatever process runs this first shall
358 * create a new namespace, all others should just join it. To
359 * serialize that we use a file lock on the socket pair.
360 *
361 * It's a bit crazy, but hey, works great! */
362
363 if (lockf(netns_storage_socket[0], F_LOCK, 0) < 0)
364 return -errno;
365
366 if (recvmsg(netns_storage_socket[0], &mh, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) < 0) {
367 if (errno != EAGAIN) {
368 r = -errno;
369 goto fail;
370 }
371
372 /* Nothing stored yet, so let's create a new namespace */
373
374 if (unshare(CLONE_NEWNET) < 0) {
375 r = -errno;
376 goto fail;
377 }
378
379 loopback_setup();
380
381 netns = open("/proc/self/ns/net", O_RDONLY|O_CLOEXEC|O_NOCTTY);
382 if (netns < 0) {
383 r = -errno;
384 goto fail;
385 }
386
387 r = 1;
388 } else {
389 /* Yay, found something, so let's join the namespace */
390
391 for (cmsg = CMSG_FIRSTHDR(&mh); cmsg; cmsg = CMSG_NXTHDR(&mh, cmsg)) {
392 if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
393 assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
394 netns = *(int*) CMSG_DATA(cmsg);
395 }
396 }
397
398 if (setns(netns, CLONE_NEWNET) < 0) {
399 r = -errno;
400 goto fail;
401 }
402
403 r = 0;
404 }
405
406 cmsg = CMSG_FIRSTHDR(&mh);
407 cmsg->cmsg_level = SOL_SOCKET;
408 cmsg->cmsg_type = SCM_RIGHTS;
409 cmsg->cmsg_len = CMSG_LEN(sizeof(int));
410 memcpy(CMSG_DATA(cmsg), &netns, sizeof(int));
411 mh.msg_controllen = cmsg->cmsg_len;
412
413 if (sendmsg(netns_storage_socket[1], &mh, MSG_DONTWAIT|MSG_NOSIGNAL) < 0) {
414 r = -errno;
415 goto fail;
416 }
417
418 fail:
419 lockf(netns_storage_socket[0], F_ULOCK, 0);
420
421 return r;
422 }