]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/basic/namespace-util.c
a87a875943c7dfcd8dba681dd7cea48e2d7f0d4a
[thirdparty/systemd.git] / src / basic / namespace-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <fcntl.h>
4 #include <sys/ioctl.h>
5 #include <sys/mount.h>
6
7 #include "errno-util.h"
8 #include "fd-util.h"
9 #include "fileio.h"
10 #include "missing_fs.h"
11 #include "missing_magic.h"
12 #include "missing_sched.h"
13 #include "namespace-util.h"
14 #include "process-util.h"
15 #include "stat-util.h"
16 #include "stdio-util.h"
17 #include "user-util.h"
18
19 const struct namespace_info namespace_info[] = {
20 [NAMESPACE_CGROUP] = { "cgroup", "ns/cgroup", CLONE_NEWCGROUP, },
21 [NAMESPACE_IPC] = { "ipc", "ns/ipc", CLONE_NEWIPC, },
22 [NAMESPACE_NET] = { "net", "ns/net", CLONE_NEWNET, },
23 /* So, the mount namespace flag is called CLONE_NEWNS for historical
24 * reasons. Let's expose it here under a more explanatory name: "mnt".
25 * This is in-line with how the kernel exposes namespaces in /proc/$PID/ns. */
26 [NAMESPACE_MOUNT] = { "mnt", "ns/mnt", CLONE_NEWNS, },
27 [NAMESPACE_PID] = { "pid", "ns/pid", CLONE_NEWPID, },
28 [NAMESPACE_USER] = { "user", "ns/user", CLONE_NEWUSER, },
29 [NAMESPACE_UTS] = { "uts", "ns/uts", CLONE_NEWUTS, },
30 [NAMESPACE_TIME] = { "time", "ns/time", CLONE_NEWTIME, },
31 { /* Allow callers to iterate over the array without using _NAMESPACE_TYPE_MAX. */ },
32 };
33
34 #define pid_namespace_path(pid, type) procfs_file_alloca(pid, namespace_info[type].proc_path)
35
36 int namespace_open(pid_t pid, int *pidns_fd, int *mntns_fd, int *netns_fd, int *userns_fd, int *root_fd) {
37 _cleanup_close_ int pidnsfd = -1, mntnsfd = -1, netnsfd = -1, usernsfd = -1;
38 int rfd = -1;
39
40 assert(pid >= 0);
41
42 if (mntns_fd) {
43 const char *mntns;
44
45 mntns = pid_namespace_path(pid, NAMESPACE_MOUNT);
46 mntnsfd = open(mntns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
47 if (mntnsfd < 0)
48 return -errno;
49 }
50
51 if (pidns_fd) {
52 const char *pidns;
53
54 pidns = pid_namespace_path(pid, NAMESPACE_PID);
55 pidnsfd = open(pidns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
56 if (pidnsfd < 0)
57 return -errno;
58 }
59
60 if (netns_fd) {
61 const char *netns;
62
63 netns = pid_namespace_path(pid, NAMESPACE_NET);
64 netnsfd = open(netns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
65 if (netnsfd < 0)
66 return -errno;
67 }
68
69 if (userns_fd) {
70 const char *userns;
71
72 userns = pid_namespace_path(pid, NAMESPACE_USER);
73 usernsfd = open(userns, O_RDONLY|O_NOCTTY|O_CLOEXEC);
74 if (usernsfd < 0 && errno != ENOENT)
75 return -errno;
76 }
77
78 if (root_fd) {
79 const char *root;
80
81 root = procfs_file_alloca(pid, "root");
82 rfd = open(root, O_RDONLY|O_NOCTTY|O_CLOEXEC|O_DIRECTORY);
83 if (rfd < 0)
84 return -errno;
85 }
86
87 if (pidns_fd)
88 *pidns_fd = TAKE_FD(pidnsfd);
89
90 if (mntns_fd)
91 *mntns_fd = TAKE_FD(mntnsfd);
92
93 if (netns_fd)
94 *netns_fd = TAKE_FD(netnsfd);
95
96 if (userns_fd)
97 *userns_fd = TAKE_FD(usernsfd);
98
99 if (root_fd)
100 *root_fd = TAKE_FD(rfd);
101
102 return 0;
103 }
104
105 int namespace_enter(int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd) {
106 int r;
107
108 if (userns_fd >= 0) {
109 /* Can't setns to your own userns, since then you could escalate from non-root to root in
110 * your own namespace, so check if namespaces are equal before attempting to enter. */
111
112 r = files_same(FORMAT_PROC_FD_PATH(userns_fd), "/proc/self/ns/user", 0);
113 if (r < 0)
114 return r;
115 if (r)
116 userns_fd = -1;
117 }
118
119 if (pidns_fd >= 0)
120 if (setns(pidns_fd, CLONE_NEWPID) < 0)
121 return -errno;
122
123 if (mntns_fd >= 0)
124 if (setns(mntns_fd, CLONE_NEWNS) < 0)
125 return -errno;
126
127 if (netns_fd >= 0)
128 if (setns(netns_fd, CLONE_NEWNET) < 0)
129 return -errno;
130
131 if (userns_fd >= 0)
132 if (setns(userns_fd, CLONE_NEWUSER) < 0)
133 return -errno;
134
135 if (root_fd >= 0) {
136 if (fchdir(root_fd) < 0)
137 return -errno;
138
139 if (chroot(".") < 0)
140 return -errno;
141 }
142
143 return reset_uid_gid();
144 }
145
146 int fd_is_ns(int fd, unsigned long nsflag) {
147 struct statfs s;
148 int r;
149
150 /* Checks whether the specified file descriptor refers to a namespace created by specifying nsflag in clone().
151 * On old kernels there's no nice way to detect that, hence on those we'll return a recognizable error (EUCLEAN),
152 * so that callers can handle this somewhat nicely.
153 *
154 * This function returns > 0 if the fd definitely refers to a network namespace, 0 if it definitely does not
155 * refer to a network namespace, -EUCLEAN if we can't determine, and other negative error codes on error. */
156
157 if (fstatfs(fd, &s) < 0)
158 return -errno;
159
160 if (!is_fs_type(&s, NSFS_MAGIC)) {
161 /* On really old kernels, there was no "nsfs", and network namespace sockets belonged to procfs
162 * instead. Handle that in a somewhat smart way. */
163
164 if (is_fs_type(&s, PROC_SUPER_MAGIC)) {
165 struct statfs t;
166
167 /* OK, so it is procfs. Let's see if our own network namespace is procfs, too. If so, then the
168 * passed fd might refer to a network namespace, but we can't know for sure. In that case,
169 * return a recognizable error. */
170
171 if (statfs("/proc/self/ns/net", &t) < 0)
172 return -errno;
173
174 if (s.f_type == t.f_type)
175 return -EUCLEAN; /* It's possible, we simply don't know */
176 }
177
178 return 0; /* No! */
179 }
180
181 r = ioctl(fd, NS_GET_NSTYPE);
182 if (r < 0) {
183 if (errno == ENOTTY) /* Old kernels didn't know this ioctl, let's also return a recognizable error in that case */
184 return -EUCLEAN;
185
186 return -errno;
187 }
188
189 return (unsigned long) r == nsflag;
190 }
191
192 int detach_mount_namespace(void) {
193
194 /* Detaches the mount namespace, disabling propagation from our namespace to the host */
195
196 if (unshare(CLONE_NEWNS) < 0)
197 return -errno;
198
199 return RET_NERRNO(mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL));
200 }
201
202 int userns_acquire(const char *uid_map, const char *gid_map) {
203 char path[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(pid_t) + 1];
204 _cleanup_(sigkill_waitp) pid_t pid = 0;
205 _cleanup_close_ int userns_fd = -1;
206 int r;
207
208 assert(uid_map);
209 assert(gid_map);
210
211 /* Forks off a process in a new userns, configures the specified uidmap/gidmap, acquires an fd to it,
212 * and then kills the process again. This way we have a userns fd that is not bound to any
213 * process. We can use that for file system mounts and similar. */
214
215 r = safe_fork("(sd-mkuserns)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_NEW_USERNS, &pid);
216 if (r < 0)
217 return r;
218 if (r == 0)
219 /* Child. We do nothing here, just freeze until somebody kills us. */
220 freeze();
221
222 xsprintf(path, "/proc/" PID_FMT "/uid_map", pid);
223 r = write_string_file(path, uid_map, WRITE_STRING_FILE_DISABLE_BUFFER);
224 if (r < 0)
225 return log_error_errno(r, "Failed to write UID map: %m");
226
227 xsprintf(path, "/proc/" PID_FMT "/gid_map", pid);
228 r = write_string_file(path, gid_map, WRITE_STRING_FILE_DISABLE_BUFFER);
229 if (r < 0)
230 return log_error_errno(r, "Failed to write GID map: %m");
231
232 r = namespace_open(pid, NULL, NULL, NULL, &userns_fd, NULL);
233 if (r < 0)
234 return log_error_errno(r, "Failed to open userns fd: %m");
235
236 return TAKE_FD(userns_fd);
237
238 }
239
240 int in_same_namespace(pid_t pid1, pid_t pid2, NamespaceType type) {
241 const char *ns_path;
242 struct stat ns_st1, ns_st2;
243
244 if (pid1 == 0)
245 pid1 = getpid_cached();
246
247 if (pid2 == 0)
248 pid2 = getpid_cached();
249
250 if (pid1 == pid2)
251 return 1;
252
253 ns_path = pid_namespace_path(pid1, type);
254 if (stat(ns_path, &ns_st1) < 0)
255 return -errno;
256
257 ns_path = pid_namespace_path(pid2, type);
258 if (stat(ns_path, &ns_st2) < 0)
259 return -errno;
260
261 return stat_inode_same(&ns_st1, &ns_st2);
262 }