]> git.ipfire.org Git - people/ms/linux.git/blame - fs/notify/fanotify/fanotify_user.c
fanotify: introduce a generic info record copying helper
[people/ms/linux.git] / fs / notify / fanotify / fanotify_user.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
33d3dfff 2#include <linux/fanotify.h>
11637e4b 3#include <linux/fcntl.h>
2a3edf86 4#include <linux/file.h>
11637e4b 5#include <linux/fs.h>
52c923dd 6#include <linux/anon_inodes.h>
11637e4b 7#include <linux/fsnotify_backend.h>
2a3edf86 8#include <linux/init.h>
a1014f10 9#include <linux/mount.h>
2a3edf86 10#include <linux/namei.h>
a1014f10 11#include <linux/poll.h>
11637e4b
EP
12#include <linux/security.h>
13#include <linux/syscalls.h>
e4e047a2 14#include <linux/slab.h>
2a3edf86 15#include <linux/types.h>
a1014f10 16#include <linux/uaccess.h>
91c2e0bc 17#include <linux/compat.h>
174cd4b1 18#include <linux/sched/signal.h>
d46eb14b 19#include <linux/memcontrol.h>
a8b13aa2
AG
20#include <linux/statfs.h>
21#include <linux/exportfs.h>
a1014f10
EP
22
23#include <asm/ioctls.h>
11637e4b 24
c63181e6 25#include "../../mount.h"
be77196b 26#include "../fdinfo.h"
7053aee2 27#include "fanotify.h"
c63181e6 28
2529a0df 29#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
5b8fea65
AG
30#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
31#define FANOTIFY_DEFAULT_MAX_GROUPS 128
32
33/*
34 * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
35 * limit of marks per user, similar to inotify. Effectively, the legacy limit
36 * of fanotify marks per user is <max marks per group> * <max groups per user>.
37 * This default limit (1M) also happens to match the increased limit of inotify
38 * max_user_watches since v5.10.
39 */
40#define FANOTIFY_DEFAULT_MAX_USER_MARKS \
41 (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
42
43/*
44 * Most of the memory cost of adding an inode mark is pinning the marked inode.
45 * The size of the filesystem inode struct is not uniform across filesystems,
46 * so double the size of a VFS inode is used as a conservative approximation.
47 */
48#define INODE_MARK_COST (2 * sizeof(struct inode))
49
50/* configurable via /proc/sys/fs/fanotify/ */
51static int fanotify_max_queued_events __read_mostly;
52
53#ifdef CONFIG_SYSCTL
54
55#include <linux/sysctl.h>
56
57struct ctl_table fanotify_table[] = {
58 {
59 .procname = "max_user_groups",
60 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
61 .maxlen = sizeof(int),
62 .mode = 0644,
63 .proc_handler = proc_dointvec_minmax,
64 .extra1 = SYSCTL_ZERO,
65 },
66 {
67 .procname = "max_user_marks",
68 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
69 .maxlen = sizeof(int),
70 .mode = 0644,
71 .proc_handler = proc_dointvec_minmax,
72 .extra1 = SYSCTL_ZERO,
73 },
74 {
75 .procname = "max_queued_events",
76 .data = &fanotify_max_queued_events,
77 .maxlen = sizeof(int),
78 .mode = 0644,
79 .proc_handler = proc_dointvec_minmax,
80 .extra1 = SYSCTL_ZERO
81 },
82 { }
83};
84#endif /* CONFIG_SYSCTL */
2529a0df 85
48149e9d
HS
86/*
87 * All flags that may be specified in parameter event_f_flags of fanotify_init.
88 *
89 * Internal and external open flags are stored together in field f_flags of
90 * struct file. Only external open flags shall be allowed in event_f_flags.
91 * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
92 * excluded.
93 */
94#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \
95 O_ACCMODE | O_APPEND | O_NONBLOCK | \
96 __O_SYNC | O_DSYNC | O_CLOEXEC | \
97 O_LARGEFILE | O_NOATIME )
98
33d3dfff 99extern const struct fsnotify_ops fanotify_fsnotify_ops;
11637e4b 100
054c636e 101struct kmem_cache *fanotify_mark_cache __read_mostly;
7088f357
JK
102struct kmem_cache *fanotify_fid_event_cachep __read_mostly;
103struct kmem_cache *fanotify_path_event_cachep __read_mostly;
f083441b 104struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
2a3edf86 105
5e469c83 106#define FANOTIFY_EVENT_ALIGN 4
d3424c9b 107#define FANOTIFY_FID_INFO_HDR_LEN \
44d705b0 108 (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
5e469c83 109
44d705b0 110static int fanotify_fid_info_len(int fh_len, int name_len)
d766b553 111{
44d705b0
AG
112 int info_len = fh_len;
113
114 if (name_len)
115 info_len += name_len + 1;
116
d3424c9b
MB
117 return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
118 FANOTIFY_EVENT_ALIGN);
d766b553
AG
119}
120
d3424c9b 121static int fanotify_event_info_len(unsigned int info_mode,
929943b3 122 struct fanotify_event *event)
5e469c83 123{
f454fa61
AG
124 struct fanotify_info *info = fanotify_event_info(event);
125 int dir_fh_len = fanotify_event_dir_fh_len(event);
afc894c7 126 int fh_len = fanotify_event_object_fh_len(event);
f454fa61 127 int info_len = 0;
929943b3 128 int dot_len = 0;
f454fa61 129
929943b3 130 if (dir_fh_len) {
f454fa61 131 info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
d3424c9b
MB
132 } else if ((info_mode & FAN_REPORT_NAME) &&
133 (event->mask & FAN_ONDIR)) {
929943b3
AG
134 /*
135 * With group flag FAN_REPORT_NAME, if name was not recorded in
136 * event on a directory, we will report the name ".".
137 */
138 dot_len = 1;
139 }
afc894c7 140
44d705b0 141 if (fh_len)
929943b3 142 info_len += fanotify_fid_info_len(fh_len, dot_len);
44d705b0 143
44d705b0 144 return info_len;
5e469c83
AG
145}
146
94e00d28
AG
147/*
148 * Remove an hashed event from merge hash table.
149 */
150static void fanotify_unhash_event(struct fsnotify_group *group,
151 struct fanotify_event *event)
152{
153 assert_spin_locked(&group->notification_lock);
154
155 pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
156 group, event, fanotify_event_hash_bucket(group, event));
157
158 if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
159 return;
160
161 hlist_del_init(&event->merge_list);
162}
163
a1014f10 164/*
7088f357 165 * Get an fanotify notification event if one exists and is small
a1014f10 166 * enough to fit in "count". Return an error pointer if the count
40873284
JK
167 * is not large enough. When permission event is dequeued, its state is
168 * updated accordingly.
a1014f10 169 */
7088f357 170static struct fanotify_event *get_one_event(struct fsnotify_group *group,
a1014f10
EP
171 size_t count)
172{
5e469c83 173 size_t event_size = FAN_EVENT_METADATA_LEN;
7088f357 174 struct fanotify_event *event = NULL;
6f73171e 175 struct fsnotify_event *fsn_event;
0aca67bb 176 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
a1014f10
EP
177
178 pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
179
8c554466 180 spin_lock(&group->notification_lock);
6f73171e
AG
181 fsn_event = fsnotify_peek_first_event(group);
182 if (!fsn_event)
8c554466 183 goto out;
a1014f10 184
6f73171e 185 event = FANOTIFY_E(fsn_event);
0aca67bb
MB
186 if (info_mode)
187 event_size += fanotify_event_info_len(info_mode, event);
5e469c83 188
8c554466 189 if (event_size > count) {
7088f357 190 event = ERR_PTR(-EINVAL);
8c554466
JK
191 goto out;
192 }
6f73171e
AG
193
194 /*
195 * Held the notification_lock the whole time, so this is the
196 * same event we peeked above.
197 */
198 fsnotify_remove_first_event(group);
7088f357
JK
199 if (fanotify_is_perm_event(event->mask))
200 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
94e00d28
AG
201 if (fanotify_is_hashed_event(event->mask))
202 fanotify_unhash_event(group, event);
8c554466
JK
203out:
204 spin_unlock(&group->notification_lock);
7088f357 205 return event;
a1014f10
EP
206}
207
a741c2fe 208static int create_fd(struct fsnotify_group *group, struct path *path,
7053aee2 209 struct file **file)
a1014f10
EP
210{
211 int client_fd;
a1014f10
EP
212 struct file *new_file;
213
0b37e097 214 client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
a1014f10
EP
215 if (client_fd < 0)
216 return client_fd;
217
a1014f10
EP
218 /*
219 * we need a new file handle for the userspace program so it can read even if it was
220 * originally opened O_WRONLY.
221 */
a741c2fe
JK
222 new_file = dentry_open(path,
223 group->fanotify_data.f_flags | FMODE_NONOTIFY,
224 current_cred());
a1014f10
EP
225 if (IS_ERR(new_file)) {
226 /*
227 * we still send an event even if we can't open the file. this
228 * can happen when say tasks are gone and we try to open their
229 * /proc files or we try to open a WRONLY file like in sysfs
230 * we just send the errno to userspace since there isn't much
231 * else we can do.
232 */
233 put_unused_fd(client_fd);
234 client_fd = PTR_ERR(new_file);
235 } else {
352e3b24 236 *file = new_file;
a1014f10
EP
237 }
238
22aa425d 239 return client_fd;
a1014f10
EP
240}
241
40873284
JK
242/*
243 * Finish processing of permission event by setting it to ANSWERED state and
244 * drop group->notification_lock.
245 */
246static void finish_permission_event(struct fsnotify_group *group,
247 struct fanotify_perm_event *event,
248 unsigned int response)
249 __releases(&group->notification_lock)
250{
fabf7f29
JK
251 bool destroy = false;
252
40873284
JK
253 assert_spin_locked(&group->notification_lock);
254 event->response = response;
fabf7f29
JK
255 if (event->state == FAN_EVENT_CANCELED)
256 destroy = true;
257 else
258 event->state = FAN_EVENT_ANSWERED;
40873284 259 spin_unlock(&group->notification_lock);
fabf7f29
JK
260 if (destroy)
261 fsnotify_destroy_event(group, &event->fae.fse);
40873284
JK
262}
263
b2d87909
EP
264static int process_access_response(struct fsnotify_group *group,
265 struct fanotify_response *response_struct)
266{
33913997 267 struct fanotify_perm_event *event;
f083441b
JK
268 int fd = response_struct->fd;
269 int response = response_struct->response;
b2d87909
EP
270
271 pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
272 fd, response);
273 /*
274 * make sure the response is valid, if invalid we do nothing and either
25985edc 275 * userspace can send a valid response or we will clean it up after the
b2d87909
EP
276 * timeout
277 */
de8cd83e 278 switch (response & ~FAN_AUDIT) {
b2d87909
EP
279 case FAN_ALLOW:
280 case FAN_DENY:
281 break;
282 default:
283 return -EINVAL;
284 }
285
286 if (fd < 0)
287 return -EINVAL;
288
96a71f21 289 if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
de8cd83e
SG
290 return -EINVAL;
291
af6a5113
JK
292 spin_lock(&group->notification_lock);
293 list_for_each_entry(event, &group->fanotify_data.access_list,
294 fae.fse.list) {
295 if (event->fd != fd)
296 continue;
b2d87909 297
af6a5113 298 list_del_init(&event->fae.fse.list);
40873284 299 finish_permission_event(group, event, response);
af6a5113
JK
300 wake_up(&group->fanotify_data.access_waitq);
301 return 0;
302 }
303 spin_unlock(&group->notification_lock);
b2d87909 304
af6a5113 305 return -ENOENT;
b2d87909 306}
b2d87909 307
d3424c9b
MB
308static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
309 int info_type, const char *name,
310 size_t name_len,
311 char __user *buf, size_t count)
5e469c83
AG
312{
313 struct fanotify_event_info_fid info = { };
314 struct file_handle handle = { };
afc894c7 315 unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
cacfb956 316 size_t fh_len = fh ? fh->len : 0;
44d705b0
AG
317 size_t info_len = fanotify_fid_info_len(fh_len, name_len);
318 size_t len = info_len;
5e469c83 319
44d705b0
AG
320 pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
321 __func__, fh_len, name_len, info_len, count);
322
83b7a598 323 if (!fh_len)
5e469c83
AG
324 return 0;
325
44d705b0 326 if (WARN_ON_ONCE(len < sizeof(info) || len > count))
5e469c83
AG
327 return -EFAULT;
328
44d705b0
AG
329 /*
330 * Copy event info fid header followed by variable sized file handle
331 * and optionally followed by variable sized filename.
332 */
83b7a598
AG
333 switch (info_type) {
334 case FAN_EVENT_INFO_TYPE_FID:
335 case FAN_EVENT_INFO_TYPE_DFID:
336 if (WARN_ON_ONCE(name_len))
337 return -EFAULT;
338 break;
339 case FAN_EVENT_INFO_TYPE_DFID_NAME:
340 if (WARN_ON_ONCE(!name || !name_len))
341 return -EFAULT;
342 break;
343 default:
344 return -EFAULT;
345 }
346
347 info.hdr.info_type = info_type;
5e469c83 348 info.hdr.len = len;
d766b553 349 info.fsid = *fsid;
5e469c83
AG
350 if (copy_to_user(buf, &info, sizeof(info)))
351 return -EFAULT;
352
353 buf += sizeof(info);
354 len -= sizeof(info);
44d705b0
AG
355 if (WARN_ON_ONCE(len < sizeof(handle)))
356 return -EFAULT;
357
afc894c7 358 handle.handle_type = fh->type;
5e469c83
AG
359 handle.handle_bytes = fh_len;
360 if (copy_to_user(buf, &handle, sizeof(handle)))
361 return -EFAULT;
362
363 buf += sizeof(handle);
364 len -= sizeof(handle);
44d705b0
AG
365 if (WARN_ON_ONCE(len < fh_len))
366 return -EFAULT;
367
b2d22b6b 368 /*
44d705b0
AG
369 * For an inline fh and inline file name, copy through stack to exclude
370 * the copy from usercopy hardening protections.
b2d22b6b 371 */
afc894c7 372 fh_buf = fanotify_fh_buf(fh);
b2d22b6b 373 if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
afc894c7
JK
374 memcpy(bounce, fh_buf, fh_len);
375 fh_buf = bounce;
b2d22b6b 376 }
afc894c7 377 if (copy_to_user(buf, fh_buf, fh_len))
5e469c83
AG
378 return -EFAULT;
379
5e469c83
AG
380 buf += fh_len;
381 len -= fh_len;
44d705b0
AG
382
383 if (name_len) {
384 /* Copy the filename with terminating null */
385 name_len++;
386 if (WARN_ON_ONCE(len < name_len))
387 return -EFAULT;
388
389 if (copy_to_user(buf, name, name_len))
390 return -EFAULT;
391
392 buf += name_len;
393 len -= name_len;
394 }
395
396 /* Pad with 0's */
5e469c83
AG
397 WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
398 if (len > 0 && clear_user(buf, len))
399 return -EFAULT;
400
44d705b0 401 return info_len;
5e469c83
AG
402}
403
0aca67bb
MB
404static int copy_info_records_to_user(struct fanotify_event *event,
405 struct fanotify_info *info,
406 unsigned int info_mode,
407 char __user *buf, size_t count)
408{
409 int ret, total_bytes = 0, info_type = 0;
410 unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
411
412 /*
413 * Event info records order is as follows: dir fid + name, child fid.
414 */
415 if (fanotify_event_dir_fh_len(event)) {
416 info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
417 FAN_EVENT_INFO_TYPE_DFID;
418 ret = copy_fid_info_to_user(fanotify_event_fsid(event),
419 fanotify_info_dir_fh(info),
420 info_type,
421 fanotify_info_name(info),
422 info->name_len, buf, count);
423 if (ret < 0)
424 return ret;
425
426 buf += ret;
427 count -= ret;
428 total_bytes += ret;
429 }
430
431 if (fanotify_event_object_fh_len(event)) {
432 const char *dot = NULL;
433 int dot_len = 0;
434
435 if (fid_mode == FAN_REPORT_FID || info_type) {
436 /*
437 * With only group flag FAN_REPORT_FID only type FID is
438 * reported. Second info record type is always FID.
439 */
440 info_type = FAN_EVENT_INFO_TYPE_FID;
441 } else if ((fid_mode & FAN_REPORT_NAME) &&
442 (event->mask & FAN_ONDIR)) {
443 /*
444 * With group flag FAN_REPORT_NAME, if name was not
445 * recorded in an event on a directory, report the name
446 * "." with info type DFID_NAME.
447 */
448 info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
449 dot = ".";
450 dot_len = 1;
451 } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
452 (event->mask & FAN_ONDIR)) {
453 /*
454 * With group flag FAN_REPORT_DIR_FID, a single info
455 * record has type DFID for directory entry modification
456 * event and for event on a directory.
457 */
458 info_type = FAN_EVENT_INFO_TYPE_DFID;
459 } else {
460 /*
461 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
462 * a single info record has type FID for event on a
463 * non-directory, when there is no directory to report.
464 * For example, on FAN_DELETE_SELF event.
465 */
466 info_type = FAN_EVENT_INFO_TYPE_FID;
467 }
468
469 ret = copy_fid_info_to_user(fanotify_event_fsid(event),
470 fanotify_event_object_fh(event),
471 info_type, dot, dot_len,
472 buf, count);
473 if (ret < 0)
474 return ret;
475
476 buf += ret;
477 count -= ret;
478 total_bytes += ret;
479 }
480
481 return total_bytes;
482}
483
a1014f10 484static ssize_t copy_event_to_user(struct fsnotify_group *group,
7088f357 485 struct fanotify_event *event,
5b03a472 486 char __user *buf, size_t count)
a1014f10 487{
bb2f7b45 488 struct fanotify_event_metadata metadata;
7088f357 489 struct path *path = fanotify_event_path(event);
f454fa61 490 struct fanotify_info *info = fanotify_event_info(event);
0aca67bb 491 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
bb2f7b45 492 struct file *f = NULL;
e9e0c890 493 int ret, fd = FAN_NOFD;
a1014f10 494
7088f357 495 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
a1014f10 496
44d705b0 497 metadata.event_len = FAN_EVENT_METADATA_LEN +
0aca67bb 498 fanotify_event_info_len(info_mode, event);
bb2f7b45
AG
499 metadata.metadata_len = FAN_EVENT_METADATA_LEN;
500 metadata.vers = FANOTIFY_METADATA_VERSION;
501 metadata.reserved = 0;
502 metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
503 metadata.pid = pid_vnr(event->pid);
7cea2a3c
AG
504 /*
505 * For an unprivileged listener, event->pid can be used to identify the
506 * events generated by the listener process itself, without disclosing
507 * the pids of other processes.
508 */
a8b98c80 509 if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
7cea2a3c
AG
510 task_tgid(current) != event->pid)
511 metadata.pid = 0;
bb2f7b45 512
a8b98c80
AG
513 /*
514 * For now, fid mode is required for an unprivileged listener and
515 * fid mode does not report fd in events. Keep this check anyway
516 * for safety in case fid mode requirement is relaxed in the future
517 * to allow unprivileged listener to get events with no fd and no fid.
518 */
519 if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
520 path && path->mnt && path->dentry) {
afc894c7
JK
521 fd = create_fd(group, path, &f);
522 if (fd < 0)
523 return fd;
bb2f7b45
AG
524 }
525 metadata.fd = fd;
b2d87909 526
b2d87909 527 ret = -EFAULT;
5b03a472
KC
528 /*
529 * Sanity check copy size in case get_one_event() and
c5e443cb 530 * event_len sizes ever get out of sync.
5b03a472 531 */
bb2f7b45 532 if (WARN_ON_ONCE(metadata.event_len > count))
5b03a472 533 goto out_close_fd;
bb2f7b45 534
5e469c83 535 if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
352e3b24
AV
536 goto out_close_fd;
537
44d705b0
AG
538 buf += FAN_EVENT_METADATA_LEN;
539 count -= FAN_EVENT_METADATA_LEN;
540
bb2f7b45 541 if (fanotify_is_perm_event(event->mask))
7088f357 542 FANOTIFY_PERM(event)->fd = fd;
a1014f10 543
44d705b0 544 if (f)
3587b1b0 545 fd_install(fd, f);
44d705b0 546
0aca67bb
MB
547 if (info_mode) {
548 ret = copy_info_records_to_user(event, info, info_mode,
549 buf, count);
44d705b0 550 if (ret < 0)
f644bc44 551 goto out_close_fd;
5e469c83
AG
552 }
553
bb2f7b45 554 return metadata.event_len;
b2d87909 555
b2d87909 556out_close_fd:
352e3b24
AV
557 if (fd != FAN_NOFD) {
558 put_unused_fd(fd);
559 fput(f);
560 }
b2d87909 561 return ret;
a1014f10
EP
562}
563
564/* intofiy userspace file descriptor functions */
076ccb76 565static __poll_t fanotify_poll(struct file *file, poll_table *wait)
a1014f10
EP
566{
567 struct fsnotify_group *group = file->private_data;
076ccb76 568 __poll_t ret = 0;
a1014f10
EP
569
570 poll_wait(file, &group->notification_waitq, wait);
c21dbe20 571 spin_lock(&group->notification_lock);
a1014f10 572 if (!fsnotify_notify_queue_is_empty(group))
a9a08845 573 ret = EPOLLIN | EPOLLRDNORM;
c21dbe20 574 spin_unlock(&group->notification_lock);
a1014f10
EP
575
576 return ret;
577}
578
579static ssize_t fanotify_read(struct file *file, char __user *buf,
580 size_t count, loff_t *pos)
581{
582 struct fsnotify_group *group;
7088f357 583 struct fanotify_event *event;
a1014f10
EP
584 char __user *start;
585 int ret;
536ebe9c 586 DEFINE_WAIT_FUNC(wait, woken_wake_function);
a1014f10
EP
587
588 start = buf;
589 group = file->private_data;
590
591 pr_debug("%s: group=%p\n", __func__, group);
592
536ebe9c 593 add_wait_queue(&group->notification_waitq, &wait);
a1014f10 594 while (1) {
47aaabde
JK
595 /*
596 * User can supply arbitrarily large buffer. Avoid softlockups
597 * in case there are lots of available events.
598 */
599 cond_resched();
7088f357
JK
600 event = get_one_event(group, count);
601 if (IS_ERR(event)) {
602 ret = PTR_ERR(event);
d8aaab4f
JK
603 break;
604 }
605
7088f357 606 if (!event) {
d8aaab4f
JK
607 ret = -EAGAIN;
608 if (file->f_flags & O_NONBLOCK)
a1014f10 609 break;
d8aaab4f
JK
610
611 ret = -ERESTARTSYS;
612 if (signal_pending(current))
613 break;
614
615 if (start != buf)
a1014f10 616 break;
536ebe9c
PZ
617
618 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
a1014f10
EP
619 continue;
620 }
621
7088f357 622 ret = copy_event_to_user(group, event, buf, count);
4ff33aaf
AG
623 if (unlikely(ret == -EOPENSTALE)) {
624 /*
625 * We cannot report events with stale fd so drop it.
626 * Setting ret to 0 will continue the event loop and
627 * do the right thing if there are no more events to
628 * read (i.e. return bytes read, -EAGAIN or wait).
629 */
630 ret = 0;
631 }
632
d8aaab4f
JK
633 /*
634 * Permission events get queued to wait for response. Other
635 * events can be destroyed now.
636 */
7088f357
JK
637 if (!fanotify_is_perm_event(event->mask)) {
638 fsnotify_destroy_event(group, &event->fse);
d507816b 639 } else {
4ff33aaf 640 if (ret <= 0) {
40873284
JK
641 spin_lock(&group->notification_lock);
642 finish_permission_event(group,
7088f357 643 FANOTIFY_PERM(event), FAN_DENY);
d507816b 644 wake_up(&group->fanotify_data.access_waitq);
4ff33aaf
AG
645 } else {
646 spin_lock(&group->notification_lock);
7088f357 647 list_add_tail(&event->fse.list,
4ff33aaf
AG
648 &group->fanotify_data.access_list);
649 spin_unlock(&group->notification_lock);
d507816b 650 }
d507816b 651 }
4ff33aaf
AG
652 if (ret < 0)
653 break;
d8aaab4f
JK
654 buf += ret;
655 count -= ret;
a1014f10 656 }
536ebe9c 657 remove_wait_queue(&group->notification_waitq, &wait);
a1014f10 658
a1014f10
EP
659 if (start != buf && ret != -EFAULT)
660 ret = buf - start;
661 return ret;
662}
663
b2d87909
EP
664static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
665{
b2d87909
EP
666 struct fanotify_response response = { .fd = -1, .response = -1 };
667 struct fsnotify_group *group;
668 int ret;
669
6685df31
MS
670 if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
671 return -EINVAL;
672
b2d87909
EP
673 group = file->private_data;
674
5e23663b
FF
675 if (count < sizeof(response))
676 return -EINVAL;
677
678 count = sizeof(response);
b2d87909
EP
679
680 pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
681
682 if (copy_from_user(&response, buf, count))
683 return -EFAULT;
684
685 ret = process_access_response(group, &response);
686 if (ret < 0)
687 count = ret;
688
689 return count;
b2d87909
EP
690}
691
52c923dd
EP
692static int fanotify_release(struct inode *ignored, struct file *file)
693{
694 struct fsnotify_group *group = file->private_data;
6f73171e 695 struct fsnotify_event *fsn_event;
19ba54f4 696
5838d444 697 /*
96d41019
JK
698 * Stop new events from arriving in the notification queue. since
699 * userspace cannot use fanotify fd anymore, no event can enter or
700 * leave access_list by now either.
5838d444 701 */
96d41019 702 fsnotify_group_stop_queueing(group);
2eebf582 703
96d41019
JK
704 /*
705 * Process all permission events on access_list and notification queue
706 * and simulate reply from userspace.
707 */
073f6552 708 spin_lock(&group->notification_lock);
ca6f8699 709 while (!list_empty(&group->fanotify_data.access_list)) {
7088f357
JK
710 struct fanotify_perm_event *event;
711
ca6f8699
JK
712 event = list_first_entry(&group->fanotify_data.access_list,
713 struct fanotify_perm_event, fae.fse.list);
f083441b 714 list_del_init(&event->fae.fse.list);
40873284
JK
715 finish_permission_event(group, event, FAN_ALLOW);
716 spin_lock(&group->notification_lock);
2eebf582 717 }
2eebf582 718
5838d444 719 /*
96d41019
JK
720 * Destroy all non-permission events. For permission events just
721 * dequeue them and set the response. They will be freed once the
722 * response is consumed and fanotify_get_response() returns.
5838d444 723 */
6f73171e
AG
724 while ((fsn_event = fsnotify_remove_first_event(group))) {
725 struct fanotify_event *event = FANOTIFY_E(fsn_event);
7088f357 726
7088f357 727 if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
c21dbe20 728 spin_unlock(&group->notification_lock);
6f73171e 729 fsnotify_destroy_event(group, fsn_event);
6685df31 730 } else {
7088f357 731 finish_permission_event(group, FANOTIFY_PERM(event),
40873284 732 FAN_ALLOW);
6685df31 733 }
40873284 734 spin_lock(&group->notification_lock);
96d41019 735 }
c21dbe20 736 spin_unlock(&group->notification_lock);
96d41019
JK
737
738 /* Response for all permission events it set, wakeup waiters */
2eebf582 739 wake_up(&group->fanotify_data.access_waitq);
0a6b6bd5 740
52c923dd 741 /* matches the fanotify_init->fsnotify_alloc_group */
d8153d4d 742 fsnotify_destroy_group(group);
52c923dd
EP
743
744 return 0;
745}
746
a1014f10
EP
747static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
748{
749 struct fsnotify_group *group;
7053aee2 750 struct fsnotify_event *fsn_event;
a1014f10
EP
751 void __user *p;
752 int ret = -ENOTTY;
753 size_t send_len = 0;
754
755 group = file->private_data;
756
757 p = (void __user *) arg;
758
759 switch (cmd) {
760 case FIONREAD:
c21dbe20 761 spin_lock(&group->notification_lock);
7053aee2 762 list_for_each_entry(fsn_event, &group->notification_list, list)
a1014f10 763 send_len += FAN_EVENT_METADATA_LEN;
c21dbe20 764 spin_unlock(&group->notification_lock);
a1014f10
EP
765 ret = put_user(send_len, (int __user *) p);
766 break;
767 }
768
769 return ret;
770}
771
52c923dd 772static const struct file_operations fanotify_fops = {
be77196b 773 .show_fdinfo = fanotify_show_fdinfo,
a1014f10
EP
774 .poll = fanotify_poll,
775 .read = fanotify_read,
b2d87909 776 .write = fanotify_write,
52c923dd
EP
777 .fasync = NULL,
778 .release = fanotify_release,
a1014f10 779 .unlocked_ioctl = fanotify_ioctl,
1832f2d8 780 .compat_ioctl = compat_ptr_ioctl,
6038f373 781 .llseek = noop_llseek,
52c923dd
EP
782};
783
2a3edf86 784static int fanotify_find_path(int dfd, const char __user *filename,
ac5656d8
AG
785 struct path *path, unsigned int flags, __u64 mask,
786 unsigned int obj_type)
2a3edf86
EP
787{
788 int ret;
789
790 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
791 dfd, filename, flags);
792
793 if (filename == NULL) {
2903ff01 794 struct fd f = fdget(dfd);
2a3edf86
EP
795
796 ret = -EBADF;
2903ff01 797 if (!f.file)
2a3edf86
EP
798 goto out;
799
800 ret = -ENOTDIR;
801 if ((flags & FAN_MARK_ONLYDIR) &&
496ad9aa 802 !(S_ISDIR(file_inode(f.file)->i_mode))) {
2903ff01 803 fdput(f);
2a3edf86
EP
804 goto out;
805 }
806
2903ff01 807 *path = f.file->f_path;
2a3edf86 808 path_get(path);
2903ff01 809 fdput(f);
2a3edf86
EP
810 } else {
811 unsigned int lookup_flags = 0;
812
813 if (!(flags & FAN_MARK_DONT_FOLLOW))
814 lookup_flags |= LOOKUP_FOLLOW;
815 if (flags & FAN_MARK_ONLYDIR)
816 lookup_flags |= LOOKUP_DIRECTORY;
817
818 ret = user_path_at(dfd, filename, lookup_flags, path);
819 if (ret)
820 goto out;
821 }
822
823 /* you can only watch an inode if you have read permissions on it */
02f92b38 824 ret = path_permission(path, MAY_READ);
ac5656d8
AG
825 if (ret) {
826 path_put(path);
827 goto out;
828 }
829
830 ret = security_path_notify(path, mask, obj_type);
2a3edf86
EP
831 if (ret)
832 path_put(path);
ac5656d8 833
2a3edf86
EP
834out:
835 return ret;
836}
837
b9e4e3bd 838static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
4ed6814a
AG
839 __u32 mask, unsigned int flags,
840 __u32 umask, int *destroy)
088b09b0 841{
d2c1874c 842 __u32 oldmask = 0;
088b09b0 843
4ed6814a
AG
844 /* umask bits cannot be removed by user */
845 mask &= ~umask;
088b09b0 846 spin_lock(&fsn_mark->lock);
b9e4e3bd
EP
847 if (!(flags & FAN_MARK_IGNORED_MASK)) {
848 oldmask = fsn_mark->mask;
a72fd224 849 fsn_mark->mask &= ~mask;
b9e4e3bd 850 } else {
a72fd224 851 fsn_mark->ignored_mask &= ~mask;
b9e4e3bd 852 }
4ed6814a
AG
853 /*
854 * We need to keep the mark around even if remaining mask cannot
855 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
856 * changes to the mask.
857 * Destroy mark when only umask bits remain.
858 */
859 *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
088b09b0
AG
860 spin_unlock(&fsn_mark->lock);
861
088b09b0
AG
862 return mask & oldmask;
863}
864
eaa2c6b0
AG
865static int fanotify_remove_mark(struct fsnotify_group *group,
866 fsnotify_connp_t *connp, __u32 mask,
4ed6814a 867 unsigned int flags, __u32 umask)
88826276
EP
868{
869 struct fsnotify_mark *fsn_mark = NULL;
088b09b0 870 __u32 removed;
6dfbd149 871 int destroy_mark;
88826276 872
7b18527c 873 mutex_lock(&group->mark_mutex);
eaa2c6b0 874 fsn_mark = fsnotify_find_mark(connp, group);
7b18527c
LS
875 if (!fsn_mark) {
876 mutex_unlock(&group->mark_mutex);
f3640192 877 return -ENOENT;
7b18527c 878 }
88826276 879
6dfbd149 880 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
4ed6814a 881 umask, &destroy_mark);
3ac70bfc
AG
882 if (removed & fsnotify_conn_mask(fsn_mark->connector))
883 fsnotify_recalc_mask(fsn_mark->connector);
6dfbd149 884 if (destroy_mark)
4712e722 885 fsnotify_detach_mark(fsn_mark);
7b18527c 886 mutex_unlock(&group->mark_mutex);
4712e722
JK
887 if (destroy_mark)
888 fsnotify_free_mark(fsn_mark);
6dfbd149 889
eaa2c6b0 890 /* matches the fsnotify_find_mark() */
f3640192 891 fsnotify_put_mark(fsn_mark);
f3640192
AG
892 return 0;
893}
2a3edf86 894
eaa2c6b0
AG
895static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
896 struct vfsmount *mnt, __u32 mask,
4ed6814a 897 unsigned int flags, __u32 umask)
eaa2c6b0
AG
898{
899 return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
4ed6814a 900 mask, flags, umask);
eaa2c6b0
AG
901}
902
d54f4fba 903static int fanotify_remove_sb_mark(struct fsnotify_group *group,
4ed6814a
AG
904 struct super_block *sb, __u32 mask,
905 unsigned int flags, __u32 umask)
d54f4fba 906{
4ed6814a
AG
907 return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
908 flags, umask);
d54f4fba
AG
909}
910
f3640192 911static int fanotify_remove_inode_mark(struct fsnotify_group *group,
b9e4e3bd 912 struct inode *inode, __u32 mask,
4ed6814a 913 unsigned int flags, __u32 umask)
f3640192 914{
eaa2c6b0 915 return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
4ed6814a 916 flags, umask);
2a3edf86
EP
917}
918
b9e4e3bd
EP
919static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
920 __u32 mask,
921 unsigned int flags)
912ee394 922{
192ca4d1 923 __u32 oldmask = -1;
912ee394
AG
924
925 spin_lock(&fsn_mark->lock);
b9e4e3bd
EP
926 if (!(flags & FAN_MARK_IGNORED_MASK)) {
927 oldmask = fsn_mark->mask;
a72fd224 928 fsn_mark->mask |= mask;
b9e4e3bd 929 } else {
a72fd224 930 fsn_mark->ignored_mask |= mask;
c9778a98
EP
931 if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
932 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
b9e4e3bd 933 }
912ee394
AG
934 spin_unlock(&fsn_mark->lock);
935
936 return mask & ~oldmask;
937}
938
5e9c070c 939static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
b812a9f5 940 fsnotify_connp_t *connp,
77115225
AG
941 unsigned int type,
942 __kernel_fsid_t *fsid)
5e9c070c 943{
5b8fea65 944 struct ucounts *ucounts = group->fanotify_data.ucounts;
5e9c070c
LS
945 struct fsnotify_mark *mark;
946 int ret;
947
5b8fea65
AG
948 /*
949 * Enforce per user marks limits per user in all containing user ns.
950 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
951 * in the limited groups account.
952 */
953 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
954 !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
5e9c070c
LS
955 return ERR_PTR(-ENOSPC);
956
957 mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
5b8fea65
AG
958 if (!mark) {
959 ret = -ENOMEM;
960 goto out_dec_ucounts;
961 }
5e9c070c 962
054c636e 963 fsnotify_init_mark(mark, group);
77115225 964 ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
5e9c070c
LS
965 if (ret) {
966 fsnotify_put_mark(mark);
5b8fea65 967 goto out_dec_ucounts;
5e9c070c
LS
968 }
969
970 return mark;
5b8fea65
AG
971
972out_dec_ucounts:
973 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
974 dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
975 return ERR_PTR(ret);
5e9c070c
LS
976}
977
978
eaa2c6b0
AG
979static int fanotify_add_mark(struct fsnotify_group *group,
980 fsnotify_connp_t *connp, unsigned int type,
77115225
AG
981 __u32 mask, unsigned int flags,
982 __kernel_fsid_t *fsid)
2a3edf86
EP
983{
984 struct fsnotify_mark *fsn_mark;
912ee394 985 __u32 added;
2a3edf86 986
7b18527c 987 mutex_lock(&group->mark_mutex);
b812a9f5 988 fsn_mark = fsnotify_find_mark(connp, group);
88826276 989 if (!fsn_mark) {
77115225 990 fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
5e9c070c 991 if (IS_ERR(fsn_mark)) {
7b18527c 992 mutex_unlock(&group->mark_mutex);
5e9c070c 993 return PTR_ERR(fsn_mark);
7b18527c 994 }
88826276 995 }
b9e4e3bd 996 added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
3ac70bfc
AG
997 if (added & ~fsnotify_conn_mask(fsn_mark->connector))
998 fsnotify_recalc_mask(fsn_mark->connector);
c9747640 999 mutex_unlock(&group->mark_mutex);
5e9c070c 1000
fa218ab9 1001 fsnotify_put_mark(fsn_mark);
5e9c070c 1002 return 0;
88826276
EP
1003}
1004
eaa2c6b0
AG
1005static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
1006 struct vfsmount *mnt, __u32 mask,
77115225 1007 unsigned int flags, __kernel_fsid_t *fsid)
eaa2c6b0
AG
1008{
1009 return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
77115225 1010 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
eaa2c6b0
AG
1011}
1012
d54f4fba 1013static int fanotify_add_sb_mark(struct fsnotify_group *group,
77115225
AG
1014 struct super_block *sb, __u32 mask,
1015 unsigned int flags, __kernel_fsid_t *fsid)
d54f4fba
AG
1016{
1017 return fanotify_add_mark(group, &sb->s_fsnotify_marks,
77115225 1018 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
d54f4fba
AG
1019}
1020
52202dfb 1021static int fanotify_add_inode_mark(struct fsnotify_group *group,
b9e4e3bd 1022 struct inode *inode, __u32 mask,
77115225 1023 unsigned int flags, __kernel_fsid_t *fsid)
88826276 1024{
88826276 1025 pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
2a3edf86 1026
5322a59f
EP
1027 /*
1028 * If some other task has this inode open for write we should not add
1029 * an ignored mark, unless that ignored mark is supposed to survive
1030 * modification changes anyway.
1031 */
1032 if ((flags & FAN_MARK_IGNORED_MASK) &&
1033 !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
ac9498d6 1034 inode_is_open_for_write(inode))
5322a59f
EP
1035 return 0;
1036
eaa2c6b0 1037 return fanotify_add_mark(group, &inode->i_fsnotify_marks,
77115225 1038 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
88826276 1039}
2a3edf86 1040
b8a6c3a2
AG
1041static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1042{
1043 struct fanotify_event *oevent;
1044
1045 oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1046 if (!oevent)
1047 return NULL;
1048
1049 fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1050 oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1051
1052 return &oevent->fse;
1053}
1054
94e00d28
AG
1055static struct hlist_head *fanotify_alloc_merge_hash(void)
1056{
1057 struct hlist_head *hash;
1058
1059 hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1060 GFP_KERNEL_ACCOUNT);
1061 if (!hash)
1062 return NULL;
1063
1064 __hash_init(hash, FANOTIFY_HTABLE_SIZE);
1065
1066 return hash;
1067}
1068
52c923dd 1069/* fanotify syscalls */
08ae8938 1070SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
11637e4b 1071{
52c923dd
EP
1072 struct fsnotify_group *group;
1073 int f_flags, fd;
83b7a598
AG
1074 unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1075 unsigned int class = flags & FANOTIFY_CLASS_BITS;
a8b98c80 1076 unsigned int internal_flags = 0;
52c923dd 1077
96a71f21
AG
1078 pr_debug("%s: flags=%x event_f_flags=%x\n",
1079 __func__, flags, event_f_flags);
52c923dd 1080
7cea2a3c
AG
1081 if (!capable(CAP_SYS_ADMIN)) {
1082 /*
1083 * An unprivileged user can setup an fanotify group with
1084 * limited functionality - an unprivileged group is limited to
1085 * notification events with file handles and it cannot use
1086 * unlimited queue/marks.
1087 */
1088 if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1089 return -EPERM;
a8b98c80
AG
1090
1091 /*
1092 * Setting the internal flag FANOTIFY_UNPRIV on the group
1093 * prevents setting mount/filesystem marks on this group and
1094 * prevents reporting pid and open fd in events.
1095 */
1096 internal_flags |= FANOTIFY_UNPRIV;
7cea2a3c 1097 }
52c923dd 1098
de8cd83e 1099#ifdef CONFIG_AUDITSYSCALL
23c9deeb 1100 if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
de8cd83e 1101#else
23c9deeb 1102 if (flags & ~FANOTIFY_INIT_FLAGS)
de8cd83e 1103#endif
52c923dd
EP
1104 return -EINVAL;
1105
48149e9d
HS
1106 if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1107 return -EINVAL;
1108
1109 switch (event_f_flags & O_ACCMODE) {
1110 case O_RDONLY:
1111 case O_RDWR:
1112 case O_WRONLY:
1113 break;
1114 default:
1115 return -EINVAL;
1116 }
1117
83b7a598 1118 if (fid_mode && class != FAN_CLASS_NOTIF)
a8b13aa2
AG
1119 return -EINVAL;
1120
929943b3 1121 /*
929943b3 1122 * Child name is reported with parent fid so requires dir fid.
691d9763 1123 * We can report both child fid and dir fid with or without name.
929943b3 1124 */
691d9763 1125 if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
83b7a598 1126 return -EINVAL;
83b7a598 1127
b2d87909 1128 f_flags = O_RDWR | FMODE_NONOTIFY;
52c923dd
EP
1129 if (flags & FAN_CLOEXEC)
1130 f_flags |= O_CLOEXEC;
1131 if (flags & FAN_NONBLOCK)
1132 f_flags |= O_NONBLOCK;
1133
1134 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
ac7b79fd 1135 group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
26379198 1136 if (IS_ERR(group)) {
52c923dd 1137 return PTR_ERR(group);
26379198 1138 }
52c923dd 1139
5b8fea65
AG
1140 /* Enforce groups limits per user in all containing user ns */
1141 group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1142 current_euid(),
1143 UCOUNT_FANOTIFY_GROUPS);
1144 if (!group->fanotify_data.ucounts) {
1145 fd = -EMFILE;
1146 goto out_destroy_group;
1147 }
1148
a8b98c80 1149 group->fanotify_data.flags = flags | internal_flags;
d46eb14b 1150 group->memcg = get_mem_cgroup_from_mm(current->mm);
4afeff85 1151
94e00d28
AG
1152 group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1153 if (!group->fanotify_data.merge_hash) {
1154 fd = -ENOMEM;
1155 goto out_destroy_group;
1156 }
1157
b8a6c3a2
AG
1158 group->overflow_event = fanotify_alloc_overflow_event();
1159 if (unlikely(!group->overflow_event)) {
ff57cd58
JK
1160 fd = -ENOMEM;
1161 goto out_destroy_group;
1162 }
ff57cd58 1163
1e2ee49f
WW
1164 if (force_o_largefile())
1165 event_f_flags |= O_LARGEFILE;
80af2588 1166 group->fanotify_data.f_flags = event_f_flags;
9e66e423
EP
1167 init_waitqueue_head(&group->fanotify_data.access_waitq);
1168 INIT_LIST_HEAD(&group->fanotify_data.access_list);
83b7a598 1169 switch (class) {
4231a235
EP
1170 case FAN_CLASS_NOTIF:
1171 group->priority = FS_PRIO_0;
1172 break;
1173 case FAN_CLASS_CONTENT:
1174 group->priority = FS_PRIO_1;
1175 break;
1176 case FAN_CLASS_PRE_CONTENT:
1177 group->priority = FS_PRIO_2;
1178 break;
1179 default:
1180 fd = -EINVAL;
d8153d4d 1181 goto out_destroy_group;
4231a235 1182 }
cb2d429f 1183
5dd03f55
EP
1184 if (flags & FAN_UNLIMITED_QUEUE) {
1185 fd = -EPERM;
1186 if (!capable(CAP_SYS_ADMIN))
d8153d4d 1187 goto out_destroy_group;
5dd03f55
EP
1188 group->max_events = UINT_MAX;
1189 } else {
5b8fea65 1190 group->max_events = fanotify_max_queued_events;
5dd03f55 1191 }
2529a0df 1192
ac7e22dc
EP
1193 if (flags & FAN_UNLIMITED_MARKS) {
1194 fd = -EPERM;
1195 if (!capable(CAP_SYS_ADMIN))
d8153d4d 1196 goto out_destroy_group;
ac7e22dc 1197 }
e7099d8a 1198
de8cd83e
SG
1199 if (flags & FAN_ENABLE_AUDIT) {
1200 fd = -EPERM;
1201 if (!capable(CAP_AUDIT_WRITE))
1202 goto out_destroy_group;
de8cd83e
SG
1203 }
1204
52c923dd
EP
1205 fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1206 if (fd < 0)
d8153d4d 1207 goto out_destroy_group;
52c923dd
EP
1208
1209 return fd;
1210
d8153d4d
LS
1211out_destroy_group:
1212 fsnotify_destroy_group(group);
52c923dd 1213 return fd;
11637e4b 1214}
bbaa4168 1215
a8b13aa2 1216/* Check if filesystem can encode a unique fid */
73072283 1217static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
a8b13aa2 1218{
73072283 1219 __kernel_fsid_t root_fsid;
a8b13aa2
AG
1220 int err;
1221
1222 /*
1223 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
1224 */
73072283 1225 err = vfs_get_fsid(path->dentry, fsid);
a8b13aa2
AG
1226 if (err)
1227 return err;
1228
73072283 1229 if (!fsid->val[0] && !fsid->val[1])
a8b13aa2
AG
1230 return -ENODEV;
1231
1232 /*
1233 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
1234 * which uses a different fsid than sb root.
1235 */
73072283 1236 err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
a8b13aa2
AG
1237 if (err)
1238 return err;
1239
73072283
AG
1240 if (root_fsid.val[0] != fsid->val[0] ||
1241 root_fsid.val[1] != fsid->val[1])
a8b13aa2
AG
1242 return -EXDEV;
1243
1244 /*
1245 * We need to make sure that the file system supports at least
1246 * encoding a file handle so user can use name_to_handle_at() to
1247 * compare fid returned with event to the file handle of watched
1248 * objects. However, name_to_handle_at() requires that the
1249 * filesystem also supports decoding file handles.
1250 */
1251 if (!path->dentry->d_sb->s_export_op ||
1252 !path->dentry->d_sb->s_export_op->fh_to_dentry)
1253 return -EOPNOTSUPP;
1254
1255 return 0;
1256}
1257
0b3b094a
JK
1258static int fanotify_events_supported(struct path *path, __u64 mask)
1259{
1260 /*
1261 * Some filesystems such as 'proc' acquire unusual locks when opening
1262 * files. For them fanotify permission events have high chances of
1263 * deadlocking the system - open done when reporting fanotify event
1264 * blocks on this "unusual" lock while another process holding the lock
1265 * waits for fanotify permission event to be answered. Just disallow
1266 * permission events for such filesystems.
1267 */
1268 if (mask & FANOTIFY_PERM_EVENTS &&
1269 path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1270 return -EINVAL;
1271 return 0;
1272}
1273
183caa3c
DB
1274static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1275 int dfd, const char __user *pathname)
bbaa4168 1276{
0ff21db9
EP
1277 struct inode *inode = NULL;
1278 struct vfsmount *mnt = NULL;
2a3edf86 1279 struct fsnotify_group *group;
2903ff01 1280 struct fd f;
2a3edf86 1281 struct path path;
73072283 1282 __kernel_fsid_t __fsid, *fsid = NULL;
bdd5a46f 1283 u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
23c9deeb 1284 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
3ef86653 1285 bool ignored = flags & FAN_MARK_IGNORED_MASK;
d809daf1 1286 unsigned int obj_type, fid_mode;
85af5d92 1287 u32 umask = 0;
2903ff01 1288 int ret;
2a3edf86
EP
1289
1290 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1291 __func__, fanotify_fd, flags, dfd, pathname, mask);
1292
1293 /* we only use the lower 32 bits as of right now. */
22d483b9 1294 if (upper_32_bits(mask))
2a3edf86
EP
1295 return -EINVAL;
1296
23c9deeb 1297 if (flags & ~FANOTIFY_MARK_FLAGS)
88380fe6 1298 return -EINVAL;
d54f4fba
AG
1299
1300 switch (mark_type) {
1301 case FAN_MARK_INODE:
ac5656d8
AG
1302 obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1303 break;
d54f4fba 1304 case FAN_MARK_MOUNT:
ac5656d8
AG
1305 obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1306 break;
d54f4fba 1307 case FAN_MARK_FILESYSTEM:
ac5656d8 1308 obj_type = FSNOTIFY_OBJ_TYPE_SB;
d54f4fba
AG
1309 break;
1310 default:
1311 return -EINVAL;
1312 }
1313
4d92604c 1314 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
df561f66 1315 case FAN_MARK_ADD:
88380fe6 1316 case FAN_MARK_REMOVE:
1734dee4
LS
1317 if (!mask)
1318 return -EINVAL;
cc299a98 1319 break;
4d92604c 1320 case FAN_MARK_FLUSH:
23c9deeb 1321 if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
cc299a98 1322 return -EINVAL;
88380fe6
AG
1323 break;
1324 default:
1325 return -EINVAL;
1326 }
8fcd6528 1327
6685df31 1328 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
23c9deeb 1329 valid_mask |= FANOTIFY_PERM_EVENTS;
6685df31
MS
1330
1331 if (mask & ~valid_mask)
2a3edf86
EP
1332 return -EINVAL;
1333
3ef86653
AG
1334 /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
1335 if (ignored)
1336 mask &= ~FANOTIFY_EVENT_FLAGS;
1337
2903ff01
AV
1338 f = fdget(fanotify_fd);
1339 if (unlikely(!f.file))
2a3edf86
EP
1340 return -EBADF;
1341
1342 /* verify that this is indeed an fanotify instance */
1343 ret = -EINVAL;
2903ff01 1344 if (unlikely(f.file->f_op != &fanotify_fops))
2a3edf86 1345 goto fput_and_out;
2903ff01 1346 group = f.file->private_data;
4231a235 1347
7cea2a3c 1348 /*
a8b98c80
AG
1349 * An unprivileged user is not allowed to setup mount nor filesystem
1350 * marks. This also includes setting up such marks by a group that
1351 * was initialized by an unprivileged user.
7cea2a3c
AG
1352 */
1353 ret = -EPERM;
a8b98c80
AG
1354 if ((!capable(CAP_SYS_ADMIN) ||
1355 FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
7cea2a3c
AG
1356 mark_type != FAN_MARK_INODE)
1357 goto fput_and_out;
1358
4231a235
EP
1359 /*
1360 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
1361 * allowed to set permissions events.
1362 */
1363 ret = -EINVAL;
23c9deeb 1364 if (mask & FANOTIFY_PERM_EVENTS &&
4231a235
EP
1365 group->priority == FS_PRIO_0)
1366 goto fput_and_out;
2a3edf86 1367
235328d1
AG
1368 /*
1369 * Events with data type inode do not carry enough information to report
1370 * event->fd, so we do not allow setting a mask for inode events unless
1371 * group supports reporting fid.
1372 * inode events are not supported on a mount mark, because they do not
1373 * carry enough information (i.e. path) to be filtered by mount point.
1374 */
d809daf1 1375 fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
235328d1 1376 if (mask & FANOTIFY_INODE_EVENTS &&
d809daf1 1377 (!fid_mode || mark_type == FAN_MARK_MOUNT))
235328d1
AG
1378 goto fput_and_out;
1379
0a8dd2db
HS
1380 if (flags & FAN_MARK_FLUSH) {
1381 ret = 0;
d54f4fba 1382 if (mark_type == FAN_MARK_MOUNT)
0a8dd2db 1383 fsnotify_clear_vfsmount_marks_by_group(group);
d54f4fba
AG
1384 else if (mark_type == FAN_MARK_FILESYSTEM)
1385 fsnotify_clear_sb_marks_by_group(group);
0a8dd2db
HS
1386 else
1387 fsnotify_clear_inode_marks_by_group(group);
1388 goto fput_and_out;
1389 }
1390
ac5656d8
AG
1391 ret = fanotify_find_path(dfd, pathname, &path, flags,
1392 (mask & ALL_FSNOTIFY_EVENTS), obj_type);
2a3edf86
EP
1393 if (ret)
1394 goto fput_and_out;
1395
0b3b094a
JK
1396 if (flags & FAN_MARK_ADD) {
1397 ret = fanotify_events_supported(&path, mask);
1398 if (ret)
1399 goto path_put_and_out;
1400 }
1401
d809daf1 1402 if (fid_mode) {
73072283 1403 ret = fanotify_test_fid(&path, &__fsid);
a8b13aa2
AG
1404 if (ret)
1405 goto path_put_and_out;
77115225 1406
73072283 1407 fsid = &__fsid;
a8b13aa2
AG
1408 }
1409
2a3edf86 1410 /* inode held in place by reference to path; group by fget on fd */
d54f4fba 1411 if (mark_type == FAN_MARK_INODE)
0ff21db9
EP
1412 inode = path.dentry->d_inode;
1413 else
1414 mnt = path.mnt;
2a3edf86 1415
85af5d92
AG
1416 /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1417 if (mnt || !S_ISDIR(inode->i_mode)) {
1418 mask &= ~FAN_EVENT_ON_CHILD;
1419 umask = FAN_EVENT_ON_CHILD;
51280637
AG
1420 /*
1421 * If group needs to report parent fid, register for getting
1422 * events with parent/name info for non-directory.
1423 */
1424 if ((fid_mode & FAN_REPORT_DIR_FID) &&
1425 (flags & FAN_MARK_ADD) && !ignored)
1426 mask |= FAN_EVENT_ON_CHILD;
85af5d92
AG
1427 }
1428
2a3edf86 1429 /* create/update an inode mark */
0a8dd2db 1430 switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
c6223f46 1431 case FAN_MARK_ADD:
d54f4fba 1432 if (mark_type == FAN_MARK_MOUNT)
77115225
AG
1433 ret = fanotify_add_vfsmount_mark(group, mnt, mask,
1434 flags, fsid);
d54f4fba 1435 else if (mark_type == FAN_MARK_FILESYSTEM)
77115225
AG
1436 ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
1437 flags, fsid);
0ff21db9 1438 else
77115225
AG
1439 ret = fanotify_add_inode_mark(group, inode, mask,
1440 flags, fsid);
c6223f46
AG
1441 break;
1442 case FAN_MARK_REMOVE:
d54f4fba 1443 if (mark_type == FAN_MARK_MOUNT)
77115225 1444 ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
85af5d92 1445 flags, umask);
d54f4fba 1446 else if (mark_type == FAN_MARK_FILESYSTEM)
77115225 1447 ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
85af5d92 1448 flags, umask);
f3640192 1449 else
77115225 1450 ret = fanotify_remove_inode_mark(group, inode, mask,
85af5d92 1451 flags, umask);
c6223f46
AG
1452 break;
1453 default:
1454 ret = -EINVAL;
1455 }
2a3edf86 1456
a8b13aa2 1457path_put_and_out:
2a3edf86
EP
1458 path_put(&path);
1459fput_and_out:
2903ff01 1460 fdput(f);
2a3edf86
EP
1461 return ret;
1462}
1463
2ca408d9 1464#ifndef CONFIG_ARCH_SPLIT_ARG64
183caa3c
DB
1465SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1466 __u64, mask, int, dfd,
1467 const char __user *, pathname)
1468{
1469 return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1470}
2ca408d9 1471#endif
183caa3c 1472
2ca408d9
BG
1473#if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1474SYSCALL32_DEFINE6(fanotify_mark,
91c2e0bc 1475 int, fanotify_fd, unsigned int, flags,
2ca408d9 1476 SC_ARG64(mask), int, dfd,
91c2e0bc
AV
1477 const char __user *, pathname)
1478{
2ca408d9
BG
1479 return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1480 dfd, pathname);
91c2e0bc
AV
1481}
1482#endif
1483
2a3edf86 1484/*
ae0e47f0 1485 * fanotify_user_setup - Our initialization function. Note that we cannot return
2a3edf86
EP
1486 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
1487 * must result in panic().
1488 */
1489static int __init fanotify_user_setup(void)
1490{
5b8fea65
AG
1491 struct sysinfo si;
1492 int max_marks;
1493
1494 si_meminfo(&si);
1495 /*
1496 * Allow up to 1% of addressable memory to be accounted for per user
1497 * marks limited to the range [8192, 1048576]. mount and sb marks are
1498 * a lot cheaper than inode marks, but there is no reason for a user
1499 * to have many of those, so calculate by the cost of inode marks.
1500 */
1501 max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1502 INODE_MARK_COST;
1503 max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1504 FANOTIFY_DEFAULT_MAX_USER_MARKS);
1505
a8b98c80 1506 BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
929943b3 1507 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
bdd5a46f
AG
1508 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
1509
d46eb14b
SB
1510 fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
1511 SLAB_PANIC|SLAB_ACCOUNT);
7088f357
JK
1512 fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1513 SLAB_PANIC);
1514 fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1515 SLAB_PANIC);
6685df31
MS
1516 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1517 fanotify_perm_event_cachep =
33913997 1518 KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
6685df31 1519 }
2a3edf86 1520
5b8fea65
AG
1521 fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1522 init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1523 FANOTIFY_DEFAULT_MAX_GROUPS;
1524 init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1525
2a3edf86 1526 return 0;
bbaa4168 1527}
2a3edf86 1528device_initcall(fanotify_user_setup);