]> git.ipfire.org Git - people/ms/linux.git/blame - fs/kernfs/file.c
Merge branch 'for-6.0/dax' into libnvdimm-fixes
[people/ms/linux.git] / fs / kernfs / file.c
CommitLineData
55716d26 1// SPDX-License-Identifier: GPL-2.0-only
b8441ed2
TH
2/*
3 * fs/kernfs/file.c - kernfs file implementation
4 *
5 * Copyright (c) 2001-3 Patrick Mochel
6 * Copyright (c) 2007 SUSE Linux Products GmbH
7 * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
b8441ed2 8 */
414985ae
TH
9
10#include <linux/fs.h>
11#include <linux/seq_file.h>
12#include <linux/slab.h>
13#include <linux/poll.h>
14#include <linux/pagemap.h>
589ee628 15#include <linux/sched/mm.h>
d911d987 16#include <linux/fsnotify.h>
4eaad21a 17#include <linux/uio.h>
414985ae
TH
18
19#include "kernfs-internal.h"
20
c525aadd 21struct kernfs_open_node {
086c00c7 22 struct rcu_head rcu_head;
414985ae
TH
23 atomic_t event;
24 wait_queue_head_t poll;
c525aadd 25 struct list_head files; /* goes through kernfs_open_file.list */
414985ae
TH
26};
27
2fd26970
IK
28/*
29 * kernfs_notify() may be called from any context and bounces notifications
30 * through a work item. To minimize space overhead in kernfs_node, the
31 * pending queue is implemented as a singly linked list of kernfs_nodes.
32 * The list is terminated with the self pointer so that whether a
33 * kernfs_node is on the list or not can be determined by testing the next
34 * pointer for NULL.
ecca47ce 35 */
2fd26970 36#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
ecca47ce 37
2fd26970
IK
38static DEFINE_SPINLOCK(kernfs_notify_lock);
39static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
ecca47ce 40
41448c61
IK
41static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn)
42{
1d25b84e
IK
43 int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
44
45 return &kernfs_locks->open_file_mutex[idx];
41448c61
IK
46}
47
48static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
49{
50 struct mutex *lock;
51
52 lock = kernfs_open_file_mutex_ptr(kn);
53
54 mutex_lock(lock);
55
56 return lock;
57}
58
086c00c7
IK
59/**
60 * kernfs_deref_open_node - Get kernfs_open_node corresponding to @kn.
61 *
62 * @of: associated kernfs_open_file instance.
63 * @kn: target kernfs_node.
64 *
65 * Fetch and return ->attr.open of @kn if @of->list is non empty.
66 * If @of->list is not empty we can safely assume that @of is on
67 * @kn->attr.open->files list and this guarantees that @kn->attr.open
68 * will not vanish i.e. dereferencing outside RCU read-side critical
69 * section is safe here.
70 *
71 * The caller needs to make sure that @of->list is not empty.
72 */
73static struct kernfs_open_node *
74kernfs_deref_open_node(struct kernfs_open_file *of, struct kernfs_node *kn)
75{
76 struct kernfs_open_node *on;
77
78 on = rcu_dereference_check(kn->attr.open, !list_empty(&of->list));
79
80 return on;
81}
82
83/**
84 * kernfs_deref_open_node_protected - Get kernfs_open_node corresponding to @kn
85 *
86 * @kn: target kernfs_node.
87 *
88 * Fetch and return ->attr.open of @kn when caller holds the
41448c61 89 * kernfs_open_file_mutex_ptr(kn).
086c00c7 90 *
41448c61 91 * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when
086c00c7
IK
92 * the caller guarantees that this mutex is being held, other updaters can't
93 * change ->attr.open and this means that we can safely deref ->attr.open
94 * outside RCU read-side critical section.
95 *
96 * The caller needs to make sure that kernfs_open_file_mutex is held.
97 */
98static struct kernfs_open_node *
99kernfs_deref_open_node_protected(struct kernfs_node *kn)
100{
101 return rcu_dereference_protected(kn->attr.open,
41448c61 102 lockdep_is_held(kernfs_open_file_mutex_ptr(kn)));
086c00c7
IK
103}
104
c525aadd 105static struct kernfs_open_file *kernfs_of(struct file *file)
414985ae
TH
106{
107 return ((struct seq_file *)file->private_data)->private;
108}
109
110/*
324a56e1 111 * Determine the kernfs_ops for the given kernfs_node. This function must
414985ae
TH
112 * be called while holding an active reference.
113 */
324a56e1 114static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
414985ae 115{
df23fc39 116 if (kn->flags & KERNFS_LOCKDEP)
324a56e1 117 lockdep_assert_held(kn);
adc5e8b5 118 return kn->attr.ops;
414985ae
TH
119}
120
bb305947
TH
121/*
122 * As kernfs_seq_stop() is also called after kernfs_seq_start() or
123 * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
124 * a seq_file iteration which is fully initialized with an active reference
125 * or an aborted kernfs_seq_start() due to get_active failure. The
126 * position pointer is the only context for each seq_file iteration and
127 * thus the stop condition should be encoded in it. As the return value is
128 * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
129 * choice to indicate get_active failure.
130 *
131 * Unfortunately, this is complicated due to the optional custom seq_file
132 * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop()
133 * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
134 * custom seq_file operations and thus can't decide whether put_active
135 * should be performed or not only on ERR_PTR(-ENODEV).
136 *
137 * This is worked around by factoring out the custom seq_stop() and
138 * put_active part into kernfs_seq_stop_active(), skipping it from
139 * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
140 * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
141 * that kernfs_seq_stop_active() is skipped only after get_active failure.
142 */
143static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
144{
145 struct kernfs_open_file *of = sf->private;
146 const struct kernfs_ops *ops = kernfs_ops(of->kn);
147
148 if (ops->seq_stop)
149 ops->seq_stop(sf, v);
150 kernfs_put_active(of->kn);
151}
152
414985ae
TH
153static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
154{
c525aadd 155 struct kernfs_open_file *of = sf->private;
414985ae
TH
156 const struct kernfs_ops *ops;
157
158 /*
2b75869b 159 * @of->mutex nests outside active ref and is primarily to ensure that
414985ae
TH
160 * the ops aren't called concurrently for the same open file.
161 */
162 mutex_lock(&of->mutex);
c637b8ac 163 if (!kernfs_get_active(of->kn))
414985ae
TH
164 return ERR_PTR(-ENODEV);
165
324a56e1 166 ops = kernfs_ops(of->kn);
414985ae 167 if (ops->seq_start) {
bb305947
TH
168 void *next = ops->seq_start(sf, ppos);
169 /* see the comment above kernfs_seq_stop_active() */
170 if (next == ERR_PTR(-ENODEV))
171 kernfs_seq_stop_active(sf, next);
172 return next;
414985ae 173 }
90b2433e 174 return single_start(sf, ppos);
414985ae
TH
175}
176
177static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
178{
c525aadd 179 struct kernfs_open_file *of = sf->private;
324a56e1 180 const struct kernfs_ops *ops = kernfs_ops(of->kn);
414985ae
TH
181
182 if (ops->seq_next) {
bb305947
TH
183 void *next = ops->seq_next(sf, v, ppos);
184 /* see the comment above kernfs_seq_stop_active() */
185 if (next == ERR_PTR(-ENODEV))
186 kernfs_seq_stop_active(sf, next);
187 return next;
414985ae
TH
188 } else {
189 /*
190 * The same behavior and code as single_open(), always
191 * terminate after the initial read.
192 */
193 ++*ppos;
194 return NULL;
195 }
196}
197
198static void kernfs_seq_stop(struct seq_file *sf, void *v)
199{
c525aadd 200 struct kernfs_open_file *of = sf->private;
414985ae 201
bb305947
TH
202 if (v != ERR_PTR(-ENODEV))
203 kernfs_seq_stop_active(sf, v);
414985ae
TH
204 mutex_unlock(&of->mutex);
205}
206
207static int kernfs_seq_show(struct seq_file *sf, void *v)
208{
c525aadd 209 struct kernfs_open_file *of = sf->private;
086c00c7 210 struct kernfs_open_node *on = kernfs_deref_open_node(of, of->kn);
414985ae 211
086c00c7
IK
212 if (!on)
213 return -EINVAL;
214
215 of->event = atomic_read(&on->event);
414985ae 216
adc5e8b5 217 return of->kn->attr.ops->seq_show(sf, v);
414985ae
TH
218}
219
220static const struct seq_operations kernfs_seq_ops = {
221 .start = kernfs_seq_start,
222 .next = kernfs_seq_next,
223 .stop = kernfs_seq_stop,
224 .show = kernfs_seq_show,
225};
226
227/*
228 * As reading a bin file can have side-effects, the exact offset and bytes
229 * specified in read(2) call should be passed to the read callback making
230 * it difficult to use seq_file. Implement simplistic custom buffering for
231 * bin files.
232 */
4eaad21a 233static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
414985ae 234{
4eaad21a
CH
235 struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
236 ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE);
414985ae 237 const struct kernfs_ops *ops;
086c00c7 238 struct kernfs_open_node *on;
414985ae
TH
239 char *buf;
240
4ef67a8c 241 buf = of->prealloc_buf;
e4234a1f
CW
242 if (buf)
243 mutex_lock(&of->prealloc_mutex);
244 else
4ef67a8c 245 buf = kmalloc(len, GFP_KERNEL);
414985ae
TH
246 if (!buf)
247 return -ENOMEM;
248
249 /*
4ef67a8c 250 * @of->mutex nests outside active ref and is used both to ensure that
e4234a1f 251 * the ops aren't called concurrently for the same open file.
414985ae
TH
252 */
253 mutex_lock(&of->mutex);
c637b8ac 254 if (!kernfs_get_active(of->kn)) {
414985ae
TH
255 len = -ENODEV;
256 mutex_unlock(&of->mutex);
257 goto out_free;
258 }
259
086c00c7
IK
260 on = kernfs_deref_open_node(of, of->kn);
261 if (!on) {
262 len = -EINVAL;
263 mutex_unlock(&of->mutex);
264 goto out_free;
265 }
266
267 of->event = atomic_read(&on->event);
268
324a56e1 269 ops = kernfs_ops(of->kn);
414985ae 270 if (ops->read)
4eaad21a 271 len = ops->read(of, buf, len, iocb->ki_pos);
414985ae
TH
272 else
273 len = -EINVAL;
274
e4234a1f
CW
275 kernfs_put_active(of->kn);
276 mutex_unlock(&of->mutex);
277
414985ae 278 if (len < 0)
e4234a1f 279 goto out_free;
414985ae 280
4eaad21a 281 if (copy_to_iter(buf, len, iter) != len) {
414985ae 282 len = -EFAULT;
e4234a1f 283 goto out_free;
414985ae
TH
284 }
285
4eaad21a 286 iocb->ki_pos += len;
414985ae
TH
287
288 out_free:
e4234a1f
CW
289 if (buf == of->prealloc_buf)
290 mutex_unlock(&of->prealloc_mutex);
291 else
4ef67a8c 292 kfree(buf);
414985ae
TH
293 return len;
294}
295
4eaad21a 296static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter)
414985ae 297{
4eaad21a
CH
298 if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW)
299 return seq_read_iter(iocb, iter);
300 return kernfs_file_read_iter(iocb, iter);
414985ae
TH
301}
302
cc099e0b 303/*
414985ae
TH
304 * Copy data in from userland and pass it to the matching kernfs write
305 * operation.
306 *
307 * There is no easy way for us to know if userspace is only doing a partial
308 * write, so we don't support them. We expect the entire buffer to come on
309 * the first write. Hint: if you're writing a value, first read the file,
3fe40764 310 * modify only the value you're changing, then write entire buffer
414985ae
TH
311 * back.
312 */
cc099e0b 313static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
414985ae 314{
cc099e0b
CH
315 struct kernfs_open_file *of = kernfs_of(iocb->ki_filp);
316 ssize_t len = iov_iter_count(iter);
414985ae 317 const struct kernfs_ops *ops;
b7ce40cf 318 char *buf;
4d3773c4 319
b7ce40cf 320 if (of->atomic_write_len) {
b7ce40cf
TH
321 if (len > of->atomic_write_len)
322 return -E2BIG;
4d3773c4 323 } else {
cc099e0b 324 len = min_t(size_t, len, PAGE_SIZE);
4d3773c4
TH
325 }
326
2b75869b 327 buf = of->prealloc_buf;
e4234a1f
CW
328 if (buf)
329 mutex_lock(&of->prealloc_mutex);
330 else
2b75869b 331 buf = kmalloc(len + 1, GFP_KERNEL);
b7ce40cf
TH
332 if (!buf)
333 return -ENOMEM;
414985ae 334
cc099e0b 335 if (copy_from_iter(buf, len, iter) != len) {
e4234a1f
CW
336 len = -EFAULT;
337 goto out_free;
338 }
339 buf[len] = '\0'; /* guarantee string termination */
340
b7ce40cf 341 /*
2b75869b 342 * @of->mutex nests outside active ref and is used both to ensure that
e4234a1f 343 * the ops aren't called concurrently for the same open file.
b7ce40cf
TH
344 */
345 mutex_lock(&of->mutex);
346 if (!kernfs_get_active(of->kn)) {
347 mutex_unlock(&of->mutex);
348 len = -ENODEV;
349 goto out_free;
350 }
351
352 ops = kernfs_ops(of->kn);
353 if (ops->write)
cc099e0b 354 len = ops->write(of, buf, len, iocb->ki_pos);
b7ce40cf
TH
355 else
356 len = -EINVAL;
357
e4234a1f
CW
358 kernfs_put_active(of->kn);
359 mutex_unlock(&of->mutex);
360
414985ae 361 if (len > 0)
cc099e0b 362 iocb->ki_pos += len;
2b75869b 363
b7ce40cf 364out_free:
e4234a1f
CW
365 if (buf == of->prealloc_buf)
366 mutex_unlock(&of->prealloc_mutex);
367 else
2b75869b 368 kfree(buf);
414985ae
TH
369 return len;
370}
371
372static void kernfs_vma_open(struct vm_area_struct *vma)
373{
374 struct file *file = vma->vm_file;
c525aadd 375 struct kernfs_open_file *of = kernfs_of(file);
414985ae
TH
376
377 if (!of->vm_ops)
378 return;
379
c637b8ac 380 if (!kernfs_get_active(of->kn))
414985ae
TH
381 return;
382
383 if (of->vm_ops->open)
384 of->vm_ops->open(vma);
385
c637b8ac 386 kernfs_put_active(of->kn);
414985ae
TH
387}
388
9ee84466 389static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
414985ae 390{
11bac800 391 struct file *file = vmf->vma->vm_file;
c525aadd 392 struct kernfs_open_file *of = kernfs_of(file);
9ee84466 393 vm_fault_t ret;
414985ae
TH
394
395 if (!of->vm_ops)
396 return VM_FAULT_SIGBUS;
397
c637b8ac 398 if (!kernfs_get_active(of->kn))
414985ae
TH
399 return VM_FAULT_SIGBUS;
400
401 ret = VM_FAULT_SIGBUS;
402 if (of->vm_ops->fault)
11bac800 403 ret = of->vm_ops->fault(vmf);
414985ae 404
c637b8ac 405 kernfs_put_active(of->kn);
414985ae
TH
406 return ret;
407}
408
9ee84466 409static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
414985ae 410{
11bac800 411 struct file *file = vmf->vma->vm_file;
c525aadd 412 struct kernfs_open_file *of = kernfs_of(file);
9ee84466 413 vm_fault_t ret;
414985ae
TH
414
415 if (!of->vm_ops)
416 return VM_FAULT_SIGBUS;
417
c637b8ac 418 if (!kernfs_get_active(of->kn))
414985ae
TH
419 return VM_FAULT_SIGBUS;
420
421 ret = 0;
422 if (of->vm_ops->page_mkwrite)
11bac800 423 ret = of->vm_ops->page_mkwrite(vmf);
414985ae
TH
424 else
425 file_update_time(file);
426
c637b8ac 427 kernfs_put_active(of->kn);
414985ae
TH
428 return ret;
429}
430
431static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
432 void *buf, int len, int write)
433{
434 struct file *file = vma->vm_file;
c525aadd 435 struct kernfs_open_file *of = kernfs_of(file);
414985ae
TH
436 int ret;
437
438 if (!of->vm_ops)
439 return -EINVAL;
440
c637b8ac 441 if (!kernfs_get_active(of->kn))
414985ae
TH
442 return -EINVAL;
443
444 ret = -EINVAL;
445 if (of->vm_ops->access)
446 ret = of->vm_ops->access(vma, addr, buf, len, write);
447
c637b8ac 448 kernfs_put_active(of->kn);
414985ae
TH
449 return ret;
450}
451
452#ifdef CONFIG_NUMA
453static int kernfs_vma_set_policy(struct vm_area_struct *vma,
454 struct mempolicy *new)
455{
456 struct file *file = vma->vm_file;
c525aadd 457 struct kernfs_open_file *of = kernfs_of(file);
414985ae
TH
458 int ret;
459
460 if (!of->vm_ops)
461 return 0;
462
c637b8ac 463 if (!kernfs_get_active(of->kn))
414985ae
TH
464 return -EINVAL;
465
466 ret = 0;
467 if (of->vm_ops->set_policy)
468 ret = of->vm_ops->set_policy(vma, new);
469
c637b8ac 470 kernfs_put_active(of->kn);
414985ae
TH
471 return ret;
472}
473
474static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
475 unsigned long addr)
476{
477 struct file *file = vma->vm_file;
c525aadd 478 struct kernfs_open_file *of = kernfs_of(file);
414985ae
TH
479 struct mempolicy *pol;
480
481 if (!of->vm_ops)
482 return vma->vm_policy;
483
c637b8ac 484 if (!kernfs_get_active(of->kn))
414985ae
TH
485 return vma->vm_policy;
486
487 pol = vma->vm_policy;
488 if (of->vm_ops->get_policy)
489 pol = of->vm_ops->get_policy(vma, addr);
490
c637b8ac 491 kernfs_put_active(of->kn);
414985ae
TH
492 return pol;
493}
494
414985ae
TH
495#endif
496
497static const struct vm_operations_struct kernfs_vm_ops = {
498 .open = kernfs_vma_open,
499 .fault = kernfs_vma_fault,
500 .page_mkwrite = kernfs_vma_page_mkwrite,
501 .access = kernfs_vma_access,
502#ifdef CONFIG_NUMA
503 .set_policy = kernfs_vma_set_policy,
504 .get_policy = kernfs_vma_get_policy,
414985ae
TH
505#endif
506};
507
c637b8ac 508static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
414985ae 509{
c525aadd 510 struct kernfs_open_file *of = kernfs_of(file);
414985ae
TH
511 const struct kernfs_ops *ops;
512 int rc;
513
9b2db6e1
TH
514 /*
515 * mmap path and of->mutex are prone to triggering spurious lockdep
516 * warnings and we don't want to add spurious locking dependency
517 * between the two. Check whether mmap is actually implemented
518 * without grabbing @of->mutex by testing HAS_MMAP flag. See the
519 * comment in kernfs_file_open() for more details.
520 */
df23fc39 521 if (!(of->kn->flags & KERNFS_HAS_MMAP))
9b2db6e1
TH
522 return -ENODEV;
523
414985ae
TH
524 mutex_lock(&of->mutex);
525
526 rc = -ENODEV;
c637b8ac 527 if (!kernfs_get_active(of->kn))
414985ae
TH
528 goto out_unlock;
529
324a56e1 530 ops = kernfs_ops(of->kn);
9b2db6e1 531 rc = ops->mmap(of, vma);
b44b2140
TH
532 if (rc)
533 goto out_put;
414985ae
TH
534
535 /*
536 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
537 * to satisfy versions of X which crash if the mmap fails: that
538 * substitutes a new vm_file, and we don't then want bin_vm_ops.
539 */
540 if (vma->vm_file != file)
541 goto out_put;
542
543 rc = -EINVAL;
544 if (of->mmapped && of->vm_ops != vma->vm_ops)
545 goto out_put;
546
547 /*
548 * It is not possible to successfully wrap close.
549 * So error if someone is trying to use close.
550 */
414985ae
TH
551 if (vma->vm_ops && vma->vm_ops->close)
552 goto out_put;
553
554 rc = 0;
a1d82aff 555 of->mmapped = true;
414985ae
TH
556 of->vm_ops = vma->vm_ops;
557 vma->vm_ops = &kernfs_vm_ops;
558out_put:
c637b8ac 559 kernfs_put_active(of->kn);
414985ae
TH
560out_unlock:
561 mutex_unlock(&of->mutex);
562
563 return rc;
564}
565
566/**
c637b8ac 567 * kernfs_get_open_node - get or create kernfs_open_node
324a56e1 568 * @kn: target kernfs_node
c525aadd 569 * @of: kernfs_open_file for this instance of open
414985ae 570 *
adc5e8b5
TH
571 * If @kn->attr.open exists, increment its reference count; otherwise,
572 * create one. @of is chained to the files list.
414985ae
TH
573 *
574 * LOCKING:
575 * Kernel thread context (may sleep).
576 *
577 * RETURNS:
578 * 0 on success, -errno on failure.
579 */
c637b8ac
TH
580static int kernfs_get_open_node(struct kernfs_node *kn,
581 struct kernfs_open_file *of)
414985ae 582{
c525aadd 583 struct kernfs_open_node *on, *new_on = NULL;
41448c61 584 struct mutex *mutex = NULL;
414985ae 585
41448c61 586 mutex = kernfs_open_file_mutex_lock(kn);
086c00c7 587 on = kernfs_deref_open_node_protected(kn);
414985ae 588
c525aadd 589 if (on) {
086c00c7 590 list_add_tail(&of->list, &on->files);
41448c61 591 mutex_unlock(mutex);
414985ae 592 return 0;
086c00c7
IK
593 } else {
594 /* not there, initialize a new one */
595 new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
596 if (!new_on) {
41448c61 597 mutex_unlock(mutex);
086c00c7
IK
598 return -ENOMEM;
599 }
600 atomic_set(&new_on->event, 1);
601 init_waitqueue_head(&new_on->poll);
602 INIT_LIST_HEAD(&new_on->files);
603 list_add_tail(&of->list, &new_on->files);
604 rcu_assign_pointer(kn->attr.open, new_on);
414985ae 605 }
41448c61 606 mutex_unlock(mutex);
414985ae 607
086c00c7 608 return 0;
414985ae
TH
609}
610
611/**
c1b1352f
IK
612 * kernfs_unlink_open_file - Unlink @of from @kn.
613 *
bd900901 614 * @kn: target kernfs_node
c525aadd 615 * @of: associated kernfs_open_file
414985ae 616 *
c1b1352f
IK
617 * Unlink @of from list of @kn's associated open files. If list of
618 * associated open files becomes empty, disassociate and free
619 * kernfs_open_node.
414985ae
TH
620 *
621 * LOCKING:
622 * None.
623 */
c1b1352f 624static void kernfs_unlink_open_file(struct kernfs_node *kn,
c637b8ac 625 struct kernfs_open_file *of)
414985ae 626{
086c00c7 627 struct kernfs_open_node *on;
41448c61 628 struct mutex *mutex = NULL;
414985ae 629
41448c61 630 mutex = kernfs_open_file_mutex_lock(kn);
086c00c7
IK
631
632 on = kernfs_deref_open_node_protected(kn);
633 if (!on) {
41448c61 634 mutex_unlock(mutex);
086c00c7
IK
635 return;
636 }
414985ae
TH
637
638 if (of)
639 list_del(&of->list);
640
086c00c7
IK
641 if (list_empty(&on->files)) {
642 rcu_assign_pointer(kn->attr.open, NULL);
643 kfree_rcu(on, rcu_head);
644 }
414985ae 645
41448c61 646 mutex_unlock(mutex);
414985ae
TH
647}
648
c637b8ac 649static int kernfs_fop_open(struct inode *inode, struct file *file)
414985ae 650{
319ba91d 651 struct kernfs_node *kn = inode->i_private;
555724a8 652 struct kernfs_root *root = kernfs_root(kn);
414985ae 653 const struct kernfs_ops *ops;
c525aadd 654 struct kernfs_open_file *of;
414985ae
TH
655 bool has_read, has_write, has_mmap;
656 int error = -EACCES;
657
c637b8ac 658 if (!kernfs_get_active(kn))
414985ae
TH
659 return -ENODEV;
660
324a56e1 661 ops = kernfs_ops(kn);
414985ae
TH
662
663 has_read = ops->seq_show || ops->read || ops->mmap;
664 has_write = ops->write || ops->mmap;
665 has_mmap = ops->mmap;
666
555724a8
TH
667 /* see the flag definition for details */
668 if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
669 if ((file->f_mode & FMODE_WRITE) &&
670 (!(inode->i_mode & S_IWUGO) || !has_write))
671 goto err_out;
414985ae 672
555724a8
TH
673 if ((file->f_mode & FMODE_READ) &&
674 (!(inode->i_mode & S_IRUGO) || !has_read))
675 goto err_out;
676 }
414985ae 677
c525aadd 678 /* allocate a kernfs_open_file for the file */
414985ae 679 error = -ENOMEM;
c525aadd 680 of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
414985ae
TH
681 if (!of)
682 goto err_out;
683
684 /*
685 * The following is done to give a different lockdep key to
686 * @of->mutex for files which implement mmap. This is a rather
687 * crude way to avoid false positive lockdep warning around
c1e8d7c6 688 * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and
414985ae 689 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
c1e8d7c6 690 * which mm->mmap_lock nests, while holding @of->mutex. As each
414985ae
TH
691 * open file has a separate mutex, it's okay as long as those don't
692 * happen on the same file. At this point, we can't easily give
693 * each file a separate locking class. Let's differentiate on
694 * whether the file has mmap or not for now.
9b2db6e1
TH
695 *
696 * Both paths of the branch look the same. They're supposed to
697 * look that way and give @of->mutex different static lockdep keys.
414985ae
TH
698 */
699 if (has_mmap)
700 mutex_init(&of->mutex);
701 else
702 mutex_init(&of->mutex);
703
324a56e1 704 of->kn = kn;
414985ae
TH
705 of->file = file;
706
b7ce40cf
TH
707 /*
708 * Write path needs to atomic_write_len outside active reference.
cc099e0b 709 * Cache it in open_file. See kernfs_fop_write_iter() for details.
b7ce40cf
TH
710 */
711 of->atomic_write_len = ops->atomic_write_len;
712
4ef67a8c
N
713 error = -EINVAL;
714 /*
715 * ->seq_show is incompatible with ->prealloc,
716 * as seq_read does its own allocation.
717 * ->read must be used instead.
718 */
719 if (ops->prealloc && ops->seq_show)
720 goto err_free;
2b75869b
N
721 if (ops->prealloc) {
722 int len = of->atomic_write_len ?: PAGE_SIZE;
723 of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
724 error = -ENOMEM;
725 if (!of->prealloc_buf)
726 goto err_free;
e4234a1f 727 mutex_init(&of->prealloc_mutex);
2b75869b
N
728 }
729
414985ae
TH
730 /*
731 * Always instantiate seq_file even if read access doesn't use
732 * seq_file or is not requested. This unifies private data access
733 * and readable regular files are the vast majority anyway.
734 */
735 if (ops->seq_show)
736 error = seq_open(file, &kernfs_seq_ops);
737 else
738 error = seq_open(file, NULL);
739 if (error)
740 goto err_free;
741
0e67db2f
TH
742 of->seq_file = file->private_data;
743 of->seq_file->private = of;
414985ae
TH
744
745 /* seq_file clears PWRITE unconditionally, restore it if WRITE */
746 if (file->f_mode & FMODE_WRITE)
747 file->f_mode |= FMODE_PWRITE;
748
c637b8ac
TH
749 /* make sure we have open node struct */
750 error = kernfs_get_open_node(kn, of);
414985ae 751 if (error)
0e67db2f
TH
752 goto err_seq_release;
753
754 if (ops->open) {
755 /* nobody has access to @of yet, skip @of->mutex */
756 error = ops->open(of);
757 if (error)
758 goto err_put_node;
759 }
414985ae
TH
760
761 /* open succeeded, put active references */
c637b8ac 762 kernfs_put_active(kn);
414985ae
TH
763 return 0;
764
0e67db2f 765err_put_node:
c1b1352f 766 kernfs_unlink_open_file(kn, of);
0e67db2f 767err_seq_release:
414985ae
TH
768 seq_release(inode, file);
769err_free:
2b75869b 770 kfree(of->prealloc_buf);
414985ae
TH
771 kfree(of);
772err_out:
c637b8ac 773 kernfs_put_active(kn);
414985ae
TH
774 return error;
775}
776
0e67db2f
TH
777/* used from release/drain to ensure that ->release() is called exactly once */
778static void kernfs_release_file(struct kernfs_node *kn,
779 struct kernfs_open_file *of)
780{
f83f3c51
TH
781 /*
782 * @of is guaranteed to have no other file operations in flight and
783 * we just want to synchronize release and drain paths.
41448c61 784 * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used
f83f3c51
TH
785 * here because drain path may be called from places which can
786 * cause circular dependency.
787 */
41448c61 788 lockdep_assert_held(kernfs_open_file_mutex_ptr(kn));
0e67db2f 789
0e67db2f
TH
790 if (!of->released) {
791 /*
792 * A file is never detached without being released and we
793 * need to be able to release files which are deactivated
794 * and being drained. Don't use kernfs_ops().
795 */
796 kn->attr.ops->release(of);
797 of->released = true;
798 }
0e67db2f
TH
799}
800
c637b8ac 801static int kernfs_fop_release(struct inode *inode, struct file *filp)
414985ae 802{
319ba91d 803 struct kernfs_node *kn = inode->i_private;
c525aadd 804 struct kernfs_open_file *of = kernfs_of(filp);
41448c61 805 struct mutex *mutex = NULL;
414985ae 806
f83f3c51 807 if (kn->flags & KERNFS_HAS_RELEASE) {
41448c61 808 mutex = kernfs_open_file_mutex_lock(kn);
f83f3c51 809 kernfs_release_file(kn, of);
41448c61 810 mutex_unlock(mutex);
f83f3c51
TH
811 }
812
c1b1352f 813 kernfs_unlink_open_file(kn, of);
414985ae 814 seq_release(inode, filp);
2b75869b 815 kfree(of->prealloc_buf);
414985ae
TH
816 kfree(of);
817
818 return 0;
819}
820
0e67db2f 821void kernfs_drain_open_files(struct kernfs_node *kn)
414985ae 822{
c525aadd
TH
823 struct kernfs_open_node *on;
824 struct kernfs_open_file *of;
41448c61 825 struct mutex *mutex = NULL;
414985ae 826
0e67db2f 827 if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE)))
55f6e30d
GKH
828 return;
829
c1b1352f
IK
830 /*
831 * lockless opportunistic check is safe below because no one is adding to
832 * ->attr.open at this point of time. This check allows early bail out
833 * if ->attr.open is already NULL. kernfs_unlink_open_file makes
834 * ->attr.open NULL only while holding kernfs_open_file_mutex so below
41448c61 835 * check under kernfs_open_file_mutex_ptr(kn) will ensure bailing out if
c1b1352f
IK
836 * ->attr.open became NULL while waiting for the mutex.
837 */
086c00c7 838 if (!rcu_access_pointer(kn->attr.open))
414985ae
TH
839 return;
840
41448c61 841 mutex = kernfs_open_file_mutex_lock(kn);
086c00c7
IK
842 on = kernfs_deref_open_node_protected(kn);
843 if (!on) {
41448c61 844 mutex_unlock(mutex);
bd900901
IK
845 return;
846 }
0e67db2f 847
c525aadd 848 list_for_each_entry(of, &on->files, list) {
414985ae 849 struct inode *inode = file_inode(of->file);
0e67db2f
TH
850
851 if (kn->flags & KERNFS_HAS_MMAP)
852 unmap_mapping_range(inode->i_mapping, 0, 0, 1);
853
966fa72a
VJ
854 if (kn->flags & KERNFS_HAS_RELEASE)
855 kernfs_release_file(kn, of);
414985ae 856 }
0e67db2f 857
41448c61 858 mutex_unlock(mutex);
414985ae
TH
859}
860
c637b8ac
TH
861/*
862 * Kernfs attribute files are pollable. The idea is that you read
414985ae
TH
863 * the content and then you use 'poll' or 'select' to wait for
864 * the content to change. When the content changes (assuming the
865 * manager for the kobject supports notification), poll will
a9a08845 866 * return EPOLLERR|EPOLLPRI, and select will return the fd whether
414985ae
TH
867 * it is waiting for read, write, or exceptions.
868 * Once poll/select indicates that the value has changed, you
869 * need to close and re-open the file, or seek to 0 and read again.
870 * Reminder: this only works for attributes which actively support
871 * it, and it is not possible to test an attribute from userspace
872 * to see if it supports poll (Neither 'poll' nor 'select' return
873 * an appropriate error code). When in doubt, set a suitable timeout value.
874 */
147e1a97
JW
875__poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait)
876{
877 struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry);
086c00c7
IK
878 struct kernfs_open_node *on = kernfs_deref_open_node(of, kn);
879
880 if (!on)
881 return EPOLLERR;
147e1a97
JW
882
883 poll_wait(of->file, &on->poll, wait);
884
885 if (of->event != atomic_read(&on->event))
886 return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
887
888 return DEFAULT_POLLMASK;
889}
890
076ccb76 891static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait)
414985ae 892{
c525aadd 893 struct kernfs_open_file *of = kernfs_of(filp);
319ba91d 894 struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry);
147e1a97 895 __poll_t ret;
414985ae 896
c637b8ac 897 if (!kernfs_get_active(kn))
147e1a97 898 return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI;
414985ae 899
147e1a97
JW
900 if (kn->attr.ops->poll)
901 ret = kn->attr.ops->poll(of, wait);
902 else
903 ret = kernfs_generic_poll(of, wait);
414985ae 904
c637b8ac 905 kernfs_put_active(kn);
147e1a97 906 return ret;
414985ae
TH
907}
908
ecca47ce 909static void kernfs_notify_workfn(struct work_struct *work)
414985ae 910{
ecca47ce 911 struct kernfs_node *kn;
d911d987 912 struct kernfs_super_info *info;
393c3714 913 struct kernfs_root *root;
ecca47ce
TH
914repeat:
915 /* pop one off the notify_list */
2fd26970
IK
916 spin_lock_irq(&kernfs_notify_lock);
917 kn = kernfs_notify_list;
918 if (kn == KERNFS_NOTIFY_EOL) {
919 spin_unlock_irq(&kernfs_notify_lock);
d911d987 920 return;
2fd26970
IK
921 }
922 kernfs_notify_list = kn->attr.notify_next;
923 kn->attr.notify_next = NULL;
924 spin_unlock_irq(&kernfs_notify_lock);
d911d987 925
393c3714 926 root = kernfs_root(kn);
d911d987 927 /* kick fsnotify */
393c3714 928 down_write(&root->kernfs_rwsem);
d911d987 929
ecca47ce 930 list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
df6a58c5 931 struct kernfs_node *parent;
497b0c5a 932 struct inode *p_inode = NULL;
d911d987 933 struct inode *inode;
25b229df 934 struct qstr name;
d911d987 935
df6a58c5
TH
936 /*
937 * We want fsnotify_modify() on @kn but as the
938 * modifications aren't originating from userland don't
939 * have the matching @file available. Look up the inodes
940 * and generate the events manually.
941 */
67c0496e 942 inode = ilookup(info->sb, kernfs_ino(kn));
d911d987
TH
943 if (!inode)
944 continue;
945
25b229df 946 name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
df6a58c5
TH
947 parent = kernfs_get_parent(kn);
948 if (parent) {
67c0496e 949 p_inode = ilookup(info->sb, kernfs_ino(parent));
df6a58c5 950 if (p_inode) {
40a100d3
AG
951 fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD,
952 inode, FSNOTIFY_EVENT_INODE,
953 p_inode, &name, inode, 0);
df6a58c5
TH
954 iput(p_inode);
955 }
956
957 kernfs_put(parent);
d911d987
TH
958 }
959
82ace1ef
AG
960 if (!p_inode)
961 fsnotify_inode(inode, FS_MODIFY);
497b0c5a 962
d911d987
TH
963 iput(inode);
964 }
965
393c3714 966 up_write(&root->kernfs_rwsem);
ecca47ce
TH
967 kernfs_put(kn);
968 goto repeat;
969}
970
971/**
972 * kernfs_notify - notify a kernfs file
973 * @kn: file to notify
974 *
975 * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any
976 * context.
977 */
978void kernfs_notify(struct kernfs_node *kn)
979{
980 static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
2fd26970 981 unsigned long flags;
03c0a920 982 struct kernfs_open_node *on;
ecca47ce
TH
983
984 if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
985 return;
986
03c0a920 987 /* kick poll immediately */
086c00c7
IK
988 rcu_read_lock();
989 on = rcu_dereference(kn->attr.open);
03c0a920
RR
990 if (on) {
991 atomic_inc(&on->event);
992 wake_up_interruptible(&on->poll);
993 }
086c00c7 994 rcu_read_unlock();
03c0a920
RR
995
996 /* schedule work to kick fsnotify */
2fd26970
IK
997 spin_lock_irqsave(&kernfs_notify_lock, flags);
998 if (!kn->attr.notify_next) {
999 kernfs_get(kn);
1000 kn->attr.notify_next = kernfs_notify_list;
1001 kernfs_notify_list = kn;
1002 schedule_work(&kernfs_notify_work);
1003 }
1004 spin_unlock_irqrestore(&kernfs_notify_lock, flags);
414985ae
TH
1005}
1006EXPORT_SYMBOL_GPL(kernfs_notify);
1007
a797bfc3 1008const struct file_operations kernfs_file_fops = {
4eaad21a 1009 .read_iter = kernfs_fop_read_iter,
cc099e0b 1010 .write_iter = kernfs_fop_write_iter,
414985ae 1011 .llseek = generic_file_llseek,
c637b8ac
TH
1012 .mmap = kernfs_fop_mmap,
1013 .open = kernfs_fop_open,
1014 .release = kernfs_fop_release,
1015 .poll = kernfs_fop_poll,
2a9becdd 1016 .fsync = noop_fsync,
f2d6c270
CH
1017 .splice_read = generic_file_splice_read,
1018 .splice_write = iter_file_splice_write,
414985ae
TH
1019};
1020
1021/**
2063d608 1022 * __kernfs_create_file - kernfs internal function to create a file
414985ae
TH
1023 * @parent: directory to create the file in
1024 * @name: name of the file
1025 * @mode: mode of the file
488dee96
DT
1026 * @uid: uid of the file
1027 * @gid: gid of the file
414985ae
TH
1028 * @size: size of the file
1029 * @ops: kernfs operations for the file
1030 * @priv: private data for the file
1031 * @ns: optional namespace tag of the file
1032 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
1033 *
1034 * Returns the created node on success, ERR_PTR() value on error.
1035 */
2063d608
TH
1036struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
1037 const char *name,
488dee96
DT
1038 umode_t mode, kuid_t uid, kgid_t gid,
1039 loff_t size,
2063d608
TH
1040 const struct kernfs_ops *ops,
1041 void *priv, const void *ns,
2063d608 1042 struct lock_class_key *key)
414985ae 1043{
324a56e1 1044 struct kernfs_node *kn;
2063d608 1045 unsigned flags;
414985ae
TH
1046 int rc;
1047
2063d608 1048 flags = KERNFS_FILE;
2063d608 1049
488dee96
DT
1050 kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG,
1051 uid, gid, flags);
324a56e1 1052 if (!kn)
414985ae
TH
1053 return ERR_PTR(-ENOMEM);
1054
adc5e8b5
TH
1055 kn->attr.ops = ops;
1056 kn->attr.size = size;
1057 kn->ns = ns;
324a56e1 1058 kn->priv = priv;
414985ae
TH
1059
1060#ifdef CONFIG_DEBUG_LOCK_ALLOC
1061 if (key) {
0f605db5 1062 lockdep_init_map(&kn->dep_map, "kn->active", key, 0);
df23fc39 1063 kn->flags |= KERNFS_LOCKDEP;
414985ae
TH
1064 }
1065#endif
1066
1067 /*
1970a062 1068 * kn->attr.ops is accessible only while holding active ref. We
414985ae
TH
1069 * need to know whether some ops are implemented outside active
1070 * ref. Cache their existence in flags.
1071 */
1072 if (ops->seq_show)
df23fc39 1073 kn->flags |= KERNFS_HAS_SEQ_SHOW;
414985ae 1074 if (ops->mmap)
df23fc39 1075 kn->flags |= KERNFS_HAS_MMAP;
0e67db2f
TH
1076 if (ops->release)
1077 kn->flags |= KERNFS_HAS_RELEASE;
414985ae 1078
988cd7af 1079 rc = kernfs_add_one(kn);
414985ae 1080 if (rc) {
324a56e1 1081 kernfs_put(kn);
414985ae
TH
1082 return ERR_PTR(rc);
1083 }
324a56e1 1084 return kn;
414985ae 1085}