fs/super.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/fs/super.c
   4  *
   5  *  Copyright (C) 1991, 1992  Linus Torvalds
   6  *
   7  *  super.c contains code to handle: - mount structures
   8  *                                   - super-block tables
   9  *                                   - filesystem drivers list
  10  *                                   - mount system call
  11  *                                   - umount system call
  12  *                                   - ustat system call
  13  *
  14  * GK 2/5/95  -  Changed to support mounting the root fs via NFS
  15  *
  16  *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
  17  *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
  18  *  Added options to /proc/mounts:
  19  *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
  20  *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
  21  *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
  22  */
  23
  24 #include <linux/export.h>
  25 #include <linux/slab.h>
  26 #include <linux/blkdev.h>
  27 #include <linux/mount.h>
  28 #include <linux/security.h>
  29 #include <linux/writeback.h>            /* for the emergency remount stuff */
  30 #include <linux/idr.h>
  31 #include <linux/mutex.h>
  32 #include <linux/backing-dev.h>
  33 #include <linux/rculist_bl.h>
  34 #include <linux/fscrypt.h>
  35 #include <linux/fsnotify.h>
  36 #include <linux/lockdep.h>
  37 #include <linux/user_namespace.h>
  38 #include <linux/fs_context.h>
  39 #include <uapi/linux/mount.h>
  40 #include "internal.h"
  41
  42 static int thaw_super_locked(struct super_block *sb, enum freeze_holder who);
  43
  44 static LIST_HEAD(super_blocks);
  45 static DEFINE_SPINLOCK(sb_lock);
  46
  47 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
  48         "sb_writers",
  49         "sb_pagefaults",
  50         "sb_internal",
  51 };
  52
  53 static inline void __super_lock(struct super_block *sb, bool excl)
  54 {
  55         if (excl)
  56                 down_write(&sb->s_umount);
  57         else
  58                 down_read(&sb->s_umount);
  59 }
  60
  61 static inline void super_unlock(struct super_block *sb, bool excl)
  62 {
  63         if (excl)
  64                 up_write(&sb->s_umount);
  65         else
  66                 up_read(&sb->s_umount);
  67 }
  68
  69 static inline void __super_lock_excl(struct super_block *sb)
  70 {
  71         __super_lock(sb, true);
  72 }
  73
  74 static inline void super_unlock_excl(struct super_block *sb)
  75 {
  76         super_unlock(sb, true);
  77 }
  78
  79 static inline void super_unlock_shared(struct super_block *sb)
  80 {
  81         super_unlock(sb, false);
  82 }
  83
  84 static bool super_flags(const struct super_block *sb, unsigned int flags)
  85 {
  86         /*
  87          * Pairs with smp_store_release() in super_wake() and ensures
  88          * that we see @flags after we're woken.
  89          */
  90         return smp_load_acquire(&sb->s_flags) & flags;
  91 }
  92
  93 /**
  94  * super_lock - wait for superblock to become ready and lock it
  95  * @sb: superblock to wait for
  96  * @excl: whether exclusive access is required
  97  *
  98  * If the superblock has neither passed through vfs_get_tree() or
  99  * generic_shutdown_super() yet wait for it to happen. Either superblock
 100  * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
 101  * woken and we'll see SB_DYING.
 102  *
 103  * The caller must have acquired a temporary reference on @sb->s_count.
 104  *
 105  * Return: The function returns true if SB_BORN was set and with
 106  *         s_umount held. The function returns false if SB_DYING was
 107  *         set and without s_umount held.
 108  */
 109 static __must_check bool super_lock(struct super_block *sb, bool excl)
 110 {
 111         lockdep_assert_not_held(&sb->s_umount);
 112
 113         /* wait until the superblock is ready or dying */
 114         wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING));
 115
 116         /* Don't pointlessly acquire s_umount. */
 117         if (super_flags(sb, SB_DYING))
 118                 return false;
 119
 120         __super_lock(sb, excl);
 121
 122         /*
 123          * Has gone through generic_shutdown_super() in the meantime.
 124          * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
 125          * grab a reference to this. Tell them so.
 126          */
 127         if (sb->s_flags & SB_DYING) {
 128                 super_unlock(sb, excl);
 129                 return false;
 130         }
 131
 132         WARN_ON_ONCE(!(sb->s_flags & SB_BORN));
 133         return true;
 134 }
 135
 136 /* wait and try to acquire read-side of @sb->s_umount */
 137 static inline bool super_lock_shared(struct super_block *sb)
 138 {
 139         return super_lock(sb, false);
 140 }
 141
 142 /* wait and try to acquire write-side of @sb->s_umount */
 143 static inline bool super_lock_excl(struct super_block *sb)
 144 {
 145         return super_lock(sb, true);
 146 }
 147
 148 /* wake waiters */
 149 #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD)
 150 static void super_wake(struct super_block *sb, unsigned int flag)
 151 {
 152         WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS));
 153         WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1);
 154
 155         /*
 156          * Pairs with smp_load_acquire() in super_lock() to make sure
 157          * all initializations in the superblock are seen by the user
 158          * seeing SB_BORN sent.
 159          */
 160         smp_store_release(&sb->s_flags, sb->s_flags | flag);
 161         /*
 162          * Pairs with the barrier in prepare_to_wait_event() to make sure
 163          * ___wait_var_event() either sees SB_BORN set or
 164          * waitqueue_active() check in wake_up_var() sees the waiter.
 165          */
 166         smp_mb();
 167         wake_up_var(&sb->s_flags);
 168 }
 169
 170 /*
 171  * One thing we have to be careful of with a per-sb shrinker is that we don't
 172  * drop the last active reference to the superblock from within the shrinker.
 173  * If that happens we could trigger unregistering the shrinker from within the
 174  * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
 175  * take a passive reference to the superblock to avoid this from occurring.
 176  */
 177 static unsigned long super_cache_scan(struct shrinker *shrink,
 178                                       struct shrink_control *sc)
 179 {
 180         struct super_block *sb;
 181         long    fs_objects = 0;
 182         long    total_objects;
 183         long    freed = 0;
 184         long    dentries;
 185         long    inodes;
 186
 187         sb = shrink->private_data;
 188
 189         /*
 190          * Deadlock avoidance.  We may hold various FS locks, and we don't want
 191          * to recurse into the FS that called us in clear_inode() and friends..
 192          */
 193         if (!(sc->gfp_mask & __GFP_FS))
 194                 return SHRINK_STOP;
 195
 196         if (!super_trylock_shared(sb))
 197                 return SHRINK_STOP;
 198
 199         if (sb->s_op->nr_cached_objects)
 200                 fs_objects = sb->s_op->nr_cached_objects(sb, sc);
 201
 202         inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
 203         dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
 204         total_objects = dentries + inodes + fs_objects + 1;
 205         if (!total_objects)
 206                 total_objects = 1;
 207
 208         /* proportion the scan between the caches */
 209         dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
 210         inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
 211         fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
 212
 213         /*
 214          * prune the dcache first as the icache is pinned by it, then
 215          * prune the icache, followed by the filesystem specific caches
 216          *
 217          * Ensure that we always scan at least one object - memcg kmem
 218          * accounting uses this to fully empty the caches.
 219          */
 220         sc->nr_to_scan = dentries + 1;
 221         freed = prune_dcache_sb(sb, sc);
 222         sc->nr_to_scan = inodes + 1;
 223         freed += prune_icache_sb(sb, sc);
 224
 225         if (fs_objects) {
 226                 sc->nr_to_scan = fs_objects + 1;
 227                 freed += sb->s_op->free_cached_objects(sb, sc);
 228         }
 229
 230         super_unlock_shared(sb);
 231         return freed;
 232 }
 233
 234 static unsigned long super_cache_count(struct shrinker *shrink,
 235                                        struct shrink_control *sc)
 236 {
 237         struct super_block *sb;
 238         long    total_objects = 0;
 239
 240         sb = shrink->private_data;
 241
 242         /*
 243          * We don't call super_trylock_shared() here as it is a scalability
 244          * bottleneck, so we're exposed to partial setup state. The shrinker
 245          * rwsem does not protect filesystem operations backing
 246          * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can
 247          * change between super_cache_count and super_cache_scan, so we really
 248          * don't need locks here.
 249          *
 250          * However, if we are currently mounting the superblock, the underlying
 251          * filesystem might be in a state of partial construction and hence it
 252          * is dangerous to access it.  super_trylock_shared() uses a SB_BORN check
 253          * to avoid this situation, so do the same here. The memory barrier is
 254          * matched with the one in mount_fs() as we don't hold locks here.
 255          */
 256         if (!(sb->s_flags & SB_BORN))
 257                 return 0;
 258         smp_rmb();
 259
 260         if (sb->s_op && sb->s_op->nr_cached_objects)
 261                 total_objects = sb->s_op->nr_cached_objects(sb, sc);
 262
 263         total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
 264         total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
 265
 266         if (!total_objects)
 267                 return SHRINK_EMPTY;
 268
 269         total_objects = vfs_pressure_ratio(total_objects);
 270         return total_objects;
 271 }
 272
 273 static void destroy_super_work(struct work_struct *work)
 274 {
 275         struct super_block *s = container_of(work, struct super_block,
 276                                                         destroy_work);
 277         security_sb_free(s);
 278         put_user_ns(s->s_user_ns);
 279         kfree(s->s_subtype);
 280         for (int i = 0; i < SB_FREEZE_LEVELS; i++)
 281                 percpu_free_rwsem(&s->s_writers.rw_sem[i]);
 282         kfree(s);
 283 }
 284
 285 static void destroy_super_rcu(struct rcu_head *head)
 286 {
 287         struct super_block *s = container_of(head, struct super_block, rcu);
 288         INIT_WORK(&s->destroy_work, destroy_super_work);
 289         schedule_work(&s->destroy_work);
 290 }
 291
 292 /* Free a superblock that has never been seen by anyone */
 293 static void destroy_unused_super(struct super_block *s)
 294 {
 295         if (!s)
 296                 return;
 297         super_unlock_excl(s);
 298         list_lru_destroy(&s->s_dentry_lru);
 299         list_lru_destroy(&s->s_inode_lru);
 300         shrinker_free(s->s_shrink);
 301         /* no delays needed */
 302         destroy_super_work(&s->destroy_work);
 303 }
 304
 305 /**
 306  *      alloc_super     -       create new superblock
 307  *      @type:  filesystem type superblock should belong to
 308  *      @flags: the mount flags
 309  *      @user_ns: User namespace for the super_block
 310  *
 311  *      Allocates and initializes a new &struct super_block.  alloc_super()
 312  *      returns a pointer new superblock or %NULL if allocation had failed.
 313  */
 314 static struct super_block *alloc_super(struct file_system_type *type, int flags,
 315                                        struct user_namespace *user_ns)
 316 {
 317         struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL);
 318         static const struct super_operations default_op;
 319         int i;
 320
 321         if (!s)
 322                 return NULL;
 323
 324         INIT_LIST_HEAD(&s->s_mounts);
 325         s->s_user_ns = get_user_ns(user_ns);
 326         init_rwsem(&s->s_umount);
 327         lockdep_set_class(&s->s_umount, &type->s_umount_key);
 328         /*
 329          * sget() can have s_umount recursion.
 330          *
 331          * When it cannot find a suitable sb, it allocates a new
 332          * one (this one), and tries again to find a suitable old
 333          * one.
 334          *
 335          * In case that succeeds, it will acquire the s_umount
 336          * lock of the old one. Since these are clearly distrinct
 337          * locks, and this object isn't exposed yet, there's no
 338          * risk of deadlocks.
 339          *
 340          * Annotate this by putting this lock in a different
 341          * subclass.
 342          */
 343         down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
 344
 345         if (security_sb_alloc(s))
 346                 goto fail;
 347
 348         for (i = 0; i < SB_FREEZE_LEVELS; i++) {
 349                 if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
 350                                         sb_writers_name[i],
 351                                         &type->s_writers_key[i]))
 352                         goto fail;
 353         }
 354         s->s_bdi = &noop_backing_dev_info;
 355         s->s_flags = flags;
 356         if (s->s_user_ns != &init_user_ns)
 357                 s->s_iflags |= SB_I_NODEV;
 358         INIT_HLIST_NODE(&s->s_instances);
 359         INIT_HLIST_BL_HEAD(&s->s_roots);
 360         mutex_init(&s->s_sync_lock);
 361         INIT_LIST_HEAD(&s->s_inodes);
 362         spin_lock_init(&s->s_inode_list_lock);
 363         INIT_LIST_HEAD(&s->s_inodes_wb);
 364         spin_lock_init(&s->s_inode_wblist_lock);
 365
 366         s->s_count = 1;
 367         atomic_set(&s->s_active, 1);
 368         mutex_init(&s->s_vfs_rename_mutex);
 369         lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
 370         init_rwsem(&s->s_dquot.dqio_sem);
 371         s->s_maxbytes = MAX_NON_LFS;
 372         s->s_op = &default_op;
 373         s->s_time_gran = 1000000000;
 374         s->s_time_min = TIME64_MIN;
 375         s->s_time_max = TIME64_MAX;
 376
 377         s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
 378                                      "sb-%s", type->name);
 379         if (!s->s_shrink)
 380                 goto fail;
 381
 382         s->s_shrink->scan_objects = super_cache_scan;
 383         s->s_shrink->count_objects = super_cache_count;
 384         s->s_shrink->batch = 1024;
 385         s->s_shrink->private_data = s;
 386
 387         if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink))
 388                 goto fail;
 389         if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
 390                 goto fail;
 391         return s;
 392
 393 fail:
 394         destroy_unused_super(s);
 395         return NULL;
 396 }
 397
 398 /* Superblock refcounting  */
 399
 400 /*
 401  * Drop a superblock's refcount.  The caller must hold sb_lock.
 402  */
 403 static void __put_super(struct super_block *s)
 404 {
 405         if (!--s->s_count) {
 406                 list_del_init(&s->s_list);
 407                 WARN_ON(s->s_dentry_lru.node);
 408                 WARN_ON(s->s_inode_lru.node);
 409                 WARN_ON(!list_empty(&s->s_mounts));
 410                 call_rcu(&s->rcu, destroy_super_rcu);
 411         }
 412 }
 413
 414 /**
 415  *      put_super       -       drop a temporary reference to superblock
 416  *      @sb: superblock in question
 417  *
 418  *      Drops a temporary reference, frees superblock if there's no
 419  *      references left.
 420  */
 421 void put_super(struct super_block *sb)
 422 {
 423         spin_lock(&sb_lock);
 424         __put_super(sb);
 425         spin_unlock(&sb_lock);
 426 }
 427
 428 static void kill_super_notify(struct super_block *sb)
 429 {
 430         lockdep_assert_not_held(&sb->s_umount);
 431
 432         /* already notified earlier */
 433         if (sb->s_flags & SB_DEAD)
 434                 return;
 435
 436         /*
 437          * Remove it from @fs_supers so it isn't found by new
 438          * sget{_fc}() walkers anymore. Any concurrent mounter still
 439          * managing to grab a temporary reference is guaranteed to
 440          * already see SB_DYING and will wait until we notify them about
 441          * SB_DEAD.
 442          */
 443         spin_lock(&sb_lock);
 444         hlist_del_init(&sb->s_instances);
 445         spin_unlock(&sb_lock);
 446
 447         /*
 448          * Let concurrent mounts know that this thing is really dead.
 449          * We don't need @sb->s_umount here as every concurrent caller
 450          * will see SB_DYING and either discard the superblock or wait
 451          * for SB_DEAD.
 452          */
 453         super_wake(sb, SB_DEAD);
 454 }
 455
 456 /**
 457  *      deactivate_locked_super -       drop an active reference to superblock
 458  *      @s: superblock to deactivate
 459  *
 460  *      Drops an active reference to superblock, converting it into a temporary
 461  *      one if there is no other active references left.  In that case we
 462  *      tell fs driver to shut it down and drop the temporary reference we
 463  *      had just acquired.
 464  *
 465  *      Caller holds exclusive lock on superblock; that lock is released.
 466  */
 467 void deactivate_locked_super(struct super_block *s)
 468 {
 469         struct file_system_type *fs = s->s_type;
 470         if (atomic_dec_and_test(&s->s_active)) {
 471                 shrinker_free(s->s_shrink);
 472                 fs->kill_sb(s);
 473
 474                 kill_super_notify(s);
 475
 476                 /*
 477                  * Since list_lru_destroy() may sleep, we cannot call it from
 478                  * put_super(), where we hold the sb_lock. Therefore we destroy
 479                  * the lru lists right now.
 480                  */
 481                 list_lru_destroy(&s->s_dentry_lru);
 482                 list_lru_destroy(&s->s_inode_lru);
 483
 484                 put_filesystem(fs);
 485                 put_super(s);
 486         } else {
 487                 super_unlock_excl(s);
 488         }
 489 }
 490
 491 EXPORT_SYMBOL(deactivate_locked_super);
 492
 493 /**
 494  *      deactivate_super        -       drop an active reference to superblock
 495  *      @s: superblock to deactivate
 496  *
 497  *      Variant of deactivate_locked_super(), except that superblock is *not*
 498  *      locked by caller.  If we are going to drop the final active reference,
 499  *      lock will be acquired prior to that.
 500  */
 501 void deactivate_super(struct super_block *s)
 502 {
 503         if (!atomic_add_unless(&s->s_active, -1, 1)) {
 504                 __super_lock_excl(s);
 505                 deactivate_locked_super(s);
 506         }
 507 }
 508
 509 EXPORT_SYMBOL(deactivate_super);
 510
 511 /**
 512  * grab_super - acquire an active reference to a superblock
 513  * @sb: superblock to acquire
 514  *
 515  * Acquire a temporary reference on a superblock and try to trade it for
 516  * an active reference. This is used in sget{_fc}() to wait for a
 517  * superblock to either become SB_BORN or for it to pass through
 518  * sb->kill() and be marked as SB_DEAD.
 519  *
 520  * Return: This returns true if an active reference could be acquired,
 521  *         false if not.
 522  */
 523 static bool grab_super(struct super_block *sb)
 524 {
 525         bool locked;
 526
 527         sb->s_count++;
 528         spin_unlock(&sb_lock);
 529         locked = super_lock_excl(sb);
 530         if (locked) {
 531                 if (atomic_inc_not_zero(&sb->s_active)) {
 532                         put_super(sb);
 533                         return true;
 534                 }
 535                 super_unlock_excl(sb);
 536         }
 537         wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD));
 538         put_super(sb);
 539         return false;
 540 }
 541
 542 /*
 543  *      super_trylock_shared - try to grab ->s_umount shared
 544  *      @sb: reference we are trying to grab
 545  *
 546  *      Try to prevent fs shutdown.  This is used in places where we
 547  *      cannot take an active reference but we need to ensure that the
 548  *      filesystem is not shut down while we are working on it. It returns
 549  *      false if we cannot acquire s_umount or if we lose the race and
 550  *      filesystem already got into shutdown, and returns true with the s_umount
 551  *      lock held in read mode in case of success. On successful return,
 552  *      the caller must drop the s_umount lock when done.
 553  *
 554  *      Note that unlike get_super() et.al. this one does *not* bump ->s_count.
 555  *      The reason why it's safe is that we are OK with doing trylock instead
 556  *      of down_read().  There's a couple of places that are OK with that, but
 557  *      it's very much not a general-purpose interface.
 558  */
 559 bool super_trylock_shared(struct super_block *sb)
 560 {
 561         if (down_read_trylock(&sb->s_umount)) {
 562                 if (!(sb->s_flags & SB_DYING) && sb->s_root &&
 563                     (sb->s_flags & SB_BORN))
 564                         return true;
 565                 super_unlock_shared(sb);
 566         }
 567
 568         return false;
 569 }
 570
 571 /**
 572  *      retire_super    -       prevents superblock from being reused
 573  *      @sb: superblock to retire
 574  *
 575  *      The function marks superblock to be ignored in superblock test, which
 576  *      prevents it from being reused for any new mounts.  If the superblock has
 577  *      a private bdi, it also unregisters it, but doesn't reduce the refcount
 578  *      of the superblock to prevent potential races.  The refcount is reduced
 579  *      by generic_shutdown_super().  The function can not be called
 580  *      concurrently with generic_shutdown_super().  It is safe to call the
 581  *      function multiple times, subsequent calls have no effect.
 582  *
 583  *      The marker will affect the re-use only for block-device-based
 584  *      superblocks.  Other superblocks will still get marked if this function
 585  *      is used, but that will not affect their reusability.
 586  */
 587 void retire_super(struct super_block *sb)
 588 {
 589         WARN_ON(!sb->s_bdev);
 590         __super_lock_excl(sb);
 591         if (sb->s_iflags & SB_I_PERSB_BDI) {
 592                 bdi_unregister(sb->s_bdi);
 593                 sb->s_iflags &= ~SB_I_PERSB_BDI;
 594         }
 595         sb->s_iflags |= SB_I_RETIRED;
 596         super_unlock_excl(sb);
 597 }
 598 EXPORT_SYMBOL(retire_super);
 599
 600 /**
 601  *      generic_shutdown_super  -       common helper for ->kill_sb()
 602  *      @sb: superblock to kill
 603  *
 604  *      generic_shutdown_super() does all fs-independent work on superblock
 605  *      shutdown.  Typical ->kill_sb() should pick all fs-specific objects
 606  *      that need destruction out of superblock, call generic_shutdown_super()
 607  *      and release aforementioned objects.  Note: dentries and inodes _are_
 608  *      taken care of and do not need specific handling.
 609  *
 610  *      Upon calling this function, the filesystem may no longer alter or
 611  *      rearrange the set of dentries belonging to this super_block, nor may it
 612  *      change the attachments of dentries to inodes.
 613  */
 614 void generic_shutdown_super(struct super_block *sb)
 615 {
 616         const struct super_operations *sop = sb->s_op;
 617
 618         if (sb->s_root) {
 619                 shrink_dcache_for_umount(sb);
 620                 sync_filesystem(sb);
 621                 sb->s_flags &= ~SB_ACTIVE;
 622
 623                 cgroup_writeback_umount();
 624
 625                 /* Evict all inodes with zero refcount. */
 626                 evict_inodes(sb);
 627
 628                 /*
 629                  * Clean up and evict any inodes that still have references due
 630                  * to fsnotify or the security policy.
 631                  */
 632                 fsnotify_sb_delete(sb);
 633                 security_sb_delete(sb);
 634
 635                 if (sb->s_dio_done_wq) {
 636                         destroy_workqueue(sb->s_dio_done_wq);
 637                         sb->s_dio_done_wq = NULL;
 638                 }
 639
 640                 if (sop->put_super)
 641                         sop->put_super(sb);
 642
 643                 /*
 644                  * Now that all potentially-encrypted inodes have been evicted,
 645                  * the fscrypt keyring can be destroyed.
 646                  */
 647                 fscrypt_destroy_keyring(sb);
 648
 649                 if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes),
 650                                 "VFS: Busy inodes after unmount of %s (%s)",
 651                                 sb->s_id, sb->s_type->name)) {
 652                         /*
 653                          * Adding a proper bailout path here would be hard, but
 654                          * we can at least make it more likely that a later
 655                          * iput_final() or such crashes cleanly.
 656                          */
 657                         struct inode *inode;
 658
 659                         spin_lock(&sb->s_inode_list_lock);
 660                         list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 661                                 inode->i_op = VFS_PTR_POISON;
 662                                 inode->i_sb = VFS_PTR_POISON;
 663                                 inode->i_mapping = VFS_PTR_POISON;
 664                         }
 665                         spin_unlock(&sb->s_inode_list_lock);
 666                 }
 667         }
 668         /*
 669          * Broadcast to everyone that grabbed a temporary reference to this
 670          * superblock before we removed it from @fs_supers that the superblock
 671          * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
 672          * discard this superblock and treat it as dead.
 673          *
 674          * We leave the superblock on @fs_supers so it can be found by
 675          * sget{_fc}() until we passed sb->kill_sb().
 676          */
 677         super_wake(sb, SB_DYING);
 678         super_unlock_excl(sb);
 679         if (sb->s_bdi != &noop_backing_dev_info) {
 680                 if (sb->s_iflags & SB_I_PERSB_BDI)
 681                         bdi_unregister(sb->s_bdi);
 682                 bdi_put(sb->s_bdi);
 683                 sb->s_bdi = &noop_backing_dev_info;
 684         }
 685 }
 686
 687 EXPORT_SYMBOL(generic_shutdown_super);
 688
 689 bool mount_capable(struct fs_context *fc)
 690 {
 691         if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT))
 692                 return capable(CAP_SYS_ADMIN);
 693         else
 694                 return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
 695 }
 696
 697 /**
 698  * sget_fc - Find or create a superblock
 699  * @fc: Filesystem context.
 700  * @test: Comparison callback
 701  * @set: Setup callback
 702  *
 703  * Create a new superblock or find an existing one.
 704  *
 705  * The @test callback is used to find a matching existing superblock.
 706  * Whether or not the requested parameters in @fc are taken into account
 707  * is specific to the @test callback that is used. They may even be
 708  * completely ignored.
 709  *
 710  * If an extant superblock is matched, it will be returned unless:
 711  *
 712  * (1) the namespace the filesystem context @fc and the extant
 713  *     superblock's namespace differ
 714  *
 715  * (2) the filesystem context @fc has requested that reusing an extant
 716  *     superblock is not allowed
 717  *
 718  * In both cases EBUSY will be returned.
 719  *
 720  * If no match is made, a new superblock will be allocated and basic
 721  * initialisation will be performed (s_type, s_fs_info and s_id will be
 722  * set and the @set callback will be invoked), the superblock will be
 723  * published and it will be returned in a partially constructed state
 724  * with SB_BORN and SB_ACTIVE as yet unset.
 725  *
 726  * Return: On success, an extant or newly created superblock is
 727  *         returned. On failure an error pointer is returned.
 728  */
 729 struct super_block *sget_fc(struct fs_context *fc,
 730                             int (*test)(struct super_block *, struct fs_context *),
 731                             int (*set)(struct super_block *, struct fs_context *))
 732 {
 733         struct super_block *s = NULL;
 734         struct super_block *old;
 735         struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
 736         int err;
 737
 738 retry:
 739         spin_lock(&sb_lock);
 740         if (test) {
 741                 hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
 742                         if (test(old, fc))
 743                                 goto share_extant_sb;
 744                 }
 745         }
 746         if (!s) {
 747                 spin_unlock(&sb_lock);
 748                 s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
 749                 if (!s)
 750                         return ERR_PTR(-ENOMEM);
 751                 goto retry;
 752         }
 753
 754         s->s_fs_info = fc->s_fs_info;
 755         err = set(s, fc);
 756         if (err) {
 757                 s->s_fs_info = NULL;
 758                 spin_unlock(&sb_lock);
 759                 destroy_unused_super(s);
 760                 return ERR_PTR(err);
 761         }
 762         fc->s_fs_info = NULL;
 763         s->s_type = fc->fs_type;
 764         s->s_iflags |= fc->s_iflags;
 765         strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
 766         /*
 767          * Make the superblock visible on @super_blocks and @fs_supers.
 768          * It's in a nascent state and users should wait on SB_BORN or
 769          * SB_DYING to be set.
 770          */
 771         list_add_tail(&s->s_list, &super_blocks);
 772         hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
 773         spin_unlock(&sb_lock);
 774         get_filesystem(s->s_type);
 775         shrinker_register(s->s_shrink);
 776         return s;
 777
 778 share_extant_sb:
 779         if (user_ns != old->s_user_ns || fc->exclusive) {
 780                 spin_unlock(&sb_lock);
 781                 destroy_unused_super(s);
 782                 if (fc->exclusive)
 783                         warnfc(fc, "reusing existing filesystem not allowed");
 784                 else
 785                         warnfc(fc, "reusing existing filesystem in another namespace not allowed");
 786                 return ERR_PTR(-EBUSY);
 787         }
 788         if (!grab_super(old))
 789                 goto retry;
 790         destroy_unused_super(s);
 791         return old;
 792 }
 793 EXPORT_SYMBOL(sget_fc);
 794
 795 /**
 796  *      sget    -       find or create a superblock
 797  *      @type:    filesystem type superblock should belong to
 798  *      @test:    comparison callback
 799  *      @set:     setup callback
 800  *      @flags:   mount flags
 801  *      @data:    argument to each of them
 802  */
 803 struct super_block *sget(struct file_system_type *type,
 804                         int (*test)(struct super_block *,void *),
 805                         int (*set)(struct super_block *,void *),
 806                         int flags,
 807                         void *data)
 808 {
 809         struct user_namespace *user_ns = current_user_ns();
 810         struct super_block *s = NULL;
 811         struct super_block *old;
 812         int err;
 813
 814         /* We don't yet pass the user namespace of the parent
 815          * mount through to here so always use &init_user_ns
 816          * until that changes.
 817          */
 818         if (flags & SB_SUBMOUNT)
 819                 user_ns = &init_user_ns;
 820
 821 retry:
 822         spin_lock(&sb_lock);
 823         if (test) {
 824                 hlist_for_each_entry(old, &type->fs_supers, s_instances) {
 825                         if (!test(old, data))
 826                                 continue;
 827                         if (user_ns != old->s_user_ns) {
 828                                 spin_unlock(&sb_lock);
 829                                 destroy_unused_super(s);
 830                                 return ERR_PTR(-EBUSY);
 831                         }
 832                         if (!grab_super(old))
 833                                 goto retry;
 834                         destroy_unused_super(s);
 835                         return old;
 836                 }
 837         }
 838         if (!s) {
 839                 spin_unlock(&sb_lock);
 840                 s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
 841                 if (!s)
 842                         return ERR_PTR(-ENOMEM);
 843                 goto retry;
 844         }
 845
 846         err = set(s, data);
 847         if (err) {
 848                 spin_unlock(&sb_lock);
 849                 destroy_unused_super(s);
 850                 return ERR_PTR(err);
 851         }
 852         s->s_type = type;
 853         strscpy(s->s_id, type->name, sizeof(s->s_id));
 854         list_add_tail(&s->s_list, &super_blocks);
 855         hlist_add_head(&s->s_instances, &type->fs_supers);
 856         spin_unlock(&sb_lock);
 857         get_filesystem(type);
 858         shrinker_register(s->s_shrink);
 859         return s;
 860 }
 861 EXPORT_SYMBOL(sget);
 862
 863 void drop_super(struct super_block *sb)
 864 {
 865         super_unlock_shared(sb);
 866         put_super(sb);
 867 }
 868
 869 EXPORT_SYMBOL(drop_super);
 870
 871 void drop_super_exclusive(struct super_block *sb)
 872 {
 873         super_unlock_excl(sb);
 874         put_super(sb);
 875 }
 876 EXPORT_SYMBOL(drop_super_exclusive);
 877
 878 static void __iterate_supers(void (*f)(struct super_block *))
 879 {
 880         struct super_block *sb, *p = NULL;
 881
 882         spin_lock(&sb_lock);
 883         list_for_each_entry(sb, &super_blocks, s_list) {
 884                 if (super_flags(sb, SB_DYING))
 885                         continue;
 886                 sb->s_count++;
 887                 spin_unlock(&sb_lock);
 888
 889                 f(sb);
 890
 891                 spin_lock(&sb_lock);
 892                 if (p)
 893                         __put_super(p);
 894                 p = sb;
 895         }
 896         if (p)
 897                 __put_super(p);
 898         spin_unlock(&sb_lock);
 899 }
 900 /**
 901  *      iterate_supers - call function for all active superblocks
 902  *      @f: function to call
 903  *      @arg: argument to pass to it
 904  *
 905  *      Scans the superblock list and calls given function, passing it
 906  *      locked superblock and given argument.
 907  */
 908 void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 909 {
 910         struct super_block *sb, *p = NULL;
 911
 912         spin_lock(&sb_lock);
 913         list_for_each_entry(sb, &super_blocks, s_list) {
 914                 bool locked;
 915
 916                 sb->s_count++;
 917                 spin_unlock(&sb_lock);
 918
 919                 locked = super_lock_shared(sb);
 920                 if (locked) {
 921                         if (sb->s_root)
 922                                 f(sb, arg);
 923                         super_unlock_shared(sb);
 924                 }
 925
 926                 spin_lock(&sb_lock);
 927                 if (p)
 928                         __put_super(p);
 929                 p = sb;
 930         }
 931         if (p)
 932                 __put_super(p);
 933         spin_unlock(&sb_lock);
 934 }
 935
 936 /**
 937  *      iterate_supers_type - call function for superblocks of given type
 938  *      @type: fs type
 939  *      @f: function to call
 940  *      @arg: argument to pass to it
 941  *
 942  *      Scans the superblock list and calls given function, passing it
 943  *      locked superblock and given argument.
 944  */
 945 void iterate_supers_type(struct file_system_type *type,
 946         void (*f)(struct super_block *, void *), void *arg)
 947 {
 948         struct super_block *sb, *p = NULL;
 949
 950         spin_lock(&sb_lock);
 951         hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
 952                 bool locked;
 953
 954                 sb->s_count++;
 955                 spin_unlock(&sb_lock);
 956
 957                 locked = super_lock_shared(sb);
 958                 if (locked) {
 959                         if (sb->s_root)
 960                                 f(sb, arg);
 961                         super_unlock_shared(sb);
 962                 }
 963
 964                 spin_lock(&sb_lock);
 965                 if (p)
 966                         __put_super(p);
 967                 p = sb;
 968         }
 969         if (p)
 970                 __put_super(p);
 971         spin_unlock(&sb_lock);
 972 }
 973
 974 EXPORT_SYMBOL(iterate_supers_type);
 975
 976 struct super_block *user_get_super(dev_t dev, bool excl)
 977 {
 978         struct super_block *sb;
 979
 980         spin_lock(&sb_lock);
 981         list_for_each_entry(sb, &super_blocks, s_list) {
 982                 if (sb->s_dev ==  dev) {
 983                         bool locked;
 984
 985                         sb->s_count++;
 986                         spin_unlock(&sb_lock);
 987                         /* still alive? */
 988                         locked = super_lock(sb, excl);
 989                         if (locked) {
 990                                 if (sb->s_root)
 991                                         return sb;
 992                                 super_unlock(sb, excl);
 993                         }
 994                         /* nope, got unmounted */
 995                         spin_lock(&sb_lock);
 996                         __put_super(sb);
 997                         break;
 998                 }
 999         }
1000         spin_unlock(&sb_lock);
1001         return NULL;
1002 }
1003
1004 /**
1005  * reconfigure_super - asks filesystem to change superblock parameters
1006  * @fc: The superblock and configuration
1007  *
1008  * Alters the configuration parameters of a live superblock.
1009  */
1010 int reconfigure_super(struct fs_context *fc)
1011 {
1012         struct super_block *sb = fc->root->d_sb;
1013         int retval;
1014         bool remount_ro = false;
1015         bool remount_rw = false;
1016         bool force = fc->sb_flags & SB_FORCE;
1017
1018         if (fc->sb_flags_mask & ~MS_RMT_MASK)
1019                 return -EINVAL;
1020         if (sb->s_writers.frozen != SB_UNFROZEN)
1021                 return -EBUSY;
1022
1023         retval = security_sb_remount(sb, fc->security);
1024         if (retval)
1025                 return retval;
1026
1027         if (fc->sb_flags_mask & SB_RDONLY) {
1028 #ifdef CONFIG_BLOCK
1029                 if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev &&
1030                     bdev_read_only(sb->s_bdev))
1031                         return -EACCES;
1032 #endif
1033                 remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb);
1034                 remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
1035         }
1036
1037         if (remount_ro) {
1038                 if (!hlist_empty(&sb->s_pins)) {
1039                         super_unlock_excl(sb);
1040                         group_pin_kill(&sb->s_pins);
1041                         __super_lock_excl(sb);
1042                         if (!sb->s_root)
1043                                 return 0;
1044                         if (sb->s_writers.frozen != SB_UNFROZEN)
1045                                 return -EBUSY;
1046                         remount_ro = !sb_rdonly(sb);
1047                 }
1048         }
1049         shrink_dcache_sb(sb);
1050
1051         /* If we are reconfiguring to RDONLY and current sb is read/write,
1052          * make sure there are no files open for writing.
1053          */
1054         if (remount_ro) {
1055                 if (force) {
1056                         sb_start_ro_state_change(sb);
1057                 } else {
1058                         retval = sb_prepare_remount_readonly(sb);
1059                         if (retval)
1060                                 return retval;
1061                 }
1062         } else if (remount_rw) {
1063                 /*
1064                  * Protect filesystem's reconfigure code from writes from
1065                  * userspace until reconfigure finishes.
1066                  */
1067                 sb_start_ro_state_change(sb);
1068         }
1069
1070         if (fc->ops->reconfigure) {
1071                 retval = fc->ops->reconfigure(fc);
1072                 if (retval) {
1073                         if (!force)
1074                                 goto cancel_readonly;
1075                         /* If forced remount, go ahead despite any errors */
1076                         WARN(1, "forced remount of a %s fs returned %i\n",
1077                              sb->s_type->name, retval);
1078                 }
1079         }
1080
1081         WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
1082                                  (fc->sb_flags & fc->sb_flags_mask)));
1083         sb_end_ro_state_change(sb);
1084
1085         /*
1086          * Some filesystems modify their metadata via some other path than the
1087          * bdev buffer cache (eg. use a private mapping, or directories in
1088          * pagecache, etc). Also file data modifications go via their own
1089          * mappings. So If we try to mount readonly then copy the filesystem
1090          * from bdev, we could get stale data, so invalidate it to give a best
1091          * effort at coherency.
1092          */
1093         if (remount_ro && sb->s_bdev)
1094                 invalidate_bdev(sb->s_bdev);
1095         return 0;
1096
1097 cancel_readonly:
1098         sb_end_ro_state_change(sb);
1099         return retval;
1100 }
1101
1102 static void do_emergency_remount_callback(struct super_block *sb)
1103 {
1104         bool locked = super_lock_excl(sb);
1105
1106         if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
1107                 struct fs_context *fc;
1108
1109                 fc = fs_context_for_reconfigure(sb->s_root,
1110                                         SB_RDONLY | SB_FORCE, SB_RDONLY);
1111                 if (!IS_ERR(fc)) {
1112                         if (parse_monolithic_mount_data(fc, NULL) == 0)
1113                                 (void)reconfigure_super(fc);
1114                         put_fs_context(fc);
1115                 }
1116         }
1117         if (locked)
1118                 super_unlock_excl(sb);
1119 }
1120
1121 static void do_emergency_remount(struct work_struct *work)
1122 {
1123         __iterate_supers(do_emergency_remount_callback);
1124         kfree(work);
1125         printk("Emergency Remount complete\n");
1126 }
1127
1128 void emergency_remount(void)
1129 {
1130         struct work_struct *work;
1131
1132         work = kmalloc(sizeof(*work), GFP_ATOMIC);
1133         if (work) {
1134                 INIT_WORK(work, do_emergency_remount);
1135                 schedule_work(work);
1136         }
1137 }
1138
1139 static void do_thaw_all_callback(struct super_block *sb)
1140 {
1141         bool locked = super_lock_excl(sb);
1142
1143         if (locked && sb->s_root) {
1144                 if (IS_ENABLED(CONFIG_BLOCK))
1145                         while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
1146                                 pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
1147                 thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
1148                 return;
1149         }
1150         if (locked)
1151                 super_unlock_excl(sb);
1152 }
1153
1154 static void do_thaw_all(struct work_struct *work)
1155 {
1156         __iterate_supers(do_thaw_all_callback);
1157         kfree(work);
1158         printk(KERN_WARNING "Emergency Thaw complete\n");
1159 }
1160
1161 /**
1162  * emergency_thaw_all -- forcibly thaw every frozen filesystem
1163  *
1164  * Used for emergency unfreeze of all filesystems via SysRq
1165  */
1166 void emergency_thaw_all(void)
1167 {
1168         struct work_struct *work;
1169
1170         work = kmalloc(sizeof(*work), GFP_ATOMIC);
1171         if (work) {
1172                 INIT_WORK(work, do_thaw_all);
1173                 schedule_work(work);
1174         }
1175 }
1176
1177 static DEFINE_IDA(unnamed_dev_ida);
1178
1179 /**
1180  * get_anon_bdev - Allocate a block device for filesystems which don't have one.
1181  * @p: Pointer to a dev_t.
1182  *
1183  * Filesystems which don't use real block devices can call this function
1184  * to allocate a virtual block device.
1185  *
1186  * Context: Any context.  Frequently called while holding sb_lock.
1187  * Return: 0 on success, -EMFILE if there are no anonymous bdevs left
1188  * or -ENOMEM if memory allocation failed.
1189  */
1190 int get_anon_bdev(dev_t *p)
1191 {
1192         int dev;
1193
1194         /*
1195          * Many userspace utilities consider an FSID of 0 invalid.
1196          * Always return at least 1 from get_anon_bdev.
1197          */
1198         dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1,
1199                         GFP_ATOMIC);
1200         if (dev == -ENOSPC)
1201                 dev = -EMFILE;
1202         if (dev < 0)
1203                 return dev;
1204
1205         *p = MKDEV(0, dev);
1206         return 0;
1207 }
1208 EXPORT_SYMBOL(get_anon_bdev);
1209
1210 void free_anon_bdev(dev_t dev)
1211 {
1212         ida_free(&unnamed_dev_ida, MINOR(dev));
1213 }
1214 EXPORT_SYMBOL(free_anon_bdev);
1215
1216 int set_anon_super(struct super_block *s, void *data)
1217 {
1218         return get_anon_bdev(&s->s_dev);
1219 }
1220 EXPORT_SYMBOL(set_anon_super);
1221
1222 void kill_anon_super(struct super_block *sb)
1223 {
1224         dev_t dev = sb->s_dev;
1225         generic_shutdown_super(sb);
1226         kill_super_notify(sb);
1227         free_anon_bdev(dev);
1228 }
1229 EXPORT_SYMBOL(kill_anon_super);
1230
1231 void kill_litter_super(struct super_block *sb)
1232 {
1233         if (sb->s_root)
1234                 d_genocide(sb->s_root);
1235         kill_anon_super(sb);
1236 }
1237 EXPORT_SYMBOL(kill_litter_super);
1238
1239 int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
1240 {
1241         return set_anon_super(sb, NULL);
1242 }
1243 EXPORT_SYMBOL(set_anon_super_fc);
1244
1245 static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
1246 {
1247         return sb->s_fs_info == fc->s_fs_info;
1248 }
1249
1250 static int test_single_super(struct super_block *s, struct fs_context *fc)
1251 {
1252         return 1;
1253 }
1254
1255 static int vfs_get_super(struct fs_context *fc,
1256                 int (*test)(struct super_block *, struct fs_context *),
1257                 int (*fill_super)(struct super_block *sb,
1258                                   struct fs_context *fc))
1259 {
1260         struct super_block *sb;
1261         int err;
1262
1263         sb = sget_fc(fc, test, set_anon_super_fc);
1264         if (IS_ERR(sb))
1265                 return PTR_ERR(sb);
1266
1267         if (!sb->s_root) {
1268                 err = fill_super(sb, fc);
1269                 if (err)
1270                         goto error;
1271
1272                 sb->s_flags |= SB_ACTIVE;
1273         }
1274
1275         fc->root = dget(sb->s_root);
1276         return 0;
1277
1278 error:
1279         deactivate_locked_super(sb);
1280         return err;
1281 }
1282
1283 int get_tree_nodev(struct fs_context *fc,
1284                   int (*fill_super)(struct super_block *sb,
1285                                     struct fs_context *fc))
1286 {
1287         return vfs_get_super(fc, NULL, fill_super);
1288 }
1289 EXPORT_SYMBOL(get_tree_nodev);
1290
1291 int get_tree_single(struct fs_context *fc,
1292                   int (*fill_super)(struct super_block *sb,
1293                                     struct fs_context *fc))
1294 {
1295         return vfs_get_super(fc, test_single_super, fill_super);
1296 }
1297 EXPORT_SYMBOL(get_tree_single);
1298
1299 int get_tree_keyed(struct fs_context *fc,
1300                   int (*fill_super)(struct super_block *sb,
1301                                     struct fs_context *fc),
1302                 void *key)
1303 {
1304         fc->s_fs_info = key;
1305         return vfs_get_super(fc, test_keyed_super, fill_super);
1306 }
1307 EXPORT_SYMBOL(get_tree_keyed);
1308
1309 static int set_bdev_super(struct super_block *s, void *data)
1310 {
1311         s->s_dev = *(dev_t *)data;
1312         return 0;
1313 }
1314
1315 static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
1316 {
1317         return set_bdev_super(s, fc->sget_key);
1318 }
1319
1320 static int super_s_dev_test(struct super_block *s, struct fs_context *fc)
1321 {
1322         return !(s->s_iflags & SB_I_RETIRED) &&
1323                 s->s_dev == *(dev_t *)fc->sget_key;
1324 }
1325
1326 /**
1327  * sget_dev - Find or create a superblock by device number
1328  * @fc: Filesystem context.
1329  * @dev: device number
1330  *
1331  * Find or create a superblock using the provided device number that
1332  * will be stored in fc->sget_key.
1333  *
1334  * If an extant superblock is matched, then that will be returned with
1335  * an elevated reference count that the caller must transfer or discard.
1336  *
1337  * If no match is made, a new superblock will be allocated and basic
1338  * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will
1339  * be set). The superblock will be published and it will be returned in
1340  * a partially constructed state with SB_BORN and SB_ACTIVE as yet
1341  * unset.
1342  *
1343  * Return: an existing or newly created superblock on success, an error
1344  *         pointer on failure.
1345  */
1346 struct super_block *sget_dev(struct fs_context *fc, dev_t dev)
1347 {
1348         fc->sget_key = &dev;
1349         return sget_fc(fc, super_s_dev_test, super_s_dev_set);
1350 }
1351 EXPORT_SYMBOL(sget_dev);
1352
1353 #ifdef CONFIG_BLOCK
1354 /*
1355  * Lock the superblock that is holder of the bdev. Returns the superblock
1356  * pointer if we successfully locked the superblock and it is alive. Otherwise
1357  * we return NULL and just unlock bdev->bd_holder_lock.
1358  *
1359  * The function must be called with bdev->bd_holder_lock and releases it.
1360  */
1361 static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
1362         __releases(&bdev->bd_holder_lock)
1363 {
1364         struct super_block *sb = bdev->bd_holder;
1365         bool locked;
1366
1367         lockdep_assert_held(&bdev->bd_holder_lock);
1368         lockdep_assert_not_held(&sb->s_umount);
1369         lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
1370
1371         /* Make sure sb doesn't go away from under us */
1372         spin_lock(&sb_lock);
1373         sb->s_count++;
1374         spin_unlock(&sb_lock);
1375
1376         mutex_unlock(&bdev->bd_holder_lock);
1377
1378         locked = super_lock(sb, excl);
1379
1380         /*
1381          * If the superblock wasn't already SB_DYING then we hold
1382          * s_umount and can safely drop our temporary reference.
1383          */
1384         put_super(sb);
1385
1386         if (!locked)
1387                 return NULL;
1388
1389         if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
1390                 super_unlock(sb, excl);
1391                 return NULL;
1392         }
1393
1394         return sb;
1395 }
1396
1397 static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
1398 {
1399         struct super_block *sb;
1400
1401         sb = bdev_super_lock(bdev, false);
1402         if (!sb)
1403                 return;
1404
1405         if (!surprise)
1406                 sync_filesystem(sb);
1407         shrink_dcache_sb(sb);
1408         invalidate_inodes(sb);
1409         if (sb->s_op->shutdown)
1410                 sb->s_op->shutdown(sb);
1411
1412         super_unlock_shared(sb);
1413 }
1414
1415 static void fs_bdev_sync(struct block_device *bdev)
1416 {
1417         struct super_block *sb;
1418
1419         sb = bdev_super_lock(bdev, false);
1420         if (!sb)
1421                 return;
1422
1423         sync_filesystem(sb);
1424         super_unlock_shared(sb);
1425 }
1426
1427 static struct super_block *get_bdev_super(struct block_device *bdev)
1428 {
1429         bool active = false;
1430         struct super_block *sb;
1431
1432         sb = bdev_super_lock(bdev, true);
1433         if (sb) {
1434                 active = atomic_inc_not_zero(&sb->s_active);
1435                 super_unlock_excl(sb);
1436         }
1437         if (!active)
1438                 return NULL;
1439         return sb;
1440 }
1441
1442 /**
1443  * fs_bdev_freeze - freeze owning filesystem of block device
1444  * @bdev: block device
1445  *
1446  * Freeze the filesystem that owns this block device if it is still
1447  * active.
1448  *
1449  * A filesystem that owns multiple block devices may be frozen from each
1450  * block device and won't be unfrozen until all block devices are
1451  * unfrozen. Each block device can only freeze the filesystem once as we
1452  * nest freezes for block devices in the block layer.
1453  *
1454  * Return: If the freeze was successful zero is returned. If the freeze
1455  *         failed a negative error code is returned.
1456  */
1457 static int fs_bdev_freeze(struct block_device *bdev)
1458 {
1459         struct super_block *sb;
1460         int error = 0;
1461
1462         lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
1463
1464         sb = get_bdev_super(bdev);
1465         if (!sb)
1466                 return -EINVAL;
1467
1468         if (sb->s_op->freeze_super)
1469                 error = sb->s_op->freeze_super(sb,
1470                                 FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
1471         else
1472                 error = freeze_super(sb,
1473                                 FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
1474         if (!error)
1475                 error = sync_blockdev(bdev);
1476         deactivate_super(sb);
1477         return error;
1478 }
1479
1480 /**
1481  * fs_bdev_thaw - thaw owning filesystem of block device
1482  * @bdev: block device
1483  *
1484  * Thaw the filesystem that owns this block device.
1485  *
1486  * A filesystem that owns multiple block devices may be frozen from each
1487  * block device and won't be unfrozen until all block devices are
1488  * unfrozen. Each block device can only freeze the filesystem once as we
1489  * nest freezes for block devices in the block layer.
1490  *
1491  * Return: If the thaw was successful zero is returned. If the thaw
1492  *         failed a negative error code is returned. If this function
1493  *         returns zero it doesn't mean that the filesystem is unfrozen
1494  *         as it may have been frozen multiple times (kernel may hold a
1495  *         freeze or might be frozen from other block devices).
1496  */
1497 static int fs_bdev_thaw(struct block_device *bdev)
1498 {
1499         struct super_block *sb;
1500         int error;
1501
1502         lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
1503
1504         sb = get_bdev_super(bdev);
1505         if (WARN_ON_ONCE(!sb))
1506                 return -EINVAL;
1507
1508         if (sb->s_op->thaw_super)
1509                 error = sb->s_op->thaw_super(sb,
1510                                 FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
1511         else
1512                 error = thaw_super(sb,
1513                                 FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
1514         deactivate_super(sb);
1515         return error;
1516 }
1517
1518 const struct blk_holder_ops fs_holder_ops = {
1519         .mark_dead              = fs_bdev_mark_dead,
1520         .sync                   = fs_bdev_sync,
1521         .freeze                 = fs_bdev_freeze,
1522         .thaw                   = fs_bdev_thaw,
1523 };
1524 EXPORT_SYMBOL_GPL(fs_holder_ops);
1525
1526 int setup_bdev_super(struct super_block *sb, int sb_flags,
1527                 struct fs_context *fc)
1528 {
1529         blk_mode_t mode = sb_open_mode(sb_flags);
1530         struct file *bdev_file;
1531         struct block_device *bdev;
1532
1533         bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
1534         if (IS_ERR(bdev_file)) {
1535                 if (fc)
1536                         errorf(fc, "%s: Can't open blockdev", fc->source);
1537                 return PTR_ERR(bdev_file);
1538         }
1539         bdev = file_bdev(bdev_file);
1540
1541         /*
1542          * This really should be in blkdev_get_by_dev, but right now can't due
1543          * to legacy issues that require us to allow opening a block device node
1544          * writable from userspace even for a read-only block device.
1545          */
1546         if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
1547                 fput(bdev_file);
1548                 return -EACCES;
1549         }
1550
1551         /*
1552          * It is enough to check bdev was not frozen before we set
1553          * s_bdev as freezing will wait until SB_BORN is set.
1554          */
1555         if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
1556                 if (fc)
1557                         warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
1558                 fput(bdev_file);
1559                 return -EBUSY;
1560         }
1561         spin_lock(&sb_lock);
1562         sb->s_bdev_file = bdev_file;
1563         sb->s_bdev = bdev;
1564         sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
1565         if (bdev_stable_writes(bdev))
1566                 sb->s_iflags |= SB_I_STABLE_WRITES;
1567         spin_unlock(&sb_lock);
1568
1569         snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
1570         shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
1571                                 sb->s_id);
1572         sb_set_blocksize(sb, block_size(bdev));
1573         return 0;
1574 }
1575 EXPORT_SYMBOL_GPL(setup_bdev_super);
1576
1577 /**
1578  * get_tree_bdev - Get a superblock based on a single block device
1579  * @fc: The filesystem context holding the parameters
1580  * @fill_super: Helper to initialise a new superblock
1581  */
1582 int get_tree_bdev(struct fs_context *fc,
1583                 int (*fill_super)(struct super_block *,
1584                                   struct fs_context *))
1585 {
1586         struct super_block *s;
1587         int error = 0;
1588         dev_t dev;
1589
1590         if (!fc->source)
1591                 return invalf(fc, "No source specified");
1592
1593         error = lookup_bdev(fc->source, &dev);
1594         if (error) {
1595                 errorf(fc, "%s: Can't lookup blockdev", fc->source);
1596                 return error;
1597         }
1598
1599         fc->sb_flags |= SB_NOSEC;
1600         s = sget_dev(fc, dev);
1601         if (IS_ERR(s))
1602                 return PTR_ERR(s);
1603
1604         if (s->s_root) {
1605                 /* Don't summarily change the RO/RW state. */
1606                 if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
1607                         warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
1608                         deactivate_locked_super(s);
1609                         return -EBUSY;
1610                 }
1611         } else {
1612                 error = setup_bdev_super(s, fc->sb_flags, fc);
1613                 if (!error)
1614                         error = fill_super(s, fc);
1615                 if (error) {
1616                         deactivate_locked_super(s);
1617                         return error;
1618                 }
1619                 s->s_flags |= SB_ACTIVE;
1620         }
1621
1622         BUG_ON(fc->root);
1623         fc->root = dget(s->s_root);
1624         return 0;
1625 }
1626 EXPORT_SYMBOL(get_tree_bdev);
1627
1628 static int test_bdev_super(struct super_block *s, void *data)
1629 {
1630         return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
1631 }
1632
1633 struct dentry *mount_bdev(struct file_system_type *fs_type,
1634         int flags, const char *dev_name, void *data,
1635         int (*fill_super)(struct super_block *, void *, int))
1636 {
1637         struct super_block *s;
1638         int error;
1639         dev_t dev;
1640
1641         error = lookup_bdev(dev_name, &dev);
1642         if (error)
1643                 return ERR_PTR(error);
1644
1645         flags |= SB_NOSEC;
1646         s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
1647         if (IS_ERR(s))
1648                 return ERR_CAST(s);
1649
1650         if (s->s_root) {
1651                 if ((flags ^ s->s_flags) & SB_RDONLY) {
1652                         deactivate_locked_super(s);
1653                         return ERR_PTR(-EBUSY);
1654                 }
1655         } else {
1656                 error = setup_bdev_super(s, flags, NULL);
1657                 if (!error)
1658                         error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
1659                 if (error) {
1660                         deactivate_locked_super(s);
1661                         return ERR_PTR(error);
1662                 }
1663
1664                 s->s_flags |= SB_ACTIVE;
1665         }
1666
1667         return dget(s->s_root);
1668 }
1669 EXPORT_SYMBOL(mount_bdev);
1670
1671 void kill_block_super(struct super_block *sb)
1672 {
1673         struct block_device *bdev = sb->s_bdev;
1674
1675         generic_shutdown_super(sb);
1676         if (bdev) {
1677                 sync_blockdev(bdev);
1678                 fput(sb->s_bdev_file);
1679         }
1680 }
1681
1682 EXPORT_SYMBOL(kill_block_super);
1683 #endif
1684
1685 struct dentry *mount_nodev(struct file_system_type *fs_type,
1686         int flags, void *data,
1687         int (*fill_super)(struct super_block *, void *, int))
1688 {
1689         int error;
1690         struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);
1691
1692         if (IS_ERR(s))
1693                 return ERR_CAST(s);
1694
1695         error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
1696         if (error) {
1697                 deactivate_locked_super(s);
1698                 return ERR_PTR(error);
1699         }
1700         s->s_flags |= SB_ACTIVE;
1701         return dget(s->s_root);
1702 }
1703 EXPORT_SYMBOL(mount_nodev);
1704
1705 int reconfigure_single(struct super_block *s,
1706                        int flags, void *data)
1707 {
1708         struct fs_context *fc;
1709         int ret;
1710
1711         /* The caller really need to be passing fc down into mount_single(),
1712          * then a chunk of this can be removed.  [Bollocks -- AV]
1713          * Better yet, reconfiguration shouldn't happen, but rather the second
1714          * mount should be rejected if the parameters are not compatible.
1715          */
1716         fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
1717         if (IS_ERR(fc))
1718                 return PTR_ERR(fc);
1719
1720         ret = parse_monolithic_mount_data(fc, data);
1721         if (ret < 0)
1722                 goto out;
1723
1724         ret = reconfigure_super(fc);
1725 out:
1726         put_fs_context(fc);
1727         return ret;
1728 }
1729
1730 static int compare_single(struct super_block *s, void *p)
1731 {
1732         return 1;
1733 }
1734
1735 struct dentry *mount_single(struct file_system_type *fs_type,
1736         int flags, void *data,
1737         int (*fill_super)(struct super_block *, void *, int))
1738 {
1739         struct super_block *s;
1740         int error;
1741
1742         s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
1743         if (IS_ERR(s))
1744                 return ERR_CAST(s);
1745         if (!s->s_root) {
1746                 error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
1747                 if (!error)
1748                         s->s_flags |= SB_ACTIVE;
1749         } else {
1750                 error = reconfigure_single(s, flags, data);
1751         }
1752         if (unlikely(error)) {
1753                 deactivate_locked_super(s);
1754                 return ERR_PTR(error);
1755         }
1756         return dget(s->s_root);
1757 }
1758 EXPORT_SYMBOL(mount_single);
1759
1760 /**
1761  * vfs_get_tree - Get the mountable root
1762  * @fc: The superblock configuration context.
1763  *
1764  * The filesystem is invoked to get or create a superblock which can then later
1765  * be used for mounting.  The filesystem places a pointer to the root to be
1766  * used for mounting in @fc->root.
1767  */
1768 int vfs_get_tree(struct fs_context *fc)
1769 {
1770         struct super_block *sb;
1771         int error;
1772
1773         if (fc->root)
1774                 return -EBUSY;
1775
1776         /* Get the mountable root in fc->root, with a ref on the root and a ref
1777          * on the superblock.
1778          */
1779         error = fc->ops->get_tree(fc);
1780         if (error < 0)
1781                 return error;
1782
1783         if (!fc->root) {
1784                 pr_err("Filesystem %s get_tree() didn't set fc->root\n",
1785                        fc->fs_type->name);
1786                 /* We don't know what the locking state of the superblock is -
1787                  * if there is a superblock.
1788                  */
1789                 BUG();
1790         }
1791
1792         sb = fc->root->d_sb;
1793         WARN_ON(!sb->s_bdi);
1794
1795         /*
1796          * super_wake() contains a memory barrier which also care of
1797          * ordering for super_cache_count(). We place it before setting
1798          * SB_BORN as the data dependency between the two functions is
1799          * the superblock structure contents that we just set up, not
1800          * the SB_BORN flag.
1801          */
1802         super_wake(sb, SB_BORN);
1803
1804         error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
1805         if (unlikely(error)) {
1806                 fc_drop_locked(fc);
1807                 return error;
1808         }
1809
1810         /*
1811          * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
1812          * but s_maxbytes was an unsigned long long for many releases. Throw
1813          * this warning for a little while to try and catch filesystems that
1814          * violate this rule.
1815          */
1816         WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
1817                 "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);
1818
1819         return 0;
1820 }
1821 EXPORT_SYMBOL(vfs_get_tree);
1822
1823 /*
1824  * Setup private BDI for given superblock. It gets automatically cleaned up
1825  * in generic_shutdown_super().
1826  */
1827 int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
1828 {
1829         struct backing_dev_info *bdi;
1830         int err;
1831         va_list args;
1832
1833         bdi = bdi_alloc(NUMA_NO_NODE);
1834         if (!bdi)
1835                 return -ENOMEM;
1836
1837         va_start(args, fmt);
1838         err = bdi_register_va(bdi, fmt, args);
1839         va_end(args);
1840         if (err) {
1841                 bdi_put(bdi);
1842                 return err;
1843         }
1844         WARN_ON(sb->s_bdi != &noop_backing_dev_info);
1845         sb->s_bdi = bdi;
1846         sb->s_iflags |= SB_I_PERSB_BDI;
1847
1848         return 0;
1849 }
1850 EXPORT_SYMBOL(super_setup_bdi_name);
1851
1852 /*
1853  * Setup private BDI for given superblock. I gets automatically cleaned up
1854  * in generic_shutdown_super().
1855  */
1856 int super_setup_bdi(struct super_block *sb)
1857 {
1858         static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
1859
1860         return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
1861                                     atomic_long_inc_return(&bdi_seq));
1862 }
1863 EXPORT_SYMBOL(super_setup_bdi);
1864
1865 /**
1866  * sb_wait_write - wait until all writers to given file system finish
1867  * @sb: the super for which we wait
1868  * @level: type of writers we wait for (normal vs page fault)
1869  *
1870  * This function waits until there are no writers of given type to given file
1871  * system.
1872  */
1873 static void sb_wait_write(struct super_block *sb, int level)
1874 {
1875         percpu_down_write(sb->s_writers.rw_sem + level-1);
1876 }
1877
1878 /*
1879  * We are going to return to userspace and forget about these locks, the
1880  * ownership goes to the caller of thaw_super() which does unlock().
1881  */
1882 static void lockdep_sb_freeze_release(struct super_block *sb)
1883 {
1884         int level;
1885
1886         for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
1887                 percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1888 }
1889
1890 /*
1891  * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
1892  */
1893 static void lockdep_sb_freeze_acquire(struct super_block *sb)
1894 {
1895         int level;
1896
1897         for (level = 0; level < SB_FREEZE_LEVELS; ++level)
1898                 percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
1899 }
1900
1901 static void sb_freeze_unlock(struct super_block *sb, int level)
1902 {
1903         for (level--; level >= 0; level--)
1904                 percpu_up_write(sb->s_writers.rw_sem + level);
1905 }
1906
1907 static int wait_for_partially_frozen(struct super_block *sb)
1908 {
1909         int ret = 0;
1910
1911         do {
1912                 unsigned short old = sb->s_writers.frozen;
1913
1914                 up_write(&sb->s_umount);
1915                 ret = wait_var_event_killable(&sb->s_writers.frozen,
1916                                                sb->s_writers.frozen != old);
1917                 down_write(&sb->s_umount);
1918         } while (ret == 0 &&
1919                  sb->s_writers.frozen != SB_UNFROZEN &&
1920                  sb->s_writers.frozen != SB_FREEZE_COMPLETE);
1921
1922         return ret;
1923 }
1924
1925 #define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
1926 #define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST)
1927
1928 static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
1929 {
1930         WARN_ON_ONCE((who & ~FREEZE_FLAGS));
1931         WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
1932
1933         if (who & FREEZE_HOLDER_KERNEL)
1934                 ++sb->s_writers.freeze_kcount;
1935         if (who & FREEZE_HOLDER_USERSPACE)
1936                 ++sb->s_writers.freeze_ucount;
1937         return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
1938 }
1939
1940 static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
1941 {
1942         WARN_ON_ONCE((who & ~FREEZE_FLAGS));
1943         WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
1944
1945         if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount)
1946                 --sb->s_writers.freeze_kcount;
1947         if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount)
1948                 --sb->s_writers.freeze_ucount;
1949         return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
1950 }
1951
1952 static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
1953 {
1954         WARN_ON_ONCE((who & ~FREEZE_FLAGS));
1955         WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
1956
1957         if (who & FREEZE_HOLDER_KERNEL)
1958                 return (who & FREEZE_MAY_NEST) ||
1959                        sb->s_writers.freeze_kcount == 0;
1960         if (who & FREEZE_HOLDER_USERSPACE)
1961                 return (who & FREEZE_MAY_NEST) ||
1962                        sb->s_writers.freeze_ucount == 0;
1963         return false;
1964 }
1965
1966 /**
1967  * freeze_super - lock the filesystem and force it into a consistent state
1968  * @sb: the super to lock
1969  * @who: context that wants to freeze
1970  *
1971  * Syncs the super to make sure the filesystem is consistent and calls the fs's
1972  * freeze_fs.  Subsequent calls to this without first thawing the fs may return
1973  * -EBUSY.
1974  *
1975  * @who should be:
1976  * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
1977  * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
1978  * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed.
1979  *
1980  * The @who argument distinguishes between the kernel and userspace trying to
1981  * freeze the filesystem.  Although there cannot be multiple kernel freezes or
1982  * multiple userspace freezes in effect at any given time, the kernel and
1983  * userspace can both hold a filesystem frozen.  The filesystem remains frozen
1984  * until there are no kernel or userspace freezes in effect.
1985  *
1986  * A filesystem may hold multiple devices and thus a filesystems may be
1987  * frozen through the block layer via multiple block devices. In this
1988  * case the request is marked as being allowed to nest by passing
1989  * FREEZE_MAY_NEST. The filesystem remains frozen until all block
1990  * devices are unfrozen. If multiple freezes are attempted without
1991  * FREEZE_MAY_NEST -EBUSY will be returned.
1992  *
1993  * During this function, sb->s_writers.frozen goes through these values:
1994  *
1995  * SB_UNFROZEN: File system is normal, all writes progress as usual.
1996  *
1997  * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
1998  * writes should be blocked, though page faults are still allowed. We wait for
1999  * all writes to complete and then proceed to the next stage.
2000  *
2001  * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
2002  * but internal fs threads can still modify the filesystem (although they
2003  * should not dirty new pages or inodes), writeback can run etc. After waiting
2004  * for all running page faults we sync the filesystem which will clean all
2005  * dirty pages and inodes (no new dirty pages or inodes can be created when
2006  * sync is running).
2007  *
2008  * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
2009  * modification are blocked (e.g. XFS preallocation truncation on inode
2010  * reclaim). This is usually implemented by blocking new transactions for
2011  * filesystems that have them and need this additional guard. After all
2012  * internal writers are finished we call ->freeze_fs() to finish filesystem
2013  * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
2014  * mostly auxiliary for filesystems to verify they do not modify frozen fs.
2015  *
2016  * sb->s_writers.frozen is protected by sb->s_umount.
2017  *
2018  * Return: If the freeze was successful zero is returned. If the freeze
2019  *         failed a negative error code is returned.
2020  */
2021 int freeze_super(struct super_block *sb, enum freeze_holder who)
2022 {
2023         int ret;
2024
2025         if (!super_lock_excl(sb)) {
2026                 WARN_ON_ONCE("Dying superblock while freezing!");
2027                 return -EINVAL;
2028         }
2029         atomic_inc(&sb->s_active);
2030
2031 retry:
2032         if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
2033                 if (may_freeze(sb, who))
2034                         ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
2035                 else
2036                         ret = -EBUSY;
2037                 /* All freezers share a single active reference. */
2038                 deactivate_locked_super(sb);
2039                 return ret;
2040         }
2041
2042         if (sb->s_writers.frozen != SB_UNFROZEN) {
2043                 ret = wait_for_partially_frozen(sb);
2044                 if (ret) {
2045                         deactivate_locked_super(sb);
2046                         return ret;
2047                 }
2048
2049                 goto retry;
2050         }
2051
2052         if (sb_rdonly(sb)) {
2053                 /* Nothing to do really... */
2054                 WARN_ON_ONCE(freeze_inc(sb, who) > 1);
2055                 sb->s_writers.frozen = SB_FREEZE_COMPLETE;
2056                 wake_up_var(&sb->s_writers.frozen);
2057                 super_unlock_excl(sb);
2058                 return 0;
2059         }
2060
2061         sb->s_writers.frozen = SB_FREEZE_WRITE;
2062         /* Release s_umount to preserve sb_start_write -> s_umount ordering */
2063         super_unlock_excl(sb);
2064         sb_wait_write(sb, SB_FREEZE_WRITE);
2065         __super_lock_excl(sb);
2066
2067         /* Now we go and block page faults... */
2068         sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
2069         sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
2070
2071         /* All writers are done so after syncing there won't be dirty data */
2072         ret = sync_filesystem(sb);
2073         if (ret) {
2074                 sb->s_writers.frozen = SB_UNFROZEN;
2075                 sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
2076                 wake_up_var(&sb->s_writers.frozen);
2077                 deactivate_locked_super(sb);
2078                 return ret;
2079         }
2080
2081         /* Now wait for internal filesystem counter */
2082         sb->s_writers.frozen = SB_FREEZE_FS;
2083         sb_wait_write(sb, SB_FREEZE_FS);
2084
2085         if (sb->s_op->freeze_fs) {
2086                 ret = sb->s_op->freeze_fs(sb);
2087                 if (ret) {
2088                         printk(KERN_ERR
2089                                 "VFS:Filesystem freeze failed\n");
2090                         sb->s_writers.frozen = SB_UNFROZEN;
2091                         sb_freeze_unlock(sb, SB_FREEZE_FS);
2092                         wake_up_var(&sb->s_writers.frozen);
2093                         deactivate_locked_super(sb);
2094                         return ret;
2095                 }
2096         }
2097         /*
2098          * For debugging purposes so that fs can warn if it sees write activity
2099          * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
2100          */
2101         WARN_ON_ONCE(freeze_inc(sb, who) > 1);
2102         sb->s_writers.frozen = SB_FREEZE_COMPLETE;
2103         wake_up_var(&sb->s_writers.frozen);
2104         lockdep_sb_freeze_release(sb);
2105         super_unlock_excl(sb);
2106         return 0;
2107 }
2108 EXPORT_SYMBOL(freeze_super);
2109
2110 /*
2111  * Undoes the effect of a freeze_super_locked call.  If the filesystem is
2112  * frozen both by userspace and the kernel, a thaw call from either source
2113  * removes that state without releasing the other state or unlocking the
2114  * filesystem.
2115  */
2116 static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
2117 {
2118         int error = -EINVAL;
2119
2120         if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
2121                 goto out_unlock;
2122
2123         /*
2124          * All freezers share a single active reference.
2125          * So just unlock in case there are any left.
2126          */
2127         if (freeze_dec(sb, who))
2128                 goto out_unlock;
2129
2130         if (sb_rdonly(sb)) {
2131                 sb->s_writers.frozen = SB_UNFROZEN;
2132                 wake_up_var(&sb->s_writers.frozen);
2133                 goto out_deactivate;
2134         }
2135
2136         lockdep_sb_freeze_acquire(sb);
2137
2138         if (sb->s_op->unfreeze_fs) {
2139                 error = sb->s_op->unfreeze_fs(sb);
2140                 if (error) {
2141                         pr_err("VFS: Filesystem thaw failed\n");
2142                         freeze_inc(sb, who);
2143                         lockdep_sb_freeze_release(sb);
2144                         goto out_unlock;
2145                 }
2146         }
2147
2148         sb->s_writers.frozen = SB_UNFROZEN;
2149         wake_up_var(&sb->s_writers.frozen);
2150         sb_freeze_unlock(sb, SB_FREEZE_FS);
2151 out_deactivate:
2152         deactivate_locked_super(sb);
2153         return 0;
2154
2155 out_unlock:
2156         super_unlock_excl(sb);
2157         return error;
2158 }
2159
2160 /**
2161  * thaw_super -- unlock filesystem
2162  * @sb: the super to thaw
2163  * @who: context that wants to freeze
2164  *
2165  * Unlocks the filesystem and marks it writeable again after freeze_super()
2166  * if there are no remaining freezes on the filesystem.
2167  *
2168  * @who should be:
2169  * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
2170  * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
2171  * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed
2172  *
2173  * A filesystem may hold multiple devices and thus a filesystems may
2174  * have been frozen through the block layer via multiple block devices.
2175  * The filesystem remains frozen until all block devices are unfrozen.
2176  */
2177 int thaw_super(struct super_block *sb, enum freeze_holder who)
2178 {
2179         if (!super_lock_excl(sb)) {
2180                 WARN_ON_ONCE("Dying superblock while thawing!");
2181                 return -EINVAL;
2182         }
2183         return thaw_super_locked(sb, who);
2184 }
2185 EXPORT_SYMBOL(thaw_super);
2186
2187 /*
2188  * Create workqueue for deferred direct IO completions. We allocate the
2189  * workqueue when it's first needed. This avoids creating workqueue for
2190  * filesystems that don't need it and also allows us to create the workqueue
2191  * late enough so the we can include s_id in the name of the workqueue.
2192  */
2193 int sb_init_dio_done_wq(struct super_block *sb)
2194 {
2195         struct workqueue_struct *old;
2196         struct workqueue_struct *wq = alloc_workqueue("dio/%s",
2197                                                       WQ_MEM_RECLAIM, 0,
2198                                                       sb->s_id);
2199         if (!wq)
2200                 return -ENOMEM;
2201         /*
2202          * This has to be atomic as more DIOs can race to create the workqueue
2203          */
2204         old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
2205         /* Someone created workqueue before us? Free ours... */
2206         if (old)
2207                 destroy_workqueue(wq);
2208         return 0;
2209 }
2210 EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);