fs/overlayfs/inode.c

   1 /*
   2  *
   3  * Copyright (C) 2011 Novell Inc.
   4  *
   5  * This program is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 as published by
   7  * the Free Software Foundation.
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/slab.h>
  12 #include <linux/cred.h>
  13 #include <linux/xattr.h>
  14 #include <linux/posix_acl.h>
  15 #include <linux/ratelimit.h>
  16 #include "overlayfs.h"
  17
  18
  19 int ovl_setattr(struct dentry *dentry, struct iattr *attr)
  20 {
  21         int err;
  22         bool full_copy_up = false;
  23         struct dentry *upperdentry;
  24         const struct cred *old_cred;
  25
  26         err = setattr_prepare(dentry, attr);
  27         if (err)
  28                 return err;
  29
  30         err = ovl_want_write(dentry);
  31         if (err)
  32                 goto out;
  33
  34         if (attr->ia_valid & ATTR_SIZE) {
  35                 struct inode *realinode = d_inode(ovl_dentry_real(dentry));
  36
  37                 err = -ETXTBSY;
  38                 if (atomic_read(&realinode->i_writecount) < 0)
  39                         goto out_drop_write;
  40
  41                 /* Truncate should trigger data copy up as well */
  42                 full_copy_up = true;
  43         }
  44
  45         if (!full_copy_up)
  46                 err = ovl_copy_up(dentry);
  47         else
  48                 err = ovl_copy_up_with_data(dentry);
  49         if (!err) {
  50                 struct inode *winode = NULL;
  51
  52                 upperdentry = ovl_dentry_upper(dentry);
  53
  54                 if (attr->ia_valid & ATTR_SIZE) {
  55                         winode = d_inode(upperdentry);
  56                         err = get_write_access(winode);
  57                         if (err)
  58                                 goto out_drop_write;
  59                 }
  60
  61                 if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
  62                         attr->ia_valid &= ~ATTR_MODE;
  63
  64                 inode_lock(upperdentry->d_inode);
  65                 old_cred = ovl_override_creds(dentry->d_sb);
  66                 err = notify_change(upperdentry, attr, NULL);
  67                 revert_creds(old_cred);
  68                 if (!err)
  69                         ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
  70                 inode_unlock(upperdentry->d_inode);
  71
  72                 if (winode)
  73                         put_write_access(winode);
  74         }
  75 out_drop_write:
  76         ovl_drop_write(dentry);
  77 out:
  78         return err;
  79 }
  80
  81 static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
  82                            struct ovl_layer *lower_layer)
  83 {
  84         bool samefs = ovl_same_sb(dentry->d_sb);
  85         unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
  86
  87         if (samefs) {
  88                 /*
  89                  * When all layers are on the same fs, all real inode
  90                  * number are unique, so we use the overlay st_dev,
  91                  * which is friendly to du -x.
  92                  */
  93                 stat->dev = dentry->d_sb->s_dev;
  94                 return 0;
  95         } else if (xinobits) {
  96                 unsigned int shift = 64 - xinobits;
  97                 /*
  98                  * All inode numbers of underlying fs should not be using the
  99                  * high xinobits, so we use high xinobits to partition the
 100                  * overlay st_ino address space. The high bits holds the fsid
 101                  * (upper fsid is 0). This way overlay inode numbers are unique
 102                  * and all inodes use overlay st_dev. Inode numbers are also
 103                  * persistent for a given layer configuration.
 104                  */
 105                 if (stat->ino >> shift) {
 106                         pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
 107                                             dentry, stat->ino, xinobits);
 108                 } else {
 109                         if (lower_layer)
 110                                 stat->ino |= ((u64)lower_layer->fsid) << shift;
 111
 112                         stat->dev = dentry->d_sb->s_dev;
 113                         return 0;
 114                 }
 115         }
 116
 117         /* The inode could not be mapped to a unified st_ino address space */
 118         if (S_ISDIR(dentry->d_inode->i_mode)) {
 119                 /*
 120                  * Always use the overlay st_dev for directories, so 'find
 121                  * -xdev' will scan the entire overlay mount and won't cross the
 122                  * overlay mount boundaries.
 123                  *
 124                  * If not all layers are on the same fs the pair {real st_ino;
 125                  * overlay st_dev} is not unique, so use the non persistent
 126                  * overlay st_ino for directories.
 127                  */
 128                 stat->dev = dentry->d_sb->s_dev;
 129                 stat->ino = dentry->d_inode->i_ino;
 130         } else if (lower_layer && lower_layer->fsid) {
 131                 /*
 132                  * For non-samefs setup, if we cannot map all layers st_ino
 133                  * to a unified address space, we need to make sure that st_dev
 134                  * is unique per lower fs. Upper layer uses real st_dev and
 135                  * lower layers use the unique anonymous bdev assigned to the
 136                  * lower fs.
 137                  */
 138                 stat->dev = lower_layer->fs->pseudo_dev;
 139         }
 140
 141         return 0;
 142 }
 143
 144 int ovl_getattr(const struct path *path, struct kstat *stat,
 145                 u32 request_mask, unsigned int flags)
 146 {
 147         struct dentry *dentry = path->dentry;
 148         enum ovl_path_type type;
 149         struct path realpath;
 150         const struct cred *old_cred;
 151         bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
 152         bool samefs = ovl_same_sb(dentry->d_sb);
 153         struct ovl_layer *lower_layer = NULL;
 154         int err;
 155         bool metacopy_blocks = false;
 156
 157         metacopy_blocks = ovl_is_metacopy_dentry(dentry);
 158
 159         type = ovl_path_real(dentry, &realpath);
 160         old_cred = ovl_override_creds(dentry->d_sb);
 161         err = vfs_getattr(&realpath, stat, request_mask, flags);
 162         if (err)
 163                 goto out;
 164
 165         /*
 166          * For non-dir or same fs, we use st_ino of the copy up origin.
 167          * This guaranties constant st_dev/st_ino across copy up.
 168          * With xino feature and non-samefs, we use st_ino of the copy up
 169          * origin masked with high bits that represent the layer id.
 170          *
 171          * If lower filesystem supports NFS file handles, this also guaranties
 172          * persistent st_ino across mount cycle.
 173          */
 174         if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
 175                 if (!OVL_TYPE_UPPER(type)) {
 176                         lower_layer = ovl_layer_lower(dentry);
 177                 } else if (OVL_TYPE_ORIGIN(type)) {
 178                         struct kstat lowerstat;
 179                         u32 lowermask = STATX_INO | STATX_BLOCKS |
 180                                         (!is_dir ? STATX_NLINK : 0);
 181
 182                         ovl_path_lower(dentry, &realpath);
 183                         err = vfs_getattr(&realpath, &lowerstat,
 184                                           lowermask, flags);
 185                         if (err)
 186                                 goto out;
 187
 188                         /*
 189                          * Lower hardlinks may be broken on copy up to different
 190                          * upper files, so we cannot use the lower origin st_ino
 191                          * for those different files, even for the same fs case.
 192                          *
 193                          * Similarly, several redirected dirs can point to the
 194                          * same dir on a lower layer. With the "verify_lower"
 195                          * feature, we do not use the lower origin st_ino, if
 196                          * we haven't verified that this redirect is unique.
 197                          *
 198                          * With inodes index enabled, it is safe to use st_ino
 199                          * of an indexed origin. The index validates that the
 200                          * upper hardlink is not broken and that a redirected
 201                          * dir is the only redirect to that origin.
 202                          */
 203                         if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
 204                             (!ovl_verify_lower(dentry->d_sb) &&
 205                              (is_dir || lowerstat.nlink == 1))) {
 206                                 stat->ino = lowerstat.ino;
 207                                 lower_layer = ovl_layer_lower(dentry);
 208                         }
 209
 210                         /*
 211                          * If we are querying a metacopy dentry and lower
 212                          * dentry is data dentry, then use the blocks we
 213                          * queried just now. We don't have to do additional
 214                          * vfs_getattr(). If lower itself is metacopy, then
 215                          * additional vfs_getattr() is unavoidable.
 216                          */
 217                         if (metacopy_blocks &&
 218                             realpath.dentry == ovl_dentry_lowerdata(dentry)) {
 219                                 stat->blocks = lowerstat.blocks;
 220                                 metacopy_blocks = false;
 221                         }
 222                 }
 223
 224                 if (metacopy_blocks) {
 225                         /*
 226                          * If lower is not same as lowerdata or if there was
 227                          * no origin on upper, we can end up here.
 228                          */
 229                         struct kstat lowerdatastat;
 230                         u32 lowermask = STATX_BLOCKS;
 231
 232                         ovl_path_lowerdata(dentry, &realpath);
 233                         err = vfs_getattr(&realpath, &lowerdatastat,
 234                                           lowermask, flags);
 235                         if (err)
 236                                 goto out;
 237                         stat->blocks = lowerdatastat.blocks;
 238                 }
 239         }
 240
 241         err = ovl_map_dev_ino(dentry, stat, lower_layer);
 242         if (err)
 243                 goto out;
 244
 245         /*
 246          * It's probably not worth it to count subdirs to get the
 247          * correct link count.  nlink=1 seems to pacify 'find' and
 248          * other utilities.
 249          */
 250         if (is_dir && OVL_TYPE_MERGE(type))
 251                 stat->nlink = 1;
 252
 253         /*
 254          * Return the overlay inode nlinks for indexed upper inodes.
 255          * Overlay inode nlink counts the union of the upper hardlinks
 256          * and non-covered lower hardlinks. It does not include the upper
 257          * index hardlink.
 258          */
 259         if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry)))
 260                 stat->nlink = dentry->d_inode->i_nlink;
 261
 262 out:
 263         revert_creds(old_cred);
 264
 265         return err;
 266 }
 267
 268 int ovl_permission(struct inode *inode, int mask)
 269 {
 270         struct inode *upperinode = ovl_inode_upper(inode);
 271         struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
 272         const struct cred *old_cred;
 273         int err;
 274
 275         /* Careful in RCU walk mode */
 276         if (!realinode) {
 277                 WARN_ON(!(mask & MAY_NOT_BLOCK));
 278                 return -ECHILD;
 279         }
 280
 281         /*
 282          * Check overlay inode with the creds of task and underlying inode
 283          * with creds of mounter
 284          */
 285         err = generic_permission(inode, mask);
 286         if (err)
 287                 return err;
 288
 289         /* No need to do any access on underlying for special files */
 290         if (special_file(realinode->i_mode))
 291                 return 0;
 292
 293         /* No need to access underlying for execute */
 294         mask &= ~MAY_EXEC;
 295         if ((mask & (MAY_READ | MAY_WRITE)) == 0)
 296                 return 0;
 297
 298         /* Lower files get copied up, so turn write access into read */
 299         if (!upperinode && mask & MAY_WRITE) {
 300                 mask &= ~(MAY_WRITE | MAY_APPEND);
 301                 mask |= MAY_READ;
 302         }
 303
 304         old_cred = ovl_override_creds(inode->i_sb);
 305         err = inode_permission(realinode, mask);
 306         revert_creds(old_cred);
 307
 308         return err;
 309 }
 310
 311 static const char *ovl_get_link(struct dentry *dentry,
 312                                 struct inode *inode,
 313                                 struct delayed_call *done)
 314 {
 315         const struct cred *old_cred;
 316         const char *p;
 317
 318         if (!dentry)
 319                 return ERR_PTR(-ECHILD);
 320
 321         old_cred = ovl_override_creds(dentry->d_sb);
 322         p = vfs_get_link(ovl_dentry_real(dentry), done);
 323         revert_creds(old_cred);
 324         return p;
 325 }
 326
 327 bool ovl_is_private_xattr(const char *name)
 328 {
 329         return strncmp(name, OVL_XATTR_PREFIX,
 330                        sizeof(OVL_XATTR_PREFIX) - 1) == 0;
 331 }
 332
 333 int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
 334                   const void *value, size_t size, int flags)
 335 {
 336         int err;
 337         struct dentry *upperdentry = ovl_i_dentry_upper(inode);
 338         struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
 339         const struct cred *old_cred;
 340
 341         err = ovl_want_write(dentry);
 342         if (err)
 343                 goto out;
 344
 345         if (!value && !upperdentry) {
 346                 err = vfs_getxattr(realdentry, name, NULL, 0);
 347                 if (err < 0)
 348                         goto out_drop_write;
 349         }
 350
 351         if (!upperdentry) {
 352                 err = ovl_copy_up(dentry);
 353                 if (err)
 354                         goto out_drop_write;
 355
 356                 realdentry = ovl_dentry_upper(dentry);
 357         }
 358
 359         old_cred = ovl_override_creds(dentry->d_sb);
 360         if (value)
 361                 err = vfs_setxattr(realdentry, name, value, size, flags);
 362         else {
 363                 WARN_ON(flags != XATTR_REPLACE);
 364                 err = vfs_removexattr(realdentry, name);
 365         }
 366         revert_creds(old_cred);
 367
 368         /* copy c/mtime */
 369         ovl_copyattr(d_inode(realdentry), inode);
 370
 371 out_drop_write:
 372         ovl_drop_write(dentry);
 373 out:
 374         return err;
 375 }
 376
 377 int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
 378                   void *value, size_t size)
 379 {
 380         ssize_t res;
 381         const struct cred *old_cred;
 382         struct dentry *realdentry =
 383                 ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
 384
 385         old_cred = ovl_override_creds(dentry->d_sb);
 386         res = vfs_getxattr(realdentry, name, value, size);
 387         revert_creds(old_cred);
 388         return res;
 389 }
 390
 391 static bool ovl_can_list(const char *s)
 392 {
 393         /* List all non-trusted xatts */
 394         if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
 395                 return true;
 396
 397         /* Never list trusted.overlay, list other trusted for superuser only */
 398         return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
 399 }
 400
 401 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
 402 {
 403         struct dentry *realdentry = ovl_dentry_real(dentry);
 404         ssize_t res;
 405         size_t len;
 406         char *s;
 407         const struct cred *old_cred;
 408
 409         old_cred = ovl_override_creds(dentry->d_sb);
 410         res = vfs_listxattr(realdentry, list, size);
 411         revert_creds(old_cred);
 412         if (res <= 0 || size == 0)
 413                 return res;
 414
 415         /* filter out private xattrs */
 416         for (s = list, len = res; len;) {
 417                 size_t slen = strnlen(s, len) + 1;
 418
 419                 /* underlying fs providing us with an broken xattr list? */
 420                 if (WARN_ON(slen > len))
 421                         return -EIO;
 422
 423                 len -= slen;
 424                 if (!ovl_can_list(s)) {
 425                         res -= slen;
 426                         memmove(s, s + slen, len);
 427                 } else {
 428                         s += slen;
 429                 }
 430         }
 431
 432         return res;
 433 }
 434
 435 struct posix_acl *ovl_get_acl(struct inode *inode, int type)
 436 {
 437         struct inode *realinode = ovl_inode_real(inode);
 438         const struct cred *old_cred;
 439         struct posix_acl *acl;
 440
 441         if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
 442                 return NULL;
 443
 444         old_cred = ovl_override_creds(inode->i_sb);
 445         acl = get_acl(realinode, type);
 446         revert_creds(old_cred);
 447
 448         return acl;
 449 }
 450
 451 int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
 452 {
 453         if (flags & S_ATIME) {
 454                 struct ovl_fs *ofs = inode->i_sb->s_fs_info;
 455                 struct path upperpath = {
 456                         .mnt = ofs->upper_mnt,
 457                         .dentry = ovl_upperdentry_dereference(OVL_I(inode)),
 458                 };
 459
 460                 if (upperpath.dentry) {
 461                         touch_atime(&upperpath);
 462                         inode->i_atime = d_inode(upperpath.dentry)->i_atime;
 463                 }
 464         }
 465         return 0;
 466 }
 467
 468 static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 469                       u64 start, u64 len)
 470 {
 471         int err;
 472         struct inode *realinode = ovl_inode_real(inode);
 473         const struct cred *old_cred;
 474
 475         if (!realinode->i_op->fiemap)
 476                 return -EOPNOTSUPP;
 477
 478         old_cred = ovl_override_creds(inode->i_sb);
 479
 480         if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
 481                 filemap_write_and_wait(realinode->i_mapping);
 482
 483         err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
 484         revert_creds(old_cred);
 485
 486         return err;
 487 }
 488
 489 static const struct inode_operations ovl_file_inode_operations = {
 490         .setattr        = ovl_setattr,
 491         .permission     = ovl_permission,
 492         .getattr        = ovl_getattr,
 493         .listxattr      = ovl_listxattr,
 494         .get_acl        = ovl_get_acl,
 495         .update_time    = ovl_update_time,
 496         .fiemap         = ovl_fiemap,
 497 };
 498
 499 static const struct inode_operations ovl_symlink_inode_operations = {
 500         .setattr        = ovl_setattr,
 501         .get_link       = ovl_get_link,
 502         .getattr        = ovl_getattr,
 503         .listxattr      = ovl_listxattr,
 504         .update_time    = ovl_update_time,
 505 };
 506
 507 static const struct inode_operations ovl_special_inode_operations = {
 508         .setattr        = ovl_setattr,
 509         .permission     = ovl_permission,
 510         .getattr        = ovl_getattr,
 511         .listxattr      = ovl_listxattr,
 512         .get_acl        = ovl_get_acl,
 513         .update_time    = ovl_update_time,
 514 };
 515
 516 static const struct address_space_operations ovl_aops = {
 517         /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
 518         .direct_IO              = noop_direct_IO,
 519 };
 520
 521 /*
 522  * It is possible to stack overlayfs instance on top of another
 523  * overlayfs instance as lower layer. We need to annonate the
 524  * stackable i_mutex locks according to stack level of the super
 525  * block instance. An overlayfs instance can never be in stack
 526  * depth 0 (there is always a real fs below it).  An overlayfs
 527  * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
 528  *
 529  * For example, here is a snip from /proc/lockdep_chains after
 530  * dir_iterate of nested overlayfs:
 531  *
 532  * [...] &ovl_i_mutex_dir_key[depth]   (stack_depth=2)
 533  * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
 534  * [...] &type->i_mutex_dir_key        (stack_depth=0)
 535  */
 536 #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
 537
 538 static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
 539 {
 540 #ifdef CONFIG_LOCKDEP
 541         static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
 542         static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
 543         static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
 544
 545         int depth = inode->i_sb->s_stack_depth - 1;
 546
 547         if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
 548                 depth = 0;
 549
 550         if (S_ISDIR(inode->i_mode))
 551                 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
 552         else
 553                 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
 554
 555         lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
 556 #endif
 557 }
 558
 559 static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
 560                            unsigned long ino, int fsid)
 561 {
 562         int xinobits = ovl_xino_bits(inode->i_sb);
 563
 564         /*
 565          * When NFS export is enabled and d_ino is consistent with st_ino
 566          * (samefs or i_ino has enough bits to encode layer), set the same
 567          * value used for d_ino to i_ino, because nfsd readdirplus compares
 568          * d_ino values to i_ino values of child entries. When called from
 569          * ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
 570          * upper inode i_ino on ovl_inode_init() or ovl_inode_update().
 571          */
 572         if (inode->i_sb->s_export_op &&
 573             (ovl_same_sb(inode->i_sb) || xinobits)) {
 574                 inode->i_ino = ino;
 575                 if (xinobits && fsid && !(ino >> (64 - xinobits)))
 576                         inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
 577         } else {
 578                 inode->i_ino = get_next_ino();
 579         }
 580         inode->i_mode = mode;
 581         inode->i_flags |= S_NOCMTIME;
 582 #ifdef CONFIG_FS_POSIX_ACL
 583         inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
 584 #endif
 585
 586         ovl_lockdep_annotate_inode_mutex_key(inode);
 587
 588         switch (mode & S_IFMT) {
 589         case S_IFREG:
 590                 inode->i_op = &ovl_file_inode_operations;
 591                 inode->i_fop = &ovl_file_operations;
 592                 inode->i_mapping->a_ops = &ovl_aops;
 593                 break;
 594
 595         case S_IFDIR:
 596                 inode->i_op = &ovl_dir_inode_operations;
 597                 inode->i_fop = &ovl_dir_operations;
 598                 break;
 599
 600         case S_IFLNK:
 601                 inode->i_op = &ovl_symlink_inode_operations;
 602                 break;
 603
 604         default:
 605                 inode->i_op = &ovl_special_inode_operations;
 606                 init_special_inode(inode, mode, rdev);
 607                 break;
 608         }
 609 }
 610
 611 /*
 612  * With inodes index enabled, an overlay inode nlink counts the union of upper
 613  * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure
 614  * upper inode, the following nlink modifying operations can happen:
 615  *
 616  * 1. Lower hardlink copy up
 617  * 2. Upper hardlink created, unlinked or renamed over
 618  * 3. Lower hardlink whiteout or renamed over
 619  *
 620  * For the first, copy up case, the union nlink does not change, whether the
 621  * operation succeeds or fails, but the upper inode nlink may change.
 622  * Therefore, before copy up, we store the union nlink value relative to the
 623  * lower inode nlink in the index inode xattr trusted.overlay.nlink.
 624  *
 625  * For the second, upper hardlink case, the union nlink should be incremented
 626  * or decremented IFF the operation succeeds, aligned with nlink change of the
 627  * upper inode. Therefore, before link/unlink/rename, we store the union nlink
 628  * value relative to the upper inode nlink in the index inode.
 629  *
 630  * For the last, lower cover up case, we simplify things by preceding the
 631  * whiteout or cover up with copy up. This makes sure that there is an index
 632  * upper inode where the nlink xattr can be stored before the copied up upper
 633  * entry is unlink.
 634  */
 635 #define OVL_NLINK_ADD_UPPER     (1 << 0)
 636
 637 /*
 638  * On-disk format for indexed nlink:
 639  *
 640  * nlink relative to the upper inode - "U[+-]NUM"
 641  * nlink relative to the lower inode - "L[+-]NUM"
 642  */
 643
 644 static int ovl_set_nlink_common(struct dentry *dentry,
 645                                 struct dentry *realdentry, const char *format)
 646 {
 647         struct inode *inode = d_inode(dentry);
 648         struct inode *realinode = d_inode(realdentry);
 649         char buf[13];
 650         int len;
 651
 652         len = snprintf(buf, sizeof(buf), format,
 653                        (int) (inode->i_nlink - realinode->i_nlink));
 654
 655         if (WARN_ON(len >= sizeof(buf)))
 656                 return -EIO;
 657
 658         return ovl_do_setxattr(ovl_dentry_upper(dentry),
 659                                OVL_XATTR_NLINK, buf, len, 0);
 660 }
 661
 662 int ovl_set_nlink_upper(struct dentry *dentry)
 663 {
 664         return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i");
 665 }
 666
 667 int ovl_set_nlink_lower(struct dentry *dentry)
 668 {
 669         return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
 670 }
 671
 672 unsigned int ovl_get_nlink(struct dentry *lowerdentry,
 673                            struct dentry *upperdentry,
 674                            unsigned int fallback)
 675 {
 676         int nlink_diff;
 677         int nlink;
 678         char buf[13];
 679         int err;
 680
 681         if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
 682                 return fallback;
 683
 684         err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1);
 685         if (err < 0)
 686                 goto fail;
 687
 688         buf[err] = '\0';
 689         if ((buf[0] != 'L' && buf[0] != 'U') ||
 690             (buf[1] != '+' && buf[1] != '-'))
 691                 goto fail;
 692
 693         err = kstrtoint(buf + 1, 10, &nlink_diff);
 694         if (err < 0)
 695                 goto fail;
 696
 697         nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink;
 698         nlink += nlink_diff;
 699
 700         if (nlink <= 0)
 701                 goto fail;
 702
 703         return nlink;
 704
 705 fail:
 706         pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n",
 707                             upperdentry, err);
 708         return fallback;
 709 }
 710
 711 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
 712 {
 713         struct inode *inode;
 714
 715         inode = new_inode(sb);
 716         if (inode)
 717                 ovl_fill_inode(inode, mode, rdev, 0, 0);
 718
 719         return inode;
 720 }
 721
 722 static int ovl_inode_test(struct inode *inode, void *data)
 723 {
 724         return inode->i_private == data;
 725 }
 726
 727 static int ovl_inode_set(struct inode *inode, void *data)
 728 {
 729         inode->i_private = data;
 730         return 0;
 731 }
 732
 733 static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
 734                              struct dentry *upperdentry, bool strict)
 735 {
 736         /*
 737          * For directories, @strict verify from lookup path performs consistency
 738          * checks, so NULL lower/upper in dentry must match NULL lower/upper in
 739          * inode. Non @strict verify from NFS handle decode path passes NULL for
 740          * 'unknown' lower/upper.
 741          */
 742         if (S_ISDIR(inode->i_mode) && strict) {
 743                 /* Real lower dir moved to upper layer under us? */
 744                 if (!lowerdentry && ovl_inode_lower(inode))
 745                         return false;
 746
 747                 /* Lookup of an uncovered redirect origin? */
 748                 if (!upperdentry && ovl_inode_upper(inode))
 749                         return false;
 750         }
 751
 752         /*
 753          * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
 754          * This happens when finding a copied up overlay inode for a renamed
 755          * or hardlinked overlay dentry and lower dentry cannot be followed
 756          * by origin because lower fs does not support file handles.
 757          */
 758         if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
 759                 return false;
 760
 761         /*
 762          * Allow non-NULL __upperdentry in inode even if upperdentry is NULL.
 763          * This happens when finding a lower alias for a copied up hard link.
 764          */
 765         if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry))
 766                 return false;
 767
 768         return true;
 769 }
 770
 771 struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
 772                                bool is_upper)
 773 {
 774         struct inode *inode, *key = d_inode(real);
 775
 776         inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
 777         if (!inode)
 778                 return NULL;
 779
 780         if (!ovl_verify_inode(inode, is_upper ? NULL : real,
 781                               is_upper ? real : NULL, false)) {
 782                 iput(inode);
 783                 return ERR_PTR(-ESTALE);
 784         }
 785
 786         return inode;
 787 }
 788
 789 /*
 790  * Does overlay inode need to be hashed by lower inode?
 791  */
 792 static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
 793                              struct dentry *lower, struct dentry *index)
 794 {
 795         struct ovl_fs *ofs = sb->s_fs_info;
 796
 797         /* No, if pure upper */
 798         if (!lower)
 799                 return false;
 800
 801         /* Yes, if already indexed */
 802         if (index)
 803                 return true;
 804
 805         /* Yes, if won't be copied up */
 806         if (!ofs->upper_mnt)
 807                 return true;
 808
 809         /* No, if lower hardlink is or will be broken on copy up */
 810         if ((upper || !ovl_indexdir(sb)) &&
 811             !d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
 812                 return false;
 813
 814         /* No, if non-indexed upper with NFS export */
 815         if (sb->s_export_op && upper)
 816                 return false;
 817
 818         /* Otherwise, hash by lower inode for fsnotify */
 819         return true;
 820 }
 821
 822 static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
 823                                struct inode *key)
 824 {
 825         return newinode ? inode_insert5(newinode, (unsigned long) key,
 826                                          ovl_inode_test, ovl_inode_set, key) :
 827                           iget5_locked(sb, (unsigned long) key,
 828                                        ovl_inode_test, ovl_inode_set, key);
 829 }
 830
 831 struct inode *ovl_get_inode(struct super_block *sb,
 832                             struct ovl_inode_params *oip)
 833 {
 834         struct dentry *upperdentry = oip->upperdentry;
 835         struct ovl_path *lowerpath = oip->lowerpath;
 836         struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
 837         struct inode *inode;
 838         struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
 839         bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
 840                                         oip->index);
 841         int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
 842         bool is_dir, metacopy = false;
 843         unsigned long ino = 0;
 844         int err = -ENOMEM;
 845
 846         if (!realinode)
 847                 realinode = d_inode(lowerdentry);
 848
 849         /*
 850          * Copy up origin (lower) may exist for non-indexed upper, but we must
 851          * not use lower as hash key if this is a broken hardlink.
 852          */
 853         is_dir = S_ISDIR(realinode->i_mode);
 854         if (upperdentry || bylower) {
 855                 struct inode *key = d_inode(bylower ? lowerdentry :
 856                                                       upperdentry);
 857                 unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
 858
 859                 inode = ovl_iget5(sb, oip->newinode, key);
 860                 if (!inode)
 861                         goto out_err;
 862                 if (!(inode->i_state & I_NEW)) {
 863                         /*
 864                          * Verify that the underlying files stored in the inode
 865                          * match those in the dentry.
 866                          */
 867                         if (!ovl_verify_inode(inode, lowerdentry, upperdentry,
 868                                               true)) {
 869                                 iput(inode);
 870                                 err = -ESTALE;
 871                                 goto out_err;
 872                         }
 873
 874                         dput(upperdentry);
 875                         kfree(oip->redirect);
 876                         goto out;
 877                 }
 878
 879                 /* Recalculate nlink for non-dir due to indexing */
 880                 if (!is_dir)
 881                         nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
 882                 set_nlink(inode, nlink);
 883                 ino = key->i_ino;
 884         } else {
 885                 /* Lower hardlink that will be broken on copy up */
 886                 inode = new_inode(sb);
 887                 if (!inode) {
 888                         err = -ENOMEM;
 889                         goto out_err;
 890                 }
 891         }
 892         ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
 893         ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata);
 894
 895         if (upperdentry && ovl_is_impuredir(upperdentry))
 896                 ovl_set_flag(OVL_IMPURE, inode);
 897
 898         if (oip->index)
 899                 ovl_set_flag(OVL_INDEX, inode);
 900
 901         if (upperdentry) {
 902                 err = ovl_check_metacopy_xattr(upperdentry);
 903                 if (err < 0)
 904                         goto out_err;
 905                 metacopy = err;
 906                 if (!metacopy)
 907                         ovl_set_flag(OVL_UPPERDATA, inode);
 908         }
 909
 910         OVL_I(inode)->redirect = oip->redirect;
 911
 912         if (bylower)
 913                 ovl_set_flag(OVL_CONST_INO, inode);
 914
 915         /* Check for non-merge dir that may have whiteouts */
 916         if (is_dir) {
 917                 if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
 918                     ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
 919                         ovl_set_flag(OVL_WHITEOUTS, inode);
 920                 }
 921         }
 922
 923         if (inode->i_state & I_NEW)
 924                 unlock_new_inode(inode);
 925 out:
 926         return inode;
 927
 928 out_err:
 929         inode = ERR_PTR(err);
 930         goto out;
 931 }