--- /dev/null
+From 5974ea7ce0f9a5987fc8cf5e08ad6e3e70bb542e Mon Sep 17 00:00:00 2001
+From: Sungup Moon <sungup.moon@samsung.com>
+Date: Mon, 14 Mar 2022 20:05:45 +0900
+Subject: nvme: allow duplicate NSIDs for private namespaces
+
+From: Sungup Moon <sungup.moon@samsung.com>
+
+commit 5974ea7ce0f9a5987fc8cf5e08ad6e3e70bb542e upstream.
+
+A NVMe subsystem with multiple controller can have private namespaces
+that use the same NSID under some conditions:
+
+ "If Namespace Management, ANA Reporting, or NVM Sets are supported, the
+ NSIDs shall be unique within the NVM subsystem. If the Namespace
+ Management, ANA Reporting, and NVM Sets are not supported, then NSIDs:
+ a) for shared namespace shall be unique; and
+ b) for private namespace are not required to be unique."
+
+Reference: Section 6.1.6 NSID and Namespace Usage; NVM Express 1.4c spec.
+
+Make sure this specific setup is supported in Linux.
+
+Fixes: 9ad1927a3bc2 ("nvme: always search for namespace head")
+Signed-off-by: Sungup Moon <sungup.moon@samsung.com>
+[hch: refactored and fixed the controller vs subsystem based naming
+ conflict]
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/nvme/host/core.c | 15 ++++++++++-----
+ drivers/nvme/host/multipath.c | 7 ++++---
+ drivers/nvme/host/nvme.h | 19 +++++++++++++++++++
+ include/linux/nvme.h | 1 +
+ 4 files changed, 34 insertions(+), 8 deletions(-)
+
+--- a/drivers/nvme/host/core.c
++++ b/drivers/nvme/host/core.c
+@@ -3510,15 +3510,20 @@ static const struct attribute_group *nvm
+ NULL,
+ };
+
+-static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys,
++static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
+ unsigned nsid)
+ {
+ struct nvme_ns_head *h;
+
+- lockdep_assert_held(&subsys->lock);
++ lockdep_assert_held(&ctrl->subsys->lock);
+
+- list_for_each_entry(h, &subsys->nsheads, entry) {
+- if (h->ns_id != nsid)
++ list_for_each_entry(h, &ctrl->subsys->nsheads, entry) {
++ /*
++ * Private namespaces can share NSIDs under some conditions.
++ * In that case we can't use the same ns_head for namespaces
++ * with the same NSID.
++ */
++ if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
+ continue;
+ if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
+ return h;
+@@ -3686,7 +3691,7 @@ static int nvme_init_ns_head(struct nvme
+ int ret = 0;
+
+ mutex_lock(&ctrl->subsys->lock);
+- head = nvme_find_ns_head(ctrl->subsys, nsid);
++ head = nvme_find_ns_head(ctrl, nsid);
+ if (!head) {
+ head = nvme_alloc_ns_head(ctrl, nsid, ids);
+ if (IS_ERR(head)) {
+--- a/drivers/nvme/host/multipath.c
++++ b/drivers/nvme/host/multipath.c
+@@ -462,10 +462,11 @@ int nvme_mpath_alloc_disk(struct nvme_ct
+
+ /*
+ * Add a multipath node if the subsystems supports multiple controllers.
+- * We also do this for private namespaces as the namespace sharing data could
+- * change after a rescan.
++ * We also do this for private namespaces as the namespace sharing flag
++ * could change after a rescan.
+ */
+- if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath)
++ if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
++ !nvme_is_unique_nsid(ctrl, head) || !multipath)
+ return 0;
+
+ head->disk = blk_alloc_disk(ctrl->numa_node);
+--- a/drivers/nvme/host/nvme.h
++++ b/drivers/nvme/host/nvme.h
+@@ -693,6 +693,25 @@ static inline bool nvme_check_ready(stru
+ return true;
+ return __nvme_check_ready(ctrl, rq, queue_live);
+ }
++
++/*
++ * NSID shall be unique for all shared namespaces, or if at least one of the
++ * following conditions is met:
++ * 1. Namespace Management is supported by the controller
++ * 2. ANA is supported by the controller
++ * 3. NVM Set are supported by the controller
++ *
++ * In other case, private namespace are not required to report a unique NSID.
++ */
++static inline bool nvme_is_unique_nsid(struct nvme_ctrl *ctrl,
++ struct nvme_ns_head *head)
++{
++ return head->shared ||
++ (ctrl->oacs & NVME_CTRL_OACS_NS_MNGT_SUPP) ||
++ (ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA) ||
++ (ctrl->ctratt & NVME_CTRL_CTRATT_NVM_SETS);
++}
++
+ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+ void *buf, unsigned bufflen);
+ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+--- a/include/linux/nvme.h
++++ b/include/linux/nvme.h
+@@ -322,6 +322,7 @@ enum {
+ NVME_CTRL_ONCS_TIMESTAMP = 1 << 6,
+ NVME_CTRL_VWC_PRESENT = 1 << 0,
+ NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
++ NVME_CTRL_OACS_NS_MNGT_SUPP = 1 << 3,
+ NVME_CTRL_OACS_DIRECTIVES = 1 << 5,
+ NVME_CTRL_OACS_DBBUF_SUPP = 1 << 8,
+ NVME_CTRL_LPA_CMD_EFFECTS_LOG = 1 << 1,
--- /dev/null
+From 726be2c72efc0a64c206e854b8996ad3ab9c7507 Mon Sep 17 00:00:00 2001
+From: Pankaj Raghav <p.raghav@samsung.com>
+Date: Tue, 22 Mar 2022 10:20:48 +0100
+Subject: nvme: fix the read-only state for zoned namespaces with unsupposed features
+
+From: Pankaj Raghav <p.raghav@samsung.com>
+
+commit 726be2c72efc0a64c206e854b8996ad3ab9c7507 upstream.
+
+commit 2f4c9ba23b88 ("nvme: export zoned namespaces without Zone Append
+support read-only") marks zoned namespaces without append support
+read-only. It does iso by setting NVME_NS_FORCE_RO in ns->flags in
+nvme_update_zone_info and checking for that flag later in
+nvme_update_disk_info to mark the disk as read-only.
+
+But commit 73d90386b559 ("nvme: cleanup zone information initialization")
+rearranged nvme_update_disk_info to be called before
+nvme_update_zone_info and thus not marking the disk as read-only.
+The call order cannot be just reverted because nvme_update_zone_info sets
+certain queue parameters such as zone_write_granularity that depend on the
+prior call to nvme_update_disk_info.
+
+Remove the call to set_disk_ro in nvme_update_disk_info. and call
+set_disk_ro after nvme_update_zone_info and nvme_update_disk_info to set
+the permission for ZNS drives correctly. The same applies to the
+multipath disk path.
+
+Fixes: 73d90386b559 ("nvme: cleanup zone information initialization")
+Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/nvme/host/core.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/drivers/nvme/host/core.c
++++ b/drivers/nvme/host/core.c
+@@ -1838,9 +1838,6 @@ static void nvme_update_disk_info(struct
+ nvme_config_discard(disk, ns);
+ blk_queue_max_write_zeroes_sectors(disk->queue,
+ ns->ctrl->max_zeroes_sectors);
+-
+- set_disk_ro(disk, (id->nsattr & NVME_NS_ATTR_RO) ||
+- test_bit(NVME_NS_FORCE_RO, &ns->flags));
+ }
+
+ static inline bool nvme_first_scan(struct gendisk *disk)
+@@ -1901,6 +1898,8 @@ static int nvme_update_ns_info(struct nv
+ goto out_unfreeze;
+ }
+
++ set_disk_ro(ns->disk, (id->nsattr & NVME_NS_ATTR_RO) ||
++ test_bit(NVME_NS_FORCE_RO, &ns->flags));
+ set_bit(NVME_NS_READY, &ns->flags);
+ blk_mq_unfreeze_queue(ns->disk->queue);
+
+@@ -1913,6 +1912,9 @@ static int nvme_update_ns_info(struct nv
+ if (nvme_ns_head_multipath(ns->head)) {
+ blk_mq_freeze_queue(ns->head->disk->queue);
+ nvme_update_disk_info(ns->head->disk, ns, id);
++ set_disk_ro(ns->head->disk,
++ (id->nsattr & NVME_NS_ATTR_RO) ||
++ test_bit(NVME_NS_FORCE_RO, &ns->flags));
+ nvme_mpath_revalidate_paths(ns);
+ blk_stack_limits(&ns->head->disk->queue->limits,
+ &ns->queue->limits, 0);
kvm-prevent-module-exit-until-all-vms-are-freed.patch
kvm-x86-fix-sending-pv-ipi.patch
kvm-svm-fix-panic-on-out-of-bounds-guest-irq.patch
+ubifs-rename_whiteout-fix-double-free-for-whiteout_ui-data.patch
+ubifs-fix-deadlock-in-concurrent-rename-whiteout-and-inode-writeback.patch
+ubifs-add-missing-iput-if-do_tmpfile-failed-in-rename-whiteout.patch
+ubifs-rename-whiteout-atomically.patch
+ubifs-fix-ui-dirty-race-between-do_tmpfile-and-writeback-work.patch
+ubifs-rectify-space-amount-budget-for-mkdir-tmpfile-operations.patch
+ubifs-setflags-make-dirtied_ino_d-8-bytes-aligned.patch
+ubifs-fix-read-out-of-bounds-in-ubifs_wbuf_write_nolock.patch
+ubifs-fix-to-add-refcount-once-page-is-set-private.patch
+ubifs-rename_whiteout-correct-old_dir-size-computing.patch
+nvme-allow-duplicate-nsids-for-private-namespaces.patch
+nvme-fix-the-read-only-state-for-zoned-namespaces-with-unsupposed-features.patch
--- /dev/null
+From 716b4573026bcbfa7b58ed19fe15554bac66b082 Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:35 +0800
+Subject: ubifs: Add missing iput if do_tmpfile() failed in rename whiteout
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit 716b4573026bcbfa7b58ed19fe15554bac66b082 upstream.
+
+whiteout inode should be put when do_tmpfile() failed if inode has been
+initialized. Otherwise we will get following warning during umount:
+ UBIFS error (ubi0:0 pid 1494): ubifs_assert_failed [ubifs]: UBIFS
+ assert failed: c->bi.dd_growth == 0, in fs/ubifs/super.c:1930
+ VFS: Busy inodes after unmount of ubifs. Self-destruct in 5 seconds.
+
+Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Suggested-by: Sascha Hauer <s.hauer@pengutronix.de>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/dir.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ubifs/dir.c
++++ b/fs/ubifs/dir.c
+@@ -432,6 +432,8 @@ out_inode:
+ make_bad_inode(inode);
+ if (!instantiated)
+ iput(inode);
++ else if (whiteout)
++ iput(*whiteout);
+ out_budg:
+ ubifs_release_budget(c, &req);
+ if (!instantiated)
--- /dev/null
+From afd427048047e8efdedab30e8888044e2be5aa9c Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:33 +0800
+Subject: ubifs: Fix deadlock in concurrent rename whiteout and inode writeback
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit afd427048047e8efdedab30e8888044e2be5aa9c upstream.
+
+Following hung tasks:
+[ 77.028764] task:kworker/u8:4 state:D stack: 0 pid: 132
+[ 77.028820] Call Trace:
+[ 77.029027] schedule+0x8c/0x1b0
+[ 77.029067] mutex_lock+0x50/0x60
+[ 77.029074] ubifs_write_inode+0x68/0x1f0 [ubifs]
+[ 77.029117] __writeback_single_inode+0x43c/0x570
+[ 77.029128] writeback_sb_inodes+0x259/0x740
+[ 77.029148] wb_writeback+0x107/0x4d0
+[ 77.029163] wb_workfn+0x162/0x7b0
+
+[ 92.390442] task:aa state:D stack: 0 pid: 1506
+[ 92.390448] Call Trace:
+[ 92.390458] schedule+0x8c/0x1b0
+[ 92.390461] wb_wait_for_completion+0x82/0xd0
+[ 92.390469] __writeback_inodes_sb_nr+0xb2/0x110
+[ 92.390472] writeback_inodes_sb_nr+0x14/0x20
+[ 92.390476] ubifs_budget_space+0x705/0xdd0 [ubifs]
+[ 92.390503] do_rename.cold+0x7f/0x187 [ubifs]
+[ 92.390549] ubifs_rename+0x8b/0x180 [ubifs]
+[ 92.390571] vfs_rename+0xdb2/0x1170
+[ 92.390580] do_renameat2+0x554/0x770
+
+, are caused by concurrent rename whiteout and inode writeback processes:
+ rename_whiteout(Thread 1) wb_workfn(Thread2)
+ubifs_rename
+ do_rename
+ lock_4_inodes (Hold ui_mutex)
+ ubifs_budget_space
+ make_free_space
+ shrink_liability
+ __writeback_inodes_sb_nr
+ bdi_split_work_to_wbs (Queue new wb work)
+ wb_do_writeback(wb work)
+ __writeback_single_inode
+ ubifs_write_inode
+ LOCK(ui_mutex)
+ ↑
+ wb_wait_for_completion (Wait wb work) <-- deadlock!
+
+Reproducer (Detail program in [Link]):
+ 1. SYS_renameat2("/mp/dir/file", "/mp/dir/whiteout", RENAME_WHITEOUT)
+ 2. Consume out of space before kernel(mdelay) doing budget for whiteout
+
+Fix it by doing whiteout space budget before locking ubifs inodes.
+BTW, it also fixes wrong goto tag 'out_release' in whiteout budget
+error handling path(It should at least recover dir i_size and unlock
+4 ubifs inodes).
+
+Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=214733
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/dir.c | 25 +++++++++++++++----------
+ 1 file changed, 15 insertions(+), 10 deletions(-)
+
+--- a/fs/ubifs/dir.c
++++ b/fs/ubifs/dir.c
+@@ -1324,6 +1324,7 @@ static int do_rename(struct inode *old_d
+
+ if (flags & RENAME_WHITEOUT) {
+ union ubifs_dev_desc *dev = NULL;
++ struct ubifs_budget_req wht_req;
+
+ dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+ if (!dev) {
+@@ -1345,6 +1346,20 @@ static int do_rename(struct inode *old_d
+ whiteout_ui->data = dev;
+ whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
+ ubifs_assert(c, !whiteout_ui->dirty);
++
++ memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
++ wht_req.dirtied_ino = 1;
++ wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8);
++ /*
++ * To avoid deadlock between space budget (holds ui_mutex and
++ * waits wb work) and writeback work(waits ui_mutex), do space
++ * budget before ubifs inodes locked.
++ */
++ err = ubifs_budget_space(c, &wht_req);
++ if (err) {
++ iput(whiteout);
++ goto out_release;
++ }
+ }
+
+ lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
+@@ -1419,16 +1434,6 @@ static int do_rename(struct inode *old_d
+ }
+
+ if (whiteout) {
+- struct ubifs_budget_req wht_req = { .dirtied_ino = 1,
+- .dirtied_ino_d = \
+- ALIGN(ubifs_inode(whiteout)->data_len, 8) };
+-
+- err = ubifs_budget_space(c, &wht_req);
+- if (err) {
+- iput(whiteout);
+- goto out_release;
+- }
+-
+ inc_nlink(whiteout);
+ mark_inode_dirty(whiteout);
+
--- /dev/null
+From 4f2262a334641e05f645364d5ade1f565c85f20b Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:40 +0800
+Subject: ubifs: Fix read out-of-bounds in ubifs_wbuf_write_nolock()
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit 4f2262a334641e05f645364d5ade1f565c85f20b upstream.
+
+Function ubifs_wbuf_write_nolock() may access buf out of bounds in
+following process:
+
+ubifs_wbuf_write_nolock():
+ aligned_len = ALIGN(len, 8); // Assume len = 4089, aligned_len = 4096
+ if (aligned_len <= wbuf->avail) ... // Not satisfy
+ if (wbuf->used) {
+ ubifs_leb_write() // Fill some data in avail wbuf
+ len -= wbuf->avail; // len is still not 8-bytes aligned
+ aligned_len -= wbuf->avail;
+ }
+ n = aligned_len >> c->max_write_shift;
+ if (n) {
+ n <<= c->max_write_shift;
+ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+ wbuf->offs, n);
+ // n > len, read out of bounds less than 8(n-len) bytes
+ }
+
+, which can be catched by KASAN:
+ =========================================================
+ BUG: KASAN: slab-out-of-bounds in ecc_sw_hamming_calculate+0x1dc/0x7d0
+ Read of size 4 at addr ffff888105594ff8 by task kworker/u8:4/128
+ Workqueue: writeback wb_workfn (flush-ubifs_0_0)
+ Call Trace:
+ kasan_report.cold+0x81/0x165
+ nand_write_page_swecc+0xa9/0x160
+ ubifs_leb_write+0xf2/0x1b0 [ubifs]
+ ubifs_wbuf_write_nolock+0x421/0x12c0 [ubifs]
+ write_head+0xdc/0x1c0 [ubifs]
+ ubifs_jnl_write_inode+0x627/0x960 [ubifs]
+ wb_workfn+0x8af/0xb80
+
+Function ubifs_wbuf_write_nolock() accepts that parameter 'len' is not 8
+bytes aligned, the 'len' represents the true length of buf (which is
+allocated in 'ubifs_jnl_xxx', eg. ubifs_jnl_write_inode), so
+ubifs_wbuf_write_nolock() must handle the length read from 'buf' carefully
+to write leb safely.
+
+Fetch a reproducer in [Link].
+
+Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=214785
+Reported-by: Chengsong Ke <kechengsong@huawei.com>
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/io.c | 34 ++++++++++++++++++++++++++++++----
+ 1 file changed, 30 insertions(+), 4 deletions(-)
+
+--- a/fs/ubifs/io.c
++++ b/fs/ubifs/io.c
+@@ -833,16 +833,42 @@ int ubifs_wbuf_write_nolock(struct ubifs
+ */
+ n = aligned_len >> c->max_write_shift;
+ if (n) {
+- n <<= c->max_write_shift;
++ int m = n - 1;
++
+ dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
+ wbuf->offs);
+- err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+- wbuf->offs, n);
++
++ if (m) {
++ /* '(n-1)<<c->max_write_shift < len' is always true. */
++ m <<= c->max_write_shift;
++ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
++ wbuf->offs, m);
++ if (err)
++ goto out;
++ wbuf->offs += m;
++ aligned_len -= m;
++ len -= m;
++ written += m;
++ }
++
++ /*
++ * The non-written len of buf may be less than 'n' because
++ * parameter 'len' is not 8 bytes aligned, so here we read
++ * min(len, n) bytes from buf.
++ */
++ n = 1 << c->max_write_shift;
++ memcpy(wbuf->buf, buf + written, min(len, n));
++ if (n > len) {
++ ubifs_assert(c, n - len < 8);
++ ubifs_pad(c, wbuf->buf + len, n - len);
++ }
++
++ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n);
+ if (err)
+ goto out;
+ wbuf->offs += n;
+ aligned_len -= n;
+- len -= n;
++ len -= min(len, n);
+ written += n;
+ }
+
--- /dev/null
+From 3b67db8a6ca83e6ff90b756d3da0c966f61cd37b Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:41 +0800
+Subject: ubifs: Fix to add refcount once page is set private
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit 3b67db8a6ca83e6ff90b756d3da0c966f61cd37b upstream.
+
+MM defined the rule [1] very clearly that once page was set with PG_private
+flag, we should increment the refcount in that page, also main flows like
+pageout(), migrate_page() will assume there is one additional page
+reference count if page_has_private() returns true. Otherwise, we may
+get a BUG in page migration:
+
+ page:0000000080d05b9d refcount:-1 mapcount:0 mapping:000000005f4d82a8
+ index:0xe2 pfn:0x14c12
+ aops:ubifs_file_address_operations [ubifs] ino:8f1 dentry name:"f30e"
+ flags: 0x1fffff80002405(locked|uptodate|owner_priv_1|private|node=0|
+ zone=1|lastcpupid=0x1fffff)
+ page dumped because: VM_BUG_ON_PAGE(page_count(page) != 0)
+ ------------[ cut here ]------------
+ kernel BUG at include/linux/page_ref.h:184!
+ invalid opcode: 0000 [#1] SMP
+ CPU: 3 PID: 38 Comm: kcompactd0 Not tainted 5.15.0-rc5
+ RIP: 0010:migrate_page_move_mapping+0xac3/0xe70
+ Call Trace:
+ ubifs_migrate_page+0x22/0xc0 [ubifs]
+ move_to_new_page+0xb4/0x600
+ migrate_pages+0x1523/0x1cc0
+ compact_zone+0x8c5/0x14b0
+ kcompactd+0x2bc/0x560
+ kthread+0x18c/0x1e0
+ ret_from_fork+0x1f/0x30
+
+Before the time, we should make clean a concept, what does refcount means
+in page gotten from grab_cache_page_write_begin(). There are 2 situations:
+Situation 1: refcount is 3, page is created by __page_cache_alloc.
+ TYPE_A - the write process is using this page
+ TYPE_B - page is assigned to one certain mapping by calling
+ __add_to_page_cache_locked()
+ TYPE_C - page is added into pagevec list corresponding current cpu by
+ calling lru_cache_add()
+Situation 2: refcount is 2, page is gotten from the mapping's tree
+ TYPE_B - page has been assigned to one certain mapping
+ TYPE_A - the write process is using this page (by calling
+ page_cache_get_speculative())
+Filesystem releases one refcount by calling put_page() in xxx_write_end(),
+the released refcount corresponds to TYPE_A (write task is using it). If
+there are any processes using a page, page migration process will skip the
+page by judging whether expected_page_refs() equals to page refcount.
+
+The BUG is caused by following process:
+ PA(cpu 0) kcompactd(cpu 1)
+ compact_zone
+ubifs_write_begin
+ page_a = grab_cache_page_write_begin
+ add_to_page_cache_lru
+ lru_cache_add
+ pagevec_add // put page into cpu 0's pagevec
+ (refcnf = 3, for page creation process)
+ubifs_write_end
+ SetPagePrivate(page_a) // doesn't increase page count !
+ unlock_page(page_a)
+ put_page(page_a) // refcnt = 2
+ [...]
+
+ PB(cpu 0)
+filemap_read
+ filemap_get_pages
+ add_to_page_cache_lru
+ lru_cache_add
+ __pagevec_lru_add // traverse all pages in cpu 0's pagevec
+ __pagevec_lru_add_fn
+ SetPageLRU(page_a)
+ isolate_migratepages
+ isolate_migratepages_block
+ get_page_unless_zero(page_a)
+ // refcnt = 3
+ list_add(page_a, from_list)
+ migrate_pages(from_list)
+ __unmap_and_move
+ move_to_new_page
+ ubifs_migrate_page(page_a)
+ migrate_page_move_mapping
+ expected_page_refs get 3
+ (migration[1] + mapping[1] + private[1])
+ release_pages
+ put_page_testzero(page_a) // refcnt = 3
+ page_ref_freeze // refcnt = 0
+ page_ref_dec_and_test(0 - 1 = -1)
+ page_ref_unfreeze
+ VM_BUG_ON_PAGE(-1 != 0, page)
+
+UBIFS doesn't increase the page refcount after setting private flag, which
+leads to page migration task believes the page is not used by any other
+processes, so the page is migrated. This causes concurrent accessing on
+page refcount between put_page() called by other process(eg. read process
+calls lru_cache_add) and page_ref_unfreeze() called by migration task.
+
+Actually zhangjun has tried to fix this problem [2] by recalculating page
+refcnt in ubifs_migrate_page(). It's better to follow MM rules [1], because
+just like Kirill suggested in [2], we need to check all users of
+page_has_private() helper. Like f2fs does in [3], fix it by adding/deleting
+refcount when setting/clearing private for a page. BTW, according to [4],
+we set 'page->private' as 1 because ubifs just simply SetPagePrivate().
+And, [5] provided a common helper to set/clear page private, ubifs can
+use this helper following the example of iomap, afs, btrfs, etc.
+
+Jump [6] to find a reproducer.
+
+[1] https://lore.kernel.org/lkml/2b19b3c4-2bc4-15fa-15cc-27a13e5c7af1@aol.com
+[2] https://www.spinics.net/lists/linux-mtd/msg04018.html
+[3] http://lkml.iu.edu/hypermail/linux/kernel/1903.0/03313.html
+[4] https://lore.kernel.org/linux-f2fs-devel/20210422154705.GO3596236@casper.infradead.org
+[5] https://lore.kernel.org/all/20200517214718.468-1-guoqing.jiang@cloud.ionos.com
+[6] https://bugzilla.kernel.org/show_bug.cgi?id=214961
+
+Fixes: 1e51764a3c2ac0 ("UBIFS: add new flash file system")
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/file.c | 14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+--- a/fs/ubifs/file.c
++++ b/fs/ubifs/file.c
+@@ -570,7 +570,7 @@ static int ubifs_write_end(struct file *
+ }
+
+ if (!PagePrivate(page)) {
+- SetPagePrivate(page);
++ attach_page_private(page, (void *)1);
+ atomic_long_inc(&c->dirty_pg_cnt);
+ __set_page_dirty_nobuffers(page);
+ }
+@@ -947,7 +947,7 @@ static int do_writepage(struct page *pag
+ release_existing_page_budget(c);
+
+ atomic_long_dec(&c->dirty_pg_cnt);
+- ClearPagePrivate(page);
++ detach_page_private(page);
+ ClearPageChecked(page);
+
+ kunmap(page);
+@@ -1304,7 +1304,7 @@ static void ubifs_invalidatepage(struct
+ release_existing_page_budget(c);
+
+ atomic_long_dec(&c->dirty_pg_cnt);
+- ClearPagePrivate(page);
++ detach_page_private(page);
+ ClearPageChecked(page);
+ }
+
+@@ -1471,8 +1471,8 @@ static int ubifs_migrate_page(struct add
+ return rc;
+
+ if (PagePrivate(page)) {
+- ClearPagePrivate(page);
+- SetPagePrivate(newpage);
++ detach_page_private(page);
++ attach_page_private(newpage, (void *)1);
+ }
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+@@ -1496,7 +1496,7 @@ static int ubifs_releasepage(struct page
+ return 0;
+ ubifs_assert(c, PagePrivate(page));
+ ubifs_assert(c, 0);
+- ClearPagePrivate(page);
++ detach_page_private(page);
+ ClearPageChecked(page);
+ return 1;
+ }
+@@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(
+ else {
+ if (!PageChecked(page))
+ ubifs_convert_page_budget(c);
+- SetPagePrivate(page);
++ attach_page_private(page, (void *)1);
+ atomic_long_inc(&c->dirty_pg_cnt);
+ __set_page_dirty_nobuffers(page);
+ }
--- /dev/null
+From 60eb3b9c9f11206996f57cb89521824304b305ad Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:37 +0800
+Subject: ubifs: Fix 'ui->dirty' race between do_tmpfile() and writeback work
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit 60eb3b9c9f11206996f57cb89521824304b305ad upstream.
+
+'ui->dirty' is not protected by 'ui_mutex' in function do_tmpfile() which
+may race with ubifs_write_inode[wb_workfn] to access/update 'ui->dirty',
+finally dirty space is released twice.
+
+ open(O_TMPFILE) wb_workfn
+do_tmpfile
+ ubifs_budget_space(ino_req = { .dirtied_ino = 1})
+ d_tmpfile // mark inode(tmpfile) dirty
+ ubifs_jnl_update // without holding tmpfile's ui_mutex
+ mark_inode_clean(ui)
+ if (ui->dirty)
+ ubifs_release_dirty_inode_budget(ui) // release first time
+ ubifs_write_inode
+ mutex_lock(&ui->ui_mutex)
+ ubifs_release_dirty_inode_budget(ui)
+ // release second time
+ mutex_unlock(&ui->ui_mutex)
+ ui->dirty = 0
+
+Run generic/476 can reproduce following message easily
+(See reproducer in [Link]):
+
+ UBIFS error (ubi0:0 pid 2578): ubifs_assert_failed [ubifs]: UBIFS assert
+ failed: c->bi.dd_growth >= 0, in fs/ubifs/budget.c:554
+ UBIFS warning (ubi0:0 pid 2578): ubifs_ro_mode [ubifs]: switched to
+ read-only mode, error -22
+ Workqueue: writeback wb_workfn (flush-ubifs_0_0)
+ Call Trace:
+ ubifs_ro_mode+0x54/0x60 [ubifs]
+ ubifs_assert_failed+0x4b/0x80 [ubifs]
+ ubifs_release_budget+0x468/0x5a0 [ubifs]
+ ubifs_release_dirty_inode_budget+0x53/0x80 [ubifs]
+ ubifs_write_inode+0x121/0x1f0 [ubifs]
+ ...
+ wb_workfn+0x283/0x7b0
+
+Fix it by holding tmpfile ubifs inode lock during ubifs_jnl_update().
+Similar problem exists in whiteout renaming, but previous fix("ubifs:
+Rename whiteout atomically") has solved the problem.
+
+Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE")
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=214765
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/dir.c | 60 ++++++++++++++++++++++++++++-----------------------------
+ 1 file changed, 30 insertions(+), 30 deletions(-)
+
+--- a/fs/ubifs/dir.c
++++ b/fs/ubifs/dir.c
+@@ -397,6 +397,32 @@ out_free:
+ return ERR_PTR(err);
+ }
+
++/**
++ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
++ * @inode1: first inode
++ * @inode2: second inode
++ *
++ * We do not implement any tricks to guarantee strict lock ordering, because
++ * VFS has already done it for us on the @i_mutex. So this is just a simple
++ * wrapper function.
++ */
++static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
++{
++ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
++ mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
++}
++
++/**
++ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
++ * @inode1: first inode
++ * @inode2: second inode
++ */
++static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
++{
++ mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
++ mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
++}
++
+ static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+ {
+@@ -404,7 +430,7 @@ static int ubifs_tmpfile(struct user_nam
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
+ struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
+- struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
++ struct ubifs_inode *ui;
+ int err, instantiated = 0;
+ struct fscrypt_name nm;
+
+@@ -452,18 +478,18 @@ static int ubifs_tmpfile(struct user_nam
+ instantiated = 1;
+ mutex_unlock(&ui->ui_mutex);
+
+- mutex_lock(&dir_ui->ui_mutex);
++ lock_2_inodes(dir, inode);
+ err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
+ if (err)
+ goto out_cancel;
+- mutex_unlock(&dir_ui->ui_mutex);
++ unlock_2_inodes(dir, inode);
+
+ ubifs_release_budget(c, &req);
+
+ return 0;
+
+ out_cancel:
+- mutex_unlock(&dir_ui->ui_mutex);
++ unlock_2_inodes(dir, inode);
+ out_inode:
+ make_bad_inode(inode);
+ if (!instantiated)
+@@ -690,32 +716,6 @@ static int ubifs_dir_release(struct inod
+ return 0;
+ }
+
+-/**
+- * lock_2_inodes - a wrapper for locking two UBIFS inodes.
+- * @inode1: first inode
+- * @inode2: second inode
+- *
+- * We do not implement any tricks to guarantee strict lock ordering, because
+- * VFS has already done it for us on the @i_mutex. So this is just a simple
+- * wrapper function.
+- */
+-static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+-{
+- mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+- mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+-}
+-
+-/**
+- * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
+- * @inode1: first inode
+- * @inode2: second inode
+- */
+-static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+-{
+- mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+- mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+-}
+-
+ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+ {
--- /dev/null
+From a6dab6607d4681d227905d5198710b575dbdb519 Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:38 +0800
+Subject: ubifs: Rectify space amount budget for mkdir/tmpfile operations
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit a6dab6607d4681d227905d5198710b575dbdb519 upstream.
+
+UBIFS should make sure the flash has enough space to store dirty (Data
+that is newer than disk) data (in memory), space budget is exactly
+designed to do that. If space budget calculates less data than we need,
+'make_reservation()' will do more work(return -ENOSPC if no free space
+lelf, sometimes we can see "cannot reserve xxx bytes in jhead xxx, error
+-28" in ubifs error messages) with ubifs inodes locked, which may effect
+other syscalls.
+
+A simple way to decide how much space do we need when make a budget:
+See how much space is needed by 'make_reservation()' in ubifs_jnl_xxx()
+function according to corresponding operation.
+
+It's better to report ENOSPC in ubifs_budget_space(), as early as we can.
+
+Fixes: 474b93704f32163 ("ubifs: Implement O_TMPFILE")
+Fixes: 1e51764a3c2ac05 ("UBIFS: add new flash file system")
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/dir.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/fs/ubifs/dir.c
++++ b/fs/ubifs/dir.c
+@@ -428,15 +428,18 @@ static int ubifs_tmpfile(struct user_nam
+ {
+ struct inode *inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
++ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
++ .dirtied_ino = 1};
+ struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
+ struct ubifs_inode *ui;
+ int err, instantiated = 0;
+ struct fscrypt_name nm;
+
+ /*
+- * Budget request settings: new dirty inode, new direntry,
+- * budget for dirtied inode will be released via writeback.
++ * Budget request settings: new inode, new direntry, changing the
++ * parent directory inode.
++ * Allocate budget separately for new dirtied inode, the budget will
++ * be released via writeback.
+ */
+
+ dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
+@@ -979,7 +982,8 @@ static int ubifs_mkdir(struct user_names
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ int err, sz_change;
+- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
++ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
++ .dirtied_ino = 1};
+ struct fscrypt_name nm;
+
+ /*
--- /dev/null
+From 278d9a243635f26c05ad95dcf9c5a593b9e04dc6 Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:36 +0800
+Subject: ubifs: Rename whiteout atomically
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit 278d9a243635f26c05ad95dcf9c5a593b9e04dc6 upstream.
+
+Currently, rename whiteout has 3 steps:
+ 1. create tmpfile(which associates old dentry to tmpfile inode) for
+ whiteout, and store tmpfile to disk
+ 2. link whiteout, associate whiteout inode to old dentry agagin and
+ store old dentry, old inode, new dentry on disk
+ 3. writeback dirty whiteout inode to disk
+
+Suddenly power-cut or error occurring(eg. ENOSPC returned by budget,
+memory allocation failure) during above steps may cause kinds of problems:
+ Problem 1: ENOSPC returned by whiteout space budget (before step 2),
+ old dentry will disappear after rename syscall, whiteout file
+ cannot be found either.
+
+ ls dir // we get file, whiteout
+ rename(dir/file, dir/whiteout, REANME_WHITEOUT)
+ ENOSPC = ubifs_budget_space(&wht_req) // return
+ ls dir // empty (no file, no whiteout)
+ Problem 2: Power-cut happens before step 3, whiteout inode with 'nlink=1'
+ is not stored on disk, whiteout dentry(old dentry) is written
+ on disk, whiteout file is lost on next mount (We get "dead
+ directory entry" after executing 'ls -l' on whiteout file).
+
+Now, we use following 3 steps to finish rename whiteout:
+ 1. create an in-mem inode with 'nlink = 1' as whiteout
+ 2. ubifs_jnl_rename (Write on disk to finish associating old dentry to
+ whiteout inode, associating new dentry with old inode)
+ 3. iput(whiteout)
+
+Rely writing in-mem inode on disk by ubifs_jnl_rename() to finish rename
+whiteout, which avoids middle disk state caused by suddenly power-cut
+and error occurring.
+
+Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/dir.c | 144 +++++++++++++++++++++++++++++++++--------------------
+ fs/ubifs/journal.c | 52 ++++++++++++++++---
+ 2 files changed, 136 insertions(+), 60 deletions(-)
+
+--- a/fs/ubifs/dir.c
++++ b/fs/ubifs/dir.c
+@@ -349,8 +349,56 @@ out_budg:
+ return err;
+ }
+
+-static int do_tmpfile(struct inode *dir, struct dentry *dentry,
+- umode_t mode, struct inode **whiteout)
++static struct inode *create_whiteout(struct inode *dir, struct dentry *dentry)
++{
++ int err;
++ umode_t mode = S_IFCHR | WHITEOUT_MODE;
++ struct inode *inode;
++ struct ubifs_info *c = dir->i_sb->s_fs_info;
++ struct fscrypt_name nm;
++
++ /*
++ * Create an inode('nlink = 1') for whiteout without updating journal,
++ * let ubifs_jnl_rename() store it on flash to complete rename whiteout
++ * atomically.
++ */
++
++ dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
++ dentry, mode, dir->i_ino);
++
++ err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
++ if (err)
++ return ERR_PTR(err);
++
++ inode = ubifs_new_inode(c, dir, mode);
++ if (IS_ERR(inode)) {
++ err = PTR_ERR(inode);
++ goto out_free;
++ }
++
++ init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
++ ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
++
++ err = ubifs_init_security(dir, inode, &dentry->d_name);
++ if (err)
++ goto out_inode;
++
++ /* The dir size is updated by do_rename. */
++ insert_inode_hash(inode);
++
++ return inode;
++
++out_inode:
++ make_bad_inode(inode);
++ iput(inode);
++out_free:
++ fscrypt_free_filename(&nm);
++ ubifs_err(c, "cannot create whiteout file, error %d", err);
++ return ERR_PTR(err);
++}
++
++static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
++ struct dentry *dentry, umode_t mode)
+ {
+ struct inode *inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+@@ -392,25 +440,13 @@ static int do_tmpfile(struct inode *dir,
+ }
+ ui = ubifs_inode(inode);
+
+- if (whiteout) {
+- init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
+- ubifs_assert(c, inode->i_op == &ubifs_file_inode_operations);
+- }
+-
+ err = ubifs_init_security(dir, inode, &dentry->d_name);
+ if (err)
+ goto out_inode;
+
+ mutex_lock(&ui->ui_mutex);
+ insert_inode_hash(inode);
+-
+- if (whiteout) {
+- mark_inode_dirty(inode);
+- drop_nlink(inode);
+- *whiteout = inode;
+- } else {
+- d_tmpfile(dentry, inode);
+- }
++ d_tmpfile(dentry, inode);
+ ubifs_assert(c, ui->dirty);
+
+ instantiated = 1;
+@@ -432,8 +468,6 @@ out_inode:
+ make_bad_inode(inode);
+ if (!instantiated)
+ iput(inode);
+- else if (whiteout)
+- iput(*whiteout);
+ out_budg:
+ ubifs_release_budget(c, &req);
+ if (!instantiated)
+@@ -443,12 +477,6 @@ out_budg:
+ return err;
+ }
+
+-static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+- struct dentry *dentry, umode_t mode)
+-{
+- return do_tmpfile(dir, dentry, mode, NULL);
+-}
+-
+ /**
+ * vfs_dent_type - get VFS directory entry type.
+ * @type: UBIFS directory entry type
+@@ -1266,17 +1294,19 @@ static int do_rename(struct inode *old_d
+ .dirtied_ino = 3 };
+ struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
++ struct ubifs_budget_req wht_req;
+ struct timespec64 time;
+ unsigned int saved_nlink;
+ struct fscrypt_name old_nm, new_nm;
+
+ /*
+- * Budget request settings: deletion direntry, new direntry, removing
+- * the old inode, and changing old and new parent directory inodes.
++ * Budget request settings:
++ * req: deletion direntry, new direntry, removing the old inode,
++ * and changing old and new parent directory inodes.
++ *
++ * wht_req: new whiteout inode for RENAME_WHITEOUT.
+ *
+- * However, this operation also marks the target inode as dirty and
+- * does not write it, so we allocate budget for the target inode
+- * separately.
++ * ino_req: marks the target inode as dirty and does not write it.
+ */
+
+ dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu flags 0x%x",
+@@ -1326,7 +1356,6 @@ static int do_rename(struct inode *old_d
+
+ if (flags & RENAME_WHITEOUT) {
+ union ubifs_dev_desc *dev = NULL;
+- struct ubifs_budget_req wht_req;
+
+ dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+ if (!dev) {
+@@ -1334,24 +1363,26 @@ static int do_rename(struct inode *old_d
+ goto out_release;
+ }
+
+- err = do_tmpfile(old_dir, old_dentry, S_IFCHR | WHITEOUT_MODE, &whiteout);
+- if (err) {
++ /*
++ * The whiteout inode without dentry is pinned in memory,
++ * umount won't happen during rename process because we
++ * got parent dentry.
++ */
++ whiteout = create_whiteout(old_dir, old_dentry);
++ if (IS_ERR(whiteout)) {
++ err = PTR_ERR(whiteout);
+ kfree(dev);
+ goto out_release;
+ }
+
+- spin_lock(&whiteout->i_lock);
+- whiteout->i_state |= I_LINKABLE;
+- spin_unlock(&whiteout->i_lock);
+-
+ whiteout_ui = ubifs_inode(whiteout);
+ whiteout_ui->data = dev;
+ whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
+ ubifs_assert(c, !whiteout_ui->dirty);
+
+ memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
+- wht_req.dirtied_ino = 1;
+- wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8);
++ wht_req.new_ino = 1;
++ wht_req.new_ino_d = ALIGN(whiteout_ui->data_len, 8);
+ /*
+ * To avoid deadlock between space budget (holds ui_mutex and
+ * waits wb work) and writeback work(waits ui_mutex), do space
+@@ -1359,6 +1390,11 @@ static int do_rename(struct inode *old_d
+ */
+ err = ubifs_budget_space(c, &wht_req);
+ if (err) {
++ /*
++ * Whiteout inode can not be written on flash by
++ * ubifs_jnl_write_inode(), because it's neither
++ * dirty nor zero-nlink.
++ */
+ iput(whiteout);
+ goto out_release;
+ }
+@@ -1433,17 +1469,11 @@ static int do_rename(struct inode *old_d
+ sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
+ if (unlink && IS_SYNC(new_inode))
+ sync = 1;
+- }
+-
+- if (whiteout) {
+- inc_nlink(whiteout);
+- mark_inode_dirty(whiteout);
+-
+- spin_lock(&whiteout->i_lock);
+- whiteout->i_state &= ~I_LINKABLE;
+- spin_unlock(&whiteout->i_lock);
+-
+- iput(whiteout);
++ /*
++ * S_SYNC flag of whiteout inherits from the old_dir, and we
++ * have already checked the old dir inode. So there is no need
++ * to check whiteout.
++ */
+ }
+
+ err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
+@@ -1454,6 +1484,11 @@ static int do_rename(struct inode *old_d
+ unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
+ ubifs_release_budget(c, &req);
+
++ if (whiteout) {
++ ubifs_release_budget(c, &wht_req);
++ iput(whiteout);
++ }
++
+ mutex_lock(&old_inode_ui->ui_mutex);
+ release = old_inode_ui->dirty;
+ mark_inode_dirty_sync(old_inode);
+@@ -1462,11 +1497,16 @@ static int do_rename(struct inode *old_d
+ if (release)
+ ubifs_release_budget(c, &ino_req);
+ if (IS_SYNC(old_inode))
+- err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
++ /*
++ * Rename finished here. Although old inode cannot be updated
++ * on flash, old ctime is not a big problem, don't return err
++ * code to userspace.
++ */
++ old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+
+ fscrypt_free_filename(&old_nm);
+ fscrypt_free_filename(&new_nm);
+- return err;
++ return 0;
+
+ out_cancel:
+ if (unlink) {
+@@ -1487,11 +1527,11 @@ out_cancel:
+ inc_nlink(old_dir);
+ }
+ }
++ unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
+ if (whiteout) {
+- drop_nlink(whiteout);
++ ubifs_release_budget(c, &wht_req);
+ iput(whiteout);
+ }
+- unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
+ out_release:
+ ubifs_release_budget(c, &ino_req);
+ ubifs_release_budget(c, &req);
+--- a/fs/ubifs/journal.c
++++ b/fs/ubifs/journal.c
+@@ -1207,9 +1207,9 @@ out_free:
+ * @sync: non-zero if the write-buffer has to be synchronized
+ *
+ * This function implements the re-name operation which may involve writing up
+- * to 4 inodes and 2 directory entries. It marks the written inodes as clean
+- * and returns zero on success. In case of failure, a negative error code is
+- * returned.
++ * to 4 inodes(new inode, whiteout inode, old and new parent directory inodes)
++ * and 2 directory entries. It marks the written inodes as clean and returns
++ * zero on success. In case of failure, a negative error code is returned.
+ */
+ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+ const struct inode *old_inode,
+@@ -1222,14 +1222,15 @@ int ubifs_jnl_rename(struct ubifs_info *
+ void *p;
+ union ubifs_key key;
+ struct ubifs_dent_node *dent, *dent2;
+- int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0;
++ int err, dlen1, dlen2, ilen, wlen, lnum, offs, len, orphan_added = 0;
+ int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
+ int last_reference = !!(new_inode && new_inode->i_nlink == 0);
+ int move = (old_dir != new_dir);
+- struct ubifs_inode *new_ui;
++ struct ubifs_inode *new_ui, *whiteout_ui;
+ u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
+ u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
+ u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
++ u8 hash_whiteout_inode[UBIFS_HASH_ARR_SZ];
+ u8 hash_dent1[UBIFS_HASH_ARR_SZ];
+ u8 hash_dent2[UBIFS_HASH_ARR_SZ];
+
+@@ -1249,9 +1250,20 @@ int ubifs_jnl_rename(struct ubifs_info *
+ } else
+ ilen = 0;
+
++ if (whiteout) {
++ whiteout_ui = ubifs_inode(whiteout);
++ ubifs_assert(c, mutex_is_locked(&whiteout_ui->ui_mutex));
++ ubifs_assert(c, whiteout->i_nlink == 1);
++ ubifs_assert(c, !whiteout_ui->dirty);
++ wlen = UBIFS_INO_NODE_SZ;
++ wlen += whiteout_ui->data_len;
++ } else
++ wlen = 0;
++
+ aligned_dlen1 = ALIGN(dlen1, 8);
+ aligned_dlen2 = ALIGN(dlen2, 8);
+- len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
++ len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) +
++ ALIGN(wlen, 8) + ALIGN(plen, 8);
+ if (move)
+ len += plen;
+
+@@ -1313,6 +1325,15 @@ int ubifs_jnl_rename(struct ubifs_info *
+ p += ALIGN(ilen, 8);
+ }
+
++ if (whiteout) {
++ pack_inode(c, p, whiteout, 0);
++ err = ubifs_node_calc_hash(c, p, hash_whiteout_inode);
++ if (err)
++ goto out_release;
++
++ p += ALIGN(wlen, 8);
++ }
++
+ if (!move) {
+ pack_inode(c, p, old_dir, 1);
+ err = ubifs_node_calc_hash(c, p, hash_old_dir);
+@@ -1352,6 +1373,9 @@ int ubifs_jnl_rename(struct ubifs_info *
+ if (new_inode)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+ new_inode->i_ino);
++ if (whiteout)
++ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
++ whiteout->i_ino);
+ }
+ release_head(c, BASEHD);
+
+@@ -1368,8 +1392,6 @@ int ubifs_jnl_rename(struct ubifs_info *
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, hash_dent2, old_nm);
+ if (err)
+ goto out_ro;
+-
+- ubifs_delete_orphan(c, whiteout->i_ino);
+ } else {
+ err = ubifs_add_dirt(c, lnum, dlen2);
+ if (err)
+@@ -1390,6 +1412,15 @@ int ubifs_jnl_rename(struct ubifs_info *
+ offs += ALIGN(ilen, 8);
+ }
+
++ if (whiteout) {
++ ino_key_init(c, &key, whiteout->i_ino);
++ err = ubifs_tnc_add(c, &key, lnum, offs, wlen,
++ hash_whiteout_inode);
++ if (err)
++ goto out_ro;
++ offs += ALIGN(wlen, 8);
++ }
++
+ ino_key_init(c, &key, old_dir->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen, hash_old_dir);
+ if (err)
+@@ -1410,6 +1441,11 @@ int ubifs_jnl_rename(struct ubifs_info *
+ new_ui->synced_i_size = new_ui->ui_size;
+ spin_unlock(&new_ui->ui_lock);
+ }
++ /*
++ * No need to mark whiteout inode clean.
++ * Whiteout doesn't have non-zero size, no need to update
++ * synced_i_size for whiteout_ui.
++ */
+ mark_inode_clean(c, ubifs_inode(old_dir));
+ if (move)
+ mark_inode_clean(c, ubifs_inode(new_dir));
--- /dev/null
+From 705757274599e2e064dd3054aabc74e8af31a095 Mon Sep 17 00:00:00 2001
+From: Baokun Li <libaokun1@huawei.com>
+Date: Tue, 15 Feb 2022 12:07:36 +0800
+Subject: ubifs: rename_whiteout: correct old_dir size computing
+
+From: Baokun Li <libaokun1@huawei.com>
+
+commit 705757274599e2e064dd3054aabc74e8af31a095 upstream.
+
+When renaming the whiteout file, the old whiteout file is not deleted.
+Therefore, we add the old dentry size to the old dir like XFS.
+Otherwise, an error may be reported due to `fscki->calc_sz != fscki->size`
+in check_indes.
+
+Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
+Reported-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/dir.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/ubifs/dir.c
++++ b/fs/ubifs/dir.c
+@@ -1402,6 +1402,9 @@ static int do_rename(struct inode *old_d
+ iput(whiteout);
+ goto out_release;
+ }
++
++ /* Add the old_dentry size to the old_dir size. */
++ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm));
+ }
+
+ lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
--- /dev/null
+From 40a8f0d5e7b3999f096570edab71c345da812e3e Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:32 +0800
+Subject: ubifs: rename_whiteout: Fix double free for whiteout_ui->data
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit 40a8f0d5e7b3999f096570edab71c345da812e3e upstream.
+
+'whiteout_ui->data' will be freed twice if space budget fail for
+rename whiteout operation as following process:
+
+rename_whiteout
+ dev = kmalloc
+ whiteout_ui->data = dev
+ kfree(whiteout_ui->data) // Free first time
+ iput(whiteout)
+ ubifs_free_inode
+ kfree(ui->data) // Double free!
+
+KASAN reports:
+==================================================================
+BUG: KASAN: double-free or invalid-free in ubifs_free_inode+0x4f/0x70
+Call Trace:
+ kfree+0x117/0x490
+ ubifs_free_inode+0x4f/0x70 [ubifs]
+ i_callback+0x30/0x60
+ rcu_do_batch+0x366/0xac0
+ __do_softirq+0x133/0x57f
+
+Allocated by task 1506:
+ kmem_cache_alloc_trace+0x3c2/0x7a0
+ do_rename+0x9b7/0x1150 [ubifs]
+ ubifs_rename+0x106/0x1f0 [ubifs]
+ do_syscall_64+0x35/0x80
+
+Freed by task 1506:
+ kfree+0x117/0x490
+ do_rename.cold+0x53/0x8a [ubifs]
+ ubifs_rename+0x106/0x1f0 [ubifs]
+ do_syscall_64+0x35/0x80
+
+The buggy address belongs to the object at ffff88810238bed8 which
+belongs to the cache kmalloc-8 of size 8
+==================================================================
+
+Let ubifs_free_inode() free 'whiteout_ui->data'. BTW, delete unused
+assignment 'whiteout_ui->data_len = 0', process 'ubifs_evict_inode()
+-> ubifs_jnl_delete_inode() -> ubifs_jnl_write_inode()' doesn't need it
+(because 'inc_nlink(whiteout)' won't be excuted by 'goto out_release',
+ and the nlink of whiteout inode is 0).
+
+Fixes: 9e0a1fff8db56ea ("ubifs: Implement RENAME_WHITEOUT")
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/dir.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/fs/ubifs/dir.c
++++ b/fs/ubifs/dir.c
+@@ -1425,8 +1425,6 @@ static int do_rename(struct inode *old_d
+
+ err = ubifs_budget_space(c, &wht_req);
+ if (err) {
+- kfree(whiteout_ui->data);
+- whiteout_ui->data_len = 0;
+ iput(whiteout);
+ goto out_release;
+ }
--- /dev/null
+From 1b83ec057db16b4d0697dc21ef7a9743b6041f72 Mon Sep 17 00:00:00 2001
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+Date: Mon, 27 Dec 2021 11:22:39 +0800
+Subject: ubifs: setflags: Make dirtied_ino_d 8 bytes aligned
+
+From: Zhihao Cheng <chengzhihao1@huawei.com>
+
+commit 1b83ec057db16b4d0697dc21ef7a9743b6041f72 upstream.
+
+Make 'ui->data_len' aligned with 8 bytes before it is assigned to
+dirtied_ino_d. Since 8871d84c8f8b0c6b("ubifs: convert to fileattr")
+applied, 'setflags()' only affects regular files and directories, only
+xattr inode, symlink inode and special inode(pipe/char_dev/block_dev)
+have none- zero 'ui->data_len' field, so assertion
+'!(req->dirtied_ino_d & 7)' cannot fail in ubifs_budget_space().
+To avoid assertion fails in future evolution(eg. setflags can operate
+special inodes), it's better to make dirtied_ino_d 8 bytes aligned,
+after all aligned size is still zero for regular files.
+
+Fixes: 1e51764a3c2ac05a ("UBIFS: add new flash file system")
+Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
+Signed-off-by: Richard Weinberger <richard@nod.at>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ubifs/ioctl.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ubifs/ioctl.c
++++ b/fs/ubifs/ioctl.c
+@@ -108,7 +108,7 @@ static int setflags(struct inode *inode,
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+- .dirtied_ino_d = ui->data_len };
++ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+ err = ubifs_budget_space(c, &req);
+ if (err)