--- /dev/null
+From 4be2ce739fb3c1ad0fbc2337b07b33a326009677 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Jul 2022 17:26:29 -0400
+Subject: __follow_mount_rcu(): verify that mount_lock remains unchanged
+
+From: Al Viro <viro@zeniv.linux.org.uk>
+
+[ Upstream commit 20aac6c60981f5bfacd66661d090d907bf1482f0 ]
+
+Validate mount_lock seqcount as soon as we cross into mount in RCU
+mode. Sure, ->mnt_root is pinned and will remain so until we
+do rcu_read_unlock() anyway, and we will eventually fail to unlazy if
+the mount_lock had been touched, but we might run into a hard error
+(e.g. -ENOENT) before trying to unlazy. And it's possible to end
+up with RCU pathwalk racing with rename() and umount() in a way
+that would fail with -ENOENT while non-RCU pathwalk would've
+succeeded with any timings.
+
+Once upon a time we hadn't needed that, but analysis had been subtle,
+brittle and went out of window as soon as RENAME_EXCHANGE had been
+added.
+
+It's narrow, hard to hit and won't get you anything other than
+stray -ENOENT that could be arranged in much easier way with the
+same priveleges, but it's a bug all the same.
+
+Cc: stable@kernel.org
+X-sky-is-falling: unlikely
+Fixes: da1ce0670c14 "vfs: add cross-rename"
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/namei.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/fs/namei.c b/fs/namei.c
+index 740a40802780..2fa412c5a082 100644
+--- a/fs/namei.c
++++ b/fs/namei.c
+@@ -1511,6 +1511,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
+ * becoming unpinned.
+ */
+ flags = dentry->d_flags;
++ if (read_seqretry(&mount_lock, nd->m_seq))
++ return false;
+ continue;
+ }
+ if (read_seqretry(&mount_lock, nd->m_seq))
+--
+2.35.1
+
--- /dev/null
+From 5f735daa405bd9da9301cbe524cf0d5239a6082d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Jul 2022 19:41:10 +0200
+Subject: ACPI: CPPC: Do not prevent CPPC from working in the future
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+[ Upstream commit 4f4179fcf420873002035cf1941d844c9e0e7cb3 ]
+
+There is a problem with the current revision checks in
+is_cppc_supported() that they essentially prevent the CPPC support
+from working if a new _CPC package format revision being a proper
+superset of the v3 and only causing _CPC to return a package with more
+entries (while retaining the types and meaning of the entries defined by
+the v3) is introduced in the future and used by the platform firmware.
+
+In that case, as long as the number of entries in the _CPC return
+package is at least CPPC_V3_NUM_ENT, it should be perfectly fine to
+use the v3 support code and disregard the additional package entries
+added by the new package format revision.
+
+For this reason, drop is_cppc_supported() altogether, put the revision
+checks directly into acpi_cppc_processor_probe() so they are easier to
+follow and rework them to take the case mentioned above into account.
+
+Fixes: 4773e77cdc9b ("ACPI / CPPC: Add support for CPPC v3")
+Cc: 4.18+ <stable@vger.kernel.org> # 4.18+
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/acpi/cppc_acpi.c | 54 ++++++++++++++++++----------------------
+ include/acpi/cppc_acpi.h | 2 +-
+ 2 files changed, 25 insertions(+), 31 deletions(-)
+
+diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
+index b8e26b6b5523..35d894674eba 100644
+--- a/drivers/acpi/cppc_acpi.c
++++ b/drivers/acpi/cppc_acpi.c
+@@ -600,33 +600,6 @@ static int pcc_data_alloc(int pcc_ss_id)
+ return 0;
+ }
+
+-/* Check if CPPC revision + num_ent combination is supported */
+-static bool is_cppc_supported(int revision, int num_ent)
+-{
+- int expected_num_ent;
+-
+- switch (revision) {
+- case CPPC_V2_REV:
+- expected_num_ent = CPPC_V2_NUM_ENT;
+- break;
+- case CPPC_V3_REV:
+- expected_num_ent = CPPC_V3_NUM_ENT;
+- break;
+- default:
+- pr_debug("Firmware exports unsupported CPPC revision: %d\n",
+- revision);
+- return false;
+- }
+-
+- if (expected_num_ent != num_ent) {
+- pr_debug("Firmware exports %d entries. Expected: %d for CPPC rev:%d\n",
+- num_ent, expected_num_ent, revision);
+- return false;
+- }
+-
+- return true;
+-}
+-
+ /*
+ * An example CPC table looks like the following.
+ *
+@@ -715,7 +688,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
+ cpc_obj->type, pr->id);
+ goto out_free;
+ }
+- cpc_ptr->num_entries = num_ent;
+
+ /* Second entry should be revision. */
+ cpc_obj = &out_obj->package.elements[1];
+@@ -726,10 +698,32 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr)
+ cpc_obj->type, pr->id);
+ goto out_free;
+ }
+- cpc_ptr->version = cpc_rev;
+
+- if (!is_cppc_supported(cpc_rev, num_ent))
++ if (cpc_rev < CPPC_V2_REV) {
++ pr_debug("Unsupported _CPC Revision (%d) for CPU:%d\n", cpc_rev,
++ pr->id);
++ goto out_free;
++ }
++
++ /*
++ * Disregard _CPC if the number of entries in the return pachage is not
++ * as expected, but support future revisions being proper supersets of
++ * the v3 and only causing more entries to be returned by _CPC.
++ */
++ if ((cpc_rev == CPPC_V2_REV && num_ent != CPPC_V2_NUM_ENT) ||
++ (cpc_rev == CPPC_V3_REV && num_ent != CPPC_V3_NUM_ENT) ||
++ (cpc_rev > CPPC_V3_REV && num_ent <= CPPC_V3_NUM_ENT)) {
++ pr_debug("Unexpected number of _CPC return package entries (%d) for CPU:%d\n",
++ num_ent, pr->id);
+ goto out_free;
++ }
++ if (cpc_rev > CPPC_V3_REV) {
++ num_ent = CPPC_V3_NUM_ENT;
++ cpc_rev = CPPC_V3_REV;
++ }
++
++ cpc_ptr->num_entries = num_ent;
++ cpc_ptr->version = cpc_rev;
+
+ /* Iterate through remaining entries in _CPC */
+ for (i = 2; i < num_ent; i++) {
+diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
+index 181907349b49..a76f8c6b732d 100644
+--- a/include/acpi/cppc_acpi.h
++++ b/include/acpi/cppc_acpi.h
+@@ -17,7 +17,7 @@
+ #include <acpi/pcc.h>
+ #include <acpi/processor.h>
+
+-/* Support CPPCv2 and CPPCv3 */
++/* CPPCv2 and CPPCv3 support */
+ #define CPPC_V2_REV 2
+ #define CPPC_V3_REV 3
+ #define CPPC_V2_NUM_ENT 21
+--
+2.35.1
+
--- /dev/null
+From 6e6c54f719b05010a877a1d77f3b5cab7c585471 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 24 Jul 2022 19:16:50 -0400
+Subject: batman-adv: tracing: Use the new __vstring() helper
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+[ Upstream commit 9abc291812d784bd4a26c01af4ebdbf9f2dbf0bb ]
+
+Instead of open coding a __dynamic_array() with a fixed length (which
+defeats the purpose of the dynamic array in the first place). Use the new
+__vstring() helper that will use a va_list and only write enough of the
+string into the ring buffer that is needed.
+
+Link: https://lkml.kernel.org/r/20220724191650.236b1355@rorschach.local.home
+
+Cc: Marek Lindner <mareklindner@neomailbox.ch>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Simon Wunderlich <sw@simonwunderlich.de>
+Cc: Antonio Quartulli <a@unstable.cc>
+Cc: "David S. Miller" <davem@davemloft.net>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Jakub Kicinski <kuba@kernel.org>
+Cc: Paolo Abeni <pabeni@redhat.com>
+Cc: b.a.t.m.a.n@lists.open-mesh.org
+Cc: netdev@vger.kernel.org
+Acked-by: Sven Eckelmann <sven@narfation.org>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/batman-adv/trace.h | 9 ++-------
+ 1 file changed, 2 insertions(+), 7 deletions(-)
+
+diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
+index d673ebdd0426..31c8f922651d 100644
+--- a/net/batman-adv/trace.h
++++ b/net/batman-adv/trace.h
+@@ -28,8 +28,6 @@
+
+ #endif /* CONFIG_BATMAN_ADV_TRACING */
+
+-#define BATADV_MAX_MSG_LEN 256
+-
+ TRACE_EVENT(batadv_dbg,
+
+ TP_PROTO(struct batadv_priv *bat_priv,
+@@ -40,16 +38,13 @@ TRACE_EVENT(batadv_dbg,
+ TP_STRUCT__entry(
+ __string(device, bat_priv->soft_iface->name)
+ __string(driver, KBUILD_MODNAME)
+- __dynamic_array(char, msg, BATADV_MAX_MSG_LEN)
++ __vstring(msg, vaf->fmt, vaf->va)
+ ),
+
+ TP_fast_assign(
+ __assign_str(device, bat_priv->soft_iface->name);
+ __assign_str(driver, KBUILD_MODNAME);
+- WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
+- BATADV_MAX_MSG_LEN,
+- vaf->fmt,
+- *vaf->va) >= BATADV_MAX_MSG_LEN);
++ __assign_vstr(msg, vaf->fmt, vaf->va);
+ ),
+
+ TP_printk(
+--
+2.35.1
+
--- /dev/null
+From dd6495e668dadb91e946e5f118bc7f72cc8bed13 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Apr 2022 06:52:46 +0200
+Subject: block: add a bdev_max_zone_append_sectors helper
+
+From: Christoph Hellwig <hch@lst.de>
+
+[ Upstream commit 2aba0d19f4d8c8929b4b3b94a9cfde2aa20e6ee2 ]
+
+Add a helper to check the max supported sectors for zone append based on
+the block_device instead of having to poke into the block layer internal
+request_queue.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Acked-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
+Link: https://lore.kernel.org/r/20220415045258.199825-16-hch@lst.de
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/nvme/target/zns.c | 3 +--
+ fs/zonefs/super.c | 3 +--
+ include/linux/blkdev.h | 6 ++++++
+ 3 files changed, 8 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
+index e34718b09550..82b61acf7a72 100644
+--- a/drivers/nvme/target/zns.c
++++ b/drivers/nvme/target/zns.c
+@@ -34,8 +34,7 @@ static int validate_conv_zones_cb(struct blk_zone *z,
+
+ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
+ {
+- struct request_queue *q = ns->bdev->bd_disk->queue;
+- u8 zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
++ u8 zasl = nvmet_zasl(bdev_max_zone_append_sectors(ns->bdev));
+ struct gendisk *bd_disk = ns->bdev->bd_disk;
+ int ret;
+
+diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
+index 15a4c7c07a3b..b68798a572fc 100644
+--- a/fs/zonefs/super.c
++++ b/fs/zonefs/super.c
+@@ -723,13 +723,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+- unsigned int max;
++ unsigned int max = bdev_max_zone_append_sectors(bdev);
+ struct bio *bio;
+ ssize_t size;
+ int nr_pages;
+ ssize_t ret;
+
+- max = queue_max_zone_append_sectors(bdev_get_queue(bdev));
+ max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
+ iov_iter_truncate(from, max);
+
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
+index cc6b24a5098f..34f2b88dfd6e 100644
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -1190,6 +1190,12 @@ static inline unsigned int queue_max_zone_append_sectors(const struct request_qu
+ return min(l->max_zone_append_sectors, l->max_sectors);
+ }
+
++static inline unsigned int
++bdev_max_zone_append_sectors(struct block_device *bdev)
++{
++ return queue_max_zone_append_sectors(bdev_get_queue(bdev));
++}
++
+ static inline unsigned queue_logical_block_size(const struct request_queue *q)
+ {
+ int retval = 512;
+--
+2.35.1
+
--- /dev/null
+From 065935c8b7fbf75e3eb0c7a9d9f88ff921b1c9a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:38 +0900
+Subject: block: add bdev_max_segments() helper
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 65ea1b66482f415d51cd46515b02477257330339 ]
+
+Add bdev_max_segments() like other queue parameters.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/blkdev.h | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
+index 34f2b88dfd6e..7927480b9cf7 100644
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -1196,6 +1196,11 @@ bdev_max_zone_append_sectors(struct block_device *bdev)
+ return queue_max_zone_append_sectors(bdev_get_queue(bdev));
+ }
+
++static inline unsigned int bdev_max_segments(struct block_device *bdev)
++{
++ return queue_max_segments(bdev_get_queue(bdev));
++}
++
+ static inline unsigned queue_logical_block_size(const struct request_queue *q)
+ {
+ int retval = 512;
+--
+2.35.1
+
--- /dev/null
+From 9d6969d9e591d57389ab123fdc51c860fd939781 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Jul 2022 17:36:16 +0800
+Subject: block: don't allow the same type rq_qos add more than once
+
+From: Jinke Han <hanjinke.666@bytedance.com>
+
+[ Upstream commit 14a6e2eb7df5c7897c15b109cba29ab0c4a791b6 ]
+
+In our test of iocost, we encountered some list add/del corruptions of
+inner_walk list in ioc_timer_fn.
+
+The reason can be described as follows:
+
+cpu 0 cpu 1
+ioc_qos_write ioc_qos_write
+
+ioc = q_to_ioc(queue);
+if (!ioc) {
+ ioc = kzalloc();
+ ioc = q_to_ioc(queue);
+ if (!ioc) {
+ ioc = kzalloc();
+ ...
+ rq_qos_add(q, rqos);
+ }
+ ...
+ rq_qos_add(q, rqos);
+ ...
+}
+
+When the io.cost.qos file is written by two cpus concurrently, rq_qos may
+be added to one disk twice. In that case, there will be two iocs enabled
+and running on one disk. They own different iocgs on their active list. In
+the ioc_timer_fn function, because of the iocgs from two iocs have the
+same root iocg, the root iocg's walk_list may be overwritten by each other
+and this leads to list add/del corruptions in building or destroying the
+inner_walk list.
+
+And so far, the blk-rq-qos framework works in case that one instance for
+one type rq_qos per queue by default. This patch make this explicit and
+also fix the crash above.
+
+Signed-off-by: Jinke Han <hanjinke.666@bytedance.com>
+Reviewed-by: Muchun Song <songmuchun@bytedance.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Cc: <stable@vger.kernel.org>
+Link: https://lore.kernel.org/r/20220720093616.70584-1-hanjinke.666@bytedance.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ block/blk-iocost.c | 20 +++++++++++++-------
+ block/blk-iolatency.c | 18 +++++++++++-------
+ block/blk-rq-qos.h | 11 ++++++++++-
+ block/blk-wbt.c | 12 +++++++++++-
+ 4 files changed, 45 insertions(+), 16 deletions(-)
+
+diff --git a/block/blk-iocost.c b/block/blk-iocost.c
+index 16705fbd0699..a19f2db4eeb2 100644
+--- a/block/blk-iocost.c
++++ b/block/blk-iocost.c
+@@ -2893,15 +2893,21 @@ static int blk_iocost_init(struct request_queue *q)
+ * called before policy activation completion, can't assume that the
+ * target bio has an iocg associated and need to test for NULL iocg.
+ */
+- rq_qos_add(q, rqos);
++ ret = rq_qos_add(q, rqos);
++ if (ret)
++ goto err_free_ioc;
++
+ ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
+- if (ret) {
+- rq_qos_del(q, rqos);
+- free_percpu(ioc->pcpu_stat);
+- kfree(ioc);
+- return ret;
+- }
++ if (ret)
++ goto err_del_qos;
+ return 0;
++
++err_del_qos:
++ rq_qos_del(q, rqos);
++err_free_ioc:
++ free_percpu(ioc->pcpu_stat);
++ kfree(ioc);
++ return ret;
+ }
+
+ static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
+diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
+index 9568bf8dfe82..7845dca5fcfd 100644
+--- a/block/blk-iolatency.c
++++ b/block/blk-iolatency.c
+@@ -773,19 +773,23 @@ int blk_iolatency_init(struct request_queue *q)
+ rqos->ops = &blkcg_iolatency_ops;
+ rqos->q = q;
+
+- rq_qos_add(q, rqos);
+-
++ ret = rq_qos_add(q, rqos);
++ if (ret)
++ goto err_free;
+ ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
+- if (ret) {
+- rq_qos_del(q, rqos);
+- kfree(blkiolat);
+- return ret;
+- }
++ if (ret)
++ goto err_qos_del;
+
+ timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
+ INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
+
+ return 0;
++
++err_qos_del:
++ rq_qos_del(q, rqos);
++err_free:
++ kfree(blkiolat);
++ return ret;
+ }
+
+ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
+index 0e46052b018a..08b856570ad1 100644
+--- a/block/blk-rq-qos.h
++++ b/block/blk-rq-qos.h
+@@ -86,7 +86,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
+ init_waitqueue_head(&rq_wait->wait);
+ }
+
+-static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
++static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+ {
+ /*
+ * No IO can be in-flight when adding rqos, so freeze queue, which
+@@ -98,6 +98,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+ blk_mq_freeze_queue(q);
+
+ spin_lock_irq(&q->queue_lock);
++ if (rq_qos_id(q, rqos->id))
++ goto ebusy;
+ rqos->next = q->rq_qos;
+ q->rq_qos = rqos;
+ spin_unlock_irq(&q->queue_lock);
+@@ -109,6 +111,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+ blk_mq_debugfs_register_rqos(rqos);
+ mutex_unlock(&q->debugfs_mutex);
+ }
++
++ return 0;
++ebusy:
++ spin_unlock_irq(&q->queue_lock);
++ blk_mq_unfreeze_queue(q);
++ return -EBUSY;
++
+ }
+
+ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
+diff --git a/block/blk-wbt.c b/block/blk-wbt.c
+index 0c119be0e813..ae6ea0b54579 100644
+--- a/block/blk-wbt.c
++++ b/block/blk-wbt.c
+@@ -820,6 +820,7 @@ int wbt_init(struct request_queue *q)
+ {
+ struct rq_wb *rwb;
+ int i;
++ int ret;
+
+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+ if (!rwb)
+@@ -846,7 +847,10 @@ int wbt_init(struct request_queue *q)
+ /*
+ * Assign rwb and add the stats callback.
+ */
+- rq_qos_add(q, &rwb->rqos);
++ ret = rq_qos_add(q, &rwb->rqos);
++ if (ret)
++ goto err_free;
++
+ blk_stat_add_callback(q, rwb->cb);
+
+ rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+@@ -855,4 +859,10 @@ int wbt_init(struct request_queue *q)
+ wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+
+ return 0;
++
++err_free:
++ blk_stat_free_callback(rwb->cb);
++ kfree(rwb);
++ return ret;
++
+ }
+--
+2.35.1
+
--- /dev/null
+From de8ba1b6410a2cffe26c1f98058fd39d39dc0e62 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Jun 2022 09:48:25 +0200
+Subject: block: serialize all debugfs operations using q->debugfs_mutex
+
+From: Christoph Hellwig <hch@lst.de>
+
+[ Upstream commit 5cf9c91ba927119fc6606b938b1895bb2459d3bc ]
+
+Various places like I/O schedulers or the QOS infrastructure try to
+register debugfs files on demans, which can race with creating and
+removing the main queue debugfs directory. Use the existing
+debugfs_mutex to serialize all debugfs operations that rely on
+q->debugfs_dir or the directories hanging off it.
+
+To make the teardown code a little simpler declare all debugfs dentry
+pointers and not just the main one uncoditionally in blkdev.h.
+
+Move debugfs_mutex next to the dentries that it protects and document
+what it is used for.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Link: https://lore.kernel.org/r/20220614074827.458955-3-hch@lst.de
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ block/blk-mq-debugfs.c | 25 ++++++++++++++++++++-----
+ block/blk-mq-debugfs.h | 5 -----
+ block/blk-mq-sched.c | 11 +++++++++++
+ block/blk-rq-qos.c | 2 ++
+ block/blk-rq-qos.h | 7 ++++++-
+ block/blk-sysfs.c | 20 +++++++++-----------
+ include/linux/blkdev.h | 8 ++++----
+ kernel/trace/blktrace.c | 3 ---
+ 8 files changed, 52 insertions(+), 29 deletions(-)
+
+diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
+index 34bee263936c..d491b6eb0ab9 100644
+--- a/block/blk-mq-debugfs.c
++++ b/block/blk-mq-debugfs.c
+@@ -713,11 +713,6 @@ void blk_mq_debugfs_register(struct request_queue *q)
+ }
+ }
+
+-void blk_mq_debugfs_unregister(struct request_queue *q)
+-{
+- q->sched_debugfs_dir = NULL;
+-}
+-
+ static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+ {
+@@ -751,6 +746,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
+
+ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
+ {
++ if (!hctx->queue->debugfs_dir)
++ return;
+ debugfs_remove_recursive(hctx->debugfs_dir);
+ hctx->sched_debugfs_dir = NULL;
+ hctx->debugfs_dir = NULL;
+@@ -778,6 +775,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
+ {
+ struct elevator_type *e = q->elevator->type;
+
++ lockdep_assert_held(&q->debugfs_mutex);
++
+ /*
+ * If the parent directory has not been created yet, return, we will be
+ * called again later on and the directory/files will be created then.
+@@ -795,6 +794,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
+
+ void blk_mq_debugfs_unregister_sched(struct request_queue *q)
+ {
++ lockdep_assert_held(&q->debugfs_mutex);
++
+ debugfs_remove_recursive(q->sched_debugfs_dir);
+ q->sched_debugfs_dir = NULL;
+ }
+@@ -816,6 +817,10 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
+
+ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
+ {
++ lockdep_assert_held(&rqos->q->debugfs_mutex);
++
++ if (!rqos->q->debugfs_dir)
++ return;
+ debugfs_remove_recursive(rqos->debugfs_dir);
+ rqos->debugfs_dir = NULL;
+ }
+@@ -825,6 +830,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
+ struct request_queue *q = rqos->q;
+ const char *dir_name = rq_qos_id_to_name(rqos->id);
+
++ lockdep_assert_held(&q->debugfs_mutex);
++
+ if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
+ return;
+
+@@ -840,6 +847,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
+
+ void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
+ {
++ lockdep_assert_held(&q->debugfs_mutex);
++
+ debugfs_remove_recursive(q->rqos_debugfs_dir);
+ q->rqos_debugfs_dir = NULL;
+ }
+@@ -849,6 +858,8 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
+ {
+ struct elevator_type *e = q->elevator->type;
+
++ lockdep_assert_held(&q->debugfs_mutex);
++
+ /*
+ * If the parent debugfs directory has not been created yet, return;
+ * We will be called again later on with appropriate parent debugfs
+@@ -868,6 +879,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
+
+ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
+ {
++ lockdep_assert_held(&hctx->queue->debugfs_mutex);
++
++ if (!hctx->queue->debugfs_dir)
++ return;
+ debugfs_remove_recursive(hctx->sched_debugfs_dir);
+ hctx->sched_debugfs_dir = NULL;
+ }
+diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
+index 69918f4170d6..771d45832878 100644
+--- a/block/blk-mq-debugfs.h
++++ b/block/blk-mq-debugfs.h
+@@ -21,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
+ int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
+
+ void blk_mq_debugfs_register(struct request_queue *q);
+-void blk_mq_debugfs_unregister(struct request_queue *q);
+ void blk_mq_debugfs_register_hctx(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx);
+ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx);
+@@ -42,10 +41,6 @@ static inline void blk_mq_debugfs_register(struct request_queue *q)
+ {
+ }
+
+-static inline void blk_mq_debugfs_unregister(struct request_queue *q)
+-{
+-}
+-
+ static inline void blk_mq_debugfs_register_hctx(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx)
+ {
+diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
+index 9e56a69422b6..e84bec39fd3a 100644
+--- a/block/blk-mq-sched.c
++++ b/block/blk-mq-sched.c
+@@ -593,7 +593,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
+ if (ret)
+ goto err_free_map_and_rqs;
+
++ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_register_sched(q);
++ mutex_unlock(&q->debugfs_mutex);
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (e->ops.init_hctx) {
+@@ -606,7 +608,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
+ return ret;
+ }
+ }
++ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_register_sched_hctx(q, hctx);
++ mutex_unlock(&q->debugfs_mutex);
+ }
+
+ return 0;
+@@ -647,14 +651,21 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
+ unsigned int flags = 0;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
++ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_unregister_sched_hctx(hctx);
++ mutex_unlock(&q->debugfs_mutex);
++
+ if (e->type->ops.exit_hctx && hctx->sched_data) {
+ e->type->ops.exit_hctx(hctx, i);
+ hctx->sched_data = NULL;
+ }
+ flags = hctx->flags;
+ }
++
++ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_unregister_sched(q);
++ mutex_unlock(&q->debugfs_mutex);
++
+ if (e->type->ops.exit_sched)
+ e->type->ops.exit_sched(e);
+ blk_mq_sched_tags_teardown(q, flags);
+diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
+index e83af7bc7591..249a6f05dd3b 100644
+--- a/block/blk-rq-qos.c
++++ b/block/blk-rq-qos.c
+@@ -294,7 +294,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
+
+ void rq_qos_exit(struct request_queue *q)
+ {
++ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_unregister_queue_rqos(q);
++ mutex_unlock(&q->debugfs_mutex);
+
+ while (q->rq_qos) {
+ struct rq_qos *rqos = q->rq_qos;
+diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
+index 68267007da1c..0e46052b018a 100644
+--- a/block/blk-rq-qos.h
++++ b/block/blk-rq-qos.h
+@@ -104,8 +104,11 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
+
+ blk_mq_unfreeze_queue(q);
+
+- if (rqos->ops->debugfs_attrs)
++ if (rqos->ops->debugfs_attrs) {
++ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_register_rqos(rqos);
++ mutex_unlock(&q->debugfs_mutex);
++ }
+ }
+
+ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
+@@ -129,7 +132,9 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
+
+ blk_mq_unfreeze_queue(q);
+
++ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_unregister_rqos(rqos);
++ mutex_unlock(&q->debugfs_mutex);
+ }
+
+ typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
+diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
+index 88bd41d4cb59..6e4801b217a7 100644
+--- a/block/blk-sysfs.c
++++ b/block/blk-sysfs.c
+@@ -779,14 +779,13 @@ static void blk_release_queue(struct kobject *kobj)
+ if (queue_is_mq(q))
+ blk_mq_release(q);
+
+- blk_trace_shutdown(q);
+ mutex_lock(&q->debugfs_mutex);
++ blk_trace_shutdown(q);
+ debugfs_remove_recursive(q->debugfs_dir);
++ q->debugfs_dir = NULL;
++ q->sched_debugfs_dir = NULL;
+ mutex_unlock(&q->debugfs_mutex);
+
+- if (queue_is_mq(q))
+- blk_mq_debugfs_unregister(q);
+-
+ bioset_exit(&q->bio_split);
+
+ if (blk_queue_has_srcu(q))
+@@ -836,17 +835,16 @@ int blk_register_queue(struct gendisk *disk)
+ goto unlock;
+ }
+
++ if (queue_is_mq(q))
++ __blk_mq_register_dev(dev, q);
++ mutex_lock(&q->sysfs_lock);
++
+ mutex_lock(&q->debugfs_mutex);
+ q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
+ blk_debugfs_root);
+- mutex_unlock(&q->debugfs_mutex);
+-
+- if (queue_is_mq(q)) {
+- __blk_mq_register_dev(dev, q);
++ if (queue_is_mq(q))
+ blk_mq_debugfs_register(q);
+- }
+-
+- mutex_lock(&q->sysfs_lock);
++ mutex_unlock(&q->debugfs_mutex);
+
+ ret = disk_register_independent_access_ranges(disk, NULL);
+ if (ret)
+diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
+index 108e3d114bfc..cc6b24a5098f 100644
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -466,7 +466,6 @@ struct request_queue {
+ #endif /* CONFIG_BLK_DEV_ZONED */
+
+ int node;
+- struct mutex debugfs_mutex;
+ #ifdef CONFIG_BLK_DEV_IO_TRACE
+ struct blk_trace __rcu *blk_trace;
+ #endif
+@@ -510,11 +509,12 @@ struct request_queue {
+ struct bio_set bio_split;
+
+ struct dentry *debugfs_dir;
+-
+-#ifdef CONFIG_BLK_DEBUG_FS
+ struct dentry *sched_debugfs_dir;
+ struct dentry *rqos_debugfs_dir;
+-#endif
++ /*
++ * Serializes all debugfs metadata operations using the above dentries.
++ */
++ struct mutex debugfs_mutex;
+
+ bool mq_sysfs_init_done;
+
+diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
+index f22219495541..f0500b5cfefe 100644
+--- a/kernel/trace/blktrace.c
++++ b/kernel/trace/blktrace.c
+@@ -770,14 +770,11 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+ **/
+ void blk_trace_shutdown(struct request_queue *q)
+ {
+- mutex_lock(&q->debugfs_mutex);
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->debugfs_mutex))) {
+ __blk_trace_startstop(q, 0);
+ __blk_trace_remove(q);
+ }
+-
+- mutex_unlock(&q->debugfs_mutex);
+ }
+
+ #ifdef CONFIG_BLK_CGROUP
+--
+2.35.1
+
--- /dev/null
+From e082d91f540e4ca12f8ed8aad09d7fc71a7d45de Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 21 Jun 2022 15:40:59 +0900
+Subject: btrfs: ensure pages are unlocked on cow_file_range() failure
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 9ce7466f372d83054c7494f6b3e4b9abaf3f0355 ]
+
+There is a hung_task report on zoned btrfs like below.
+
+https://github.com/naota/linux/issues/59
+
+ [726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds.
+ [726.329839] Not tainted 5.16.0-rc1+ #1
+ [726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+ [726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000
+ [726.331608] Call Trace:
+ [726.331611] <TASK>
+ [726.331614] __schedule+0x2e5/0x9d0
+ [726.331622] schedule+0x58/0xd0
+ [726.331626] io_schedule+0x3f/0x70
+ [726.331629] __folio_lock+0x125/0x200
+ [726.331634] ? find_get_entries+0x1bc/0x240
+ [726.331638] ? filemap_invalidate_unlock_two+0x40/0x40
+ [726.331642] truncate_inode_pages_range+0x5b2/0x770
+ [726.331649] truncate_inode_pages_final+0x44/0x50
+ [726.331653] btrfs_evict_inode+0x67/0x480
+ [726.331658] evict+0xd0/0x180
+ [726.331661] iput+0x13f/0x200
+ [726.331664] do_unlinkat+0x1c0/0x2b0
+ [726.331668] __x64_sys_unlink+0x23/0x30
+ [726.331670] do_syscall_64+0x3b/0xc0
+ [726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae
+ [726.331677] RIP: 0033:0x7fb9490a171b
+ [726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057
+ [726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b
+ [726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300
+ [726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000
+ [726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000
+ [726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260
+ [726.331693] </TASK>
+
+While we debug the issue, we found running fstests generic/551 on 5GB
+non-zoned null_blk device in the emulated zoned mode also had a
+similar hung issue.
+
+Also, we can reproduce the same symptom with an error injected
+cow_file_range() setup.
+
+The hang occurs when cow_file_range() fails in the middle of
+allocation. cow_file_range() called from do_allocation_zoned() can
+split the give region ([start, end]) for allocation depending on
+current block group usages. When btrfs can allocate bytes for one part
+of the split regions but fails for the other region (e.g. because of
+-ENOSPC), we return the error leaving the pages in the succeeded regions
+locked. Technically, this occurs only when @unlock == 0. Otherwise, we
+unlock the pages in an allocated region after creating an ordered
+extent.
+
+Considering the callers of cow_file_range(unlock=0) won't write out
+the pages, we can unlock the pages on error exit from
+cow_file_range(). So, we can ensure all the pages except @locked_page
+are unlocked on error case.
+
+In summary, cow_file_range now behaves like this:
+
+- page_started == 1 (return value)
+ - All the pages are unlocked. IO is started.
+- unlock == 1
+ - All the pages except @locked_page are unlocked in any case
+- unlock == 0
+ - On success, all the pages are locked for writing out them
+ - On failure, all the pages except @locked_page are unlocked
+
+Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems")
+CC: stable@vger.kernel.org # 5.12+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 64 insertions(+), 8 deletions(-)
+
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 5d15e374d032..54afa9e538c5 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1097,6 +1097,28 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ * *page_started is set to one if we unlock locked_page and do everything
+ * required to start IO on it. It may be clean and already done with
+ * IO when we return.
++ *
++ * When unlock == 1, we unlock the pages in successfully allocated regions.
++ * When unlock == 0, we leave them locked for writing them out.
++ *
++ * However, we unlock all the pages except @locked_page in case of failure.
++ *
++ * In summary, page locking state will be as follow:
++ *
++ * - page_started == 1 (return value)
++ * - All the pages are unlocked. IO is started.
++ * - Note that this can happen only on success
++ * - unlock == 1
++ * - All the pages except @locked_page are unlocked in any case
++ * - unlock == 0
++ * - On success, all the pages are locked for writing out them
++ * - On failure, all the pages except @locked_page are unlocked
++ *
++ * When a failure happens in the second or later iteration of the
++ * while-loop, the ordered extents created in previous iterations are kept
++ * intact. So, the caller must clean them up by calling
++ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
++ * example.
+ */
+ static noinline int cow_file_range(struct btrfs_inode *inode,
+ struct page *locked_page,
+@@ -1106,6 +1128,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 alloc_hint = 0;
++ u64 orig_start = start;
+ u64 num_bytes;
+ unsigned long ram_size;
+ u64 cur_alloc_size = 0;
+@@ -1293,18 +1316,44 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ out_unlock:
++ /*
++ * Now, we have three regions to clean up:
++ *
++ * |-------(1)----|---(2)---|-------------(3)----------|
++ * `- orig_start `- start `- start + cur_alloc_size `- end
++ *
++ * We process each region below.
++ */
++
+ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
+ EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
+ page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
++
+ /*
+- * If we reserved an extent for our delalloc range (or a subrange) and
+- * failed to create the respective ordered extent, then it means that
+- * when we reserved the extent we decremented the extent's size from
+- * the data space_info's bytes_may_use counter and incremented the
+- * space_info's bytes_reserved counter by the same amount. We must make
+- * sure extent_clear_unlock_delalloc() does not try to decrement again
+- * the data space_info's bytes_may_use counter, therefore we do not pass
+- * it the flag EXTENT_CLEAR_DATA_RESV.
++ * For the range (1). We have already instantiated the ordered extents
++ * for this region. They are cleaned up by
++ * btrfs_cleanup_ordered_extents() in e.g,
++ * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
++ * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
++ * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
++ * function.
++ *
++ * However, in case of unlock == 0, we still need to unlock the pages
++ * (except @locked_page) to ensure all the pages are unlocked.
++ */
++ if (!unlock && orig_start < start)
++ extent_clear_unlock_delalloc(inode, orig_start, start - 1,
++ locked_page, 0, page_ops);
++
++ /*
++ * For the range (2). If we reserved an extent for our delalloc range
++ * (or a subrange) and failed to create the respective ordered extent,
++ * then it means that when we reserved the extent we decremented the
++ * extent's size from the data space_info's bytes_may_use counter and
++ * incremented the space_info's bytes_reserved counter by the same
++ * amount. We must make sure extent_clear_unlock_delalloc() does not try
++ * to decrement again the data space_info's bytes_may_use counter,
++ * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
+ */
+ if (extent_reserved) {
+ extent_clear_unlock_delalloc(inode, start,
+@@ -1316,6 +1365,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
+ if (start >= end)
+ goto out;
+ }
++
++ /*
++ * For the range (3). We never touched the region. In addition to the
++ * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
++ * space_info's bytes_may_use counter, reserved in
++ * btrfs_check_data_free_space().
++ */
+ extent_clear_unlock_delalloc(inode, start, end, locked_page,
+ clear_bits | EXTENT_CLEAR_DATA_RESV,
+ page_ops);
+--
+2.35.1
+
--- /dev/null
+From db0a5d9ef124f104269150d76fb2bcbc29e5293a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 21 Jun 2022 15:41:01 +0900
+Subject: btrfs: fix error handling of fallback uncompress write
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 71aa147b4d9d81fa65afa6016f50d7818b64a54f ]
+
+When cow_file_range() fails in the middle of the allocation loop, it
+unlocks the pages but leaves the ordered extents intact. Thus, we need
+to call btrfs_cleanup_ordered_extents() to finish the created ordered
+extents.
+
+Also, we need to call end_extent_writepage() if locked_page is available
+because btrfs_cleanup_ordered_extents() never processes the region on
+the locked_page.
+
+Furthermore, we need to set the mapping as error if locked_page is
+unavailable before unlocking the pages, so that the errno is properly
+propagated to the user space.
+
+CC: stable@vger.kernel.org # 5.18+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/inode.c | 17 +++++++++++++++--
+ 1 file changed, 15 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 54afa9e538c5..1e404476fe6a 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -891,8 +891,18 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
+ goto out;
+ }
+ if (ret < 0) {
+- if (locked_page)
++ btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
++ if (locked_page) {
++ const u64 page_start = page_offset(locked_page);
++ const u64 page_end = page_start + PAGE_SIZE - 1;
++
++ btrfs_page_set_error(inode->root->fs_info, locked_page,
++ page_start, PAGE_SIZE);
++ set_page_writeback(locked_page);
++ end_page_writeback(locked_page);
++ end_extent_writepage(locked_page, ret, page_start, page_end);
+ unlock_page(locked_page);
++ }
+ goto out;
+ }
+
+@@ -1341,9 +1351,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
+ * However, in case of unlock == 0, we still need to unlock the pages
+ * (except @locked_page) to ensure all the pages are unlocked.
+ */
+- if (!unlock && orig_start < start)
++ if (!unlock && orig_start < start) {
++ if (!locked_page)
++ mapping_set_error(inode->vfs_inode.i_mapping, ret);
+ extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+ locked_page, 0, page_ops);
++ }
+
+ /*
+ * For the range (2). If we reserved an extent for our delalloc range
+--
+2.35.1
+
--- /dev/null
+From d95e54fdfd9bad3d0327ac42599359dc8c90ef75 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:43 +0900
+Subject: btrfs: let can_allocate_chunk return error
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit bb9950d3df7169a673c594d38fb74e241ed4fb2a ]
+
+For the later patch, convert the return type from bool to int and return
+errors. No functional changes.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-tree.c | 15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index f45ecd939a2c..8bdcbc0c6d60 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3985,12 +3985,12 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
+ }
+ }
+
+-static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
+- struct find_free_extent_ctl *ffe_ctl)
++static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
++ struct find_free_extent_ctl *ffe_ctl)
+ {
+ switch (ffe_ctl->policy) {
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+- return true;
++ return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+ /*
+ * If we have enough free space left in an already
+@@ -4000,8 +4000,8 @@ static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ */
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+ !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+- return false;
+- return true;
++ return -ENOSPC;
++ return 0;
+ default:
+ BUG();
+ }
+@@ -4083,8 +4083,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
+ int exist = 0;
+
+ /*Check if allocation policy allows to create a new chunk */
+- if (!can_allocate_chunk(fs_info, ffe_ctl))
+- return -ENOSPC;
++ ret = can_allocate_chunk(fs_info, ffe_ctl);
++ if (ret)
++ return ret;
+
+ trans = current->journal_info;
+ if (trans)
+--
+2.35.1
+
--- /dev/null
+From e4ae7bab98014a3d07e59efe09a829cb15cb518f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 29 Mar 2022 01:56:06 -0700
+Subject: btrfs: make the bg_reclaim_threshold per-space info
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+[ Upstream commit bb5a098d9791f184899499531ff4411089e2a5e0 ]
+
+For non-zoned file systems it's useful to have the auto reclaim feature,
+however there are different use cases for non-zoned, for example we may
+not want to reclaim metadata chunks ever, only data chunks. Move this
+sysfs flag to per-space_info. This won't affect current users because
+this tunable only ever did anything for zoned, and that is currently
+hidden behind BTRFS_CONFIG_DEBUG.
+
+Tested-by: Pankaj Raghav <p.raghav@samsung.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+[ jth restore global bg_reclaim_threshold ]
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/free-space-cache.c | 7 +++++--
+ fs/btrfs/space-info.c | 9 +++++++++
+ fs/btrfs/space-info.h | 6 ++++++
+ fs/btrfs/sysfs.c | 37 +++++++++++++++++++++++++++++++++++++
+ fs/btrfs/zoned.h | 6 +-----
+ 5 files changed, 58 insertions(+), 7 deletions(-)
+
+diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
+index 01a408db5683..ef84bc5030cd 100644
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -2630,16 +2630,19 @@ int __btrfs_add_free_space(struct btrfs_block_group *block_group,
+ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
+ u64 bytenr, u64 size, bool used)
+ {
+- struct btrfs_fs_info *fs_info = block_group->fs_info;
++ struct btrfs_space_info *sinfo = block_group->space_info;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 offset = bytenr - block_group->start;
+ u64 to_free, to_unusable;
+- const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
++ int bg_reclaim_threshold = 0;
+ bool initial = (size == block_group->length);
+ u64 reclaimable_unusable;
+
+ WARN_ON(!initial && offset + size > block_group->zone_capacity);
+
++ if (!initial)
++ bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
++
+ spin_lock(&ctl->tree_lock);
+ if (!used)
+ to_free = size;
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index 56a7c99fc03e..85608acb9557 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+ found->full = 0;
+ }
+
++/*
++ * Block groups with more than this value (percents) of unusable space will be
++ * scheduled for background reclaim.
++ */
++#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
++
+ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+ {
+
+@@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+ INIT_LIST_HEAD(&space_info->priority_tickets);
+ space_info->clamp = 1;
+
++ if (btrfs_is_zoned(info))
++ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
++
+ ret = btrfs_sysfs_add_space_info_type(info, space_info);
+ if (ret)
+ return ret;
+diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
+index d841fed73492..a803e29bd781 100644
+--- a/fs/btrfs/space-info.h
++++ b/fs/btrfs/space-info.h
+@@ -24,6 +24,12 @@ struct btrfs_space_info {
+ the space info if we had an ENOSPC in the
+ allocator. */
+
++ /*
++ * Once a block group drops below this threshold (percents) we'll
++ * schedule it for reclaim.
++ */
++ int bg_reclaim_threshold;
++
+ int clamp; /* Used to scale our threshold for preemptive
+ flushing. The value is >> clamp, so turns
+ out to be a 2^clamp divisor. */
+diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
+index ba78ca5aabbb..43845cae0c74 100644
+--- a/fs/btrfs/sysfs.c
++++ b/fs/btrfs/sysfs.c
+@@ -722,6 +722,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
+ SPACE_INFO_ATTR(disk_used);
+ SPACE_INFO_ATTR(disk_total);
+
++static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
++ struct kobj_attribute *a,
++ char *buf)
++{
++ struct btrfs_space_info *space_info = to_space_info(kobj);
++ ssize_t ret;
++
++ ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
++
++ return ret;
++}
++
++static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
++ struct kobj_attribute *a,
++ const char *buf, size_t len)
++{
++ struct btrfs_space_info *space_info = to_space_info(kobj);
++ int thresh;
++ int ret;
++
++ ret = kstrtoint(buf, 10, &thresh);
++ if (ret)
++ return ret;
++
++ if (thresh != 0 && (thresh <= 50 || thresh > 100))
++ return -EINVAL;
++
++ WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
++
++ return len;
++}
++
++BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
++ btrfs_sinfo_bg_reclaim_threshold_show,
++ btrfs_sinfo_bg_reclaim_threshold_store);
++
+ /*
+ * Allocation information about block group types.
+ *
+@@ -738,6 +774,7 @@ static struct attribute *space_info_attrs[] = {
+ BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
+ BTRFS_ATTR_PTR(space_info, disk_used),
+ BTRFS_ATTR_PTR(space_info, disk_total),
++ BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
+ NULL,
+ };
+ ATTRIBUTE_GROUPS(space_info);
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index c424417e19bb..199b69670fa2 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -10,11 +10,7 @@
+ #include "block-group.h"
+ #include "btrfs_inode.h"
+
+-/*
+- * Block groups with more than this value (percents) of unusable space will be
+- * scheduled for background reclaim.
+- */
+-#define BTRFS_DEFAULT_RECLAIM_THRESH 75
++#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
+
+ struct btrfs_zoned_device_info {
+ /*
+--
+2.35.1
+
--- /dev/null
+From 2317054fdc824202a5af22cdb2b8b2f0fcc792d6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Jun 2022 10:55:47 +0300
+Subject: btrfs: properly flag filesystem with
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+[ Upstream commit e26b04c4c91925dba57324db177a24e18e2d0013 ]
+
+Commit 6f93e834fa7c seemingly inadvertently moved the code responsible
+for flagging the filesystem as having BIG_METADATA to a place where
+setting the flag was essentially lost. This means that
+filesystems created with kernels containing this bug (starting with 5.15)
+can potentially be mounted by older (pre-3.4) kernels. In reality
+chances for this happening are low because there are other incompat
+flags introduced in the mean time. Still the correct behavior is to set
+INCOMPAT_BIG_METADATA flag and persist this in the superblock.
+
+Fixes: 6f93e834fa7c ("btrfs: fix upper limit for max_inline for page size 64K")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/disk-io.c | 21 +++++++++++----------
+ 1 file changed, 11 insertions(+), 10 deletions(-)
+
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index f45470798022..34cd57d799e4 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3577,16 +3577,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
+- /*
+- * Flag our filesystem as having big metadata blocks if they are bigger
+- * than the page size.
+- */
+- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
+- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+- btrfs_info(fs_info,
+- "flagging fs with big metadata feature");
+- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+- }
+
+ /* Set up fs_info before parsing mount options */
+ nodesize = btrfs_super_nodesize(disk_super);
+@@ -3627,6 +3617,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+ btrfs_info(fs_info, "has skinny extents");
+
++ /*
++ * Flag our filesystem as having big metadata blocks if they are bigger
++ * than the page size.
++ */
++ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
++ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
++ btrfs_info(fs_info,
++ "flagging fs with big metadata feature");
++ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
++ }
++
+ /*
+ * mixed block groups end up with duplicate but slightly offset
+ * extent buffers for the same range. It leads to corruptions
+--
+2.35.1
+
--- /dev/null
+From dcb75d1d2ed081e90e672a228bd75205ce484c3e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:40 +0900
+Subject: btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit f7b12a62f008a3041f42f2426983e59a6a0a3c59 ]
+
+On zoned filesystem, data write out is limited by max_zone_append_size,
+and a large ordered extent is split according the size of a bio. OTOH,
+the number of extents to be written is calculated using
+BTRFS_MAX_EXTENT_SIZE, and that estimated number is used to reserve the
+metadata bytes to update and/or create the metadata items.
+
+The metadata reservation is done at e.g, btrfs_buffered_write() and then
+released according to the estimation changes. Thus, if the number of extent
+increases massively, the reserved metadata can run out.
+
+The increase of the number of extents easily occurs on zoned filesystem
+if BTRFS_MAX_EXTENT_SIZE > max_zone_append_size. And, it causes the
+following warning on a small RAM environment with disabling metadata
+over-commit (in the following patch).
+
+[75721.498492] ------------[ cut here ]------------
+[75721.505624] BTRFS: block rsv 1 returned -28
+[75721.512230] WARNING: CPU: 24 PID: 2327559 at fs/btrfs/block-rsv.c:537 btrfs_use_block_rsv+0x560/0x760 [btrfs]
+[75721.581854] CPU: 24 PID: 2327559 Comm: kworker/u64:10 Kdump: loaded Tainted: G W 5.18.0-rc2-BTRFS-ZNS+ #109
+[75721.597200] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.0 02/22/2021
+[75721.607310] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs]
+[75721.616209] RIP: 0010:btrfs_use_block_rsv+0x560/0x760 [btrfs]
+[75721.646649] RSP: 0018:ffffc9000fbdf3e0 EFLAGS: 00010286
+[75721.654126] RAX: 0000000000000000 RBX: 0000000000004000 RCX: 0000000000000000
+[75721.663524] RDX: 0000000000000004 RSI: 0000000000000008 RDI: fffff52001f7be6e
+[75721.672921] RBP: ffffc9000fbdf420 R08: 0000000000000001 R09: ffff889f8d1fc6c7
+[75721.682493] R10: ffffed13f1a3f8d8 R11: 0000000000000001 R12: ffff88980a3c0e28
+[75721.692284] R13: ffff889b66590000 R14: ffff88980a3c0e40 R15: ffff88980a3c0e8a
+[75721.701878] FS: 0000000000000000(0000) GS:ffff889f8d000000(0000) knlGS:0000000000000000
+[75721.712601] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[75721.720726] CR2: 000055d12e05c018 CR3: 0000800193594000 CR4: 0000000000350ee0
+[75721.730499] Call Trace:
+[75721.735166] <TASK>
+[75721.739886] btrfs_alloc_tree_block+0x1e1/0x1100 [btrfs]
+[75721.747545] ? btrfs_alloc_logged_file_extent+0x550/0x550 [btrfs]
+[75721.756145] ? btrfs_get_32+0xea/0x2d0 [btrfs]
+[75721.762852] ? btrfs_get_32+0xea/0x2d0 [btrfs]
+[75721.769520] ? push_leaf_left+0x420/0x620 [btrfs]
+[75721.776431] ? memcpy+0x4e/0x60
+[75721.781931] split_leaf+0x433/0x12d0 [btrfs]
+[75721.788392] ? btrfs_get_token_32+0x580/0x580 [btrfs]
+[75721.795636] ? push_for_double_split.isra.0+0x420/0x420 [btrfs]
+[75721.803759] ? leaf_space_used+0x15d/0x1a0 [btrfs]
+[75721.811156] btrfs_search_slot+0x1bc3/0x2790 [btrfs]
+[75721.818300] ? lock_downgrade+0x7c0/0x7c0
+[75721.824411] ? free_extent_buffer.part.0+0x107/0x200 [btrfs]
+[75721.832456] ? split_leaf+0x12d0/0x12d0 [btrfs]
+[75721.839149] ? free_extent_buffer.part.0+0x14f/0x200 [btrfs]
+[75721.846945] ? free_extent_buffer+0x13/0x20 [btrfs]
+[75721.853960] ? btrfs_release_path+0x4b/0x190 [btrfs]
+[75721.861429] btrfs_csum_file_blocks+0x85c/0x1500 [btrfs]
+[75721.869313] ? rcu_read_lock_sched_held+0x16/0x80
+[75721.876085] ? lock_release+0x552/0xf80
+[75721.881957] ? btrfs_del_csums+0x8c0/0x8c0 [btrfs]
+[75721.888886] ? __kasan_check_write+0x14/0x20
+[75721.895152] ? do_raw_read_unlock+0x44/0x80
+[75721.901323] ? _raw_write_lock_irq+0x60/0x80
+[75721.907983] ? btrfs_global_root+0xb9/0xe0 [btrfs]
+[75721.915166] ? btrfs_csum_root+0x12b/0x180 [btrfs]
+[75721.921918] ? btrfs_get_global_root+0x820/0x820 [btrfs]
+[75721.929166] ? _raw_write_unlock+0x23/0x40
+[75721.935116] ? unpin_extent_cache+0x1e3/0x390 [btrfs]
+[75721.942041] btrfs_finish_ordered_io.isra.0+0xa0c/0x1dc0 [btrfs]
+[75721.949906] ? try_to_wake_up+0x30/0x14a0
+[75721.955700] ? btrfs_unlink_subvol+0xda0/0xda0 [btrfs]
+[75721.962661] ? rcu_read_lock_sched_held+0x16/0x80
+[75721.969111] ? lock_acquire+0x41b/0x4c0
+[75721.974982] finish_ordered_fn+0x15/0x20 [btrfs]
+[75721.981639] btrfs_work_helper+0x1af/0xa80 [btrfs]
+[75721.988184] ? _raw_spin_unlock_irq+0x28/0x50
+[75721.994643] process_one_work+0x815/0x1460
+[75722.000444] ? pwq_dec_nr_in_flight+0x250/0x250
+[75722.006643] ? do_raw_spin_trylock+0xbb/0x190
+[75722.013086] worker_thread+0x59a/0xeb0
+[75722.018511] kthread+0x2ac/0x360
+[75722.023428] ? process_one_work+0x1460/0x1460
+[75722.029431] ? kthread_complete_and_exit+0x30/0x30
+[75722.036044] ret_from_fork+0x22/0x30
+[75722.041255] </TASK>
+[75722.045047] irq event stamp: 0
+[75722.049703] hardirqs last enabled at (0): [<0000000000000000>] 0x0
+[75722.057610] hardirqs last disabled at (0): [<ffffffff8118a94a>] copy_process+0x1c1a/0x66b0
+[75722.067533] softirqs last enabled at (0): [<ffffffff8118a989>] copy_process+0x1c59/0x66b0
+[75722.077423] softirqs last disabled at (0): [<0000000000000000>] 0x0
+[75722.085335] ---[ end trace 0000000000000000 ]---
+
+To fix the estimation, we need to introduce fs_info->max_extent_size to
+replace BTRFS_MAX_EXTENT_SIZE, which allow setting the different size for
+regular vs zoned filesystem.
+
+Set fs_info->max_extent_size to BTRFS_MAX_EXTENT_SIZE by default. On zoned
+filesystem, it is set to fs_info->max_zone_append_size.
+
+CC: stable@vger.kernel.org # 5.12+
+Fixes: d8e3fb106f39 ("btrfs: zoned: use ZONE_APPEND write for zoned mode")
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/ctree.h | 6 ++++++
+ fs/btrfs/disk-io.c | 2 ++
+ fs/btrfs/extent_io.c | 4 +++-
+ fs/btrfs/inode.c | 6 ++++--
+ fs/btrfs/zoned.c | 5 ++++-
+ 5 files changed, 19 insertions(+), 4 deletions(-)
+
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 1c377bcfe787..97f5a3d320ff 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -1032,6 +1032,12 @@ struct btrfs_fs_info {
+ u32 csums_per_leaf;
+ u32 stripesize;
+
++ /*
++ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
++ * filesystem, on zoned it depends on the device constraints.
++ */
++ u64 max_extent_size;
++
+ /* Block groups and devices containing active swapfiles. */
+ spinlock_t swapfile_pins_lock;
+ struct rb_root swapfile_pins;
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 34cd57d799e4..bf5c6ac67e87 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3246,6 +3246,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
+ fs_info->sectorsize_bits = ilog2(4096);
+ fs_info->stripesize = 4096;
+
++ fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
++
+ spin_lock_init(&fs_info->swapfile_pins_lock);
+ fs_info->swapfile_pins = RB_ROOT;
+
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 68ddd90685d9..bfc7d5b31156 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -1992,10 +1992,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
+ struct page *locked_page, u64 *start,
+ u64 *end)
+ {
++ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ const u64 orig_start = *start;
+ const u64 orig_end = *end;
+- u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
++ /* The sanity tests may not set a valid fs_info. */
++ u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
+ u64 delalloc_start;
+ u64 delalloc_end;
+ bool found;
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 1e404476fe6a..c50288d90c66 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2102,6 +2102,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
+ void btrfs_split_delalloc_extent(struct inode *inode,
+ struct extent_state *orig, u64 split)
+ {
++ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 size;
+
+ /* not delalloc, ignore it */
+@@ -2109,7 +2110,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
+ return;
+
+ size = orig->end - orig->start + 1;
+- if (size > BTRFS_MAX_EXTENT_SIZE) {
++ if (size > fs_info->max_extent_size) {
+ u32 num_extents;
+ u64 new_size;
+
+@@ -2138,6 +2139,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
+ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+ struct extent_state *other)
+ {
++ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ u64 new_size, old_size;
+ u32 num_extents;
+
+@@ -2151,7 +2153,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
+ new_size = other->end - new->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+- if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
++ if (new_size <= fs_info->max_extent_size) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
+ spin_unlock(&BTRFS_I(inode)->lock);
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 1d5b9308f5ef..a0bf2c20fa61 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -731,8 +731,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ }
+
+ fs_info->zone_size = zone_size;
+- fs_info->max_zone_append_size = max_zone_append_size;
++ fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size,
++ fs_info->sectorsize);
+ fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
++ if (fs_info->max_zone_append_size < fs_info->max_extent_size)
++ fs_info->max_extent_size = fs_info->max_zone_append_size;
+
+ /*
+ * Check mount options here, because we might change fs_info->zoned
+--
+2.35.1
+
--- /dev/null
+From 7b8c917f29d18606d01d4f3ae4aab8f10342c9f7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Jun 2022 18:31:17 -0400
+Subject: btrfs: reset block group chunk force if we have to wait
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+[ Upstream commit 1314ca78b2c35d3e7d0f097268a2ee6dc0d369ef ]
+
+If you try to force a chunk allocation, but you race with another chunk
+allocation, you will end up waiting on the chunk allocation that just
+occurred and then allocate another chunk. If you have many threads all
+doing this at once you can way over-allocate chunks.
+
+Fix this by resetting force to NO_FORCE, that way if we think we need to
+allocate we can, otherwise we don't force another chunk allocation if
+one is already happening.
+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+CC: stable@vger.kernel.org # 5.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/block-group.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
+index 667b7025d503..1deca5164c23 100644
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -3724,6 +3724,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+ * attempt.
+ */
+ wait_for_alloc = true;
++ force = CHUNK_ALLOC_NO_FORCE;
+ spin_unlock(&space_info->lock);
+ mutex_lock(&fs_info->chunk_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
+--
+2.35.1
+
--- /dev/null
+From 43c91308d645899ddb58951c23e4338cb43cb48c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Feb 2022 11:31:20 -0800
+Subject: btrfs: store chunk size in space-info struct
+
+From: Stefan Roesch <shr@fb.com>
+
+[ Upstream commit f6fca3917b4d99d8c13901738afec35f570a3c2f ]
+
+The chunk size is stored in the btrfs_space_info structure. It is
+initialized at the start and is then used.
+
+A new API is added to update the current chunk size. This API is used
+to be able to expose the chunk_size as a sysfs setting.
+
+Signed-off-by: Stefan Roesch <shr@fb.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ rename and merge helpers, switch atomic type to u64, style fixes ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/space-info.c | 32 ++++++++++++++++++++++++++++++++
+ fs/btrfs/space-info.h | 4 ++++
+ fs/btrfs/volumes.c | 28 +++++++++-------------------
+ 3 files changed, 45 insertions(+), 19 deletions(-)
+
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index 85608acb9557..98a84b523be6 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -187,6 +187,37 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+ */
+ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
+
++/*
++ * Calculate chunk size depending on volume type (regular or zoned).
++ */
++static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
++{
++ if (btrfs_is_zoned(fs_info))
++ return fs_info->zone_size;
++
++ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
++
++ if (flags & BTRFS_BLOCK_GROUP_DATA)
++ return SZ_1G;
++ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
++ return SZ_32M;
++
++ /* Handle BTRFS_BLOCK_GROUP_METADATA */
++ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
++ return SZ_1G;
++
++ return SZ_256M;
++}
++
++/*
++ * Update default chunk size.
++ */
++void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
++ u64 chunk_size)
++{
++ WRITE_ONCE(space_info->chunk_size, chunk_size);
++}
++
+ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+ {
+
+@@ -208,6 +239,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+ INIT_LIST_HEAD(&space_info->tickets);
+ INIT_LIST_HEAD(&space_info->priority_tickets);
+ space_info->clamp = 1;
++ btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+
+ if (btrfs_is_zoned(info))
+ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
+index a803e29bd781..137206b8049f 100644
+--- a/fs/btrfs/space-info.h
++++ b/fs/btrfs/space-info.h
+@@ -23,6 +23,8 @@ struct btrfs_space_info {
+ u64 max_extent_size; /* This will hold the maximum extent size of
+ the space info if we had an ENOSPC in the
+ allocator. */
++ /* Chunk size in bytes */
++ u64 chunk_size;
+
+ /*
+ * Once a block group drops below this threshold (percents) we'll
+@@ -121,6 +123,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly, u64 bytes_zone_unusable,
+ struct btrfs_space_info **space_info);
++void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
++ u64 chunk_size);
+ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
+ u64 flags);
+ u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
+diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
+index 659575526e9f..4bc97e7d8e46 100644
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -5091,26 +5091,16 @@ static void init_alloc_chunk_ctl_policy_regular(
+ struct btrfs_fs_devices *fs_devices,
+ struct alloc_chunk_ctl *ctl)
+ {
+- u64 type = ctl->type;
++ struct btrfs_space_info *space_info;
+
+- if (type & BTRFS_BLOCK_GROUP_DATA) {
+- ctl->max_stripe_size = SZ_1G;
+- ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
+- } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+- /* For larger filesystems, use larger metadata chunks */
+- if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+- ctl->max_stripe_size = SZ_1G;
+- else
+- ctl->max_stripe_size = SZ_256M;
+- ctl->max_chunk_size = ctl->max_stripe_size;
+- } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+- ctl->max_stripe_size = SZ_32M;
+- ctl->max_chunk_size = 2 * ctl->max_stripe_size;
+- ctl->devs_max = min_t(int, ctl->devs_max,
+- BTRFS_MAX_DEVS_SYS_CHUNK);
+- } else {
+- BUG();
+- }
++ space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
++ ASSERT(space_info);
++
++ ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
++ ctl->max_stripe_size = ctl->max_chunk_size;
++
++ if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
++ ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
+
+ /* We don't want a chunk larger than 10% of writable space */
+ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
+--
+2.35.1
+
--- /dev/null
+From f844f5c9f2598302a23d546bfe15914ee71b9c29 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Jun 2022 15:09:48 -0400
+Subject: btrfs: tree-log: make the return value for log syncing consistent
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+[ Upstream commit f31f09f6be1c6c1a673e0566e258281a7bbaaa51 ]
+
+Currently we will return 1 or -EAGAIN if we decide we need to commit
+the transaction rather than sync the log. In practice this doesn't
+really matter, we interpret any !0 and !BTRFS_NO_LOG_SYNC as needing to
+commit the transaction. However this makes it hard to figure out what
+the correct thing to do is.
+
+Fix this up by defining BTRFS_LOG_FORCE_COMMIT and using this in all the
+places where we want to force the transaction to be committed.
+
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/file.c | 2 +-
+ fs/btrfs/tree-log.c | 18 +++++++++---------
+ fs/btrfs/tree-log.h | 3 +++
+ 3 files changed, 13 insertions(+), 10 deletions(-)
+
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 153920acd226..2d24f2dcc0ea 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -2344,7 +2344,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+ btrfs_release_log_ctx_extents(&ctx);
+ if (ret < 0) {
+ /* Fallthrough and commit/free transaction. */
+- ret = 1;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ }
+
+ /* we've logged all the items and now have a consistent
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index e65633686378..08917069a125 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -171,7 +171,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
+ int index = (root->log_transid + 1) % 2;
+
+ if (btrfs_need_log_full_commit(trans)) {
+- ret = -EAGAIN;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
+
+@@ -194,7 +194,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
+ * writing.
+ */
+ if (zoned && !created) {
+- ret = -EAGAIN;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
+
+@@ -3122,7 +3122,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+
+ /* bail out if we need to do a full commit */
+ if (btrfs_need_log_full_commit(trans)) {
+- ret = -EAGAIN;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+@@ -3223,7 +3223,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ }
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+- ret = -EAGAIN;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
+
+@@ -3262,7 +3262,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
+ blk_finish_plug(&plug);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+- ret = -EAGAIN;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out_wake_log_root;
+ }
+
+@@ -5849,7 +5849,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ inode_only == LOG_INODE_ALL &&
+ inode->last_unlink_trans >= trans->transid) {
+ btrfs_set_log_full_commit(trans);
+- ret = 1;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out_unlock;
+ }
+
+@@ -6563,12 +6563,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ bool log_dentries = false;
+
+ if (btrfs_test_opt(fs_info, NOTREELOG)) {
+- ret = 1;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto end_no_trans;
+ }
+
+ if (btrfs_root_refs(&root->root_item) == 0) {
+- ret = 1;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ goto end_no_trans;
+ }
+
+@@ -6666,7 +6666,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ end_trans:
+ if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+- ret = 1;
++ ret = BTRFS_LOG_FORCE_COMMIT;
+ }
+
+ if (ret)
+diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
+index 1620f8170629..57ab5f3b8dc7 100644
+--- a/fs/btrfs/tree-log.h
++++ b/fs/btrfs/tree-log.h
+@@ -12,6 +12,9 @@
+ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+ #define BTRFS_NO_LOG_SYNC 256
+
++/* We can't use the tree log for whatever reason, force a transaction commit */
++#define BTRFS_LOG_FORCE_COMMIT (1)
++
+ struct btrfs_log_ctx {
+ int log_ret;
+ int log_transid;
+--
+2.35.1
+
--- /dev/null
+From d866ac5585bc1afadfcc6bb5649813cd1f0f82ab Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:47 +0900
+Subject: btrfs: zoned: activate metadata block group on flush_space
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit b0931513913633044ed6e3800334c28433c007b0 ]
+
+For metadata space on zoned filesystem, reaching ALLOC_CHUNK{,_FORCE}
+means we don't have enough space left in the active_total_bytes. Before
+allocating a new chunk, we can try to activate an existing block group
+in this case.
+
+Also, allocating a chunk is not enough to grant a ticket for metadata
+space on zoned filesystem we need to activate the block group to
+increase the active_total_bytes.
+
+btrfs_zoned_activate_one_bg() implements the activation feature. It will
+activate a block group by (maybe) finishing a block group. It will give up
+activating a block group if it cannot finish any block group.
+
+CC: stable@vger.kernel.org # 5.16+
+Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/space-info.c | 30 ++++++++++++++++++++++++
+ fs/btrfs/zoned.c | 53 +++++++++++++++++++++++++++++++++++++++++++
+ fs/btrfs/zoned.h | 10 ++++++++
+ 3 files changed, 93 insertions(+)
+
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index 4867199cf983..104cbc901c0e 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -9,6 +9,7 @@
+ #include "ordered-data.h"
+ #include "transaction.h"
+ #include "block-group.h"
++#include "zoned.h"
+
+ /*
+ * HOW DOES SPACE RESERVATION WORK
+@@ -724,6 +725,18 @@ static void flush_space(struct btrfs_fs_info *fs_info,
+ break;
+ case ALLOC_CHUNK:
+ case ALLOC_CHUNK_FORCE:
++ /*
++ * For metadata space on zoned filesystem, reaching here means we
++ * don't have enough space left in active_total_bytes. Try to
++ * activate a block group first, because we may have inactive
++ * block group already allocated.
++ */
++ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false);
++ if (ret < 0)
++ break;
++ else if (ret == 1)
++ break;
++
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+@@ -734,6 +747,23 @@ static void flush_space(struct btrfs_fs_info *fs_info,
+ (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
+ CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
++
++ /*
++ * For metadata space on zoned filesystem, allocating a new chunk
++ * is not enough. We still need to activate the block * group.
++ * Active the newly allocated block group by (maybe) finishing
++ * a block group.
++ */
++ if (ret == 1) {
++ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
++ /*
++ * Revert to the original ret regardless we could finish
++ * one block group or not.
++ */
++ if (ret >= 0)
++ ret = 1;
++ }
++
+ if (ret > 0 || ret == -ENOSPC)
+ ret = 0;
+ break;
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 2ffc6d50d20d..0c2d81b0e3d3 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2222,3 +2222,56 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+
+ return ret < 0 ? ret : 1;
+ }
++
++int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
++ struct btrfs_space_info *space_info,
++ bool do_finish)
++{
++ struct btrfs_block_group *bg;
++ int index;
++
++ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
++ return 0;
++
++ /* No more block groups to activate */
++ if (space_info->active_total_bytes == space_info->total_bytes)
++ return 0;
++
++ for (;;) {
++ int ret;
++ bool need_finish = false;
++
++ down_read(&space_info->groups_sem);
++ for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) {
++ list_for_each_entry(bg, &space_info->block_groups[index],
++ list) {
++ if (!spin_trylock(&bg->lock))
++ continue;
++ if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) {
++ spin_unlock(&bg->lock);
++ continue;
++ }
++ spin_unlock(&bg->lock);
++
++ if (btrfs_zone_activate(bg)) {
++ up_read(&space_info->groups_sem);
++ return 1;
++ }
++
++ need_finish = true;
++ }
++ }
++ up_read(&space_info->groups_sem);
++
++ if (!do_finish || !need_finish)
++ break;
++
++ ret = btrfs_zone_finish_one_bg(fs_info);
++ if (ret == 0)
++ break;
++ if (ret < 0)
++ return ret;
++ }
++
++ return 0;
++}
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index 0740458894ac..1cac32266276 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -80,6 +80,8 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
+ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
++int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
++ struct btrfs_space_info *space_info, bool do_finish);
+ #else /* CONFIG_BLK_DEV_ZONED */
+ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+@@ -250,6 +252,14 @@ static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+ return 1;
+ }
+
++static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
++ struct btrfs_space_info *space_info,
++ bool do_finish)
++{
++ /* Consider all the block groups are active */
++ return 0;
++}
++
+ #endif
+
+ static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
+--
+2.35.1
+
--- /dev/null
+From eae937137d623d9fd942a7525307563e157bc79a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:48 +0900
+Subject: btrfs: zoned: activate necessary block group
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit b6a98021e4019c562a23ad151a7e40adfa9f91e5 ]
+
+There are two places where allocating a chunk is not enough. These two
+places are trying to ensure the space by allocating a chunk. To meet the
+condition for active_total_bytes, we also need to activate a block group
+there.
+
+CC: stable@vger.kernel.org # 5.16+
+Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/block-group.c | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
+index 88f59a2e4113..0c7fe3142d7c 100644
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -2659,6 +2659,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+ ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ if (ret < 0)
+ goto out;
++ /*
++ * We have allocated a new chunk. We also need to activate that chunk to
++ * grant metadata tickets for zoned filesystem.
++ */
++ ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
++ if (ret < 0)
++ goto out;
++
+ ret = inc_block_group_ro(cache, 0);
+ if (ret == -ETXTBSY)
+ goto unlock_out;
+@@ -3853,6 +3861,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
+ if (IS_ERR(bg)) {
+ ret = PTR_ERR(bg);
+ } else {
++ /*
++ * We have a new chunk. We also need to activate it for
++ * zoned filesystem.
++ */
++ ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
++ if (ret < 0)
++ return;
++
+ /*
+ * If we fail to add the chunk item here, we end up
+ * trying again at phase 2 of chunk allocation, at
+--
+2.35.1
+
--- /dev/null
+From 28f8ac17eaf7d79fae9a491d13a968e609237c54 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:46 +0900
+Subject: btrfs: zoned: disable metadata overcommit for zoned
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 79417d040f4f77b19c701bccc23013b9cdac358d ]
+
+The metadata overcommit makes the space reservation flexible but it is also
+harmful to active zone tracking. Since we cannot finish a block group from
+the metadata allocation context, we might not activate a new block group
+and might not be able to actually write out the overcommit reservations.
+
+So, disable metadata overcommit for zoned filesystems. We will ensure
+the reservations are under active_total_bytes in the following patches.
+
+CC: stable@vger.kernel.org # 5.16+
+Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/space-info.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index b87931a458eb..56a7c99fc03e 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -340,7 +340,10 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ return 0;
+
+ used = btrfs_space_info_used(space_info, true);
+- avail = calc_available_free_space(fs_info, space_info, flush);
++ if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
++ avail = 0;
++ else
++ avail = calc_available_free_space(fs_info, space_info, flush);
+
+ if (used + bytes < space_info->total_bytes + avail)
+ return 1;
+--
+2.35.1
+
--- /dev/null
+From fe36ff205f09ab9fb8e3d7405b25826217ca9aec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:44 +0900
+Subject: btrfs: zoned: finish least available block group on data bg
+ allocation
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 393f646e34c18b85d0f41272bfcbd475ae3a0d34 ]
+
+When we run out of active zones and no sufficient space is left in any
+block groups, we need to finish one block group to make room to activate a
+new block group.
+
+However, we cannot do this for metadata block groups because we can cause a
+deadlock by waiting for a running transaction commit. So, do that only for
+a data block group.
+
+Furthermore, the block group to be finished has two requirements. First,
+the block group must not have reserved bytes left. Having reserved bytes
+means we have an allocated region but did not yet send bios for it. If that
+region is allocated by the thread calling btrfs_zone_finish(), it results
+in a deadlock.
+
+Second, the block group to be finished must not be a SYSTEM block
+group. Finishing a SYSTEM block group easily breaks further chunk
+allocation by nullifying the SYSTEM free space.
+
+In a certain case, we cannot find any zone finish candidate or
+btrfs_zone_finish() may fail. In that case, we fall back to split the
+allocation bytes and fill the last spaces left in the block groups.
+
+CC: stable@vger.kernel.org # 5.16+
+Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-tree.c | 50 +++++++++++++++++++++++++++++++++---------
+ fs/btrfs/zoned.c | 40 +++++++++++++++++++++++++++++++++
+ fs/btrfs/zoned.h | 7 ++++++
+ 3 files changed, 87 insertions(+), 10 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 8bdcbc0c6d60..bdebd77f31b4 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3985,6 +3985,45 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl,
+ }
+ }
+
++static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
++ struct find_free_extent_ctl *ffe_ctl)
++{
++ /* If we can activate new zone, just allocate a chunk and use it */
++ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
++ return 0;
++
++ /*
++ * We already reached the max active zones. Try to finish one block
++ * group to make a room for a new block group. This is only possible
++ * for a data block group because btrfs_zone_finish() may need to wait
++ * for a running transaction which can cause a deadlock for metadata
++ * allocation.
++ */
++ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
++ int ret = btrfs_zone_finish_one_bg(fs_info);
++
++ if (ret == 1)
++ return 0;
++ else if (ret < 0)
++ return ret;
++ }
++
++ /*
++ * If we have enough free space left in an already active block group
++ * and we can't activate any other zone now, do not allow allocating a
++ * new chunk and let find_free_extent() retry with a smaller size.
++ */
++ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
++ return -ENOSPC;
++
++ /*
++ * We cannot activate a new block group and no enough space left in any
++ * block groups. So, allocating a new block group may not help. But,
++ * there is nothing to do anyway, so let's go with it.
++ */
++ return 0;
++}
++
+ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+ {
+@@ -3992,16 +4031,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
+ case BTRFS_EXTENT_ALLOC_CLUSTERED:
+ return 0;
+ case BTRFS_EXTENT_ALLOC_ZONED:
+- /*
+- * If we have enough free space left in an already
+- * active block group and we can't activate any other
+- * zone now, do not allow allocating a new chunk and
+- * let find_free_extent() retry with a smaller size.
+- */
+- if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
+- !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags))
+- return -ENOSPC;
+- return 0;
++ return can_allocate_chunk_zoned(fs_info, ffe_ctl);
+ default:
+ BUG();
+ }
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index a0bf2c20fa61..0a6a3d6f5af7 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2176,3 +2176,43 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica
+ spin_unlock(&block_group->lock);
+ btrfs_put_block_group(block_group);
+ }
++
++int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
++{
++ struct btrfs_block_group *block_group;
++ struct btrfs_block_group *min_bg = NULL;
++ u64 min_avail = U64_MAX;
++ int ret;
++
++ spin_lock(&fs_info->zone_active_bgs_lock);
++ list_for_each_entry(block_group, &fs_info->zone_active_bgs,
++ active_bg_list) {
++ u64 avail;
++
++ spin_lock(&block_group->lock);
++ if (block_group->reserved ||
++ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
++ spin_unlock(&block_group->lock);
++ continue;
++ }
++
++ avail = block_group->zone_capacity - block_group->alloc_offset;
++ if (min_avail > avail) {
++ if (min_bg)
++ btrfs_put_block_group(min_bg);
++ min_bg = block_group;
++ min_avail = avail;
++ btrfs_get_block_group(min_bg);
++ }
++ spin_unlock(&block_group->lock);
++ }
++ spin_unlock(&fs_info->zone_active_bgs_lock);
++
++ if (!min_bg)
++ return 0;
++
++ ret = btrfs_zone_finish(min_bg);
++ btrfs_put_block_group(min_bg);
++
++ return ret < 0 ? ret : 1;
++}
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index 2d6da8f4b55a..c424417e19bb 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -83,6 +83,7 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 length);
++int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
+ #else /* CONFIG_BLK_DEV_ZONED */
+ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
+ struct blk_zone *zone)
+@@ -247,6 +248,12 @@ static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+
+ static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length) { }
++
++static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
++{
++ return 1;
++}
++
+ #endif
+
+ static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
+--
+2.35.1
+
--- /dev/null
+From 3672acc7c01efcf0c957c7949f4ff6dcaffde3f1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 3 May 2022 17:48:50 -0700
+Subject: btrfs: zoned: introduce btrfs_zoned_bg_is_full
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 1bfd476754a2d63f899ef9c3e253b17766b8fb73 ]
+
+Introduce a wrapper to check if all the space in a block group is
+allocated or not.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-tree.c | 3 +--
+ fs/btrfs/zoned.c | 2 +-
+ fs/btrfs/zoned.h | 6 ++++++
+ 3 files changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index bdebd77f31b4..56185541e188 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3803,8 +3803,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
+
+ /* Check RO and no space case before trying to activate it */
+ spin_lock(&block_group->lock);
+- if (block_group->ro ||
+- block_group->alloc_offset == block_group->zone_capacity) {
++ if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
+ ret = 1;
+ /*
+ * May need to clear fs_info->{treelog,data_reloc}_bg.
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 0a6a3d6f5af7..170681797283 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1859,7 +1859,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+ }
+
+ /* No space left */
+- if (block_group->alloc_offset == block_group->zone_capacity) {
++ if (btrfs_zoned_bg_is_full(block_group)) {
+ ret = false;
+ goto out_unlock;
+ }
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index 199b69670fa2..0740458894ac 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -384,4 +384,10 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
+ mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
+ }
+
++static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
++{
++ ASSERT(btrfs_is_zoned(bg->fs_info));
++ return (bg->alloc_offset == bg->zone_capacity);
++}
++
+ #endif
+--
+2.35.1
+
--- /dev/null
+From 24698349d4bc8dc722b1db3ab3979252e0b38fe3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:45 +0900
+Subject: btrfs: zoned: introduce space_info->active_total_bytes
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 6a921de589926a350634e6e279f43fa5b9dbf5ba ]
+
+The active_total_bytes, like the total_bytes, accounts for the total bytes
+of active block groups in the space_info.
+
+With an introduction of active_total_bytes, we can check if the reserved
+bytes can be written to the block groups without activating a new block
+group. The check is necessary for metadata allocation on zoned
+filesystem. We cannot finish a block group, which may require waiting
+for the current transaction, from the metadata allocation context.
+Instead, we need to ensure the ongoing allocation (reserved bytes) fits
+in active block groups.
+
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/block-group.c | 12 +++++++++---
+ fs/btrfs/space-info.c | 41 ++++++++++++++++++++++++++++++++---------
+ fs/btrfs/space-info.h | 4 +++-
+ fs/btrfs/zoned.c | 6 ++++++
+ 4 files changed, 50 insertions(+), 13 deletions(-)
+
+diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
+index 1deca5164c23..88f59a2e4113 100644
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1033,8 +1033,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+ < block_group->zone_unusable);
+ WARN_ON(block_group->space_info->disk_total
+ < block_group->length * factor);
++ WARN_ON(block_group->zone_is_active &&
++ block_group->space_info->active_total_bytes
++ < block_group->length);
+ }
+ block_group->space_info->total_bytes -= block_group->length;
++ if (block_group->zone_is_active)
++ block_group->space_info->active_total_bytes -= block_group->length;
+ block_group->space_info->bytes_readonly -=
+ (block_group->length - block_group->zone_unusable);
+ block_group->space_info->bytes_zone_unusable -=
+@@ -2102,7 +2107,8 @@ static int read_one_block_group(struct btrfs_fs_info *info,
+ trace_btrfs_add_block_group(info, cache, 0);
+ btrfs_update_space_info(info, cache->flags, cache->length,
+ cache->used, cache->bytes_super,
+- cache->zone_unusable, &space_info);
++ cache->zone_unusable, cache->zone_is_active,
++ &space_info);
+
+ cache->space_info = space_info;
+
+@@ -2172,7 +2178,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
+ }
+
+ btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
+- 0, 0, &space_info);
++ 0, 0, false, &space_info);
+ bg->space_info = space_info;
+ link_block_group(bg);
+
+@@ -2553,7 +2559,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
+ trace_btrfs_add_block_group(fs_info, cache, 1);
+ btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
+ cache->bytes_super, cache->zone_unusable,
+- &cache->space_info);
++ cache->zone_is_active, &cache->space_info);
+ btrfs_update_global_block_rsv(fs_info);
+
+ link_block_group(cache);
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index 98a84b523be6..4867199cf983 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -295,7 +295,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly, u64 bytes_zone_unusable,
+- struct btrfs_space_info **space_info)
++ bool active, struct btrfs_space_info **space_info)
+ {
+ struct btrfs_space_info *found;
+ int factor;
+@@ -306,6 +306,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
+ ASSERT(found);
+ spin_lock(&found->lock);
+ found->total_bytes += total_bytes;
++ if (active)
++ found->active_total_bytes += total_bytes;
+ found->disk_total += total_bytes * factor;
+ found->bytes_used += bytes_used;
+ found->disk_used += bytes_used * factor;
+@@ -369,6 +371,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
+ return avail;
+ }
+
++static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
++ struct btrfs_space_info *space_info)
++{
++ /*
++ * On regular filesystem, all total_bytes are always writable. On zoned
++ * filesystem, there may be a limitation imposed by max_active_zones.
++ * For metadata allocation, we cannot finish an existing active block
++ * group to avoid a deadlock. Thus, we need to consider only the active
++ * groups to be writable for metadata space.
++ */
++ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
++ return space_info->total_bytes;
++
++ return space_info->active_total_bytes;
++}
++
+ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+@@ -386,7 +404,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ else
+ avail = calc_available_free_space(fs_info, space_info, flush);
+
+- if (used + bytes < space_info->total_bytes + avail)
++ if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
+ return 1;
+ return 0;
+ }
+@@ -422,7 +440,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
+ ticket = list_first_entry(head, struct reserve_ticket, list);
+
+ /* Check and see if our ticket can be satisfied now. */
+- if ((used + ticket->bytes <= space_info->total_bytes) ||
++ if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
+ btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
+ flush)) {
+ btrfs_space_info_update_bytes_may_use(fs_info,
+@@ -753,6 +771,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ {
+ u64 used;
+ u64 avail;
++ u64 total;
+ u64 to_reclaim = space_info->reclaim_size;
+
+ lockdep_assert_held(&space_info->lock);
+@@ -767,8 +786,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ * space. If that's the case add in our overage so we make sure to put
+ * appropriate pressure on the flushing state machine.
+ */
+- if (space_info->total_bytes + avail < used)
+- to_reclaim += used - (space_info->total_bytes + avail);
++ total = writable_total_bytes(fs_info, space_info);
++ if (total + avail < used)
++ to_reclaim += used - (total + avail);
+
+ return to_reclaim;
+ }
+@@ -778,9 +798,12 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
+ {
+ u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ u64 ordered, delalloc;
+- u64 thresh = div_factor_fine(space_info->total_bytes, 90);
++ u64 total = writable_total_bytes(fs_info, space_info);
++ u64 thresh;
+ u64 used;
+
++ thresh = div_factor_fine(total, 90);
++
+ lockdep_assert_held(&space_info->lock);
+
+ /* If we're just plain full then async reclaim just slows us down. */
+@@ -842,8 +865,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
+ BTRFS_RESERVE_FLUSH_ALL);
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_readonly + global_rsv_size;
+- if (used < space_info->total_bytes)
+- thresh += space_info->total_bytes - used;
++ if (used < total)
++ thresh += total - used;
+ thresh >>= space_info->clamp;
+
+ used = space_info->bytes_pinned;
+@@ -1560,7 +1583,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ * can_overcommit() to ensure we can overcommit to continue.
+ */
+ if (!pending_tickets &&
+- ((used + orig_bytes <= space_info->total_bytes) ||
++ ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
+ btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
+ btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+ orig_bytes);
+diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
+index 137206b8049f..b8cee27df213 100644
+--- a/fs/btrfs/space-info.h
++++ b/fs/btrfs/space-info.h
+@@ -17,6 +17,8 @@ struct btrfs_space_info {
+ u64 bytes_may_use; /* number of bytes that may be used for
+ delalloc/allocations */
+ u64 bytes_readonly; /* total bytes that are read only */
++ /* Total bytes in the space, but only accounts active block groups. */
++ u64 active_total_bytes;
+ u64 bytes_zone_unusable; /* total bytes that are unusable until
+ resetting the device zone */
+
+@@ -122,7 +124,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
+ u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly, u64 bytes_zone_unusable,
+- struct btrfs_space_info **space_info);
++ bool active, struct btrfs_space_info **space_info);
+ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
+ u64 chunk_size);
+ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 170681797283..2ffc6d50d20d 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1841,6 +1841,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
+ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+ {
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
++ struct btrfs_space_info *space_info = block_group->space_info;
+ struct map_lookup *map;
+ struct btrfs_device *device;
+ u64 physical;
+@@ -1852,6 +1853,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+
+ map = block_group->physical_map;
+
++ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (block_group->zone_is_active) {
+ ret = true;
+@@ -1880,7 +1882,10 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+
+ /* Successfully activated all the zones */
+ block_group->zone_is_active = 1;
++ space_info->active_total_bytes += block_group->length;
+ spin_unlock(&block_group->lock);
++ btrfs_try_granting_tickets(fs_info, space_info);
++ spin_unlock(&space_info->lock);
+
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
+@@ -1893,6 +1898,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+
+ out_unlock:
+ spin_unlock(&block_group->lock);
++ spin_unlock(&space_info->lock);
+ return ret;
+ }
+
+--
+2.35.1
+
--- /dev/null
+From 33903083ab3fd9bee96c3d8ade9c045e2b61060a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:39 +0900
+Subject: btrfs: zoned: revive max_zone_append_bytes
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit c2ae7b772ef4e86c5ddf3fd47bf59045ae96a414 ]
+
+This patch is basically a revert of commit 5a80d1c6a270 ("btrfs: zoned:
+remove max_zone_append_size logic"), but without unnecessary ASSERT and
+check. The max_zone_append_size will be used as a hint to estimate the
+number of extents to cover delalloc/writeback region in the later commits.
+
+The size of a ZONE APPEND bio is also limited by queue_max_segments(), so
+this commit considers it to calculate max_zone_append_size. Technically, a
+bio can be larger than queue_max_segments() * PAGE_SIZE if the pages are
+contiguous. But, it is safe to consider "queue_max_segments() * PAGE_SIZE"
+as an upper limit of an extent size to calculate the number of extents
+needed to write data.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/ctree.h | 2 ++
+ fs/btrfs/zoned.c | 17 +++++++++++++++++
+ fs/btrfs/zoned.h | 1 +
+ 3 files changed, 20 insertions(+)
+
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 077c95e9baa5..1c377bcfe787 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -1050,6 +1050,8 @@ struct btrfs_fs_info {
+ u64 zoned;
+ };
+
++ /* Max size to emit ZONE_APPEND write command */
++ u64 max_zone_append_size;
+ struct mutex zoned_meta_io_lock;
+ spinlock_t treelog_bg_lock;
+ u64 treelog_bg;
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 84b6d39509bd..1d5b9308f5ef 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -407,6 +407,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
+ nr_sectors = bdev_nr_sectors(bdev);
+ zone_info->zone_size_shift = ilog2(zone_info->zone_size);
+ zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
++ /*
++ * We limit max_zone_append_size also by max_segments *
++ * PAGE_SIZE. Technically, we can have multiple pages per segment. But,
++ * since btrfs adds the pages one by one to a bio, and btrfs cannot
++ * increase the metadata reservation even if it increases the number of
++ * extents, it is safe to stick with the limit.
++ */
++ zone_info->max_zone_append_size =
++ min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT,
++ (u64)bdev_max_segments(bdev) << PAGE_SHIFT);
+ if (!IS_ALIGNED(nr_sectors, zone_sectors))
+ zone_info->nr_zones++;
+
+@@ -632,6 +642,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ u64 zoned_devices = 0;
+ u64 nr_devices = 0;
+ u64 zone_size = 0;
++ u64 max_zone_append_size = 0;
+ const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
+ int ret = 0;
+
+@@ -666,6 +677,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ ret = -EINVAL;
+ goto out;
+ }
++ if (!max_zone_append_size ||
++ (zone_info->max_zone_append_size &&
++ zone_info->max_zone_append_size < max_zone_append_size))
++ max_zone_append_size =
++ zone_info->max_zone_append_size;
+ }
+ nr_devices++;
+ }
+@@ -715,6 +731,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
+ }
+
+ fs_info->zone_size = zone_size;
++ fs_info->max_zone_append_size = max_zone_append_size;
+ fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
+
+ /*
+diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
+index cf6320feef46..2d6da8f4b55a 100644
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -23,6 +23,7 @@ struct btrfs_zoned_device_info {
+ */
+ u64 zone_size;
+ u8 zone_size_shift;
++ u64 max_zone_append_size;
+ u32 nr_zones;
+ unsigned int max_active_zones;
+ atomic_t active_zones_left;
+--
+2.35.1
+
--- /dev/null
+From bfdbe30121dfbc58093980adb193543c02b438ce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:50 +0900
+Subject: btrfs: zoned: wait until zone is finished when allocation didn't
+ progress
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 2ce543f478433a0eec0f72090d7e814f1d53d456 ]
+
+When the allocated position doesn't progress, we cannot submit IOs to
+finish a block group, but there should be ongoing IOs that will finish a
+block group. So, in that case, we wait for a zone to be finished and retry
+the allocation after that.
+
+Introduce a new flag BTRFS_FS_NEED_ZONE_FINISH for fs_info->flags to
+indicate we need a zone finish to have proceeded. The flag is set when the
+allocator detected it cannot activate a new block group. And, it is cleared
+once a zone is finished.
+
+CC: stable@vger.kernel.org # 5.16+
+Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/ctree.h | 5 +++++
+ fs/btrfs/disk-io.c | 1 +
+ fs/btrfs/inode.c | 9 +++++++--
+ fs/btrfs/zoned.c | 6 ++++++
+ 4 files changed, 19 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 97f5a3d320ff..76fbe4cf2a28 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -635,6 +635,9 @@ enum {
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
++ /* Indicate we have to finish a zone to do next allocation. */
++ BTRFS_FS_NEED_ZONE_FINISH,
++
+ #if BITS_PER_LONG == 32
+ /* Indicate if we have error/warn message printed on 32bit systems */
+ BTRFS_FS_32BIT_ERROR,
+@@ -1074,6 +1077,8 @@ struct btrfs_fs_info {
+
+ spinlock_t zone_active_bgs_lock;
+ struct list_head zone_active_bgs;
++ /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */
++ wait_queue_head_t zone_finish_wait;
+
+ #ifdef CONFIG_BTRFS_FS_REF_VERIFY
+ spinlock_t ref_verify_lock;
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index bf5c6ac67e87..59fa7bf3a2e5 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3239,6 +3239,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
+ init_waitqueue_head(&fs_info->transaction_blocked_wait);
+ init_waitqueue_head(&fs_info->async_submit_wait);
+ init_waitqueue_head(&fs_info->delayed_iputs_wait);
++ init_waitqueue_head(&fs_info->zone_finish_wait);
+
+ /* Usable values until the real ones are cached from the superblock */
+ fs_info->nodesize = 4096;
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 9753fc47e488..64d310ecbb84 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1606,8 +1606,13 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
+ if (ret == 0)
+ done_offset = end;
+
+- if (done_offset == start)
+- return -ENOSPC;
++ if (done_offset == start) {
++ struct btrfs_fs_info *info = inode->root->fs_info;
++
++ wait_var_event(&info->zone_finish_wait,
++ !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags));
++ continue;
++ }
+
+ if (!locked_page_done) {
+ __set_page_dirty_nobuffers(locked_page);
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 0c2d81b0e3d3..45e29b8c705c 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1993,6 +1993,9 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+
++ clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
++ wake_up_all(&fs_info->zone_finish_wait);
++
+ return 0;
+ }
+
+@@ -2021,6 +2024,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
+ }
+ mutex_unlock(&fs_info->chunk_mutex);
+
++ if (!ret)
++ set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags);
++
+ return ret;
+ }
+
+--
+2.35.1
+
--- /dev/null
+From e3d143c47bd2fd49137af62c599571f2ab7620d6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 9 Jul 2022 08:18:49 +0900
+Subject: btrfs: zoned: write out partially allocated region
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit 898793d992c23dac6126a6a94ad893eae1a2c9df ]
+
+cow_file_range() works in an all-or-nothing way: if it fails to allocate an
+extent for a part of the given region, it gives up all the region including
+the successfully allocated parts. On cow_file_range(), run_delalloc_zoned()
+writes data for the region only when it successfully allocate all the
+region.
+
+This all-or-nothing allocation and write-out are problematic when available
+space in all the block groups are get tight with the active zone
+restriction. btrfs_reserve_extent() try hard to utilize the left space in
+the active block groups and gives up finally and fails with
+-ENOSPC. However, if we send IOs for the successfully allocated region, we
+can finish a zone and can continue on the rest of the allocation on a newly
+allocated block group.
+
+This patch implements the partial write-out for run_delalloc_zoned(). With
+this patch applied, cow_file_range() returns -EAGAIN to tell the caller to
+do something to progress the further allocation, and tells the successfully
+allocated region with done_offset. Furthermore, the zoned extent allocator
+returns -EAGAIN to tell cow_file_range() going back to the caller side.
+
+Actually, we still need to wait for an IO to complete to continue the
+allocation. The next patch implements that part.
+
+CC: stable@vger.kernel.org # 5.16+
+Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-tree.c | 10 +++++++
+ fs/btrfs/inode.c | 63 ++++++++++++++++++++++++++++++++----------
+ 2 files changed, 59 insertions(+), 14 deletions(-)
+
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 56185541e188..eee68a6f2be7 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -4015,6 +4015,16 @@ static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info,
+ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size)
+ return -ENOSPC;
+
++ /*
++ * Even min_alloc_size is not left in any block groups. Since we cannot
++ * activate a new block group, allocating it may not help. Let's tell a
++ * caller to try again and hope it progress something by writing some
++ * parts of the region. That is only possible for data block groups,
++ * where a part of the region can be written.
++ */
++ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)
++ return -EAGAIN;
++
+ /*
+ * We cannot activate a new block group and no enough space left in any
+ * block groups. So, allocating a new block group may not help. But,
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index c50288d90c66..9753fc47e488 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -92,7 +92,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
+ static noinline int cow_file_range(struct btrfs_inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+- unsigned long *nr_written, int unlock);
++ unsigned long *nr_written, int unlock,
++ u64 *done_offset);
+ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
+ u64 len, u64 orig_start, u64 block_start,
+ u64 block_len, u64 orig_block_len,
+@@ -884,7 +885,7 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
+ * can directly submit them without interruption.
+ */
+ ret = cow_file_range(inode, locked_page, start, end, &page_started,
+- &nr_written, 0);
++ &nr_written, 0, NULL);
+ /* Inline extent inserted, page gets unlocked and everything is done */
+ if (page_started) {
+ ret = 0;
+@@ -1133,7 +1134,8 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
+ static noinline int cow_file_range(struct btrfs_inode *inode,
+ struct page *locked_page,
+ u64 start, u64 end, int *page_started,
+- unsigned long *nr_written, int unlock)
++ unsigned long *nr_written, int unlock,
++ u64 *done_offset)
+ {
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+@@ -1326,6 +1328,21 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ out_unlock:
++ /*
++ * If done_offset is non-NULL and ret == -EAGAIN, we expect the
++ * caller to write out the successfully allocated region and retry.
++ */
++ if (done_offset && ret == -EAGAIN) {
++ if (orig_start < start)
++ *done_offset = start - 1;
++ else
++ *done_offset = start;
++ return ret;
++ } else if (ret == -EAGAIN) {
++ /* Convert to -ENOSPC since the caller cannot retry. */
++ ret = -ENOSPC;
++ }
++
+ /*
+ * Now, we have three regions to clean up:
+ *
+@@ -1571,19 +1588,37 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
+ u64 end, int *page_started,
+ unsigned long *nr_written)
+ {
++ u64 done_offset = end;
+ int ret;
++ bool locked_page_done = false;
+
+- ret = cow_file_range(inode, locked_page, start, end, page_started,
+- nr_written, 0);
+- if (ret)
+- return ret;
++ while (start <= end) {
++ ret = cow_file_range(inode, locked_page, start, end, page_started,
++ nr_written, 0, &done_offset);
++ if (ret && ret != -EAGAIN)
++ return ret;
+
+- if (*page_started)
+- return 0;
++ if (*page_started) {
++ ASSERT(ret == 0);
++ return 0;
++ }
++
++ if (ret == 0)
++ done_offset = end;
++
++ if (done_offset == start)
++ return -ENOSPC;
++
++ if (!locked_page_done) {
++ __set_page_dirty_nobuffers(locked_page);
++ account_page_redirty(locked_page);
++ }
++ locked_page_done = true;
++ extent_write_locked_range(&inode->vfs_inode, start, done_offset);
++
++ start = done_offset + 1;
++ }
+
+- __set_page_dirty_nobuffers(locked_page);
+- account_page_redirty(locked_page);
+- extent_write_locked_range(&inode->vfs_inode, start, end);
+ *page_started = 1;
+
+ return 0;
+@@ -1675,7 +1710,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
+ }
+
+ return cow_file_range(inode, locked_page, start, end, page_started,
+- nr_written, 1);
++ nr_written, 1, NULL);
+ }
+
+ /*
+@@ -2086,7 +2121,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
+ page_started, nr_written);
+ else
+ ret = cow_file_range(inode, locked_page, start, end,
+- page_started, nr_written, 1);
++ page_started, nr_written, 1, NULL);
+ } else {
+ set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
+ ret = cow_file_range_async(inode, wbc, locked_page, start, end,
+--
+2.35.1
+
--- /dev/null
+From 808c1dca59fc32bd267c25d387bbc56a55144ccb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 28 May 2022 21:44:07 +0200
+Subject: crypto: blake2s - remove shash module
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+[ Upstream commit 2d16803c562ecc644803d42ba98a8e0aef9c014e ]
+
+BLAKE2s has no currently known use as an shash. Just remove all of this
+unnecessary plumbing. Removing this shash was something we talked about
+back when we were making BLAKE2s a built-in, but I simply never got
+around to doing it. So this completes that project.
+
+Importantly, this fixs a bug in which the lib code depends on
+crypto_simd_disabled_for_test, causing linker errors.
+
+Also add more alignment tests to the selftests and compare SIMD and
+non-SIMD compression functions, to make up for what we lose from
+testmgr.c.
+
+Reported-by: gaochao <gaochao49@huawei.com>
+Cc: Eric Biggers <ebiggers@kernel.org>
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: stable@vger.kernel.org
+Fixes: 6048fdcc5f26 ("lib/crypto: blake2s: include as built-in")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm/crypto/Kconfig | 2 +-
+ arch/arm/crypto/Makefile | 4 +-
+ arch/arm/crypto/blake2s-shash.c | 75 -----------
+ arch/x86/crypto/Makefile | 4 +-
+ arch/x86/crypto/blake2s-glue.c | 3 +-
+ arch/x86/crypto/blake2s-shash.c | 77 -----------
+ crypto/Kconfig | 20 +--
+ crypto/Makefile | 1 -
+ crypto/blake2s_generic.c | 75 -----------
+ crypto/tcrypt.c | 12 --
+ crypto/testmgr.c | 24 ----
+ crypto/testmgr.h | 217 ------------------------------
+ include/crypto/internal/blake2s.h | 108 ---------------
+ lib/crypto/blake2s-selftest.c | 41 ++++++
+ lib/crypto/blake2s.c | 37 ++++-
+ 15 files changed, 76 insertions(+), 624 deletions(-)
+ delete mode 100644 arch/arm/crypto/blake2s-shash.c
+ delete mode 100644 arch/x86/crypto/blake2s-shash.c
+ delete mode 100644 crypto/blake2s_generic.c
+
+diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
+index e4dba5461cb3..149a5bd6b88c 100644
+--- a/arch/arm/crypto/Kconfig
++++ b/arch/arm/crypto/Kconfig
+@@ -63,7 +63,7 @@ config CRYPTO_SHA512_ARM
+ using optimized ARM assembler and NEON, when available.
+
+ config CRYPTO_BLAKE2S_ARM
+- tristate "BLAKE2s digest algorithm (ARM)"
++ bool "BLAKE2s digest algorithm (ARM)"
+ select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+ help
+ BLAKE2s digest algorithm optimized with ARM scalar instructions. This
+diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
+index 0274f81cc8ea..971e74546fb1 100644
+--- a/arch/arm/crypto/Makefile
++++ b/arch/arm/crypto/Makefile
+@@ -9,8 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
+ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+ obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
+ obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
+-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
+-obj-$(if $(CONFIG_CRYPTO_BLAKE2S_ARM),y) += libblake2s-arm.o
++obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o
+ obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
+ obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
+ obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
+@@ -32,7 +31,6 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
+ sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
+ sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
+ sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
+-blake2s-arm-y := blake2s-shash.o
+ libblake2s-arm-y:= blake2s-core.o blake2s-glue.o
+ blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o
+ sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
+diff --git a/arch/arm/crypto/blake2s-shash.c b/arch/arm/crypto/blake2s-shash.c
+deleted file mode 100644
+index 763c73beea2d..000000000000
+--- a/arch/arm/crypto/blake2s-shash.c
++++ /dev/null
+@@ -1,75 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0-or-later
+-/*
+- * BLAKE2s digest algorithm, ARM scalar implementation
+- *
+- * Copyright 2020 Google LLC
+- */
+-
+-#include <crypto/internal/blake2s.h>
+-#include <crypto/internal/hash.h>
+-
+-#include <linux/module.h>
+-
+-static int crypto_blake2s_update_arm(struct shash_desc *desc,
+- const u8 *in, unsigned int inlen)
+-{
+- return crypto_blake2s_update(desc, in, inlen, false);
+-}
+-
+-static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
+-{
+- return crypto_blake2s_final(desc, out, false);
+-}
+-
+-#define BLAKE2S_ALG(name, driver_name, digest_size) \
+- { \
+- .base.cra_name = name, \
+- .base.cra_driver_name = driver_name, \
+- .base.cra_priority = 200, \
+- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
+- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
+- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
+- .base.cra_module = THIS_MODULE, \
+- .digestsize = digest_size, \
+- .setkey = crypto_blake2s_setkey, \
+- .init = crypto_blake2s_init, \
+- .update = crypto_blake2s_update_arm, \
+- .final = crypto_blake2s_final_arm, \
+- .descsize = sizeof(struct blake2s_state), \
+- }
+-
+-static struct shash_alg blake2s_arm_algs[] = {
+- BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
+-};
+-
+-static int __init blake2s_arm_mod_init(void)
+-{
+- return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
+- crypto_register_shashes(blake2s_arm_algs,
+- ARRAY_SIZE(blake2s_arm_algs)) : 0;
+-}
+-
+-static void __exit blake2s_arm_mod_exit(void)
+-{
+- if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
+- crypto_unregister_shashes(blake2s_arm_algs,
+- ARRAY_SIZE(blake2s_arm_algs));
+-}
+-
+-module_init(blake2s_arm_mod_init);
+-module_exit(blake2s_arm_mod_exit);
+-
+-MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
+-MODULE_LICENSE("GPL");
+-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+-MODULE_ALIAS_CRYPTO("blake2s-128");
+-MODULE_ALIAS_CRYPTO("blake2s-128-arm");
+-MODULE_ALIAS_CRYPTO("blake2s-160");
+-MODULE_ALIAS_CRYPTO("blake2s-160-arm");
+-MODULE_ALIAS_CRYPTO("blake2s-224");
+-MODULE_ALIAS_CRYPTO("blake2s-224-arm");
+-MODULE_ALIAS_CRYPTO("blake2s-256");
+-MODULE_ALIAS_CRYPTO("blake2s-256-arm");
+diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
+index 2831685adf6f..8ed4597fdf6a 100644
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -61,9 +61,7 @@ sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o
+ obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
+ sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
+
+-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
+-blake2s-x86_64-y := blake2s-shash.o
+-obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o
++obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o
+ libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+
+ obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
+diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
+index 69853c13e8fb..aaba21230528 100644
+--- a/arch/x86/crypto/blake2s-glue.c
++++ b/arch/x86/crypto/blake2s-glue.c
+@@ -4,7 +4,6 @@
+ */
+
+ #include <crypto/internal/blake2s.h>
+-#include <crypto/internal/simd.h>
+
+ #include <linux/types.h>
+ #include <linux/jump_label.h>
+@@ -33,7 +32,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
+
+- if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) {
++ if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
+ blake2s_compress_generic(state, block, nblocks, inc);
+ return;
+ }
+diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c
+deleted file mode 100644
+index 59ae28abe35c..000000000000
+--- a/arch/x86/crypto/blake2s-shash.c
++++ /dev/null
+@@ -1,77 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0 OR MIT
+-/*
+- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+- */
+-
+-#include <crypto/internal/blake2s.h>
+-#include <crypto/internal/simd.h>
+-#include <crypto/internal/hash.h>
+-
+-#include <linux/types.h>
+-#include <linux/kernel.h>
+-#include <linux/module.h>
+-#include <linux/sizes.h>
+-
+-#include <asm/cpufeature.h>
+-#include <asm/processor.h>
+-
+-static int crypto_blake2s_update_x86(struct shash_desc *desc,
+- const u8 *in, unsigned int inlen)
+-{
+- return crypto_blake2s_update(desc, in, inlen, false);
+-}
+-
+-static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out)
+-{
+- return crypto_blake2s_final(desc, out, false);
+-}
+-
+-#define BLAKE2S_ALG(name, driver_name, digest_size) \
+- { \
+- .base.cra_name = name, \
+- .base.cra_driver_name = driver_name, \
+- .base.cra_priority = 200, \
+- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
+- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
+- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
+- .base.cra_module = THIS_MODULE, \
+- .digestsize = digest_size, \
+- .setkey = crypto_blake2s_setkey, \
+- .init = crypto_blake2s_init, \
+- .update = crypto_blake2s_update_x86, \
+- .final = crypto_blake2s_final_x86, \
+- .descsize = sizeof(struct blake2s_state), \
+- }
+-
+-static struct shash_alg blake2s_algs[] = {
+- BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE),
+-};
+-
+-static int __init blake2s_mod_init(void)
+-{
+- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+- return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+- return 0;
+-}
+-
+-static void __exit blake2s_mod_exit(void)
+-{
+- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
+- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+-}
+-
+-module_init(blake2s_mod_init);
+-module_exit(blake2s_mod_exit);
+-
+-MODULE_ALIAS_CRYPTO("blake2s-128");
+-MODULE_ALIAS_CRYPTO("blake2s-128-x86");
+-MODULE_ALIAS_CRYPTO("blake2s-160");
+-MODULE_ALIAS_CRYPTO("blake2s-160-x86");
+-MODULE_ALIAS_CRYPTO("blake2s-224");
+-MODULE_ALIAS_CRYPTO("blake2s-224-x86");
+-MODULE_ALIAS_CRYPTO("blake2s-256");
+-MODULE_ALIAS_CRYPTO("blake2s-256-x86");
+-MODULE_LICENSE("GPL v2");
+diff --git a/crypto/Kconfig b/crypto/Kconfig
+index b4e00a7a046b..38601a072b99 100644
+--- a/crypto/Kconfig
++++ b/crypto/Kconfig
+@@ -692,26 +692,8 @@ config CRYPTO_BLAKE2B
+
+ See https://blake2.net for further information.
+
+-config CRYPTO_BLAKE2S
+- tristate "BLAKE2s digest algorithm"
+- select CRYPTO_LIB_BLAKE2S_GENERIC
+- select CRYPTO_HASH
+- help
+- Implementation of cryptographic hash function BLAKE2s
+- optimized for 8-32bit platforms and can produce digests of any size
+- between 1 to 32. The keyed hash is also implemented.
+-
+- This module provides the following algorithms:
+-
+- - blake2s-128
+- - blake2s-160
+- - blake2s-224
+- - blake2s-256
+-
+- See https://blake2.net for further information.
+-
+ config CRYPTO_BLAKE2S_X86
+- tristate "BLAKE2s digest algorithm (x86 accelerated version)"
++ bool "BLAKE2s digest algorithm (x86 accelerated version)"
+ depends on X86 && 64BIT
+ select CRYPTO_LIB_BLAKE2S_GENERIC
+ select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
+diff --git a/crypto/Makefile b/crypto/Makefile
+index a40e6d5fb2c8..dbfa53567c92 100644
+--- a/crypto/Makefile
++++ b/crypto/Makefile
+@@ -83,7 +83,6 @@ obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
+ obj-$(CONFIG_CRYPTO_WP512) += wp512.o
+ CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
+ obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b_generic.o
+-obj-$(CONFIG_CRYPTO_BLAKE2S) += blake2s_generic.o
+ obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o
+ obj-$(CONFIG_CRYPTO_ECB) += ecb.o
+ obj-$(CONFIG_CRYPTO_CBC) += cbc.o
+diff --git a/crypto/blake2s_generic.c b/crypto/blake2s_generic.c
+deleted file mode 100644
+index 5f96a21f8788..000000000000
+--- a/crypto/blake2s_generic.c
++++ /dev/null
+@@ -1,75 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0 OR MIT
+-/*
+- * shash interface to the generic implementation of BLAKE2s
+- *
+- * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+- */
+-
+-#include <crypto/internal/blake2s.h>
+-#include <crypto/internal/hash.h>
+-
+-#include <linux/types.h>
+-#include <linux/kernel.h>
+-#include <linux/module.h>
+-
+-static int crypto_blake2s_update_generic(struct shash_desc *desc,
+- const u8 *in, unsigned int inlen)
+-{
+- return crypto_blake2s_update(desc, in, inlen, true);
+-}
+-
+-static int crypto_blake2s_final_generic(struct shash_desc *desc, u8 *out)
+-{
+- return crypto_blake2s_final(desc, out, true);
+-}
+-
+-#define BLAKE2S_ALG(name, driver_name, digest_size) \
+- { \
+- .base.cra_name = name, \
+- .base.cra_driver_name = driver_name, \
+- .base.cra_priority = 100, \
+- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
+- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
+- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
+- .base.cra_module = THIS_MODULE, \
+- .digestsize = digest_size, \
+- .setkey = crypto_blake2s_setkey, \
+- .init = crypto_blake2s_init, \
+- .update = crypto_blake2s_update_generic, \
+- .final = crypto_blake2s_final_generic, \
+- .descsize = sizeof(struct blake2s_state), \
+- }
+-
+-static struct shash_alg blake2s_algs[] = {
+- BLAKE2S_ALG("blake2s-128", "blake2s-128-generic",
+- BLAKE2S_128_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-160", "blake2s-160-generic",
+- BLAKE2S_160_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-224", "blake2s-224-generic",
+- BLAKE2S_224_HASH_SIZE),
+- BLAKE2S_ALG("blake2s-256", "blake2s-256-generic",
+- BLAKE2S_256_HASH_SIZE),
+-};
+-
+-static int __init blake2s_mod_init(void)
+-{
+- return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+-}
+-
+-static void __exit blake2s_mod_exit(void)
+-{
+- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
+-}
+-
+-subsys_initcall(blake2s_mod_init);
+-module_exit(blake2s_mod_exit);
+-
+-MODULE_ALIAS_CRYPTO("blake2s-128");
+-MODULE_ALIAS_CRYPTO("blake2s-128-generic");
+-MODULE_ALIAS_CRYPTO("blake2s-160");
+-MODULE_ALIAS_CRYPTO("blake2s-160-generic");
+-MODULE_ALIAS_CRYPTO("blake2s-224");
+-MODULE_ALIAS_CRYPTO("blake2s-224-generic");
+-MODULE_ALIAS_CRYPTO("blake2s-256");
+-MODULE_ALIAS_CRYPTO("blake2s-256-generic");
+-MODULE_LICENSE("GPL v2");
+diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c
+index 2bacf8384f59..66b7ca1ccb23 100644
+--- a/crypto/tcrypt.c
++++ b/crypto/tcrypt.c
+@@ -1669,10 +1669,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
+ ret += tcrypt_test("rmd160");
+ break;
+
+- case 41:
+- ret += tcrypt_test("blake2s-256");
+- break;
+-
+ case 42:
+ ret += tcrypt_test("blake2b-512");
+ break;
+@@ -2240,10 +2236,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
+ test_hash_speed("rmd160", sec, generic_hash_speed_template);
+ if (mode > 300 && mode < 400) break;
+ fallthrough;
+- case 316:
+- test_hash_speed("blake2s-256", sec, generic_hash_speed_template);
+- if (mode > 300 && mode < 400) break;
+- fallthrough;
+ case 317:
+ test_hash_speed("blake2b-512", sec, generic_hash_speed_template);
+ if (mode > 300 && mode < 400) break;
+@@ -2352,10 +2344,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb)
+ test_ahash_speed("rmd160", sec, generic_hash_speed_template);
+ if (mode > 400 && mode < 500) break;
+ fallthrough;
+- case 416:
+- test_ahash_speed("blake2s-256", sec, generic_hash_speed_template);
+- if (mode > 400 && mode < 500) break;
+- fallthrough;
+ case 417:
+ test_ahash_speed("blake2b-512", sec, generic_hash_speed_template);
+ if (mode > 400 && mode < 500) break;
+diff --git a/crypto/testmgr.c b/crypto/testmgr.c
+index 4948201065cc..56facdb63843 100644
+--- a/crypto/testmgr.c
++++ b/crypto/testmgr.c
+@@ -4324,30 +4324,6 @@ static const struct alg_test_desc alg_test_descs[] = {
+ .suite = {
+ .hash = __VECS(blake2b_512_tv_template)
+ }
+- }, {
+- .alg = "blake2s-128",
+- .test = alg_test_hash,
+- .suite = {
+- .hash = __VECS(blakes2s_128_tv_template)
+- }
+- }, {
+- .alg = "blake2s-160",
+- .test = alg_test_hash,
+- .suite = {
+- .hash = __VECS(blakes2s_160_tv_template)
+- }
+- }, {
+- .alg = "blake2s-224",
+- .test = alg_test_hash,
+- .suite = {
+- .hash = __VECS(blakes2s_224_tv_template)
+- }
+- }, {
+- .alg = "blake2s-256",
+- .test = alg_test_hash,
+- .suite = {
+- .hash = __VECS(blakes2s_256_tv_template)
+- }
+ }, {
+ .alg = "cbc(aes)",
+ .test = alg_test_skcipher,
+diff --git a/crypto/testmgr.h b/crypto/testmgr.h
+index 4d7449fc6a65..c29658337d96 100644
+--- a/crypto/testmgr.h
++++ b/crypto/testmgr.h
+@@ -34034,221 +34034,4 @@ static const struct hash_testvec blake2b_512_tv_template[] = {{
+ 0xae, 0x15, 0x81, 0x15, 0xd0, 0x88, 0xa0, 0x3c, },
+ }};
+
+-static const struct hash_testvec blakes2s_128_tv_template[] = {{
+- .digest = (u8[]){ 0x64, 0x55, 0x0d, 0x6f, 0xfe, 0x2c, 0x0a, 0x01,
+- 0xa1, 0x4a, 0xba, 0x1e, 0xad, 0xe0, 0x20, 0x0c, },
+-}, {
+- .plaintext = blake2_ordered_sequence,
+- .psize = 64,
+- .digest = (u8[]){ 0xdc, 0x66, 0xca, 0x8f, 0x03, 0x86, 0x58, 0x01,
+- 0xb0, 0xff, 0xe0, 0x6e, 0xd8, 0xa1, 0xa9, 0x0e, },
+-}, {
+- .ksize = 16,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 1,
+- .digest = (u8[]){ 0x88, 0x1e, 0x42, 0xe7, 0xbb, 0x35, 0x80, 0x82,
+- 0x63, 0x7c, 0x0a, 0x0f, 0xd7, 0xec, 0x6c, 0x2f, },
+-}, {
+- .ksize = 32,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 7,
+- .digest = (u8[]){ 0xcf, 0x9e, 0x07, 0x2a, 0xd5, 0x22, 0xf2, 0xcd,
+- 0xa2, 0xd8, 0x25, 0x21, 0x80, 0x86, 0x73, 0x1c, },
+-}, {
+- .ksize = 1,
+- .key = "B",
+- .plaintext = blake2_ordered_sequence,
+- .psize = 15,
+- .digest = (u8[]){ 0xf6, 0x33, 0x5a, 0x2c, 0x22, 0xa0, 0x64, 0xb2,
+- 0xb6, 0x3f, 0xeb, 0xbc, 0xd1, 0xc3, 0xe5, 0xb2, },
+-}, {
+- .ksize = 16,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 247,
+- .digest = (u8[]){ 0x72, 0x66, 0x49, 0x60, 0xf9, 0x4a, 0xea, 0xbe,
+- 0x1f, 0xf4, 0x60, 0xce, 0xb7, 0x81, 0xcb, 0x09, },
+-}, {
+- .ksize = 32,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 256,
+- .digest = (u8[]){ 0xd5, 0xa4, 0x0e, 0xc3, 0x16, 0xc7, 0x51, 0xa6,
+- 0x3c, 0xd0, 0xd9, 0x11, 0x57, 0xfa, 0x1e, 0xbb, },
+-}};
+-
+-static const struct hash_testvec blakes2s_160_tv_template[] = {{
+- .plaintext = blake2_ordered_sequence,
+- .psize = 7,
+- .digest = (u8[]){ 0xb4, 0xf2, 0x03, 0x49, 0x37, 0xed, 0xb1, 0x3e,
+- 0x5b, 0x2a, 0xca, 0x64, 0x82, 0x74, 0xf6, 0x62,
+- 0xe3, 0xf2, 0x84, 0xff, },
+-}, {
+- .plaintext = blake2_ordered_sequence,
+- .psize = 256,
+- .digest = (u8[]){ 0xaa, 0x56, 0x9b, 0xdc, 0x98, 0x17, 0x75, 0xf2,
+- 0xb3, 0x68, 0x83, 0xb7, 0x9b, 0x8d, 0x48, 0xb1,
+- 0x9b, 0x2d, 0x35, 0x05, },
+-}, {
+- .ksize = 1,
+- .key = "B",
+- .digest = (u8[]){ 0x50, 0x16, 0xe7, 0x0c, 0x01, 0xd0, 0xd3, 0xc3,
+- 0xf4, 0x3e, 0xb1, 0x6e, 0x97, 0xa9, 0x4e, 0xd1,
+- 0x79, 0x65, 0x32, 0x93, },
+-}, {
+- .ksize = 32,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 1,
+- .digest = (u8[]){ 0x1c, 0x2b, 0xcd, 0x9a, 0x68, 0xca, 0x8c, 0x71,
+- 0x90, 0x29, 0x6c, 0x54, 0xfa, 0x56, 0x4a, 0xef,
+- 0xa2, 0x3a, 0x56, 0x9c, },
+-}, {
+- .ksize = 16,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 15,
+- .digest = (u8[]){ 0x36, 0xc3, 0x5f, 0x9a, 0xdc, 0x7e, 0xbf, 0x19,
+- 0x68, 0xaa, 0xca, 0xd8, 0x81, 0xbf, 0x09, 0x34,
+- 0x83, 0x39, 0x0f, 0x30, },
+-}, {
+- .ksize = 1,
+- .key = "B",
+- .plaintext = blake2_ordered_sequence,
+- .psize = 64,
+- .digest = (u8[]){ 0x86, 0x80, 0x78, 0xa4, 0x14, 0xec, 0x03, 0xe5,
+- 0xb6, 0x9a, 0x52, 0x0e, 0x42, 0xee, 0x39, 0x9d,
+- 0xac, 0xa6, 0x81, 0x63, },
+-}, {
+- .ksize = 32,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 247,
+- .digest = (u8[]){ 0x2d, 0xd8, 0xd2, 0x53, 0x66, 0xfa, 0xa9, 0x01,
+- 0x1c, 0x9c, 0xaf, 0xa3, 0xe2, 0x9d, 0x9b, 0x10,
+- 0x0a, 0xf6, 0x73, 0xe8, },
+-}};
+-
+-static const struct hash_testvec blakes2s_224_tv_template[] = {{
+- .plaintext = blake2_ordered_sequence,
+- .psize = 1,
+- .digest = (u8[]){ 0x61, 0xb9, 0x4e, 0xc9, 0x46, 0x22, 0xa3, 0x91,
+- 0xd2, 0xae, 0x42, 0xe6, 0x45, 0x6c, 0x90, 0x12,
+- 0xd5, 0x80, 0x07, 0x97, 0xb8, 0x86, 0x5a, 0xfc,
+- 0x48, 0x21, 0x97, 0xbb, },
+-}, {
+- .plaintext = blake2_ordered_sequence,
+- .psize = 247,
+- .digest = (u8[]){ 0x9e, 0xda, 0xc7, 0x20, 0x2c, 0xd8, 0x48, 0x2e,
+- 0x31, 0x94, 0xab, 0x46, 0x6d, 0x94, 0xd8, 0xb4,
+- 0x69, 0xcd, 0xae, 0x19, 0x6d, 0x9e, 0x41, 0xcc,
+- 0x2b, 0xa4, 0xd5, 0xf6, },
+-}, {
+- .ksize = 16,
+- .key = blake2_ordered_sequence,
+- .digest = (u8[]){ 0x32, 0xc0, 0xac, 0xf4, 0x3b, 0xd3, 0x07, 0x9f,
+- 0xbe, 0xfb, 0xfa, 0x4d, 0x6b, 0x4e, 0x56, 0xb3,
+- 0xaa, 0xd3, 0x27, 0xf6, 0x14, 0xbf, 0xb9, 0x32,
+- 0xa7, 0x19, 0xfc, 0xb8, },
+-}, {
+- .ksize = 1,
+- .key = "B",
+- .plaintext = blake2_ordered_sequence,
+- .psize = 7,
+- .digest = (u8[]){ 0x73, 0xad, 0x5e, 0x6d, 0xb9, 0x02, 0x8e, 0x76,
+- 0xf2, 0x66, 0x42, 0x4b, 0x4c, 0xfa, 0x1f, 0xe6,
+- 0x2e, 0x56, 0x40, 0xe5, 0xa2, 0xb0, 0x3c, 0xe8,
+- 0x7b, 0x45, 0xfe, 0x05, },
+-}, {
+- .ksize = 32,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 15,
+- .digest = (u8[]){ 0x16, 0x60, 0xfb, 0x92, 0x54, 0xb3, 0x6e, 0x36,
+- 0x81, 0xf4, 0x16, 0x41, 0xc3, 0x3d, 0xd3, 0x43,
+- 0x84, 0xed, 0x10, 0x6f, 0x65, 0x80, 0x7a, 0x3e,
+- 0x25, 0xab, 0xc5, 0x02, },
+-}, {
+- .ksize = 16,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 64,
+- .digest = (u8[]){ 0xca, 0xaa, 0x39, 0x67, 0x9c, 0xf7, 0x6b, 0xc7,
+- 0xb6, 0x82, 0xca, 0x0e, 0x65, 0x36, 0x5b, 0x7c,
+- 0x24, 0x00, 0xfa, 0x5f, 0xda, 0x06, 0x91, 0x93,
+- 0x6a, 0x31, 0x83, 0xb5, },
+-}, {
+- .ksize = 1,
+- .key = "B",
+- .plaintext = blake2_ordered_sequence,
+- .psize = 256,
+- .digest = (u8[]){ 0x90, 0x02, 0x26, 0xb5, 0x06, 0x9c, 0x36, 0x86,
+- 0x94, 0x91, 0x90, 0x1e, 0x7d, 0x2a, 0x71, 0xb2,
+- 0x48, 0xb5, 0xe8, 0x16, 0xfd, 0x64, 0x33, 0x45,
+- 0xb3, 0xd7, 0xec, 0xcc, },
+-}};
+-
+-static const struct hash_testvec blakes2s_256_tv_template[] = {{
+- .plaintext = blake2_ordered_sequence,
+- .psize = 15,
+- .digest = (u8[]){ 0xd9, 0x7c, 0x82, 0x8d, 0x81, 0x82, 0xa7, 0x21,
+- 0x80, 0xa0, 0x6a, 0x78, 0x26, 0x83, 0x30, 0x67,
+- 0x3f, 0x7c, 0x4e, 0x06, 0x35, 0x94, 0x7c, 0x04,
+- 0xc0, 0x23, 0x23, 0xfd, 0x45, 0xc0, 0xa5, 0x2d, },
+-}, {
+- .ksize = 32,
+- .key = blake2_ordered_sequence,
+- .digest = (u8[]){ 0x48, 0xa8, 0x99, 0x7d, 0xa4, 0x07, 0x87, 0x6b,
+- 0x3d, 0x79, 0xc0, 0xd9, 0x23, 0x25, 0xad, 0x3b,
+- 0x89, 0xcb, 0xb7, 0x54, 0xd8, 0x6a, 0xb7, 0x1a,
+- 0xee, 0x04, 0x7a, 0xd3, 0x45, 0xfd, 0x2c, 0x49, },
+-}, {
+- .ksize = 1,
+- .key = "B",
+- .plaintext = blake2_ordered_sequence,
+- .psize = 1,
+- .digest = (u8[]){ 0x22, 0x27, 0xae, 0xaa, 0x6e, 0x81, 0x56, 0x03,
+- 0xa7, 0xe3, 0xa1, 0x18, 0xa5, 0x9a, 0x2c, 0x18,
+- 0xf4, 0x63, 0xbc, 0x16, 0x70, 0xf1, 0xe7, 0x4b,
+- 0x00, 0x6d, 0x66, 0x16, 0xae, 0x9e, 0x74, 0x4e, },
+-}, {
+- .ksize = 16,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 7,
+- .digest = (u8[]){ 0x58, 0x5d, 0xa8, 0x60, 0x1c, 0xa4, 0xd8, 0x03,
+- 0x86, 0x86, 0x84, 0x64, 0xd7, 0xa0, 0x8e, 0x15,
+- 0x2f, 0x05, 0xa2, 0x1b, 0xbc, 0xef, 0x7a, 0x34,
+- 0xb3, 0xc5, 0xbc, 0x4b, 0xf0, 0x32, 0xeb, 0x12, },
+-}, {
+- .ksize = 32,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 64,
+- .digest = (u8[]){ 0x89, 0x75, 0xb0, 0x57, 0x7f, 0xd3, 0x55, 0x66,
+- 0xd7, 0x50, 0xb3, 0x62, 0xb0, 0x89, 0x7a, 0x26,
+- 0xc3, 0x99, 0x13, 0x6d, 0xf0, 0x7b, 0xab, 0xab,
+- 0xbd, 0xe6, 0x20, 0x3f, 0xf2, 0x95, 0x4e, 0xd4, },
+-}, {
+- .ksize = 1,
+- .key = "B",
+- .plaintext = blake2_ordered_sequence,
+- .psize = 247,
+- .digest = (u8[]){ 0x2e, 0x74, 0x1c, 0x1d, 0x03, 0xf4, 0x9d, 0x84,
+- 0x6f, 0xfc, 0x86, 0x32, 0x92, 0x49, 0x7e, 0x66,
+- 0xd7, 0xc3, 0x10, 0x88, 0xfe, 0x28, 0xb3, 0xe0,
+- 0xbf, 0x50, 0x75, 0xad, 0x8e, 0xa4, 0xe6, 0xb2, },
+-}, {
+- .ksize = 16,
+- .key = blake2_ordered_sequence,
+- .plaintext = blake2_ordered_sequence,
+- .psize = 256,
+- .digest = (u8[]){ 0xb9, 0xd2, 0x81, 0x0e, 0x3a, 0xb1, 0x62, 0x9b,
+- 0xad, 0x44, 0x05, 0xf4, 0x92, 0x2e, 0x99, 0xc1,
+- 0x4a, 0x47, 0xbb, 0x5b, 0x6f, 0xb2, 0x96, 0xed,
+- 0xd5, 0x06, 0xb5, 0x3a, 0x7c, 0x7a, 0x65, 0x1d, },
+-}};
+-
+ #endif /* _CRYPTO_TESTMGR_H */
+diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h
+index 52363eee2b20..506d56530ca9 100644
+--- a/include/crypto/internal/blake2s.h
++++ b/include/crypto/internal/blake2s.h
+@@ -8,7 +8,6 @@
+ #define _CRYPTO_INTERNAL_BLAKE2S_H
+
+ #include <crypto/blake2s.h>
+-#include <crypto/internal/hash.h>
+ #include <linux/string.h>
+
+ void blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
+@@ -19,111 +18,4 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block,
+
+ bool blake2s_selftest(void);
+
+-static inline void blake2s_set_lastblock(struct blake2s_state *state)
+-{
+- state->f[0] = -1;
+-}
+-
+-/* Helper functions for BLAKE2s shared by the library and shash APIs */
+-
+-static __always_inline void
+-__blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen,
+- bool force_generic)
+-{
+- const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+-
+- if (unlikely(!inlen))
+- return;
+- if (inlen > fill) {
+- memcpy(state->buf + state->buflen, in, fill);
+- if (force_generic)
+- blake2s_compress_generic(state, state->buf, 1,
+- BLAKE2S_BLOCK_SIZE);
+- else
+- blake2s_compress(state, state->buf, 1,
+- BLAKE2S_BLOCK_SIZE);
+- state->buflen = 0;
+- in += fill;
+- inlen -= fill;
+- }
+- if (inlen > BLAKE2S_BLOCK_SIZE) {
+- const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+- /* Hash one less (full) block than strictly possible */
+- if (force_generic)
+- blake2s_compress_generic(state, in, nblocks - 1,
+- BLAKE2S_BLOCK_SIZE);
+- else
+- blake2s_compress(state, in, nblocks - 1,
+- BLAKE2S_BLOCK_SIZE);
+- in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+- inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
+- }
+- memcpy(state->buf + state->buflen, in, inlen);
+- state->buflen += inlen;
+-}
+-
+-static __always_inline void
+-__blake2s_final(struct blake2s_state *state, u8 *out, bool force_generic)
+-{
+- blake2s_set_lastblock(state);
+- memset(state->buf + state->buflen, 0,
+- BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
+- if (force_generic)
+- blake2s_compress_generic(state, state->buf, 1, state->buflen);
+- else
+- blake2s_compress(state, state->buf, 1, state->buflen);
+- cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
+- memcpy(out, state->h, state->outlen);
+-}
+-
+-/* Helper functions for shash implementations of BLAKE2s */
+-
+-struct blake2s_tfm_ctx {
+- u8 key[BLAKE2S_KEY_SIZE];
+- unsigned int keylen;
+-};
+-
+-static inline int crypto_blake2s_setkey(struct crypto_shash *tfm,
+- const u8 *key, unsigned int keylen)
+-{
+- struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+-
+- if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE)
+- return -EINVAL;
+-
+- memcpy(tctx->key, key, keylen);
+- tctx->keylen = keylen;
+-
+- return 0;
+-}
+-
+-static inline int crypto_blake2s_init(struct shash_desc *desc)
+-{
+- const struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+- struct blake2s_state *state = shash_desc_ctx(desc);
+- unsigned int outlen = crypto_shash_digestsize(desc->tfm);
+-
+- __blake2s_init(state, outlen, tctx->key, tctx->keylen);
+- return 0;
+-}
+-
+-static inline int crypto_blake2s_update(struct shash_desc *desc,
+- const u8 *in, unsigned int inlen,
+- bool force_generic)
+-{
+- struct blake2s_state *state = shash_desc_ctx(desc);
+-
+- __blake2s_update(state, in, inlen, force_generic);
+- return 0;
+-}
+-
+-static inline int crypto_blake2s_final(struct shash_desc *desc, u8 *out,
+- bool force_generic)
+-{
+- struct blake2s_state *state = shash_desc_ctx(desc);
+-
+- __blake2s_final(state, out, force_generic);
+- return 0;
+-}
+-
+ #endif /* _CRYPTO_INTERNAL_BLAKE2S_H */
+diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c
+index 409e4b728770..66f505220f43 100644
+--- a/lib/crypto/blake2s-selftest.c
++++ b/lib/crypto/blake2s-selftest.c
+@@ -4,6 +4,8 @@
+ */
+
+ #include <crypto/internal/blake2s.h>
++#include <linux/kernel.h>
++#include <linux/random.h>
+ #include <linux/string.h>
+
+ /*
+@@ -587,5 +589,44 @@ bool __init blake2s_selftest(void)
+ }
+ }
+
++ for (i = 0; i < 32; ++i) {
++ enum { TEST_ALIGNMENT = 16 };
++ u8 unaligned_block[BLAKE2S_BLOCK_SIZE + TEST_ALIGNMENT - 1]
++ __aligned(TEST_ALIGNMENT);
++ u8 blocks[BLAKE2S_BLOCK_SIZE * 3];
++ struct blake2s_state state1, state2;
++
++ get_random_bytes(blocks, sizeof(blocks));
++ get_random_bytes(&state, sizeof(state));
++
++#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \
++ defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S)
++ memcpy(&state1, &state, sizeof(state1));
++ memcpy(&state2, &state, sizeof(state2));
++ blake2s_compress(&state1, blocks, 3, BLAKE2S_BLOCK_SIZE);
++ blake2s_compress_generic(&state2, blocks, 3, BLAKE2S_BLOCK_SIZE);
++ if (memcmp(&state1, &state2, sizeof(state1))) {
++ pr_err("blake2s random compress self-test %d: FAIL\n",
++ i + 1);
++ success = false;
++ }
++#endif
++
++ memcpy(&state1, &state, sizeof(state1));
++ blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE);
++ for (l = 1; l < TEST_ALIGNMENT; ++l) {
++ memcpy(unaligned_block + l, blocks,
++ BLAKE2S_BLOCK_SIZE);
++ memcpy(&state2, &state, sizeof(state2));
++ blake2s_compress(&state2, unaligned_block + l, 1,
++ BLAKE2S_BLOCK_SIZE);
++ if (memcmp(&state1, &state2, sizeof(state1))) {
++ pr_err("blake2s random compress align %d self-test %d: FAIL\n",
++ l, i + 1);
++ success = false;
++ }
++ }
++ }
++
+ return success;
+ }
+diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
+index c71c09621c09..98e688c6d891 100644
+--- a/lib/crypto/blake2s.c
++++ b/lib/crypto/blake2s.c
+@@ -16,16 +16,44 @@
+ #include <linux/init.h>
+ #include <linux/bug.h>
+
++static inline void blake2s_set_lastblock(struct blake2s_state *state)
++{
++ state->f[0] = -1;
++}
++
+ void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+ {
+- __blake2s_update(state, in, inlen, false);
++ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
++
++ if (unlikely(!inlen))
++ return;
++ if (inlen > fill) {
++ memcpy(state->buf + state->buflen, in, fill);
++ blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
++ state->buflen = 0;
++ in += fill;
++ inlen -= fill;
++ }
++ if (inlen > BLAKE2S_BLOCK_SIZE) {
++ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
++ blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
++ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
++ }
++ memcpy(state->buf + state->buflen, in, inlen);
++ state->buflen += inlen;
+ }
+ EXPORT_SYMBOL(blake2s_update);
+
+ void blake2s_final(struct blake2s_state *state, u8 *out)
+ {
+ WARN_ON(IS_ENABLED(DEBUG) && !out);
+- __blake2s_final(state, out, false);
++ blake2s_set_lastblock(state);
++ memset(state->buf + state->buflen, 0,
++ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
++ blake2s_compress(state, state->buf, 1, state->buflen);
++ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
++ memcpy(out, state->h, state->outlen);
+ memzero_explicit(state, sizeof(*state));
+ }
+ EXPORT_SYMBOL(blake2s_final);
+@@ -38,12 +66,7 @@ static int __init blake2s_mod_init(void)
+ return 0;
+ }
+
+-static void __exit blake2s_mod_exit(void)
+-{
+-}
+-
+ module_init(blake2s_mod_init);
+-module_exit(blake2s_mod_exit);
+ MODULE_LICENSE("GPL v2");
+ MODULE_DESCRIPTION("BLAKE2s hash function");
+ MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
+--
+2.35.1
+
--- /dev/null
+From 737fa72cb41c624004f0ab3e93689a3a9cfa1b17 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 24 Jul 2022 14:33:52 -0400
+Subject: dm raid: fix address sanitizer warning in raid_resume
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+[ Upstream commit 7dad24db59d2d2803576f2e3645728866a056dab ]
+
+There is a KASAN warning in raid_resume when running the lvm test
+lvconvert-raid.sh. The reason for the warning is that mddev->raid_disks
+is greater than rs->raid_disks, so the loop touches one entry beyond
+the allocated length.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/dm-raid.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
+index 92e6b731f9d6..a55d6f6f294b 100644
+--- a/drivers/md/dm-raid.c
++++ b/drivers/md/dm-raid.c
+@@ -3824,7 +3824,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
+
+ memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
+
+- for (i = 0; i < mddev->raid_disks; i++) {
++ for (i = 0; i < rs->raid_disks; i++) {
+ r = &rs->dev[i].rdev;
+ /* HM FIXME: enhance journal device recovery processing */
+ if (test_bit(Journal, &r->flags))
+--
+2.35.1
+
--- /dev/null
+From 78121652de18db6e02f29395b05f1f73c1cf3fd2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 24 Jul 2022 14:31:35 -0400
+Subject: dm raid: fix address sanitizer warning in raid_status
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+[ Upstream commit 1fbeea217d8f297fe0e0956a1516d14ba97d0396 ]
+
+There is this warning when using a kernel with the address sanitizer
+and running this testsuite:
+https://gitlab.com/cki-project/kernel-tests/-/tree/main/storage/swraid/scsi_raid
+
+==================================================================
+BUG: KASAN: slab-out-of-bounds in raid_status+0x1747/0x2820 [dm_raid]
+Read of size 4 at addr ffff888079d2c7e8 by task lvcreate/13319
+CPU: 0 PID: 13319 Comm: lvcreate Not tainted 5.18.0-0.rc3.<snip> #1
+Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x6a/0x9c
+ print_address_description.constprop.0+0x1f/0x1e0
+ print_report.cold+0x55/0x244
+ kasan_report+0xc9/0x100
+ raid_status+0x1747/0x2820 [dm_raid]
+ dm_ima_measure_on_table_load+0x4b8/0xca0 [dm_mod]
+ table_load+0x35c/0x630 [dm_mod]
+ ctl_ioctl+0x411/0x630 [dm_mod]
+ dm_ctl_ioctl+0xa/0x10 [dm_mod]
+ __x64_sys_ioctl+0x12a/0x1a0
+ do_syscall_64+0x5b/0x80
+
+The warning is caused by reading conf->max_nr_stripes in raid_status. The
+code in raid_status reads mddev->private, casts it to struct r5conf and
+reads the entry max_nr_stripes.
+
+However, if we have different raid type than 4/5/6, mddev->private
+doesn't point to struct r5conf; it may point to struct r0conf, struct
+r1conf, struct r10conf or struct mpconf. If we cast a pointer to one
+of these structs to struct r5conf, we will be reading invalid memory
+and KASAN warns about it.
+
+Fix this bug by reading struct r5conf only if raid type is 4, 5 or 6.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/dm-raid.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
+index e362a7471512..92e6b731f9d6 100644
+--- a/drivers/md/dm-raid.c
++++ b/drivers/md/dm-raid.c
+@@ -3514,7 +3514,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
+ {
+ struct raid_set *rs = ti->private;
+ struct mddev *mddev = &rs->md;
+- struct r5conf *conf = mddev->private;
++ struct r5conf *conf = rs_is_raid456(rs) ? mddev->private : NULL;
+ int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
+ unsigned long recovery;
+ unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
+--
+2.35.1
+
--- /dev/null
+From 185d3911adcea01b2082c14635a2a8071134384b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Jul 2022 19:28:25 +0800
+Subject: dm thin: fix use-after-free crash in
+ dm_sm_register_threshold_callback
+
+From: Luo Meng <luomeng12@huawei.com>
+
+[ Upstream commit 3534e5a5ed2997ca1b00f44a0378a075bd05e8a3 ]
+
+Fault inject on pool metadata device reports:
+ BUG: KASAN: use-after-free in dm_pool_register_metadata_threshold+0x40/0x80
+ Read of size 8 at addr ffff8881b9d50068 by task dmsetup/950
+
+ CPU: 7 PID: 950 Comm: dmsetup Tainted: G W 5.19.0-rc6 #1
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014
+ Call Trace:
+ <TASK>
+ dump_stack_lvl+0x34/0x44
+ print_address_description.constprop.0.cold+0xeb/0x3f4
+ kasan_report.cold+0xe6/0x147
+ dm_pool_register_metadata_threshold+0x40/0x80
+ pool_ctr+0xa0a/0x1150
+ dm_table_add_target+0x2c8/0x640
+ table_load+0x1fd/0x430
+ ctl_ioctl+0x2c4/0x5a0
+ dm_ctl_ioctl+0xa/0x10
+ __x64_sys_ioctl+0xb3/0xd0
+ do_syscall_64+0x35/0x80
+ entry_SYSCALL_64_after_hwframe+0x46/0xb0
+
+This can be easily reproduced using:
+ echo offline > /sys/block/sda/device/state
+ dd if=/dev/zero of=/dev/mapper/thin bs=4k count=10
+ dmsetup load pool --table "0 20971520 thin-pool /dev/sda /dev/sdb 128 0 0"
+
+If a metadata commit fails, the transaction will be aborted and the
+metadata space maps will be destroyed. If a DM table reload then
+happens for this failed thin-pool, a use-after-free will occur in
+dm_sm_register_threshold_callback (called from
+dm_pool_register_metadata_threshold).
+
+Fix this by in dm_pool_register_metadata_threshold() by returning the
+-EINVAL error if the thin-pool is in fail mode. Also fail pool_ctr()
+with a new error message: "Error registering metadata threshold".
+
+Fixes: ac8c3f3df65e4 ("dm thin: generate event when metadata threshold passed")
+Cc: stable@vger.kernel.org
+Reported-by: Hulk Robot <hulkci@huawei.com>
+Signed-off-by: Luo Meng <luomeng12@huawei.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/dm-thin-metadata.c | 7 +++++--
+ drivers/md/dm-thin.c | 4 +++-
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
+index 2db7030aba00..a27395c8621f 100644
+--- a/drivers/md/dm-thin-metadata.c
++++ b/drivers/md/dm-thin-metadata.c
+@@ -2045,10 +2045,13 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
+ dm_sm_threshold_fn fn,
+ void *context)
+ {
+- int r;
++ int r = -EINVAL;
+
+ pmd_write_lock_in_core(pmd);
+- r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
++ if (!pmd->fail_io) {
++ r = dm_sm_register_threshold_callback(pmd->metadata_sm,
++ threshold, fn, context);
++ }
+ pmd_write_unlock(pmd);
+
+ return r;
+diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
+index 4d25d0e27031..53ac6ae870ac 100644
+--- a/drivers/md/dm-thin.c
++++ b/drivers/md/dm-thin.c
+@@ -3382,8 +3382,10 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
+ calc_metadata_threshold(pt),
+ metadata_low_callback,
+ pool);
+- if (r)
++ if (r) {
++ ti->error = "Error registering metadata threshold";
+ goto out_flags_changed;
++ }
+
+ dm_pool_register_pre_commit_callback(pool->pmd,
+ metadata_pre_commit_callback, pool);
+--
+2.35.1
+
--- /dev/null
+From 36bc440738ef1c764c40ac40f0a1d699ee8d1d3f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 13 Jul 2022 07:09:04 -0400
+Subject: dm writecache: set a default MAX_WRITEBACK_JOBS
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+[ Upstream commit ca7dc242e358e46d963b32f9d9dd829785a9e957 ]
+
+dm-writecache has the capability to limit the number of writeback jobs
+in progress. However, this feature was off by default. As such there
+were some out-of-memory crashes observed when lowering the low
+watermark while the cache is full.
+
+This commit enables writeback limit by default. It is set to 256MiB or
+1/16 of total system memory, whichever is smaller.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/md/dm-writecache.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
+index e5acb393f70b..27557b852c94 100644
+--- a/drivers/md/dm-writecache.c
++++ b/drivers/md/dm-writecache.c
+@@ -22,7 +22,7 @@
+
+ #define HIGH_WATERMARK 50
+ #define LOW_WATERMARK 45
+-#define MAX_WRITEBACK_JOBS 0
++#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
+ #define ENDIO_LATENCY 16
+ #define WRITEBACK_LATENCY 64
+ #define AUTOCOMMIT_BLOCKS_SSD 65536
+--
+2.35.1
+
--- /dev/null
+From cb546eb4a3db40281eb2d2b70ec132d20dba301e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Jul 2022 09:49:24 -0400
+Subject: drivers/base: fix userspace break from using bin_attributes for
+ cpumap and cpulist
+
+From: Phil Auld <pauld@redhat.com>
+
+[ Upstream commit 7ee951acd31a88f941fd6535fbdee3a1567f1d63 ]
+
+Using bin_attributes with a 0 size causes fstat and friends to return that
+0 size. This breaks userspace code that retrieves the size before reading
+the file. Rather than reverting 75bd50fa841 ("drivers/base/node.c: use
+bin_attribute to break the size limitation of cpumap ABI") let's put in a
+size value at compile time.
+
+For cpulist the maximum size is on the order of
+ NR_CPUS * (ceil(log10(NR_CPUS)) + 1)/2
+
+which for 8192 is 20480 (8192 * 5)/2. In order to get near that you'd need
+a system with every other CPU on one node. For example: (0,2,4,8, ... ).
+To simplify the math and support larger NR_CPUS in the future we are using
+(NR_CPUS * 7)/2. We also set it to a min of PAGE_SIZE to retain the older
+behavior for smaller NR_CPUS.
+
+The cpumap file the size works out to be NR_CPUS/4 + NR_CPUS/32 - 1
+(or NR_CPUS * 9/32 - 1) including the ","s.
+
+Add a set of macros for these values to cpumask.h so they can be used in
+multiple places. Apply these to the handful of such files in
+drivers/base/topology.c as well as node.c.
+
+As an example, on an 80 cpu 4-node system (NR_CPUS == 8192):
+
+before:
+
+-r--r--r--. 1 root root 0 Jul 12 14:08 system/node/node0/cpulist
+-r--r--r--. 1 root root 0 Jul 11 17:25 system/node/node0/cpumap
+
+after:
+
+-r--r--r--. 1 root root 28672 Jul 13 11:32 system/node/node0/cpulist
+-r--r--r--. 1 root root 4096 Jul 13 11:31 system/node/node0/cpumap
+
+CONFIG_NR_CPUS = 16384
+-r--r--r--. 1 root root 57344 Jul 13 14:03 system/node/node0/cpulist
+-r--r--r--. 1 root root 4607 Jul 13 14:02 system/node/node0/cpumap
+
+The actual number of cpus doesn't matter for the reported size since they
+are based on NR_CPUS.
+
+Fixes: 75bd50fa841d ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI")
+Fixes: bb9ec13d156e ("topology: use bin_attribute to break the size limitation of cpumap ABI")
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: "Rafael J. Wysocki" <rafael@kernel.org>
+Cc: Yury Norov <yury.norov@gmail.com>
+Cc: stable@vger.kernel.org
+Acked-by: Yury Norov <yury.norov@gmail.com> (for include/linux/cpumask.h)
+Signed-off-by: Phil Auld <pauld@redhat.com>
+Link: https://lore.kernel.org/r/20220715134924.3466194-1-pauld@redhat.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/base/node.c | 4 ++--
+ drivers/base/topology.c | 32 ++++++++++++++++----------------
+ include/linux/cpumask.h | 18 ++++++++++++++++++
+ 3 files changed, 36 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/base/node.c b/drivers/base/node.c
+index 0ac6376ef7a1..eb0f43784c2b 100644
+--- a/drivers/base/node.c
++++ b/drivers/base/node.c
+@@ -45,7 +45,7 @@ static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj,
+ return n;
+ }
+
+-static BIN_ATTR_RO(cpumap, 0);
++static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES);
+
+ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+@@ -66,7 +66,7 @@ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj,
+ return n;
+ }
+
+-static BIN_ATTR_RO(cpulist, 0);
++static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES);
+
+ /**
+ * struct node_access_nodes - Access class device to hold user visible
+diff --git a/drivers/base/topology.c b/drivers/base/topology.c
+index ac6ad9ab67f9..89f98be5c5b9 100644
+--- a/drivers/base/topology.c
++++ b/drivers/base/topology.c
+@@ -62,47 +62,47 @@ define_id_show_func(ppin, "0x%llx");
+ static DEVICE_ATTR_ADMIN_RO(ppin);
+
+ define_siblings_read_func(thread_siblings, sibling_cpumask);
+-static BIN_ATTR_RO(thread_siblings, 0);
+-static BIN_ATTR_RO(thread_siblings_list, 0);
++static BIN_ATTR_RO(thread_siblings, CPUMAP_FILE_MAX_BYTES);
++static BIN_ATTR_RO(thread_siblings_list, CPULIST_FILE_MAX_BYTES);
+
+ define_siblings_read_func(core_cpus, sibling_cpumask);
+-static BIN_ATTR_RO(core_cpus, 0);
+-static BIN_ATTR_RO(core_cpus_list, 0);
++static BIN_ATTR_RO(core_cpus, CPUMAP_FILE_MAX_BYTES);
++static BIN_ATTR_RO(core_cpus_list, CPULIST_FILE_MAX_BYTES);
+
+ define_siblings_read_func(core_siblings, core_cpumask);
+-static BIN_ATTR_RO(core_siblings, 0);
+-static BIN_ATTR_RO(core_siblings_list, 0);
++static BIN_ATTR_RO(core_siblings, CPUMAP_FILE_MAX_BYTES);
++static BIN_ATTR_RO(core_siblings_list, CPULIST_FILE_MAX_BYTES);
+
+ #ifdef TOPOLOGY_CLUSTER_SYSFS
+ define_siblings_read_func(cluster_cpus, cluster_cpumask);
+-static BIN_ATTR_RO(cluster_cpus, 0);
+-static BIN_ATTR_RO(cluster_cpus_list, 0);
++static BIN_ATTR_RO(cluster_cpus, CPUMAP_FILE_MAX_BYTES);
++static BIN_ATTR_RO(cluster_cpus_list, CPULIST_FILE_MAX_BYTES);
+ #endif
+
+ #ifdef TOPOLOGY_DIE_SYSFS
+ define_siblings_read_func(die_cpus, die_cpumask);
+-static BIN_ATTR_RO(die_cpus, 0);
+-static BIN_ATTR_RO(die_cpus_list, 0);
++static BIN_ATTR_RO(die_cpus, CPUMAP_FILE_MAX_BYTES);
++static BIN_ATTR_RO(die_cpus_list, CPULIST_FILE_MAX_BYTES);
+ #endif
+
+ define_siblings_read_func(package_cpus, core_cpumask);
+-static BIN_ATTR_RO(package_cpus, 0);
+-static BIN_ATTR_RO(package_cpus_list, 0);
++static BIN_ATTR_RO(package_cpus, CPUMAP_FILE_MAX_BYTES);
++static BIN_ATTR_RO(package_cpus_list, CPULIST_FILE_MAX_BYTES);
+
+ #ifdef TOPOLOGY_BOOK_SYSFS
+ define_id_show_func(book_id, "%d");
+ static DEVICE_ATTR_RO(book_id);
+ define_siblings_read_func(book_siblings, book_cpumask);
+-static BIN_ATTR_RO(book_siblings, 0);
+-static BIN_ATTR_RO(book_siblings_list, 0);
++static BIN_ATTR_RO(book_siblings, CPUMAP_FILE_MAX_BYTES);
++static BIN_ATTR_RO(book_siblings_list, CPULIST_FILE_MAX_BYTES);
+ #endif
+
+ #ifdef TOPOLOGY_DRAWER_SYSFS
+ define_id_show_func(drawer_id, "%d");
+ static DEVICE_ATTR_RO(drawer_id);
+ define_siblings_read_func(drawer_siblings, drawer_cpumask);
+-static BIN_ATTR_RO(drawer_siblings, 0);
+-static BIN_ATTR_RO(drawer_siblings_list, 0);
++static BIN_ATTR_RO(drawer_siblings, CPUMAP_FILE_MAX_BYTES);
++static BIN_ATTR_RO(drawer_siblings_list, CPULIST_FILE_MAX_BYTES);
+ #endif
+
+ static struct bin_attribute *bin_attrs[] = {
+diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
+index fe29ac7cc469..4592d0845941 100644
+--- a/include/linux/cpumask.h
++++ b/include/linux/cpumask.h
+@@ -1071,4 +1071,22 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
+ [0] = 1UL \
+ } }
+
++/*
++ * Provide a valid theoretical max size for cpumap and cpulist sysfs files
++ * to avoid breaking userspace which may allocate a buffer based on the size
++ * reported by e.g. fstat.
++ *
++ * for cpumap NR_CPUS * 9/32 - 1 should be an exact length.
++ *
++ * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up
++ * to 2 orders of magnitude larger than 8192. And then we divide by 2 to
++ * cover a worst-case of every other cpu being on one of two nodes for a
++ * very large NR_CPUS.
++ *
++ * Use PAGE_SIZE as a minimum for smaller configurations.
++ */
++#define CPUMAP_FILE_MAX_BYTES ((((NR_CPUS * 9)/32 - 1) > PAGE_SIZE) \
++ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
++#define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)
++
+ #endif /* __LINUX_CPUMASK_H */
+--
+2.35.1
+
--- /dev/null
+From c73a45d0946910af4186cf5c6152e0792eaaabd2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Jun 2022 12:45:37 +0300
+Subject: drm/dp/mst: Read the extended DPCD capabilities during system resume
+
+From: Imre Deak <imre.deak@intel.com>
+
+[ Upstream commit 7a710a8bc909313951eb9252d8419924c771d7c2 ]
+
+The WD22TB4 Thunderbolt dock at least will revert its DP_MAX_LINK_RATE
+from HBR3 to HBR2 after system suspend/resume if the DP_DP13_DPCD_REV
+registers are not read subsequently also as required.
+
+Fix this by reading DP_DP13_DPCD_REV registers as well, matching what is
+done during connector detection. While at it also fix up the same call
+in drm_dp_mst_dump_topology().
+
+Cc: Lyude Paul <lyude@redhat.com>
+Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5292
+Signed-off-by: Imre Deak <imre.deak@intel.com>
+Reviewed-by: Jani Nikula <jani.nikula@intel.com>
+Cc: <stable@vger.kernel.org> # v5.14+
+Reviewed-by: Lyude Paul <lyude@redhat.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20220614094537.885472-1-imre.deak@intel.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/dp/drm_dp_mst_topology.c | 7 ++-----
+ 1 file changed, 2 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/gpu/drm/dp/drm_dp_mst_topology.c b/drivers/gpu/drm/dp/drm_dp_mst_topology.c
+index 7a7cc44686f9..96869875390f 100644
+--- a/drivers/gpu/drm/dp/drm_dp_mst_topology.c
++++ b/drivers/gpu/drm/dp/drm_dp_mst_topology.c
+@@ -3861,9 +3861,7 @@ int drm_dp_mst_topology_mgr_resume(struct drm_dp_mst_topology_mgr *mgr,
+ if (!mgr->mst_primary)
+ goto out_fail;
+
+- ret = drm_dp_dpcd_read(mgr->aux, DP_DPCD_REV, mgr->dpcd,
+- DP_RECEIVER_CAP_SIZE);
+- if (ret != DP_RECEIVER_CAP_SIZE) {
++ if (drm_dp_read_dpcd_caps(mgr->aux, mgr->dpcd) < 0) {
+ drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during suspend?\n");
+ goto out_fail;
+ }
+@@ -4912,8 +4910,7 @@ void drm_dp_mst_dump_topology(struct seq_file *m,
+ u8 buf[DP_PAYLOAD_TABLE_SIZE];
+ int ret;
+
+- ret = drm_dp_dpcd_read(mgr->aux, DP_DPCD_REV, buf, DP_RECEIVER_CAP_SIZE);
+- if (ret) {
++ if (drm_dp_read_dpcd_caps(mgr->aux, buf) < 0) {
+ seq_printf(m, "dpcd read failed\n");
+ goto out;
+ }
+--
+2.35.1
+
--- /dev/null
+From 72b19d4c277cf659673dc5b4b02f03f6ea4a746e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 20 May 2022 10:00:06 +0800
+Subject: drm/mediatek: Keep dsi as LP00 before dcs cmds transfer
+
+From: Jitao Shi <jitao.shi@mediatek.com>
+
+[ Upstream commit 39e8d062b03c3dc257d880d82bd55cdd9e185a3b ]
+
+To comply with the panel sequence, hold the mipi signal to LP00 before
+the dcs cmds transmission, and pull the mipi signal high from LP00 to
+LP11 until the start of the dcs cmds transmission.
+
+The normal panel timing is :
+(1) pp1800 DC pull up
+(2) avdd & avee AC pull high
+(3) lcm_reset pull high -> pull low -> pull high
+(4) Pull MIPI signal high (LP11) -> initial code -> send video data
+ (HS mode)
+
+The power-off sequence is reversed.
+If dsi is not in cmd mode, then dsi will pull the mipi signal high in
+the mtk_output_dsi_enable function. The delay in lane_ready func is
+the reaction time of dsi_rx after pulling up the mipi signal.
+
+Fixes: 2dd8075d2185 ("drm/mediatek: mtk_dsi: Use the drm_panel_bridge API")
+
+Link: https://patchwork.kernel.org/project/linux-mediatek/patch/1653012007-11854-4-git-send-email-xinlei.lee@mediatek.com/
+Cc: <stable@vger.kernel.org> # 5.10.x: 7f6335c6a258: drm/mediatek: Modify dsi funcs to atomic operations
+Cc: <stable@vger.kernel.org> # 5.10.x: cde7e2e35c28: drm/mediatek: Separate poweron/poweroff from enable/disable and define new funcs
+Cc: <stable@vger.kernel.org> # 5.10.x
+Signed-off-by: Jitao Shi <jitao.shi@mediatek.com>
+Signed-off-by: Xinlei Lee <xinlei.lee@mediatek.com>
+Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
+Reviewed-by: Rex-BC Chen <rex-bc.chen@mediatek.com>
+Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/mediatek/mtk_dsi.c | 28 +++++++++++++++++++++-------
+ 1 file changed, 21 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c b/drivers/gpu/drm/mediatek/mtk_dsi.c
+index f0f523bdafb8..e0a2d5ea40af 100644
+--- a/drivers/gpu/drm/mediatek/mtk_dsi.c
++++ b/drivers/gpu/drm/mediatek/mtk_dsi.c
+@@ -203,6 +203,7 @@ struct mtk_dsi {
+ struct mtk_phy_timing phy_timing;
+ int refcount;
+ bool enabled;
++ bool lanes_ready;
+ u32 irq_data;
+ wait_queue_head_t irq_wait_queue;
+ const struct mtk_dsi_driver_data *driver_data;
+@@ -649,18 +650,11 @@ static int mtk_dsi_poweron(struct mtk_dsi *dsi)
+ mtk_dsi_reset_engine(dsi);
+ mtk_dsi_phy_timconfig(dsi);
+
+- mtk_dsi_rxtx_control(dsi);
+- usleep_range(30, 100);
+- mtk_dsi_reset_dphy(dsi);
+ mtk_dsi_ps_control_vact(dsi);
+ mtk_dsi_set_vm_cmd(dsi);
+ mtk_dsi_config_vdo_timing(dsi);
+ mtk_dsi_set_interrupt_enable(dsi);
+
+- mtk_dsi_clk_ulp_mode_leave(dsi);
+- mtk_dsi_lane0_ulp_mode_leave(dsi);
+- mtk_dsi_clk_hs_mode(dsi, 0);
+-
+ return 0;
+ err_disable_engine_clk:
+ clk_disable_unprepare(dsi->engine_clk);
+@@ -691,6 +685,23 @@ static void mtk_dsi_poweroff(struct mtk_dsi *dsi)
+ clk_disable_unprepare(dsi->digital_clk);
+
+ phy_power_off(dsi->phy);
++
++ dsi->lanes_ready = false;
++}
++
++static void mtk_dsi_lane_ready(struct mtk_dsi *dsi)
++{
++ if (!dsi->lanes_ready) {
++ dsi->lanes_ready = true;
++ mtk_dsi_rxtx_control(dsi);
++ usleep_range(30, 100);
++ mtk_dsi_reset_dphy(dsi);
++ mtk_dsi_clk_ulp_mode_leave(dsi);
++ mtk_dsi_lane0_ulp_mode_leave(dsi);
++ mtk_dsi_clk_hs_mode(dsi, 0);
++ msleep(20);
++ /* The reaction time after pulling up the mipi signal for dsi_rx */
++ }
+ }
+
+ static void mtk_output_dsi_enable(struct mtk_dsi *dsi)
+@@ -698,6 +709,7 @@ static void mtk_output_dsi_enable(struct mtk_dsi *dsi)
+ if (dsi->enabled)
+ return;
+
++ mtk_dsi_lane_ready(dsi);
+ mtk_dsi_set_mode(dsi);
+ mtk_dsi_clk_hs_mode(dsi, 1);
+
+@@ -1007,6 +1019,8 @@ static ssize_t mtk_dsi_host_transfer(struct mipi_dsi_host *host,
+ if (MTK_DSI_HOST_IS_READ(msg->type))
+ irq_flag |= LPRX_RD_RDY_INT_FLAG;
+
++ mtk_dsi_lane_ready(dsi);
++
+ ret = mtk_dsi_host_send_cmd(dsi, msg, irq_flag);
+ if (ret)
+ goto restore_dsi_mode;
+--
+2.35.1
+
--- /dev/null
+From eb33b7125fa369b35e05a9779b1ee772cfe928cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Jun 2022 16:47:28 +0200
+Subject: drm/vc4: drv: Adopt the dma configuration from the HVS or V3D
+ component
+
+From: Dave Stevenson <dave.stevenson@raspberrypi.com>
+
+[ Upstream commit da8e393e23efb60eba8959856c7df88f9859f6eb ]
+
+vc4_drv isn't necessarily under the /soc node in DT as it is a
+virtual device, but it is the one that does the allocations.
+The DMA addresses are consumed by primarily the HVS or V3D, and
+those require VideoCore cache alias address mapping, and so will be
+under /soc.
+
+During probe find the a suitable device node for HVS or V3D,
+and adopt the DMA configuration of that node.
+
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
+Link: https://lore.kernel.org/r/20220613144800.326124-2-maxime@cerno.tech
+Signed-off-by: Maxime Ripard <maxime@cerno.tech>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/vc4/vc4_drv.c | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
+index 162bc18e7497..14a7d529144d 100644
+--- a/drivers/gpu/drm/vc4/vc4_drv.c
++++ b/drivers/gpu/drm/vc4/vc4_drv.c
+@@ -209,6 +209,15 @@ static void vc4_match_add_drivers(struct device *dev,
+ }
+ }
+
++const struct of_device_id vc4_dma_range_matches[] = {
++ { .compatible = "brcm,bcm2711-hvs" },
++ { .compatible = "brcm,bcm2835-hvs" },
++ { .compatible = "brcm,bcm2835-v3d" },
++ { .compatible = "brcm,cygnus-v3d" },
++ { .compatible = "brcm,vc4-v3d" },
++ {}
++};
++
+ static int vc4_drm_bind(struct device *dev)
+ {
+ struct platform_device *pdev = to_platform_device(dev);
+@@ -227,6 +236,16 @@ static int vc4_drm_bind(struct device *dev)
+ vc4_drm_driver.driver_features &= ~DRIVER_RENDER;
+ of_node_put(node);
+
++ node = of_find_matching_node_and_match(NULL, vc4_dma_range_matches,
++ NULL);
++ if (node) {
++ ret = of_dma_configure(dev, node, true);
++ of_node_put(node);
++
++ if (ret)
++ return ret;
++ }
++
+ vc4 = devm_drm_dev_alloc(dev, &vc4_drm_driver, struct vc4_dev, base);
+ if (IS_ERR(vc4))
+ return PTR_ERR(vc4);
+--
+2.35.1
+
--- /dev/null
+From cc80213fb3767c2c9e67a78719c06cb0b2f32c15 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 16 Jun 2022 10:13:55 +0800
+Subject: ext4: add EXT4_INODE_HAS_XATTR_SPACE macro in xattr.h
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit 179b14152dcb6a24c3415200603aebca70ff13af ]
+
+When adding an xattr to an inode, we must ensure that the inode_size is
+not less than EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad. Otherwise,
+the end position may be greater than the start position, resulting in UAF.
+
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+Link: https://lore.kernel.org/r/20220616021358.2504451-2-libaokun1@huawei.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/xattr.h | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
+index 77efb9a627ad..f885f362add4 100644
+--- a/fs/ext4/xattr.h
++++ b/fs/ext4/xattr.h
+@@ -95,6 +95,19 @@ struct ext4_xattr_entry {
+
+ #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+
++/*
++ * If we want to add an xattr to the inode, we should make sure that
++ * i_extra_isize is not 0 and that the inode size is not less than
++ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
++ * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data
++ * |--------------------------|------------|------|---------|---|-------|
++ */
++#define EXT4_INODE_HAS_XATTR_SPACE(inode) \
++ ((EXT4_I(inode)->i_extra_isize != 0) && \
++ (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \
++ sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \
++ EXT4_INODE_SIZE((inode)->i_sb)))
++
+ struct ext4_xattr_info {
+ const char *name;
+ const void *value;
+--
+2.35.1
+
--- /dev/null
+From 6770bf434d6a397d0ac1762555133b5cf6e7a3e8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Jul 2022 16:27:20 +0200
+Subject: ext4: check if directory block is within i_size
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+[ Upstream commit 65f8ea4cd57dbd46ea13b41dc8bac03176b04233 ]
+
+Currently ext4 directory handling code implicitly assumes that the
+directory blocks are always within the i_size. In fact ext4_append()
+will attempt to allocate next directory block based solely on i_size and
+the i_size is then appropriately increased after a successful
+allocation.
+
+However, for this to work it requires i_size to be correct. If, for any
+reason, the directory inode i_size is corrupted in a way that the
+directory tree refers to a valid directory block past i_size, we could
+end up corrupting parts of the directory tree structure by overwriting
+already used directory blocks when modifying the directory.
+
+Fix it by catching the corruption early in __ext4_read_dirblock().
+
+Addresses Red-Hat-Bugzilla: #2070205
+CVE: CVE-2022-1184
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Cc: stable@vger.kernel.org
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Link: https://lore.kernel.org/r/20220704142721.157985-1-lczerner@redhat.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/namei.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 4f0420b1ff3e..2bc3e4b27204 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -110,6 +110,13 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+ struct ext4_dir_entry *dirent;
+ int is_dx_block = 0;
+
++ if (block >= inode->i_size) {
++ ext4_error_inode(inode, func, line, block,
++ "Attempting to read directory block (%u) that is past i_size (%llu)",
++ block, inode->i_size);
++ return ERR_PTR(-EFSCORRUPTED);
++ }
++
+ if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
+ bh = ERR_PTR(-EIO);
+ else
+--
+2.35.1
+
--- /dev/null
+From e4206366d1ea91bfb593c5f1c297f97d95cc09b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 16 Jun 2022 10:13:57 +0800
+Subject: ext4: correct max_inline_xattr_value_size computing
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit c9fd167d57133c5b748d16913c4eabc55e531c73 ]
+
+If the ext4 inode does not have xattr space, 0 is returned in the
+get_max_inline_xattr_value_size function. Otherwise, the function returns
+a negative value when the inode does not contain EXT4_STATE_XATTR.
+
+Cc: stable@kernel.org
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20220616021358.2504451-4-libaokun1@huawei.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/inline.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
+index e9ef5cf30969..84fcd06a8e8a 100644
+--- a/fs/ext4/inline.c
++++ b/fs/ext4/inline.c
+@@ -35,6 +35,9 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
+ struct ext4_inode *raw_inode;
+ int free, min_offs;
+
++ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
++ return 0;
++
+ min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+ EXT4_GOOD_OLD_INODE_SIZE -
+ EXT4_I(inode)->i_extra_isize -
+--
+2.35.1
+
--- /dev/null
+From bb87c08d8958bcedb747138758192ad8471a7f14 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 16 Jun 2022 10:13:58 +0800
+Subject: ext4: correct the misjudgment in ext4_iget_extra_inode
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit fd7e672ea98b95b9d4c9dae316639f03c16a749d ]
+
+Use the EXT4_INODE_HAS_XATTR_SPACE macro to more accurately
+determine whether the inode have xattr space.
+
+Cc: stable@kernel.org
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20220616021358.2504451-5-libaokun1@huawei.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/inode.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 826e2deb10f8..e478cac3b8f2 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4681,8 +4681,7 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+
+- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
+- EXT4_INODE_SIZE(inode->i_sb) &&
++ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
+ *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ return ext4_find_inline_data_nolock(inode);
+--
+2.35.1
+
--- /dev/null
+From dc4a02f5902dad0bd2ac7063ad9cc82a29f579f2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 12:05:30 -0400
+Subject: ext4: fix extent status tree race in writeback error recovery path
+
+From: Eric Whitney <enwlinux@gmail.com>
+
+[ Upstream commit 7f0d8e1d607c1a4fa9a27362a108921d82230874 ]
+
+A race can occur in the unlikely event ext4 is unable to allocate a
+physical cluster for a delayed allocation in a bigalloc file system
+during writeback. Failure to allocate a cluster forces error recovery
+that includes a call to mpage_release_unused_pages(). That function
+removes any corresponding delayed allocated blocks from the extent
+status tree. If a new delayed write is in progress on the same cluster
+simultaneously, resulting in the addition of an new extent containing
+one or more blocks in that cluster to the extent status tree, delayed
+block accounting can be thrown off if that delayed write then encounters
+a similar cluster allocation failure during future writeback.
+
+Write lock the i_data_sem in mpage_release_unused_pages() to fix this
+problem. Ext4's block/cluster accounting code for bigalloc relies on
+i_data_sem for mutual exclusion, as is found in the delayed write path,
+and the locking in mpage_release_unused_pages() is missing.
+
+Cc: stable@kernel.org
+Reported-by: Ye Bin <yebin10@huawei.com>
+Signed-off-by: Eric Whitney <enwlinux@gmail.com>
+Link: https://lore.kernel.org/r/20220615160530.1928801-1-enwlinux@gmail.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/inode.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index beed9e32571c..826e2deb10f8 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1559,7 +1559,14 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
+ ext4_lblk_t start, last;
+ start = index << (PAGE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_SHIFT - inode->i_blkbits);
++
++ /*
++ * avoid racing with extent status tree scans made by
++ * ext4_insert_delayed_block()
++ */
++ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_es_remove_extent(inode, start, last - start + 1);
++ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+
+ pagevec_init(&pvec);
+--
+2.35.1
+
--- /dev/null
+From 7e96f9358fc2312891b448cce6dd758e684e2b80 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 12:54:24 +0200
+Subject: ext4: fix race when reusing xattr blocks
+
+From: Jan Kara <jack@suse.cz>
+
+[ Upstream commit 65f8b80053a1b2fd602daa6814e62d6fa90e5e9b ]
+
+When ext4_xattr_block_set() decides to remove xattr block the following
+race can happen:
+
+CPU1 CPU2
+ext4_xattr_block_set() ext4_xattr_release_block()
+ new_bh = ext4_xattr_block_cache_find()
+
+ lock_buffer(bh);
+ ref = le32_to_cpu(BHDR(bh)->h_refcount);
+ if (ref == 1) {
+ ...
+ mb_cache_entry_delete();
+ unlock_buffer(bh);
+ ext4_free_blocks();
+ ...
+ ext4_forget(..., bh, ...);
+ jbd2_journal_revoke(..., bh);
+
+ ext4_journal_get_write_access(..., new_bh, ...)
+ do_get_write_access()
+ jbd2_journal_cancel_revoke(..., new_bh);
+
+Later the code in ext4_xattr_block_set() finds out the block got freed
+and cancels reusal of the block but the revoke stays canceled and so in
+case of block reuse and journal replay the filesystem can get corrupted.
+If the race works out slightly differently, we can also hit assertions
+in the jbd2 code.
+
+Fix the problem by making sure that once matching mbcache entry is
+found, code dropping the last xattr block reference (or trying to modify
+xattr block in place) waits until the mbcache entry reference is
+dropped. This way code trying to reuse xattr block is protected from
+someone trying to drop the last reference to xattr block.
+
+Reported-and-tested-by: Ritesh Harjani <ritesh.list@gmail.com>
+CC: stable@vger.kernel.org
+Fixes: 82939d7999df ("ext4: convert to mbcache2")
+Signed-off-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20220712105436.32204-5-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/xattr.c | 67 +++++++++++++++++++++++++++++++++----------------
+ 1 file changed, 45 insertions(+), 22 deletions(-)
+
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index a25942a74929..533216e80fa2 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -439,9 +439,16 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
+ /* Remove entry from mbcache when EA inode is getting evicted */
+ void ext4_evict_ea_inode(struct inode *inode)
+ {
+- if (EA_INODE_CACHE(inode))
+- mb_cache_entry_delete(EA_INODE_CACHE(inode),
+- ext4_xattr_inode_get_hash(inode), inode->i_ino);
++ struct mb_cache_entry *oe;
++
++ if (!EA_INODE_CACHE(inode))
++ return;
++ /* Wait for entry to get unused so that we can remove it */
++ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
++ ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
++ mb_cache_entry_wait_unused(oe);
++ mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
++ }
+ }
+
+ static int
+@@ -1229,6 +1236,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+ if (error)
+ goto out;
+
++retry_ref:
+ lock_buffer(bh);
+ hash = le32_to_cpu(BHDR(bh)->h_hash);
+ ref = le32_to_cpu(BHDR(bh)->h_refcount);
+@@ -1238,9 +1246,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+ * This must happen under buffer lock for
+ * ext4_xattr_block_set() to reliably detect freed block
+ */
+- if (ea_block_cache)
+- mb_cache_entry_delete(ea_block_cache, hash,
+- bh->b_blocknr);
++ if (ea_block_cache) {
++ struct mb_cache_entry *oe;
++
++ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
++ bh->b_blocknr);
++ if (oe) {
++ unlock_buffer(bh);
++ mb_cache_entry_wait_unused(oe);
++ mb_cache_entry_put(ea_block_cache, oe);
++ goto retry_ref;
++ }
++ }
+ get_bh(bh);
+ unlock_buffer(bh);
+
+@@ -1867,9 +1884,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ * ext4_xattr_block_set() to reliably detect modified
+ * block
+ */
+- if (ea_block_cache)
+- mb_cache_entry_delete(ea_block_cache, hash,
+- bs->bh->b_blocknr);
++ if (ea_block_cache) {
++ struct mb_cache_entry *oe;
++
++ oe = mb_cache_entry_delete_or_get(ea_block_cache,
++ hash, bs->bh->b_blocknr);
++ if (oe) {
++ /*
++ * Xattr block is getting reused. Leave
++ * it alone.
++ */
++ mb_cache_entry_put(ea_block_cache, oe);
++ goto clone_block;
++ }
++ }
+ ea_bdebug(bs->bh, "modifying in-place");
+ error = ext4_xattr_set_entry(i, s, handle, inode,
+ true /* is_block */);
+@@ -1885,6 +1913,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ goto cleanup;
+ goto inserted;
+ }
++clone_block:
+ unlock_buffer(bs->bh);
+ ea_bdebug(bs->bh, "cloning");
+ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+@@ -1990,18 +2019,13 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ lock_buffer(new_bh);
+ /*
+ * We have to be careful about races with
+- * freeing, rehashing or adding references to
+- * xattr block. Once we hold buffer lock xattr
+- * block's state is stable so we can check
+- * whether the block got freed / rehashed or
+- * not. Since we unhash mbcache entry under
+- * buffer lock when freeing / rehashing xattr
+- * block, checking whether entry is still
+- * hashed is reliable. Same rules hold for
+- * e_reusable handling.
++ * adding references to xattr block. Once we
++ * hold buffer lock xattr block's state is
++ * stable so we can check the additional
++ * reference fits.
+ */
+- if (hlist_bl_unhashed(&ce->e_hash_list) ||
+- !ce->e_reusable) {
++ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
++ if (ref > EXT4_XATTR_REFCOUNT_MAX) {
+ /*
+ * Undo everything and check mbcache
+ * again.
+@@ -2016,9 +2040,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ new_bh = NULL;
+ goto inserted;
+ }
+- ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
+- if (ref >= EXT4_XATTR_REFCOUNT_MAX)
++ if (ref == EXT4_XATTR_REFCOUNT_MAX)
+ ce->e_reusable = 0;
+ ea_bdebug(new_bh, "reusing; refcount now=%d",
+ ref);
+--
+2.35.1
+
--- /dev/null
+From a366d09886e9e7ed9fa6ffb4207af76a69c861ce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 16 Jun 2022 10:13:56 +0800
+Subject: ext4: fix use-after-free in ext4_xattr_set_entry
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit 67d7d8ad99beccd9fe92d585b87f1760dc9018e3 ]
+
+Hulk Robot reported a issue:
+==================================================================
+BUG: KASAN: use-after-free in ext4_xattr_set_entry+0x18ab/0x3500
+Write of size 4105 at addr ffff8881675ef5f4 by task syz-executor.0/7092
+
+CPU: 1 PID: 7092 Comm: syz-executor.0 Not tainted 4.19.90-dirty #17
+Call Trace:
+[...]
+ memcpy+0x34/0x50 mm/kasan/kasan.c:303
+ ext4_xattr_set_entry+0x18ab/0x3500 fs/ext4/xattr.c:1747
+ ext4_xattr_ibody_inline_set+0x86/0x2a0 fs/ext4/xattr.c:2205
+ ext4_xattr_set_handle+0x940/0x1300 fs/ext4/xattr.c:2386
+ ext4_xattr_set+0x1da/0x300 fs/ext4/xattr.c:2498
+ __vfs_setxattr+0x112/0x170 fs/xattr.c:149
+ __vfs_setxattr_noperm+0x11b/0x2a0 fs/xattr.c:180
+ __vfs_setxattr_locked+0x17b/0x250 fs/xattr.c:238
+ vfs_setxattr+0xed/0x270 fs/xattr.c:255
+ setxattr+0x235/0x330 fs/xattr.c:520
+ path_setxattr+0x176/0x190 fs/xattr.c:539
+ __do_sys_lsetxattr fs/xattr.c:561 [inline]
+ __se_sys_lsetxattr fs/xattr.c:557 [inline]
+ __x64_sys_lsetxattr+0xc2/0x160 fs/xattr.c:557
+ do_syscall_64+0xdf/0x530 arch/x86/entry/common.c:298
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+RIP: 0033:0x459fe9
+RSP: 002b:00007fa5e54b4c08 EFLAGS: 00000246 ORIG_RAX: 00000000000000bd
+RAX: ffffffffffffffda RBX: 000000000051bf60 RCX: 0000000000459fe9
+RDX: 00000000200003c0 RSI: 0000000020000180 RDI: 0000000020000140
+RBP: 000000000051bf60 R08: 0000000000000001 R09: 0000000000000000
+R10: 0000000000001009 R11: 0000000000000246 R12: 0000000000000000
+R13: 00007ffc73c93fc0 R14: 000000000051bf60 R15: 00007fa5e54b4d80
+[...]
+==================================================================
+
+Above issue may happen as follows:
+-------------------------------------
+ext4_xattr_set
+ ext4_xattr_set_handle
+ ext4_xattr_ibody_find
+ >> s->end < s->base
+ >> no EXT4_STATE_XATTR
+ >> xattr_check_inode is not executed
+ ext4_xattr_ibody_set
+ ext4_xattr_set_entry
+ >> size_t min_offs = s->end - s->base
+ >> UAF in memcpy
+
+we can easily reproduce this problem with the following commands:
+ mkfs.ext4 -F /dev/sda
+ mount -o debug_want_extra_isize=128 /dev/sda /mnt
+ touch /mnt/file
+ setfattr -n user.cat -v `seq -s z 4096|tr -d '[:digit:]'` /mnt/file
+
+In ext4_xattr_ibody_find, we have the following assignment logic:
+ header = IHDR(inode, raw_inode)
+ = raw_inode + EXT4_GOOD_OLD_INODE_SIZE + i_extra_isize
+ is->s.base = IFIRST(header)
+ = header + sizeof(struct ext4_xattr_ibody_header)
+ is->s.end = raw_inode + s_inode_size
+
+In ext4_xattr_set_entry
+ min_offs = s->end - s->base
+ = s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - i_extra_isize -
+ sizeof(struct ext4_xattr_ibody_header)
+ last = s->first
+ free = min_offs - ((void *)last - s->base) - sizeof(__u32)
+ = s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - i_extra_isize -
+ sizeof(struct ext4_xattr_ibody_header) - sizeof(__u32)
+
+In the calculation formula, all values except s_inode_size and
+i_extra_size are fixed values. When i_extra_size is the maximum value
+s_inode_size - EXT4_GOOD_OLD_INODE_SIZE, min_offs is -4 and free is -8.
+The value overflows. As a result, the preceding issue is triggered when
+memcpy is executed.
+
+Therefore, when finding xattr or setting xattr, check whether
+there is space for storing xattr in the inode to resolve this issue.
+
+Cc: stable@kernel.org
+Reported-by: Hulk Robot <hulkci@huawei.com>
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20220616021358.2504451-3-libaokun1@huawei.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/xattr.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 042325349098..c3c3194f3ee1 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -2176,8 +2176,9 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_inode *raw_inode;
+ int error;
+
+- if (EXT4_I(inode)->i_extra_isize == 0)
++ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+ return 0;
++
+ raw_inode = ext4_raw_inode(&is->iloc);
+ header = IHDR(inode, raw_inode);
+ is->s.base = is->s.first = IFIRST(header);
+@@ -2205,8 +2206,9 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_search *s = &is->s;
+ int error;
+
+- if (EXT4_I(inode)->i_extra_isize == 0)
++ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+ return -ENOSPC;
++
+ error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
+ if (error)
+ return error;
+--
+2.35.1
+
--- /dev/null
+From 122a6fffdeebf14f28ad593406efaa4e52613ea6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 17 Jun 2022 09:39:35 +0800
+Subject: ext4: fix warning in ext4_iomap_begin as race between bmap and write
+
+From: Ye Bin <yebin10@huawei.com>
+
+[ Upstream commit 51ae846cff568c8c29921b1b28eb2dfbcd4ac12d ]
+
+We got issue as follows:
+------------[ cut here ]------------
+WARNING: CPU: 3 PID: 9310 at fs/ext4/inode.c:3441 ext4_iomap_begin+0x182/0x5d0
+RIP: 0010:ext4_iomap_begin+0x182/0x5d0
+RSP: 0018:ffff88812460fa08 EFLAGS: 00010293
+RAX: ffff88811f168000 RBX: 0000000000000000 RCX: ffffffff97793c12
+RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003
+RBP: ffff88812c669160 R08: ffff88811f168000 R09: ffffed10258cd20f
+R10: ffff88812c669077 R11: ffffed10258cd20e R12: 0000000000000001
+R13: 00000000000000a4 R14: 000000000000000c R15: ffff88812c6691ee
+FS: 00007fd0d6ff3740(0000) GS:ffff8883af180000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007fd0d6dda290 CR3: 0000000104a62000 CR4: 00000000000006e0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ iomap_apply+0x119/0x570
+ iomap_bmap+0x124/0x150
+ ext4_bmap+0x14f/0x250
+ bmap+0x55/0x80
+ do_vfs_ioctl+0x952/0xbd0
+ __x64_sys_ioctl+0xc6/0x170
+ do_syscall_64+0x33/0x40
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Above issue may happen as follows:
+ bmap write
+bmap
+ ext4_bmap
+ iomap_bmap
+ ext4_iomap_begin
+ ext4_file_write_iter
+ ext4_buffered_write_iter
+ generic_perform_write
+ ext4_da_write_begin
+ ext4_da_write_inline_data_begin
+ ext4_prepare_inline_data
+ ext4_create_inline_data
+ ext4_set_inode_flag(inode,
+ EXT4_INODE_INLINE_DATA);
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode))) ->trigger bug_on
+
+To solved above issue hold inode lock in ext4_bamp.
+
+Signed-off-by: Ye Bin <yebin10@huawei.com>
+Link: https://lore.kernel.org/r/20220617013935.397596-1-yebin10@huawei.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/inode.c | 12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index e478cac3b8f2..9ef6f41a5250 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3137,13 +3137,15 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+ {
+ struct inode *inode = mapping->host;
+ journal_t *journal;
++ sector_t ret = 0;
+ int err;
+
++ inode_lock_shared(inode);
+ /*
+ * We can get here for an inline file via the FIBMAP ioctl
+ */
+ if (ext4_has_inline_data(inode))
+- return 0;
++ goto out;
+
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
+@@ -3182,10 +3184,14 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
+ jbd2_journal_unlock_updates(journal);
+
+ if (err)
+- return 0;
++ goto out;
+ }
+
+- return iomap_bmap(mapping, block, &ext4_iomap_ops);
++ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
++
++out:
++ inode_unlock_shared(inode);
++ return ret;
+ }
+
+ static int ext4_readpage(struct file *file, struct page *page)
+--
+2.35.1
+
--- /dev/null
+From 103d5a38ce71f77d0a0ab8577c7e93d52421b67c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 4 Jul 2022 16:27:21 +0200
+Subject: ext4: make sure ext4_append() always allocates new block
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+[ Upstream commit b8a04fe77ef1360fbf73c80fddbdfeaa9407ed1b ]
+
+ext4_append() must always allocate a new block, otherwise we run the
+risk of overwriting existing directory block corrupting the directory
+tree in the process resulting in all manner of problems later on.
+
+Add a sanity check to see if the logical block is already allocated and
+error out if it is.
+
+Cc: stable@kernel.org
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Link: https://lore.kernel.org/r/20220704142721.157985-2-lczerner@redhat.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/namei.c | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 2bc3e4b27204..13b6265848c2 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
+ struct inode *inode,
+ ext4_lblk_t *block)
+ {
++ struct ext4_map_blocks map;
+ struct buffer_head *bh;
+ int err;
+
+@@ -63,6 +64,21 @@ static struct buffer_head *ext4_append(handle_t *handle,
+ return ERR_PTR(-ENOSPC);
+
+ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
++ map.m_lblk = *block;
++ map.m_len = 1;
++
++ /*
++ * We're appending new directory block. Make sure the block is not
++ * allocated yet, otherwise we will end up corrupting the
++ * directory.
++ */
++ err = ext4_map_blocks(NULL, inode, &map, 0);
++ if (err < 0)
++ return ERR_PTR(err);
++ if (err) {
++ EXT4_ERROR_INODE(inode, "Logical block already allocated");
++ return ERR_PTR(-EFSCORRUPTED);
++ }
+
+ bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
+ if (IS_ERR(bh))
+--
+2.35.1
+
--- /dev/null
+From 61a993fc39a5ec6129c04ea49d65c2173d1071a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 12:54:22 +0200
+Subject: ext4: remove EA inode entry from mbcache on inode eviction
+
+From: Jan Kara <jack@suse.cz>
+
+[ Upstream commit 6bc0d63dad7f9f54d381925ee855b402f652fa39 ]
+
+Currently we remove EA inode from mbcache as soon as its xattr refcount
+drops to zero. However there can be pending attempts to reuse the inode
+and thus refcount handling code has to handle the situation when
+refcount increases from zero anyway. So save some work and just keep EA
+inode in mbcache until it is getting evicted. At that moment we are sure
+following iget() of EA inode will fail anyway (or wait for eviction to
+finish and load things from the disk again) and so removing mbcache
+entry at that moment is fine and simplifies the code a bit.
+
+CC: stable@vger.kernel.org
+Fixes: 82939d7999df ("ext4: convert to mbcache2")
+Signed-off-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20220712105436.32204-3-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/inode.c | 2 ++
+ fs/ext4/xattr.c | 24 ++++++++----------------
+ fs/ext4/xattr.h | 1 +
+ 3 files changed, 11 insertions(+), 16 deletions(-)
+
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 9ef6f41a5250..e94ec798dce1 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -178,6 +178,8 @@ void ext4_evict_inode(struct inode *inode)
+
+ trace_ext4_evict_inode(inode);
+
++ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
++ ext4_evict_ea_inode(inode);
+ if (inode->i_nlink) {
+ /*
+ * When journalling data dirty buffers are tracked only in the
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index c3c3194f3ee1..b57fd07fbdba 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -436,6 +436,14 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
+ return err;
+ }
+
++/* Remove entry from mbcache when EA inode is getting evicted */
++void ext4_evict_ea_inode(struct inode *inode)
++{
++ if (EA_INODE_CACHE(inode))
++ mb_cache_entry_delete(EA_INODE_CACHE(inode),
++ ext4_xattr_inode_get_hash(inode), inode->i_ino);
++}
++
+ static int
+ ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
+ struct ext4_xattr_entry *entry, void *buffer,
+@@ -976,10 +984,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
+ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+ int ref_change)
+ {
+- struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
+ struct ext4_iloc iloc;
+ s64 ref_count;
+- u32 hash;
+ int ret;
+
+ inode_lock(ea_inode);
+@@ -1002,14 +1008,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+
+ set_nlink(ea_inode, 1);
+ ext4_orphan_del(handle, ea_inode);
+-
+- if (ea_inode_cache) {
+- hash = ext4_xattr_inode_get_hash(ea_inode);
+- mb_cache_entry_create(ea_inode_cache,
+- GFP_NOFS, hash,
+- ea_inode->i_ino,
+- true /* reusable */);
+- }
+ }
+ } else {
+ WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
+@@ -1022,12 +1020,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
+
+ clear_nlink(ea_inode);
+ ext4_orphan_add(handle, ea_inode);
+-
+- if (ea_inode_cache) {
+- hash = ext4_xattr_inode_get_hash(ea_inode);
+- mb_cache_entry_delete(ea_inode_cache, hash,
+- ea_inode->i_ino);
+- }
+ }
+ }
+
+diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
+index f885f362add4..e5e36bd11f05 100644
+--- a/fs/ext4/xattr.h
++++ b/fs/ext4/xattr.h
+@@ -191,6 +191,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
+
+ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
+ struct ext4_inode *raw_inode, handle_t *handle);
++extern void ext4_evict_ea_inode(struct inode *inode);
+
+ extern const struct xattr_handler *ext4_xattr_handlers[];
+
+--
+2.35.1
+
--- /dev/null
+From 08077db477ddbef197b91281d9078043da9adb08 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 12:54:23 +0200
+Subject: ext4: unindent codeblock in ext4_xattr_block_set()
+
+From: Jan Kara <jack@suse.cz>
+
+[ Upstream commit fd48e9acdf26d0cbd80051de07d4a735d05d29b2 ]
+
+Remove unnecessary else (and thus indentation level) from a code block
+in ext4_xattr_block_set(). It will also make following code changes
+easier. No functional changes.
+
+CC: stable@vger.kernel.org
+Fixes: 82939d7999df ("ext4: convert to mbcache2")
+Signed-off-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20220712105436.32204-4-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/xattr.c | 77 ++++++++++++++++++++++++-------------------------
+ 1 file changed, 38 insertions(+), 39 deletions(-)
+
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index d92d50de5a01..a25942a74929 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -1850,6 +1850,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ #define header(x) ((struct ext4_xattr_header *)(x))
+
+ if (s->base) {
++ int offset = (char *)s->here - bs->bh->b_data;
++
+ BUFFER_TRACE(bs->bh, "get_write_access");
+ error = ext4_journal_get_write_access(handle, sb, bs->bh,
+ EXT4_JTR_NONE);
+@@ -1882,49 +1884,46 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ if (error)
+ goto cleanup;
+ goto inserted;
+- } else {
+- int offset = (char *)s->here - bs->bh->b_data;
++ }
++ unlock_buffer(bs->bh);
++ ea_bdebug(bs->bh, "cloning");
++ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
++ error = -ENOMEM;
++ if (s->base == NULL)
++ goto cleanup;
++ s->first = ENTRY(header(s->base)+1);
++ header(s->base)->h_refcount = cpu_to_le32(1);
++ s->here = ENTRY(s->base + offset);
++ s->end = s->base + bs->bh->b_size;
+
+- unlock_buffer(bs->bh);
+- ea_bdebug(bs->bh, "cloning");
+- s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+- error = -ENOMEM;
+- if (s->base == NULL)
++ /*
++ * If existing entry points to an xattr inode, we need
++ * to prevent ext4_xattr_set_entry() from decrementing
++ * ref count on it because the reference belongs to the
++ * original block. In this case, make the entry look
++ * like it has an empty value.
++ */
++ if (!s->not_found && s->here->e_value_inum) {
++ ea_ino = le32_to_cpu(s->here->e_value_inum);
++ error = ext4_xattr_inode_iget(inode, ea_ino,
++ le32_to_cpu(s->here->e_hash),
++ &tmp_inode);
++ if (error)
+ goto cleanup;
+- s->first = ENTRY(header(s->base)+1);
+- header(s->base)->h_refcount = cpu_to_le32(1);
+- s->here = ENTRY(s->base + offset);
+- s->end = s->base + bs->bh->b_size;
+
+- /*
+- * If existing entry points to an xattr inode, we need
+- * to prevent ext4_xattr_set_entry() from decrementing
+- * ref count on it because the reference belongs to the
+- * original block. In this case, make the entry look
+- * like it has an empty value.
+- */
+- if (!s->not_found && s->here->e_value_inum) {
+- ea_ino = le32_to_cpu(s->here->e_value_inum);
+- error = ext4_xattr_inode_iget(inode, ea_ino,
+- le32_to_cpu(s->here->e_hash),
+- &tmp_inode);
+- if (error)
+- goto cleanup;
+-
+- if (!ext4_test_inode_state(tmp_inode,
+- EXT4_STATE_LUSTRE_EA_INODE)) {
+- /*
+- * Defer quota free call for previous
+- * inode until success is guaranteed.
+- */
+- old_ea_inode_quota = le32_to_cpu(
+- s->here->e_value_size);
+- }
+- iput(tmp_inode);
+-
+- s->here->e_value_inum = 0;
+- s->here->e_value_size = 0;
++ if (!ext4_test_inode_state(tmp_inode,
++ EXT4_STATE_LUSTRE_EA_INODE)) {
++ /*
++ * Defer quota free call for previous
++ * inode until success is guaranteed.
++ */
++ old_ea_inode_quota = le32_to_cpu(
++ s->here->e_value_size);
+ }
++ iput(tmp_inode);
++
++ s->here->e_value_inum = 0;
++ s->here->e_value_size = 0;
+ }
+ } else {
+ /* Allocate a buffer where we construct the new block. */
+--
+2.35.1
+
--- /dev/null
+From 17944868e9c621e876595423dbbf4c4660b651fd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 29 Jun 2022 00:00:25 -0400
+Subject: ext4: update s_overhead_clusters in the superblock during an on-line
+ resize
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+[ Upstream commit de394a86658ffe4e89e5328fd4993abfe41b7435 ]
+
+When doing an online resize, the on-disk superblock on-disk wasn't
+updated. This means that when the file system is unmounted and
+remounted, and the on-disk overhead value is non-zero, this would
+result in the results of statfs(2) to be incorrect.
+
+This was partially fixed by Commits 10b01ee92df5 ("ext4: fix overhead
+calculation to account for the reserved gdt blocks"), 85d825dbf489
+("ext4: force overhead calculation if the s_overhead_cluster makes no
+sense"), and eb7054212eac ("ext4: update the cached overhead value in
+the superblock").
+
+However, since it was too expensive to forcibly recalculate the
+overhead for bigalloc file systems at every mount, this didn't fix the
+problem for bigalloc file systems. This commit should address the
+problem when resizing file systems with the bigalloc feature enabled.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Link: https://lore.kernel.org/r/20220629040026.112371-1-tytso@mit.edu
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/resize.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
+index 8b70a4701293..e5c2713aa11a 100644
+--- a/fs/ext4/resize.c
++++ b/fs/ext4/resize.c
+@@ -1484,6 +1484,7 @@ static void ext4_update_super(struct super_block *sb,
+ * Update the fs overhead information
+ */
+ ext4_calculate_overhead(sb);
++ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
+
+ if (test_opt(sb, DEBUG))
+ printk(KERN_DEBUG "EXT4-fs: added group %u:"
+--
+2.35.1
+
--- /dev/null
+From 7f40ec1ab39f18dae16aba7df11a741b3bc968a1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 May 2022 11:01:20 +0800
+Subject: ext4: use kmemdup() to replace kmalloc + memcpy
+
+From: Shuqi Zhang <zhangshuqi3@huawei.com>
+
+[ Upstream commit 4efd9f0d120c55b08852ee5605dbb02a77089a5d ]
+
+Replace kmalloc + memcpy with kmemdup()
+
+Signed-off-by: Shuqi Zhang <zhangshuqi3@huawei.com>
+Reviewed-by: Ritesh Harjani <ritesh.list@gmail.com>
+Link: https://lore.kernel.org/r/20220525030120.803330-1-zhangshuqi3@huawei.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/xattr.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index b57fd07fbdba..d92d50de5a01 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -1887,11 +1887,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+
+ unlock_buffer(bs->bh);
+ ea_bdebug(bs->bh, "cloning");
+- s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
++ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
+ s->first = ENTRY(header(s->base)+1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->here = ENTRY(s->base + offset);
+--
+2.35.1
+
--- /dev/null
+From 20eafffdcdc724c0cbbd8582b9bdedc47de0f785 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Jul 2022 17:03:10 +0100
+Subject: firmware: arm_scpi: Ensure scpi_info is not assigned if the probe
+ fails
+
+From: Sudeep Holla <sudeep.holla@arm.com>
+
+[ Upstream commit 689640efc0a2c4e07e6f88affe6d42cd40cc3f85 ]
+
+When scpi probe fails, at any point, we need to ensure that the scpi_info
+is not set and will remain NULL until the probe succeeds. If it is not
+taken care, then it could result use-after-free as the value is exported
+via get_scpi_ops() and could refer to a memory allocated via devm_kzalloc()
+but freed when the probe fails.
+
+Link: https://lore.kernel.org/r/20220701160310.148344-1-sudeep.holla@arm.com
+Cc: stable@vger.kernel.org # 4.19+
+Reported-by: huhai <huhai@kylinos.cn>
+Reviewed-by: Jackie Liu <liuyun01@kylinos.cn>
+Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/firmware/arm_scpi.c | 61 +++++++++++++++++++++----------------
+ 1 file changed, 35 insertions(+), 26 deletions(-)
+
+diff --git a/drivers/firmware/arm_scpi.c b/drivers/firmware/arm_scpi.c
+index ddf0b9ff9e15..435d0e2658a4 100644
+--- a/drivers/firmware/arm_scpi.c
++++ b/drivers/firmware/arm_scpi.c
+@@ -815,7 +815,7 @@ static int scpi_init_versions(struct scpi_drvinfo *info)
+ info->firmware_version = le32_to_cpu(caps.platform_version);
+ }
+ /* Ignore error if not implemented */
+- if (scpi_info->is_legacy && ret == -EOPNOTSUPP)
++ if (info->is_legacy && ret == -EOPNOTSUPP)
+ return 0;
+
+ return ret;
+@@ -913,13 +913,14 @@ static int scpi_probe(struct platform_device *pdev)
+ struct resource res;
+ struct device *dev = &pdev->dev;
+ struct device_node *np = dev->of_node;
++ struct scpi_drvinfo *scpi_drvinfo;
+
+- scpi_info = devm_kzalloc(dev, sizeof(*scpi_info), GFP_KERNEL);
+- if (!scpi_info)
++ scpi_drvinfo = devm_kzalloc(dev, sizeof(*scpi_drvinfo), GFP_KERNEL);
++ if (!scpi_drvinfo)
+ return -ENOMEM;
+
+ if (of_match_device(legacy_scpi_of_match, &pdev->dev))
+- scpi_info->is_legacy = true;
++ scpi_drvinfo->is_legacy = true;
+
+ count = of_count_phandle_with_args(np, "mboxes", "#mbox-cells");
+ if (count < 0) {
+@@ -927,19 +928,19 @@ static int scpi_probe(struct platform_device *pdev)
+ return -ENODEV;
+ }
+
+- scpi_info->channels = devm_kcalloc(dev, count, sizeof(struct scpi_chan),
+- GFP_KERNEL);
+- if (!scpi_info->channels)
++ scpi_drvinfo->channels =
++ devm_kcalloc(dev, count, sizeof(struct scpi_chan), GFP_KERNEL);
++ if (!scpi_drvinfo->channels)
+ return -ENOMEM;
+
+- ret = devm_add_action(dev, scpi_free_channels, scpi_info);
++ ret = devm_add_action(dev, scpi_free_channels, scpi_drvinfo);
+ if (ret)
+ return ret;
+
+- for (; scpi_info->num_chans < count; scpi_info->num_chans++) {
++ for (; scpi_drvinfo->num_chans < count; scpi_drvinfo->num_chans++) {
+ resource_size_t size;
+- int idx = scpi_info->num_chans;
+- struct scpi_chan *pchan = scpi_info->channels + idx;
++ int idx = scpi_drvinfo->num_chans;
++ struct scpi_chan *pchan = scpi_drvinfo->channels + idx;
+ struct mbox_client *cl = &pchan->cl;
+ struct device_node *shmem = of_parse_phandle(np, "shmem", idx);
+
+@@ -986,45 +987,53 @@ static int scpi_probe(struct platform_device *pdev)
+ return ret;
+ }
+
+- scpi_info->commands = scpi_std_commands;
++ scpi_drvinfo->commands = scpi_std_commands;
+
+- platform_set_drvdata(pdev, scpi_info);
++ platform_set_drvdata(pdev, scpi_drvinfo);
+
+- if (scpi_info->is_legacy) {
++ if (scpi_drvinfo->is_legacy) {
+ /* Replace with legacy variants */
+ scpi_ops.clk_set_val = legacy_scpi_clk_set_val;
+- scpi_info->commands = scpi_legacy_commands;
++ scpi_drvinfo->commands = scpi_legacy_commands;
+
+ /* Fill priority bitmap */
+ for (idx = 0; idx < ARRAY_SIZE(legacy_hpriority_cmds); idx++)
+ set_bit(legacy_hpriority_cmds[idx],
+- scpi_info->cmd_priority);
++ scpi_drvinfo->cmd_priority);
+ }
+
+- ret = scpi_init_versions(scpi_info);
++ scpi_info = scpi_drvinfo;
++
++ ret = scpi_init_versions(scpi_drvinfo);
+ if (ret) {
+ dev_err(dev, "incorrect or no SCP firmware found\n");
++ scpi_info = NULL;
+ return ret;
+ }
+
+- if (scpi_info->is_legacy && !scpi_info->protocol_version &&
+- !scpi_info->firmware_version)
++ if (scpi_drvinfo->is_legacy && !scpi_drvinfo->protocol_version &&
++ !scpi_drvinfo->firmware_version)
+ dev_info(dev, "SCP Protocol legacy pre-1.0 firmware\n");
+ else
+ dev_info(dev, "SCP Protocol %lu.%lu Firmware %lu.%lu.%lu version\n",
+ FIELD_GET(PROTO_REV_MAJOR_MASK,
+- scpi_info->protocol_version),
++ scpi_drvinfo->protocol_version),
+ FIELD_GET(PROTO_REV_MINOR_MASK,
+- scpi_info->protocol_version),
++ scpi_drvinfo->protocol_version),
+ FIELD_GET(FW_REV_MAJOR_MASK,
+- scpi_info->firmware_version),
++ scpi_drvinfo->firmware_version),
+ FIELD_GET(FW_REV_MINOR_MASK,
+- scpi_info->firmware_version),
++ scpi_drvinfo->firmware_version),
+ FIELD_GET(FW_REV_PATCH_MASK,
+- scpi_info->firmware_version));
+- scpi_info->scpi_ops = &scpi_ops;
++ scpi_drvinfo->firmware_version));
++
++ scpi_drvinfo->scpi_ops = &scpi_ops;
+
+- return devm_of_platform_populate(dev);
++ ret = devm_of_platform_populate(dev);
++ if (ret)
++ scpi_info = NULL;
++
++ return ret;
+ }
+
+ static const struct of_device_id scpi_of_match[] = {
+--
+2.35.1
+
--- /dev/null
+From e881657d8ff0377cfd861e664e7e198f7d0ca102 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 26 Jul 2022 10:18:51 -0400
+Subject: ftrace/x86: Add back ftrace_expected assignment
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+[ Upstream commit ac6c1b2ca77e722a1e5d651f12f437f2f237e658 ]
+
+When a ftrace_bug happens (where ftrace fails to modify a location) it is
+helpful to have what was at that location as well as what was expected to
+be there.
+
+But with the conversion to text_poke() the variable that assigns the
+expected for debugging was dropped. Unfortunately, I noticed this when I
+needed it. Add it back.
+
+Link: https://lkml.kernel.org/r/20220726101851.069d2e70@gandalf.local.home
+
+Cc: "x86@kernel.org" <x86@kernel.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Fixes: 768ae4406a5c ("x86/ftrace: Use text_poke()")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/ftrace.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
+index 6892ca67d9c6..b6d7ece7bf51 100644
+--- a/arch/x86/kernel/ftrace.c
++++ b/arch/x86/kernel/ftrace.c
+@@ -93,6 +93,7 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code)
+
+ /* Make sure it is what we expect it to be */
+ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
++ ftrace_expected = old_code;
+ WARN_ON(1);
+ return -EINVAL;
+ }
+--
+2.35.1
+
--- /dev/null
+From 2cd3814758463e87405049bb89f7940da1dc7c1b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 23 Jul 2022 15:38:04 +0800
+Subject: hugetlb_cgroup: fix wrong hugetlb cgroup numa stat
+
+From: Miaohe Lin <linmiaohe@huawei.com>
+
+[ Upstream commit 2727cfe4072a35ce813e3708f74c135de7da8897 ]
+
+We forget to set cft->private for numa stat file. As a result, numa stat
+of hstates[0] is always showed for all hstates. Encode the hstates index
+into cft->private to fix this issue.
+
+Link: https://lkml.kernel.org/r/20220723073804.53035-1-linmiaohe@huawei.com
+Fixes: f47761999052 ("hugetlb: add hugetlb.*.numa_stat file")
+Signed-off-by: Miaohe Lin <linmiaohe@huawei.com>
+Acked-by: Muchun Song <songmuchun@bytedance.com>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Mina Almasry <almasrymina@google.com>
+Cc: Shakeel Butt <shakeelb@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/hugetlb_cgroup.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
+index f9942841df18..c86691c431fd 100644
+--- a/mm/hugetlb_cgroup.c
++++ b/mm/hugetlb_cgroup.c
+@@ -772,6 +772,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
+ /* Add the numa stat file */
+ cft = &h->cgroup_files_dfl[6];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
++ cft->private = MEMFILE_PRIVATE(idx, 0);
+ cft->seq_show = hugetlb_cgroup_read_numa_stat;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+--
+2.35.1
+
--- /dev/null
+From 6674953f1a89b966ac5c94dac089f2b80d9e9932 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Aug 2022 15:20:33 +0800
+Subject: Input: gscps2 - check return value of ioremap() in gscps2_probe()
+
+From: Xie Shaowen <studentxswpy@163.com>
+
+[ Upstream commit e61b3125a4f036b3c6b87ffd656fc1ab00440ae9 ]
+
+The function ioremap() in gscps2_probe() can fail, so
+its return value should be checked.
+
+Fixes: 4bdc0d676a643 ("remove ioremap_nocache and devm_ioremap_nocache")
+Cc: <stable@vger.kernel.org> # v5.6+
+Reported-by: Hacash Robot <hacashRobot@santino.com>
+Signed-off-by: Xie Shaowen <studentxswpy@163.com>
+Signed-off-by: Helge Deller <deller@gmx.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/input/serio/gscps2.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/drivers/input/serio/gscps2.c b/drivers/input/serio/gscps2.c
+index a9065c6ab550..da2c67cb8642 100644
+--- a/drivers/input/serio/gscps2.c
++++ b/drivers/input/serio/gscps2.c
+@@ -350,6 +350,10 @@ static int __init gscps2_probe(struct parisc_device *dev)
+ ps2port->port = serio;
+ ps2port->padev = dev;
+ ps2port->addr = ioremap(hpa, GSC_STATUS + 4);
++ if (!ps2port->addr) {
++ ret = -ENOMEM;
++ goto fail_nomem;
++ }
+ spin_lock_init(&ps2port->lock);
+
+ gscps2_reset(ps2port);
+--
+2.35.1
+
--- /dev/null
+From c13505c36b2642150596f12cc6b74f630ad22b45 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Apr 2022 17:39:51 +0800
+Subject: intel_idle: Add AlderLake support
+
+From: Zhang Rui <rui.zhang@intel.com>
+
+[ Upstream commit d1cf8bbfed1edc5108220342ab39e4544d55fbc3 ]
+
+Similar to SPR, the C1 and C1E states on ADL are mutually exclusive.
+Only one of them can be enabled at a time.
+
+But contrast to SPR, which usually has a strong latency requirement
+as a Xeon processor, C1E is preferred on ADL for better energy
+efficiency.
+
+Add custom C-state tables for ADL with both C1 and C1E, and
+
+ 1. Enable the "C1E promotion" bit in MSR_IA32_POWER_CTL and mark C1
+ with the CPUIDLE_FLAG_UNUSABLE flag, so C1 is not available by
+ default.
+
+ 2. Add support for the "preferred_cstates" module parameter, so that
+ users can choose to use C1 instead of C1E by booting with
+ "intel_idle.preferred_cstates=2".
+
+Separate custom C-state tables are introduced for the ADL mobile and
+desktop processors, because of the exit latency differences between
+these two variants, especially with respect to PC10.
+
+Signed-off-by: Zhang Rui <rui.zhang@intel.com>
+[ rjw: Changelog edits, code rearrangement ]
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 133 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 133 insertions(+)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 47b68c6071be..907700d1e78e 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -811,6 +811,106 @@ static struct cpuidle_state icx_cstates[] __initdata = {
+ .enter = NULL }
+ };
+
++/*
++ * On AlderLake C1 has to be disabled if C1E is enabled, and vice versa.
++ * C1E is enabled only if "C1E promotion" bit is set in MSR_IA32_POWER_CTL.
++ * But in this case there is effectively no C1, because C1 requests are
++ * promoted to C1E. If the "C1E promotion" bit is cleared, then both C1
++ * and C1E requests end up with C1, so there is effectively no C1E.
++ *
++ * By default we enable C1E and disable C1 by marking it with
++ * 'CPUIDLE_FLAG_UNUSABLE'.
++ */
++static struct cpuidle_state adl_cstates[] __initdata = {
++ {
++ .name = "C1",
++ .desc = "MWAIT 0x00",
++ .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE,
++ .exit_latency = 1,
++ .target_residency = 1,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .name = "C1E",
++ .desc = "MWAIT 0x01",
++ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
++ .exit_latency = 2,
++ .target_residency = 4,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .name = "C6",
++ .desc = "MWAIT 0x20",
++ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
++ .exit_latency = 220,
++ .target_residency = 600,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .name = "C8",
++ .desc = "MWAIT 0x40",
++ .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
++ .exit_latency = 280,
++ .target_residency = 800,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .name = "C10",
++ .desc = "MWAIT 0x60",
++ .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
++ .exit_latency = 680,
++ .target_residency = 2000,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .enter = NULL }
++};
++
++static struct cpuidle_state adl_l_cstates[] __initdata = {
++ {
++ .name = "C1",
++ .desc = "MWAIT 0x00",
++ .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE,
++ .exit_latency = 1,
++ .target_residency = 1,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .name = "C1E",
++ .desc = "MWAIT 0x01",
++ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
++ .exit_latency = 2,
++ .target_residency = 4,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .name = "C6",
++ .desc = "MWAIT 0x20",
++ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
++ .exit_latency = 170,
++ .target_residency = 500,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .name = "C8",
++ .desc = "MWAIT 0x40",
++ .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
++ .exit_latency = 200,
++ .target_residency = 600,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .name = "C10",
++ .desc = "MWAIT 0x60",
++ .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
++ .exit_latency = 230,
++ .target_residency = 700,
++ .enter = &intel_idle,
++ .enter_s2idle = intel_idle_s2idle, },
++ {
++ .enter = NULL }
++};
++
+ /*
+ * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
+ * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
+@@ -1194,6 +1294,14 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
+ .use_acpi = true,
+ };
+
++static const struct idle_cpu idle_cpu_adl __initconst = {
++ .state_table = adl_cstates,
++};
++
++static const struct idle_cpu idle_cpu_adl_l __initconst = {
++ .state_table = adl_l_cstates,
++};
++
+ static const struct idle_cpu idle_cpu_spr __initconst = {
+ .state_table = spr_cstates,
+ .disable_promotion_to_c1e = true,
+@@ -1262,6 +1370,8 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
+ X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx),
++ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &idle_cpu_adl),
++ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l),
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr),
+ X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl),
+ X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl),
+@@ -1620,6 +1730,25 @@ static void __init skx_idle_state_table_update(void)
+ }
+ }
+
++/**
++ * adl_idle_state_table_update - Adjust AlderLake idle states table.
++ */
++static void __init adl_idle_state_table_update(void)
++{
++ /* Check if user prefers C1 over C1E. */
++ if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) {
++ cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE;
++ cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE;
++
++ /* Disable C1E by clearing the "C1E promotion" bit. */
++ c1e_promotion = C1E_PROMOTION_DISABLE;
++ return;
++ }
++
++ /* Make sure C1E is enabled by default */
++ c1e_promotion = C1E_PROMOTION_ENABLE;
++}
++
+ /**
+ * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
+ */
+@@ -1689,6 +1818,10 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ spr_idle_state_table_update();
+ break;
++ case INTEL_FAM6_ALDERLAKE:
++ case INTEL_FAM6_ALDERLAKE_L:
++ adl_idle_state_table_update();
++ break;
+ }
+
+ for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
+--
+2.35.1
+
--- /dev/null
+From 72d2cfae1dd2c795060d056fcddb70dff941d0da Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 16 Jul 2022 09:26:55 +0300
+Subject: intel_idle: make SPR C1 and C1E be independent
+
+From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
+
+[ Upstream commit 1548fac47a114b42063def551eb152a536ed9697 ]
+
+This patch partially reverts the changes made by the following commit:
+
+da0e58c038e6 intel_idle: add 'preferred_cstates' module argument
+
+As that commit describes, on early Sapphire Rapids Xeon platforms the C1 and
+C1E states were mutually exclusive, so that users could only have either C1 and
+C6, or C1E and C6.
+
+However, Intel firmware engineers managed to remove this limitation and make C1
+and C1E to be completely independent, just like on previous Xeon platforms.
+
+Therefore, this patch:
+ * Removes commentary describing the old, and now non-existing SPR C1E
+ limitation.
+ * Marks SPR C1E as available by default.
+ * Removes the 'preferred_cstates' parameter handling for SPR. Both C1 and
+ C1E will be available regardless of 'preferred_cstates' value.
+
+We expect that all SPR systems are shipping with new firmware, which includes
+the C1/C1E improvement.
+
+Cc: v5.18+ <stable@vger.kernel.org> # v5.18+
+Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/idle/intel_idle.c | 24 +-----------------------
+ 1 file changed, 1 insertion(+), 23 deletions(-)
+
+diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
+index 907700d1e78e..9515a3146dc9 100644
+--- a/drivers/idle/intel_idle.c
++++ b/drivers/idle/intel_idle.c
+@@ -911,16 +911,6 @@ static struct cpuidle_state adl_l_cstates[] __initdata = {
+ .enter = NULL }
+ };
+
+-/*
+- * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
+- * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
+- * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
+- * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
+- * both C1 and C1E requests end up with C1, so there is effectively no C1E.
+- *
+- * By default we enable C1 and disable C1E by marking it with
+- * 'CPUIDLE_FLAG_UNUSABLE'.
+- */
+ static struct cpuidle_state spr_cstates[] __initdata = {
+ {
+ .name = "C1",
+@@ -933,8 +923,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
+ {
+ .name = "C1E",
+ .desc = "MWAIT 0x01",
+- .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
+- CPUIDLE_FLAG_UNUSABLE,
++ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ .exit_latency = 2,
+ .target_residency = 4,
+ .enter = &intel_idle,
+@@ -1756,17 +1745,6 @@ static void __init spr_idle_state_table_update(void)
+ {
+ unsigned long long msr;
+
+- /* Check if user prefers C1E over C1. */
+- if ((preferred_states_mask & BIT(2)) &&
+- !(preferred_states_mask & BIT(1))) {
+- /* Disable C1 and enable C1E. */
+- spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
+- spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
+-
+- /* Enable C1E using the "C1E promotion" bit. */
+- c1e_promotion = C1E_PROMOTION_ENABLE;
+- }
+-
+ /*
+ * By default, the C6 state assumes the worst-case scenario of package
+ * C6. However, if PC6 is disabled, we update the numbers to match
+--
+2.35.1
+
--- /dev/null
+From ca9b34b3b4760f6429dbc8d09097e704d75a7bed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Jul 2022 11:26:35 +0300
+Subject: intel_th: pci: Add Meteor Lake-P support
+
+From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+
+[ Upstream commit 802a9a0b1d91274ef10d9fe429b4cc1e8c200aef ]
+
+Add support for the Trace Hub in Meteor Lake-P.
+
+Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Cc: stable <stable@kernel.org>
+Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Link: https://lore.kernel.org/r/20220705082637.59979-5-alexander.shishkin@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/hwtracing/intel_th/pci.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c
+index fcd0aca75007..41a31c7f505f 100644
+--- a/drivers/hwtracing/intel_th/pci.c
++++ b/drivers/hwtracing/intel_th/pci.c
+@@ -284,6 +284,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = {
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x54a6),
+ .driver_data = (kernel_ulong_t)&intel_th_2x,
+ },
++ {
++ /* Meteor Lake-P */
++ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7e24),
++ .driver_data = (kernel_ulong_t)&intel_th_2x,
++ },
+ {
+ /* Alder Lake CPU */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x466f),
+--
+2.35.1
+
--- /dev/null
+From 72f0e594ee06a5ab7c5ba3b37bb0774444d0b18e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Jul 2022 11:26:37 +0300
+Subject: intel_th: pci: Add Raptor Lake-S CPU support
+
+From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+
+[ Upstream commit ff46a601afc5a66a81c3945b83d0a2caeb88e8bc ]
+
+Add support for the Trace Hub in Raptor Lake-S CPU.
+
+Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Cc: stable <stable@kernel.org>
+Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Link: https://lore.kernel.org/r/20220705082637.59979-7-alexander.shishkin@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/hwtracing/intel_th/pci.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c
+index 5b6da26f1b63..147d338c191e 100644
+--- a/drivers/hwtracing/intel_th/pci.c
++++ b/drivers/hwtracing/intel_th/pci.c
+@@ -294,6 +294,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = {
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7a26),
+ .driver_data = (kernel_ulong_t)&intel_th_2x,
+ },
++ {
++ /* Raptor Lake-S CPU */
++ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa76f),
++ .driver_data = (kernel_ulong_t)&intel_th_2x,
++ },
+ {
+ /* Alder Lake CPU */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x466f),
+--
+2.35.1
+
--- /dev/null
+From 2ea65adef5be4557edb506fdaa6b3ea53411d77f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Jul 2022 11:26:36 +0300
+Subject: intel_th: pci: Add Raptor Lake-S PCH support
+
+From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+
+[ Upstream commit 23e2de5826e2fc4dd43e08bab3a2ea1a5338b063 ]
+
+Add support for the Trace Hub in Raptor Lake-S PCH.
+
+Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Cc: stable <stable@kernel.org>
+Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
+Link: https://lore.kernel.org/r/20220705082637.59979-6-alexander.shishkin@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/hwtracing/intel_th/pci.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c
+index 41a31c7f505f..5b6da26f1b63 100644
+--- a/drivers/hwtracing/intel_th/pci.c
++++ b/drivers/hwtracing/intel_th/pci.c
+@@ -289,6 +289,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = {
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7e24),
+ .driver_data = (kernel_ulong_t)&intel_th_2x,
+ },
++ {
++ /* Raptor Lake-S */
++ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7a26),
++ .driver_data = (kernel_ulong_t)&intel_th_2x,
++ },
+ {
+ /* Alder Lake CPU */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x466f),
+--
+2.35.1
+
--- /dev/null
+From 48a418e1d496eadf562380a9947ee715f429e6ca Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 17:38:36 +0200
+Subject: iommu/vt-d: avoid invalid memory access via node_online(NUMA_NO_NODE)
+
+From: Alexander Lobakin <alexandr.lobakin@intel.com>
+
+[ Upstream commit b0b0b77ea611e3088e9523e60860f4f41b62b235 ]
+
+KASAN reports:
+
+[ 4.668325][ T0] BUG: KASAN: wild-memory-access in dmar_parse_one_rhsa (arch/x86/include/asm/bitops.h:214 arch/x86/include/asm/bitops.h:226 include/asm-generic/bitops/instrumented-non-atomic.h:142 include/linux/nodemask.h:415 drivers/iommu/intel/dmar.c:497)
+[ 4.676149][ T0] Read of size 8 at addr 1fffffff85115558 by task swapper/0/0
+[ 4.683454][ T0]
+[ 4.685638][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.19.0-rc3-00004-g0e862838f290 #1
+[ 4.694331][ T0] Hardware name: Supermicro SYS-5018D-FN4T/X10SDV-8C-TLN4F, BIOS 1.1 03/02/2016
+[ 4.703196][ T0] Call Trace:
+[ 4.706334][ T0] <TASK>
+[ 4.709133][ T0] ? dmar_parse_one_rhsa (arch/x86/include/asm/bitops.h:214 arch/x86/include/asm/bitops.h:226 include/asm-generic/bitops/instrumented-non-atomic.h:142 include/linux/nodemask.h:415 drivers/iommu/intel/dmar.c:497)
+
+after converting the type of the first argument (@nr, bit number)
+of arch_test_bit() from `long` to `unsigned long`[0].
+
+Under certain conditions (for example, when ACPI NUMA is disabled
+via command line), pxm_to_node() can return %NUMA_NO_NODE (-1).
+It is valid 'magic' number of NUMA node, but not valid bit number
+to use in bitops.
+node_online() eventually descends to test_bit() without checking
+for the input, assuming it's on caller side (which might be good
+for perf-critical tasks). There, -1 becomes %ULONG_MAX which leads
+to an insane array index when calculating bit position in memory.
+
+For now, add an explicit check for @node being not %NUMA_NO_NODE
+before calling test_bit(). The actual logics didn't change here
+at all.
+
+[0] https://github.com/norov/linux/commit/0e862838f290147ea9c16db852d8d494b552d38d
+
+Fixes: ee34b32d8c29 ("dmar: support for parsing Remapping Hardware Static Affinity structure")
+Cc: stable@vger.kernel.org # 2.6.33+
+Reported-by: kernel test robot <oliver.sang@intel.com>
+Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
+Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
+Signed-off-by: Yury Norov <yury.norov@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/iommu/intel/dmar.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
+index 497c5bd95caf..2a10c9b54064 100644
+--- a/drivers/iommu/intel/dmar.c
++++ b/drivers/iommu/intel/dmar.c
+@@ -495,7 +495,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg)
+ if (drhd->reg_base_addr == rhsa->base_address) {
+ int node = pxm_to_node(rhsa->proximity_domain);
+
+- if (!node_online(node))
++ if (node != NUMA_NO_NODE && !node_online(node))
+ node = NUMA_NO_NODE;
+ drhd->iommu->node = node;
+ return 0;
+--
+2.35.1
+
--- /dev/null
+From 0d01ede0c3b718afbf2b4bdb7b91840f25cc0d0e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Jul 2022 21:40:24 +0800
+Subject: kexec: clean up arch_kexec_kernel_verify_sig
+
+From: Coiby Xu <coxu@redhat.com>
+
+[ Upstream commit 689a71493bd2f31c024f8c0395f85a1fd4b2138e ]
+
+Before commit 105e10e2cf1c ("kexec_file: drop weak attribute from
+functions"), there was already no arch-specific implementation
+of arch_kexec_kernel_verify_sig. With weak attribute dropped by that
+commit, arch_kexec_kernel_verify_sig is completely useless. So clean it
+up.
+
+Note later patches are dependent on this patch so it should be backported
+to the stable tree as well.
+
+Cc: stable@vger.kernel.org
+Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
+Reviewed-by: Michal Suchanek <msuchanek@suse.de>
+Acked-by: Baoquan He <bhe@redhat.com>
+Signed-off-by: Coiby Xu <coxu@redhat.com>
+[zohar@linux.ibm.com: reworded patch description "Note"]
+Link: https://lore.kernel.org/linux-integrity/20220714134027.394370-1-coxu@redhat.com/
+Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/kexec.h | 5 -----
+ kernel/kexec_file.c | 33 +++++++++++++--------------------
+ 2 files changed, 13 insertions(+), 25 deletions(-)
+
+diff --git a/include/linux/kexec.h b/include/linux/kexec.h
+index 87c1795297b0..f3e7680befcc 100644
+--- a/include/linux/kexec.h
++++ b/include/linux/kexec.h
+@@ -212,11 +212,6 @@ static inline void *arch_kexec_kernel_image_load(struct kimage *image)
+ }
+ #endif
+
+-#ifdef CONFIG_KEXEC_SIG
+-int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+- unsigned long buf_len);
+-#endif
+-
+ extern int kexec_add_buffer(struct kexec_buf *kbuf);
+ int kexec_locate_mem_hole(struct kexec_buf *kbuf);
+
+diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
+index 925953dfef05..ad005cd184a4 100644
+--- a/kernel/kexec_file.c
++++ b/kernel/kexec_file.c
+@@ -81,24 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image)
+ return image->fops->cleanup(image->image_loader_data);
+ }
+
+-#ifdef CONFIG_KEXEC_SIG
+-static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
+- unsigned long buf_len)
+-{
+- if (!image->fops || !image->fops->verify_sig) {
+- pr_debug("kernel loader does not support signature verification.\n");
+- return -EKEYREJECTED;
+- }
+-
+- return image->fops->verify_sig(buf, buf_len);
+-}
+-
+-int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len)
+-{
+- return kexec_image_verify_sig_default(image, buf, buf_len);
+-}
+-#endif
+-
+ /*
+ * Free up memory used by kernel, initrd, and command line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+@@ -141,13 +123,24 @@ void kimage_file_post_load_cleanup(struct kimage *image)
+ }
+
+ #ifdef CONFIG_KEXEC_SIG
++static int kexec_image_verify_sig(struct kimage *image, void *buf,
++ unsigned long buf_len)
++{
++ if (!image->fops || !image->fops->verify_sig) {
++ pr_debug("kernel loader does not support signature verification.\n");
++ return -EKEYREJECTED;
++ }
++
++ return image->fops->verify_sig(buf, buf_len);
++}
++
+ static int
+ kimage_validate_signature(struct kimage *image)
+ {
+ int ret;
+
+- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+- image->kernel_buf_len);
++ ret = kexec_image_verify_sig(image, image->kernel_buf,
++ image->kernel_buf_len);
+ if (ret) {
+
+ if (sig_enforce) {
+--
+2.35.1
+
--- /dev/null
+From 2f10017ddc47f38b293b183149fca92892bcfec0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Jul 2022 21:40:27 +0800
+Subject: kexec, KEYS, s390: Make use of built-in and secondary keyring for
+ signature verification
+
+From: Michal Suchanek <msuchanek@suse.de>
+
+[ Upstream commit 0828c4a39be57768b8788e8cbd0d84683ea757e5 ]
+
+commit e23a8020ce4e ("s390/kexec_file: Signature verification prototype")
+adds support for KEXEC_SIG verification with keys from platform keyring
+but the built-in keys and secondary keyring are not used.
+
+Add support for the built-in keys and secondary keyring as x86 does.
+
+Fixes: e23a8020ce4e ("s390/kexec_file: Signature verification prototype")
+Cc: stable@vger.kernel.org
+Cc: Philipp Rudo <prudo@linux.ibm.com>
+Cc: kexec@lists.infradead.org
+Cc: keyrings@vger.kernel.org
+Cc: linux-security-module@vger.kernel.org
+Signed-off-by: Michal Suchanek <msuchanek@suse.de>
+Reviewed-by: "Lee, Chun-Yi" <jlee@suse.com>
+Acked-by: Baoquan He <bhe@redhat.com>
+Signed-off-by: Coiby Xu <coxu@redhat.com>
+Acked-by: Heiko Carstens <hca@linux.ibm.com>
+Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/s390/kernel/machine_kexec_file.c | 18 +++++++++++++-----
+ 1 file changed, 13 insertions(+), 5 deletions(-)
+
+diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c
+index 8f43575a4dd3..fc6d5f58debe 100644
+--- a/arch/s390/kernel/machine_kexec_file.c
++++ b/arch/s390/kernel/machine_kexec_file.c
+@@ -31,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
+ const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1;
+ struct module_signature *ms;
+ unsigned long sig_len;
++ int ret;
+
+ /* Skip signature verification when not secure IPLed. */
+ if (!ipl_secure_flag)
+@@ -65,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len)
+ return -EBADMSG;
+ }
+
+- return verify_pkcs7_signature(kernel, kernel_len,
+- kernel + kernel_len, sig_len,
+- VERIFY_USE_PLATFORM_KEYRING,
+- VERIFYING_MODULE_SIGNATURE,
+- NULL, NULL);
++ ret = verify_pkcs7_signature(kernel, kernel_len,
++ kernel + kernel_len, sig_len,
++ VERIFY_USE_SECONDARY_KEYRING,
++ VERIFYING_MODULE_SIGNATURE,
++ NULL, NULL);
++ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING))
++ ret = verify_pkcs7_signature(kernel, kernel_len,
++ kernel + kernel_len, sig_len,
++ VERIFY_USE_PLATFORM_KEYRING,
++ VERIFYING_MODULE_SIGNATURE,
++ NULL, NULL);
++ return ret;
+ }
+ #endif /* CONFIG_KEXEC_SIG */
+
+--
+2.35.1
+
--- /dev/null
+From a0103a12b495a7c1698fb02cfa8077d5018aedce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Jul 2022 13:04:04 +0530
+Subject: kexec_file: drop weak attribute from functions
+
+From: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+
+[ Upstream commit 65d9a9a60fd71be964effb2e94747a6acb6e7015 ]
+
+As requested
+(http://lkml.kernel.org/r/87ee0q7b92.fsf@email.froward.int.ebiederm.org),
+this series converts weak functions in kexec to use the #ifdef approach.
+
+Quoting the 3e35142ef99fe ("kexec_file: drop weak attribute from
+arch_kexec_apply_relocations[_add]") changelog:
+
+: Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols")
+: [1], binutils (v2.36+) started dropping section symbols that it thought
+: were unused. This isn't an issue in general, but with kexec_file.c, gcc
+: is placing kexec_arch_apply_relocations[_add] into a separate
+: .text.unlikely section and the section symbol ".text.unlikely" is being
+: dropped. Due to this, recordmcount is unable to find a non-weak symbol in
+: .text.unlikely to generate a relocation record against.
+
+This patch (of 2);
+
+Drop __weak attribute from functions in kexec_file.c:
+- arch_kexec_kernel_image_probe()
+- arch_kimage_file_post_load_cleanup()
+- arch_kexec_kernel_image_load()
+- arch_kexec_locate_mem_hole()
+- arch_kexec_kernel_verify_sig()
+
+arch_kexec_kernel_image_load() calls into kexec_image_load_default(), so
+drop the static attribute for the latter.
+
+arch_kexec_kernel_verify_sig() is not overridden by any architecture, so
+drop the __weak attribute.
+
+Link: https://lkml.kernel.org/r/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com
+Link: https://lkml.kernel.org/r/2cd7ca1fe4d6bb6ca38e3283c717878388ed6788.1656659357.git.naveen.n.rao@linux.vnet.ibm.com
+Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+Suggested-by: Eric Biederman <ebiederm@xmission.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm64/include/asm/kexec.h | 4 ++-
+ arch/powerpc/include/asm/kexec.h | 9 +++++++
+ arch/s390/include/asm/kexec.h | 3 +++
+ arch/x86/include/asm/kexec.h | 6 +++++
+ include/linux/kexec.h | 44 +++++++++++++++++++++++++++-----
+ kernel/kexec_file.c | 35 ++-----------------------
+ 6 files changed, 61 insertions(+), 40 deletions(-)
+
+diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
+index 9839bfc163d7..78d272b26ebd 100644
+--- a/arch/arm64/include/asm/kexec.h
++++ b/arch/arm64/include/asm/kexec.h
+@@ -115,7 +115,9 @@ extern const struct kexec_file_ops kexec_image_ops;
+
+ struct kimage;
+
+-extern int arch_kimage_file_post_load_cleanup(struct kimage *image);
++int arch_kimage_file_post_load_cleanup(struct kimage *image);
++#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
++
+ extern int load_other_segments(struct kimage *image,
+ unsigned long kernel_load_addr, unsigned long kernel_size,
+ char *initrd, unsigned long initrd_len,
+diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h
+index 2aefe14e1442..1e5e9b6ec78d 100644
+--- a/arch/powerpc/include/asm/kexec.h
++++ b/arch/powerpc/include/asm/kexec.h
+@@ -120,6 +120,15 @@ int setup_purgatory(struct kimage *image, const void *slave_code,
+ #ifdef CONFIG_PPC64
+ struct kexec_buf;
+
++int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len);
++#define arch_kexec_kernel_image_probe arch_kexec_kernel_image_probe
++
++int arch_kimage_file_post_load_cleanup(struct kimage *image);
++#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
++
++int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
++#define arch_kexec_locate_mem_hole arch_kexec_locate_mem_hole
++
+ int load_crashdump_segments_ppc64(struct kimage *image,
+ struct kexec_buf *kbuf);
+ int setup_purgatory_ppc64(struct kimage *image, const void *slave_code,
+diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h
+index 63098df81c9f..d13bd221cd37 100644
+--- a/arch/s390/include/asm/kexec.h
++++ b/arch/s390/include/asm/kexec.h
+@@ -92,5 +92,8 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+ #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
++
++int arch_kimage_file_post_load_cleanup(struct kimage *image);
++#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+ #endif
+ #endif /*_S390_KEXEC_H */
+diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
+index 6ad8d946cd3e..5ec359c1b50c 100644
+--- a/arch/x86/include/asm/kexec.h
++++ b/arch/x86/include/asm/kexec.h
+@@ -193,6 +193,12 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+ const Elf_Shdr *relsec,
+ const Elf_Shdr *symtab);
+ #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add
++
++void *arch_kexec_kernel_image_load(struct kimage *image);
++#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load
++
++int arch_kimage_file_post_load_cleanup(struct kimage *image);
++#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+ #endif
+ #endif
+
+diff --git a/include/linux/kexec.h b/include/linux/kexec.h
+index 8d573baaab29..87c1795297b0 100644
+--- a/include/linux/kexec.h
++++ b/include/linux/kexec.h
+@@ -188,21 +188,53 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size,
+ bool get_value);
+ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name);
++void *kexec_image_load_default(struct kimage *image);
++
++#ifndef arch_kexec_kernel_image_probe
++static inline int
++arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len)
++{
++ return kexec_image_probe_default(image, buf, buf_len);
++}
++#endif
++
++#ifndef arch_kimage_file_post_load_cleanup
++static inline int arch_kimage_file_post_load_cleanup(struct kimage *image)
++{
++ return kexec_image_post_load_cleanup_default(image);
++}
++#endif
++
++#ifndef arch_kexec_kernel_image_load
++static inline void *arch_kexec_kernel_image_load(struct kimage *image)
++{
++ return kexec_image_load_default(image);
++}
++#endif
+
+-/* Architectures may override the below functions */
+-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+- unsigned long buf_len);
+-void *arch_kexec_kernel_image_load(struct kimage *image);
+-int arch_kimage_file_post_load_cleanup(struct kimage *image);
+ #ifdef CONFIG_KEXEC_SIG
+ int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len);
+ #endif
+-int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
+
+ extern int kexec_add_buffer(struct kexec_buf *kbuf);
+ int kexec_locate_mem_hole(struct kexec_buf *kbuf);
+
++#ifndef arch_kexec_locate_mem_hole
++/**
++ * arch_kexec_locate_mem_hole - Find free memory to place the segments.
++ * @kbuf: Parameters for the memory search.
++ *
++ * On success, kbuf->mem will have the start address of the memory region found.
++ *
++ * Return: 0 on success, negative errno on error.
++ */
++static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
++{
++ return kexec_locate_mem_hole(kbuf);
++}
++#endif
++
+ /* Alignment required for elf header segment */
+ #define ELF_CORE_HEADER_ALIGN 4096
+
+diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
+index bb0fb63f563c..925953dfef05 100644
+--- a/kernel/kexec_file.c
++++ b/kernel/kexec_file.c
+@@ -62,14 +62,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
+ return ret;
+ }
+
+-/* Architectures can provide this probe function */
+-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+- unsigned long buf_len)
+-{
+- return kexec_image_probe_default(image, buf, buf_len);
+-}
+-
+-static void *kexec_image_load_default(struct kimage *image)
++void *kexec_image_load_default(struct kimage *image)
+ {
+ if (!image->fops || !image->fops->load)
+ return ERR_PTR(-ENOEXEC);
+@@ -80,11 +73,6 @@ static void *kexec_image_load_default(struct kimage *image)
+ image->cmdline_buf_len);
+ }
+
+-void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+-{
+- return kexec_image_load_default(image);
+-}
+-
+ int kexec_image_post_load_cleanup_default(struct kimage *image)
+ {
+ if (!image->fops || !image->fops->cleanup)
+@@ -93,11 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image)
+ return image->fops->cleanup(image->image_loader_data);
+ }
+
+-int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+-{
+- return kexec_image_post_load_cleanup_default(image);
+-}
+-
+ #ifdef CONFIG_KEXEC_SIG
+ static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
+ unsigned long buf_len)
+@@ -110,8 +93,7 @@ static int kexec_image_verify_sig_default(struct kimage *image, void *buf,
+ return image->fops->verify_sig(buf, buf_len);
+ }
+
+-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+- unsigned long buf_len)
++int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len)
+ {
+ return kexec_image_verify_sig_default(image, buf, buf_len);
+ }
+@@ -621,19 +603,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
+ return ret == 1 ? 0 : -EADDRNOTAVAIL;
+ }
+
+-/**
+- * arch_kexec_locate_mem_hole - Find free memory to place the segments.
+- * @kbuf: Parameters for the memory search.
+- *
+- * On success, kbuf->mem will have the start address of the memory region found.
+- *
+- * Return: 0 on success, negative errno on error.
+- */
+-int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
+-{
+- return kexec_locate_mem_hole(kbuf);
+-}
+-
+ /**
+ * kexec_add_buffer - place a buffer in a kexec segment
+ * @kbuf: Buffer contents and memory parameters.
+--
+2.35.1
+
--- /dev/null
+From 7537225de80672a829ff983a94b62557441b5bf3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Jun 2022 11:37:20 +0800
+Subject: KEYS: asymmetric: enforce SM2 signature use pkey algo
+
+From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+
+[ Upstream commit 0815291a8fd66cdcf7db1445d4d99b0d16065829 ]
+
+The signature verification of SM2 needs to add the Za value and
+recalculate sig->digest, which requires the detection of the pkey_algo
+in public_key_verify_signature(). As Eric Biggers said, the pkey_algo
+field in sig is attacker-controlled and should be use pkey->pkey_algo
+instead of sig->pkey_algo, and secondly, if sig->pkey_algo is NULL, it
+will also cause signature verification failure.
+
+The software_key_determine_akcipher() already forces the algorithms
+are matched, so the SM3 algorithm is enforced in the SM2 signature,
+although this has been checked, we still avoid using any algorithm
+information in the signature as input.
+
+Fixes: 215525639631 ("X.509: support OSCCA SM2-with-SM3 certificate verification")
+Reported-by: Eric Biggers <ebiggers@google.com>
+Cc: stable@vger.kernel.org # v5.10+
+Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ crypto/asymmetric_keys/public_key.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c
+index 7c9e6be35c30..2f8352e88860 100644
+--- a/crypto/asymmetric_keys/public_key.c
++++ b/crypto/asymmetric_keys/public_key.c
+@@ -304,6 +304,10 @@ static int cert_sig_digest_update(const struct public_key_signature *sig,
+
+ BUG_ON(!sig->data);
+
++ /* SM2 signatures always use the SM3 hash algorithm */
++ if (!sig->hash_algo || strcmp(sig->hash_algo, "sm3") != 0)
++ return -EINVAL;
++
+ ret = sm2_compute_z_digest(tfm_pkey, SM2_DEFAULT_USERID,
+ SM2_DEFAULT_USERID_LEN, dgst);
+ if (ret)
+@@ -414,8 +418,7 @@ int public_key_verify_signature(const struct public_key *pkey,
+ if (ret)
+ goto error_free_key;
+
+- if (sig->pkey_algo && strcmp(sig->pkey_algo, "sm2") == 0 &&
+- sig->data_size) {
++ if (strcmp(pkey->pkey_algo, "sm2") == 0 && sig->data_size) {
+ ret = cert_sig_digest_update(sig, tfm);
+ if (ret)
+ goto error_free_key;
+--
+2.35.1
+
--- /dev/null
+From e6b41f4cc7280b1aff9e2ef689f1e8a83a605357 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 May 2022 16:22:43 +0900
+Subject: ksmbd: add smbd max io size parameter
+
+From: Namjae Jeon <linkinjeon@kernel.org>
+
+[ Upstream commit 65bb45b97b578c8eed1ffa80caec84708df49729 ]
+
+Add 'smbd max io size' parameter to adjust smbd-direct max read/write
+size.
+
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Reviewed-by: Hyunchul Lee <hyc.lee@gmail.com>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ksmbd/ksmbd_netlink.h | 3 ++-
+ fs/ksmbd/transport_ipc.c | 3 +++
+ fs/ksmbd/transport_rdma.c | 8 +++++++-
+ fs/ksmbd/transport_rdma.h | 6 ++++++
+ 4 files changed, 18 insertions(+), 2 deletions(-)
+
+diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
+index ebe6ca08467a..52aa0adeb951 100644
+--- a/fs/ksmbd/ksmbd_netlink.h
++++ b/fs/ksmbd/ksmbd_netlink.h
+@@ -104,7 +104,8 @@ struct ksmbd_startup_request {
+ */
+ __u32 sub_auth[3]; /* Subauth value for Security ID */
+ __u32 smb2_max_credits; /* MAX credits */
+- __u32 reserved[128]; /* Reserved room */
++ __u32 smbd_max_io_size; /* smbd read write size */
++ __u32 reserved[127]; /* Reserved room */
+ __u32 ifc_list_sz; /* interfaces list size */
+ __s8 ____payload[];
+ };
+diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c
+index 3ad6881e0f7e..7cb0eeb07c80 100644
+--- a/fs/ksmbd/transport_ipc.c
++++ b/fs/ksmbd/transport_ipc.c
+@@ -26,6 +26,7 @@
+ #include "mgmt/ksmbd_ida.h"
+ #include "connection.h"
+ #include "transport_tcp.h"
++#include "transport_rdma.h"
+
+ #define IPC_WAIT_TIMEOUT (2 * HZ)
+
+@@ -303,6 +304,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
+ init_smb2_max_trans_size(req->smb2_max_trans);
+ if (req->smb2_max_credits)
+ init_smb2_max_credits(req->smb2_max_credits);
++ if (req->smbd_max_io_size)
++ init_smbd_max_io_size(req->smbd_max_io_size);
+
+ ret = ksmbd_set_netbios_name(req->netbios_name);
+ ret |= ksmbd_set_server_string(req->server_string);
+diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
+index b44a5e584bac..afc66b9765e7 100644
+--- a/fs/ksmbd/transport_rdma.c
++++ b/fs/ksmbd/transport_rdma.c
+@@ -80,7 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
+ /* The maximum single-message size which can be received */
+ static int smb_direct_max_receive_size = 8192;
+
+-static int smb_direct_max_read_write_size = 8 * 1024 * 1024;
++static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE;
+
+ static LIST_HEAD(smb_direct_device_list);
+ static DEFINE_RWLOCK(smb_direct_device_lock);
+@@ -214,6 +214,12 @@ struct smb_direct_rdma_rw_msg {
+ struct scatterlist sg_list[];
+ };
+
++void init_smbd_max_io_size(unsigned int sz)
++{
++ sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE);
++ smb_direct_max_read_write_size = sz;
++}
++
+ static inline int get_buf_page_count(void *buf, int size)
+ {
+ return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) -
+diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h
+index 5567d93a6f96..e7b4e6790fab 100644
+--- a/fs/ksmbd/transport_rdma.h
++++ b/fs/ksmbd/transport_rdma.h
+@@ -7,6 +7,10 @@
+ #ifndef __KSMBD_TRANSPORT_RDMA_H__
+ #define __KSMBD_TRANSPORT_RDMA_H__
+
++#define SMBD_DEFAULT_IOSIZE (8 * 1024 * 1024)
++#define SMBD_MIN_IOSIZE (512 * 1024)
++#define SMBD_MAX_IOSIZE (16 * 1024 * 1024)
++
+ /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */
+ struct smb_direct_negotiate_req {
+ __le16 min_version;
+@@ -52,10 +56,12 @@ struct smb_direct_data_transfer {
+ int ksmbd_rdma_init(void);
+ void ksmbd_rdma_destroy(void);
+ bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
++void init_smbd_max_io_size(unsigned int sz);
+ #else
+ static inline int ksmbd_rdma_init(void) { return 0; }
+ static inline int ksmbd_rdma_destroy(void) { return 0; }
+ static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
++static inline void init_smbd_max_io_size(unsigned int sz) { }
+ #endif
+
+ #endif /* __KSMBD_TRANSPORT_RDMA_H__ */
+--
+2.35.1
+
--- /dev/null
+From 1149a190b27509bf700591539f0cd164592d4ce0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 May 2022 16:23:28 +0900
+Subject: ksmbd: fix wrong smbd max read/write size check
+
+From: Namjae Jeon <linkinjeon@kernel.org>
+
+[ Upstream commit 7a84399e1ce3f5f2fbec3e7dd93459ba25badc2f ]
+
+smb-direct max read/write size can be different with smb2 max read/write
+size. So smb2_read() can return error by wrong max read/write size check.
+This patch use smb_direct_max_read_write_size for this check in
+smb-direct read/write().
+
+Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
+Reviewed-by: Hyunchul Lee <hyc.lee@gmail.com>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ksmbd/smb2pdu.c | 39 +++++++++++++++++++++++++--------------
+ fs/ksmbd/transport_rdma.c | 5 +++++
+ fs/ksmbd/transport_rdma.h | 2 ++
+ 3 files changed, 32 insertions(+), 14 deletions(-)
+
+diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
+index 8f86b8d6765f..6c8dd718b5db 100644
+--- a/fs/ksmbd/smb2pdu.c
++++ b/fs/ksmbd/smb2pdu.c
+@@ -6194,6 +6194,8 @@ int smb2_read(struct ksmbd_work *work)
+ size_t length, mincount;
+ ssize_t nbytes = 0, remain_bytes = 0;
+ int err = 0;
++ bool is_rdma_channel = false;
++ unsigned int max_read_size = conn->vals->max_read_size;
+
+ WORK_BUFFERS(work, req, rsp);
+
+@@ -6205,6 +6207,11 @@ int smb2_read(struct ksmbd_work *work)
+
+ if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
+ req->Channel == SMB2_CHANNEL_RDMA_V1) {
++ is_rdma_channel = true;
++ max_read_size = get_smbd_max_read_write_size();
++ }
++
++ if (is_rdma_channel == true) {
+ unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset);
+
+ if (ch_offset < offsetof(struct smb2_read_req, Buffer)) {
+@@ -6236,9 +6243,9 @@ int smb2_read(struct ksmbd_work *work)
+ length = le32_to_cpu(req->Length);
+ mincount = le32_to_cpu(req->MinimumCount);
+
+- if (length > conn->vals->max_read_size) {
++ if (length > max_read_size) {
+ ksmbd_debug(SMB, "limiting read size to max size(%u)\n",
+- conn->vals->max_read_size);
++ max_read_size);
+ err = -EINVAL;
+ goto out;
+ }
+@@ -6270,8 +6277,7 @@ int smb2_read(struct ksmbd_work *work)
+ ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n",
+ nbytes, offset, mincount);
+
+- if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
+- req->Channel == SMB2_CHANNEL_RDMA_V1) {
++ if (is_rdma_channel == true) {
+ /* write data to the client using rdma channel */
+ remain_bytes = smb2_read_rdma_channel(work, req,
+ work->aux_payload_buf,
+@@ -6432,8 +6438,9 @@ int smb2_write(struct ksmbd_work *work)
+ size_t length;
+ ssize_t nbytes;
+ char *data_buf;
+- bool writethrough = false;
++ bool writethrough = false, is_rdma_channel = false;
+ int err = 0;
++ unsigned int max_write_size = work->conn->vals->max_write_size;
+
+ WORK_BUFFERS(work, req, rsp);
+
+@@ -6442,8 +6449,17 @@ int smb2_write(struct ksmbd_work *work)
+ return smb2_write_pipe(work);
+ }
+
++ offset = le64_to_cpu(req->Offset);
++ length = le32_to_cpu(req->Length);
++
+ if (req->Channel == SMB2_CHANNEL_RDMA_V1 ||
+ req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
++ is_rdma_channel = true;
++ max_write_size = get_smbd_max_read_write_size();
++ length = le32_to_cpu(req->RemainingBytes);
++ }
++
++ if (is_rdma_channel == true) {
+ unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset);
+
+ if (req->Length != 0 || req->DataOffset != 0 ||
+@@ -6478,12 +6494,9 @@ int smb2_write(struct ksmbd_work *work)
+ goto out;
+ }
+
+- offset = le64_to_cpu(req->Offset);
+- length = le32_to_cpu(req->Length);
+-
+- if (length > work->conn->vals->max_write_size) {
++ if (length > max_write_size) {
+ ksmbd_debug(SMB, "limiting write size to max size(%u)\n",
+- work->conn->vals->max_write_size);
++ max_write_size);
+ err = -EINVAL;
+ goto out;
+ }
+@@ -6491,8 +6504,7 @@ int smb2_write(struct ksmbd_work *work)
+ if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
+ writethrough = true;
+
+- if (req->Channel != SMB2_CHANNEL_RDMA_V1 &&
+- req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
++ if (is_rdma_channel == false) {
+ if ((u64)le16_to_cpu(req->DataOffset) + length >
+ get_rfc1002_len(work->request_buf)) {
+ pr_err("invalid write data offset %u, smb_len %u\n",
+@@ -6518,8 +6530,7 @@ int smb2_write(struct ksmbd_work *work)
+ /* read data from the client using rdma channel, and
+ * write the data.
+ */
+- nbytes = smb2_write_rdma_channel(work, req, fp, offset,
+- le32_to_cpu(req->RemainingBytes),
++ nbytes = smb2_write_rdma_channel(work, req, fp, offset, length,
+ writethrough);
+ if (nbytes < 0) {
+ err = (int)nbytes;
+diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
+index afc66b9765e7..c6af8d89b7f7 100644
+--- a/fs/ksmbd/transport_rdma.c
++++ b/fs/ksmbd/transport_rdma.c
+@@ -220,6 +220,11 @@ void init_smbd_max_io_size(unsigned int sz)
+ smb_direct_max_read_write_size = sz;
+ }
+
++unsigned int get_smbd_max_read_write_size(void)
++{
++ return smb_direct_max_read_write_size;
++}
++
+ static inline int get_buf_page_count(void *buf, int size)
+ {
+ return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) -
+diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h
+index e7b4e6790fab..77aee4e5c9dc 100644
+--- a/fs/ksmbd/transport_rdma.h
++++ b/fs/ksmbd/transport_rdma.h
+@@ -57,11 +57,13 @@ int ksmbd_rdma_init(void);
+ void ksmbd_rdma_destroy(void);
+ bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
+ void init_smbd_max_io_size(unsigned int sz);
++unsigned int get_smbd_max_read_write_size(void);
+ #else
+ static inline int ksmbd_rdma_init(void) { return 0; }
+ static inline int ksmbd_rdma_destroy(void) { return 0; }
+ static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
+ static inline void init_smbd_max_io_size(unsigned int sz) { }
++static inline unsigned int get_smbd_max_read_write_size(void) { return 0; }
+ #endif
+
+ #endif /* __KSMBD_TRANSPORT_RDMA_H__ */
+--
+2.35.1
+
--- /dev/null
+From 79288fb9f5ec9fb25c6f827a60c7233dece972c4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 28 Jul 2022 23:41:51 +0900
+Subject: ksmbd: prevent out of bound read for SMB2_WRITE
+
+From: Hyunchul Lee <hyc.lee@gmail.com>
+
+[ Upstream commit ac60778b87e45576d7bfdbd6f53df902654e6f09 ]
+
+OOB read memory can be written to a file,
+if DataOffset is 0 and Length is too large
+in SMB2_WRITE request of compound request.
+
+To prevent this, when checking the length of
+the data area of SMB2_WRITE in smb2_get_data_area_len(),
+let the minimum of DataOffset be the size of
+SMB2 header + the size of SMB2_WRITE header.
+
+This bug can lead an oops looking something like:
+
+[ 798.008715] BUG: KASAN: slab-out-of-bounds in copy_page_from_iter_atomic+0xd3d/0x14b0
+[ 798.008724] Read of size 252 at addr ffff88800f863e90 by task kworker/0:2/2859
+...
+[ 798.008754] Call Trace:
+[ 798.008756] <TASK>
+[ 798.008759] dump_stack_lvl+0x49/0x5f
+[ 798.008764] print_report.cold+0x5e/0x5cf
+[ 798.008768] ? __filemap_get_folio+0x285/0x6d0
+[ 798.008774] ? copy_page_from_iter_atomic+0xd3d/0x14b0
+[ 798.008777] kasan_report+0xaa/0x120
+[ 798.008781] ? copy_page_from_iter_atomic+0xd3d/0x14b0
+[ 798.008784] kasan_check_range+0x100/0x1e0
+[ 798.008788] memcpy+0x24/0x60
+[ 798.008792] copy_page_from_iter_atomic+0xd3d/0x14b0
+[ 798.008795] ? pagecache_get_page+0x53/0x160
+[ 798.008799] ? iov_iter_get_pages_alloc+0x1590/0x1590
+[ 798.008803] ? ext4_write_begin+0xfc0/0xfc0
+[ 798.008807] ? current_time+0x72/0x210
+[ 798.008811] generic_perform_write+0x2c8/0x530
+[ 798.008816] ? filemap_fdatawrite_wbc+0x180/0x180
+[ 798.008820] ? down_write+0xb4/0x120
+[ 798.008824] ? down_write_killable+0x130/0x130
+[ 798.008829] ext4_buffered_write_iter+0x137/0x2c0
+[ 798.008833] ext4_file_write_iter+0x40b/0x1490
+[ 798.008837] ? __fsnotify_parent+0x275/0xb20
+[ 798.008842] ? __fsnotify_update_child_dentry_flags+0x2c0/0x2c0
+[ 798.008846] ? ext4_buffered_write_iter+0x2c0/0x2c0
+[ 798.008851] __kernel_write+0x3a1/0xa70
+[ 798.008855] ? __x64_sys_preadv2+0x160/0x160
+[ 798.008860] ? security_file_permission+0x4a/0xa0
+[ 798.008865] kernel_write+0xbb/0x360
+[ 798.008869] ksmbd_vfs_write+0x27e/0xb90 [ksmbd]
+[ 798.008881] ? ksmbd_vfs_read+0x830/0x830 [ksmbd]
+[ 798.008892] ? _raw_read_unlock+0x2a/0x50
+[ 798.008896] smb2_write+0xb45/0x14e0 [ksmbd]
+[ 798.008909] ? __kasan_check_write+0x14/0x20
+[ 798.008912] ? _raw_spin_lock_bh+0xd0/0xe0
+[ 798.008916] ? smb2_read+0x15e0/0x15e0 [ksmbd]
+[ 798.008927] ? memcpy+0x4e/0x60
+[ 798.008931] ? _raw_spin_unlock+0x19/0x30
+[ 798.008934] ? ksmbd_smb2_check_message+0x16af/0x2350 [ksmbd]
+[ 798.008946] ? _raw_spin_lock_bh+0xe0/0xe0
+[ 798.008950] handle_ksmbd_work+0x30e/0x1020 [ksmbd]
+[ 798.008962] process_one_work+0x778/0x11c0
+[ 798.008966] ? _raw_spin_lock_irq+0x8e/0xe0
+[ 798.008970] worker_thread+0x544/0x1180
+[ 798.008973] ? __cpuidle_text_end+0x4/0x4
+[ 798.008977] kthread+0x282/0x320
+[ 798.008982] ? process_one_work+0x11c0/0x11c0
+[ 798.008985] ? kthread_complete_and_exit+0x30/0x30
+[ 798.008989] ret_from_fork+0x1f/0x30
+[ 798.008995] </TASK>
+
+Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3")
+Cc: stable@vger.kernel.org
+Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-17817
+Signed-off-by: Hyunchul Lee <hyc.lee@gmail.com>
+Acked-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ksmbd/smb2misc.c | 7 +++++--
+ fs/ksmbd/smb2pdu.c | 8 +++-----
+ 2 files changed, 8 insertions(+), 7 deletions(-)
+
+diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c
+index 03bcd7ce0c75..6e25ace36568 100644
+--- a/fs/ksmbd/smb2misc.c
++++ b/fs/ksmbd/smb2misc.c
+@@ -131,8 +131,11 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
+ *len = le16_to_cpu(((struct smb2_read_req *)hdr)->ReadChannelInfoLength);
+ break;
+ case SMB2_WRITE:
+- if (((struct smb2_write_req *)hdr)->DataOffset) {
+- *off = le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset);
++ if (((struct smb2_write_req *)hdr)->DataOffset ||
++ ((struct smb2_write_req *)hdr)->Length) {
++ *off = max_t(unsigned int,
++ le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset),
++ offsetof(struct smb2_write_req, Buffer));
+ *len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length);
+ break;
+ }
+diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
+index 6c8dd718b5db..85a9ed7156ea 100644
+--- a/fs/ksmbd/smb2pdu.c
++++ b/fs/ksmbd/smb2pdu.c
+@@ -6505,14 +6505,12 @@ int smb2_write(struct ksmbd_work *work)
+ writethrough = true;
+
+ if (is_rdma_channel == false) {
+- if ((u64)le16_to_cpu(req->DataOffset) + length >
+- get_rfc1002_len(work->request_buf)) {
+- pr_err("invalid write data offset %u, smb_len %u\n",
+- le16_to_cpu(req->DataOffset),
+- get_rfc1002_len(work->request_buf));
++ if (le16_to_cpu(req->DataOffset) <
++ offsetof(struct smb2_write_req, Buffer)) {
+ err = -EINVAL;
+ goto out;
+ }
++
+ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+ le16_to_cpu(req->DataOffset));
+
+--
+2.35.1
+
--- /dev/null
+From a2389c2ae23605f7b53ab49541ee17285fdf563e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 30 Apr 2022 08:30:25 +0900
+Subject: ksmbd: smbd: change prototypes of RDMA read/write related functions
+
+From: Hyunchul Lee <hyc.lee@gmail.com>
+
+[ Upstream commit 1807abcf8778bcbbf584fe54da9ccbe9029c49bb ]
+
+Change the prototypes of RDMA read/write
+operations to accept a pointer and length
+of buffer descriptors.
+
+Signed-off-by: Hyunchul Lee <hyc.lee@gmail.com>
+Acked-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ksmbd/connection.c | 20 ++++++++++----------
+ fs/ksmbd/connection.h | 27 ++++++++++++++++-----------
+ fs/ksmbd/smb2pdu.c | 23 ++++++++---------------
+ fs/ksmbd/transport_rdma.c | 30 +++++++++++++++++-------------
+ 4 files changed, 51 insertions(+), 49 deletions(-)
+
+diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
+index bc6050b67256..e8f476c5f189 100644
+--- a/fs/ksmbd/connection.c
++++ b/fs/ksmbd/connection.c
+@@ -205,31 +205,31 @@ int ksmbd_conn_write(struct ksmbd_work *work)
+ return 0;
+ }
+
+-int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf,
+- unsigned int buflen, u32 remote_key, u64 remote_offset,
+- u32 remote_len)
++int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
++ void *buf, unsigned int buflen,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len)
+ {
+ int ret = -EINVAL;
+
+ if (conn->transport->ops->rdma_read)
+ ret = conn->transport->ops->rdma_read(conn->transport,
+ buf, buflen,
+- remote_key, remote_offset,
+- remote_len);
++ desc, desc_len);
+ return ret;
+ }
+
+-int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf,
+- unsigned int buflen, u32 remote_key,
+- u64 remote_offset, u32 remote_len)
++int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
++ void *buf, unsigned int buflen,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len)
+ {
+ int ret = -EINVAL;
+
+ if (conn->transport->ops->rdma_write)
+ ret = conn->transport->ops->rdma_write(conn->transport,
+ buf, buflen,
+- remote_key, remote_offset,
+- remote_len);
++ desc, desc_len);
+ return ret;
+ }
+
+diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h
+index 7a59aacb5daa..98c1cbe45ec9 100644
+--- a/fs/ksmbd/connection.h
++++ b/fs/ksmbd/connection.h
+@@ -122,11 +122,14 @@ struct ksmbd_transport_ops {
+ int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov,
+ int size, bool need_invalidate_rkey,
+ unsigned int remote_key);
+- int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len,
+- u32 remote_key, u64 remote_offset, u32 remote_len);
+- int (*rdma_write)(struct ksmbd_transport *t, void *buf,
+- unsigned int len, u32 remote_key, u64 remote_offset,
+- u32 remote_len);
++ int (*rdma_read)(struct ksmbd_transport *t,
++ void *buf, unsigned int len,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len);
++ int (*rdma_write)(struct ksmbd_transport *t,
++ void *buf, unsigned int len,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len);
+ };
+
+ struct ksmbd_transport {
+@@ -148,12 +151,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void);
+ void ksmbd_conn_free(struct ksmbd_conn *conn);
+ bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
+ int ksmbd_conn_write(struct ksmbd_work *work);
+-int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf,
+- unsigned int buflen, u32 remote_key, u64 remote_offset,
+- u32 remote_len);
+-int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf,
+- unsigned int buflen, u32 remote_key, u64 remote_offset,
+- u32 remote_len);
++int ksmbd_conn_rdma_read(struct ksmbd_conn *conn,
++ void *buf, unsigned int buflen,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len);
++int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
++ void *buf, unsigned int buflen,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len);
+ void ksmbd_conn_enqueue_request(struct ksmbd_work *work);
+ int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work);
+ void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops);
+diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
+index 5df87fe18905..8f86b8d6765f 100644
+--- a/fs/ksmbd/smb2pdu.c
++++ b/fs/ksmbd/smb2pdu.c
+@@ -6132,7 +6132,6 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
+ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
+ struct smb2_buffer_desc_v1 *desc,
+ __le32 Channel,
+- __le16 ChannelInfoOffset,
+ __le16 ChannelInfoLength)
+ {
+ unsigned int i, ch_count;
+@@ -6158,7 +6157,8 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
+
+ work->need_invalidate_rkey =
+ (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
+- work->remote_key = le32_to_cpu(desc->token);
++ if (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE)
++ work->remote_key = le32_to_cpu(desc->token);
+ return 0;
+ }
+
+@@ -6166,14 +6166,12 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work,
+ struct smb2_read_req *req, void *data_buf,
+ size_t length)
+ {
+- struct smb2_buffer_desc_v1 *desc =
+- (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
+ int err;
+
+ err = ksmbd_conn_rdma_write(work->conn, data_buf, length,
+- le32_to_cpu(desc->token),
+- le64_to_cpu(desc->offset),
+- le32_to_cpu(desc->length));
++ (struct smb2_buffer_desc_v1 *)
++ ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)),
++ le16_to_cpu(req->ReadChannelInfoLength));
+ if (err)
+ return err;
+
+@@ -6217,7 +6215,6 @@ int smb2_read(struct ksmbd_work *work)
+ (struct smb2_buffer_desc_v1 *)
+ ((char *)req + ch_offset),
+ req->Channel,
+- req->ReadChannelInfoOffset,
+ req->ReadChannelInfoLength);
+ if (err)
+ goto out;
+@@ -6395,21 +6392,18 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work,
+ struct ksmbd_file *fp,
+ loff_t offset, size_t length, bool sync)
+ {
+- struct smb2_buffer_desc_v1 *desc;
+ char *data_buf;
+ int ret;
+ ssize_t nbytes;
+
+- desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0];
+-
+ data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO);
+ if (!data_buf)
+ return -ENOMEM;
+
+ ret = ksmbd_conn_rdma_read(work->conn, data_buf, length,
+- le32_to_cpu(desc->token),
+- le64_to_cpu(desc->offset),
+- le32_to_cpu(desc->length));
++ (struct smb2_buffer_desc_v1 *)
++ ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)),
++ le16_to_cpu(req->WriteChannelInfoLength));
+ if (ret < 0) {
+ kvfree(data_buf);
+ return ret;
+@@ -6461,7 +6455,6 @@ int smb2_write(struct ksmbd_work *work)
+ (struct smb2_buffer_desc_v1 *)
+ ((char *)req + ch_offset),
+ req->Channel,
+- req->WriteChannelInfoOffset,
+ req->WriteChannelInfoLength);
+ if (err)
+ goto out;
+diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
+index 3f5d13571694..479d279ee146 100644
+--- a/fs/ksmbd/transport_rdma.c
++++ b/fs/ksmbd/transport_rdma.c
+@@ -1352,14 +1352,18 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc)
+ read_write_done(cq, wc, DMA_TO_DEVICE);
+ }
+
+-static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf,
+- int buf_len, u32 remote_key, u64 remote_offset,
+- u32 remote_len, bool is_read)
++static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
++ void *buf, int buf_len,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len,
++ bool is_read)
+ {
+ struct smb_direct_rdma_rw_msg *msg;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(completion);
+ struct ib_send_wr *first_wr = NULL;
++ u32 remote_key = le32_to_cpu(desc[0].token);
++ u64 remote_offset = le64_to_cpu(desc[0].offset);
+
+ ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops);
+ if (ret < 0)
+@@ -1424,22 +1428,22 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf,
+ return ret;
+ }
+
+-static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf,
+- unsigned int buflen, u32 remote_key,
+- u64 remote_offset, u32 remote_len)
++static int smb_direct_rdma_write(struct ksmbd_transport *t,
++ void *buf, unsigned int buflen,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len)
+ {
+ return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
+- remote_key, remote_offset,
+- remote_len, false);
++ desc, desc_len, false);
+ }
+
+-static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf,
+- unsigned int buflen, u32 remote_key,
+- u64 remote_offset, u32 remote_len)
++static int smb_direct_rdma_read(struct ksmbd_transport *t,
++ void *buf, unsigned int buflen,
++ struct smb2_buffer_desc_v1 *desc,
++ unsigned int desc_len)
+ {
+ return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen,
+- remote_key, remote_offset,
+- remote_len, true);
++ desc, desc_len, true);
+ }
+
+ static void smb_direct_disconnect(struct ksmbd_transport *t)
+--
+2.35.1
+
--- /dev/null
+From 874e8676953ec14919db995ac2610534d933d174 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 30 Apr 2022 08:30:26 +0900
+Subject: ksmbd: smbd: introduce read/write credits for RDMA read/write
+
+From: Hyunchul Lee <hyc.lee@gmail.com>
+
+[ Upstream commit ddbdc861e37c168cf2fb8a7b7477f5d18b4daf76 ]
+
+SMB2_READ/SMB2_WRITE request has to be granted the number
+of rw credits, the pages the request wants to transfer
+/ the maximum pages which can be registered with one
+MR to read and write a file.
+And allocate enough RDMA resources for the maximum
+number of rw credits allowed by ksmbd.
+
+Signed-off-by: Hyunchul Lee <hyc.lee@gmail.com>
+Acked-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ksmbd/transport_rdma.c | 120 ++++++++++++++++++++++----------------
+ 1 file changed, 71 insertions(+), 49 deletions(-)
+
+diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
+index 479d279ee146..b44a5e584bac 100644
+--- a/fs/ksmbd/transport_rdma.c
++++ b/fs/ksmbd/transport_rdma.c
+@@ -80,9 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
+ /* The maximum single-message size which can be received */
+ static int smb_direct_max_receive_size = 8192;
+
+-static int smb_direct_max_read_write_size = 524224;
+-
+-static int smb_direct_max_outstanding_rw_ops = 8;
++static int smb_direct_max_read_write_size = 8 * 1024 * 1024;
+
+ static LIST_HEAD(smb_direct_device_list);
+ static DEFINE_RWLOCK(smb_direct_device_lock);
+@@ -147,10 +145,12 @@ struct smb_direct_transport {
+ atomic_t send_credits;
+ spinlock_t lock_new_recv_credits;
+ int new_recv_credits;
+- atomic_t rw_avail_ops;
++ int max_rw_credits;
++ int pages_per_rw_credit;
++ atomic_t rw_credits;
+
+ wait_queue_head_t wait_send_credits;
+- wait_queue_head_t wait_rw_avail_ops;
++ wait_queue_head_t wait_rw_credits;
+
+ mempool_t *sendmsg_mempool;
+ struct kmem_cache *sendmsg_cache;
+@@ -377,7 +377,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
+ t->reassembly_queue_length = 0;
+ init_waitqueue_head(&t->wait_reassembly_queue);
+ init_waitqueue_head(&t->wait_send_credits);
+- init_waitqueue_head(&t->wait_rw_avail_ops);
++ init_waitqueue_head(&t->wait_rw_credits);
+
+ spin_lock_init(&t->receive_credit_lock);
+ spin_lock_init(&t->recvmsg_queue_lock);
+@@ -984,18 +984,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t,
+ }
+
+ static int wait_for_credits(struct smb_direct_transport *t,
+- wait_queue_head_t *waitq, atomic_t *credits)
++ wait_queue_head_t *waitq, atomic_t *total_credits,
++ int needed)
+ {
+ int ret;
+
+ do {
+- if (atomic_dec_return(credits) >= 0)
++ if (atomic_sub_return(needed, total_credits) >= 0)
+ return 0;
+
+- atomic_inc(credits);
++ atomic_add(needed, total_credits);
+ ret = wait_event_interruptible(*waitq,
+- atomic_read(credits) > 0 ||
+- t->status != SMB_DIRECT_CS_CONNECTED);
++ atomic_read(total_credits) >= needed ||
++ t->status != SMB_DIRECT_CS_CONNECTED);
+
+ if (t->status != SMB_DIRECT_CS_CONNECTED)
+ return -ENOTCONN;
+@@ -1016,7 +1017,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t,
+ return ret;
+ }
+
+- return wait_for_credits(t, &t->wait_send_credits, &t->send_credits);
++ return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1);
++}
++
++static int wait_for_rw_credits(struct smb_direct_transport *t, int credits)
++{
++ return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits);
++}
++
++static int calc_rw_credits(struct smb_direct_transport *t,
++ char *buf, unsigned int len)
++{
++ return DIV_ROUND_UP(get_buf_page_count(buf, len),
++ t->pages_per_rw_credit);
+ }
+
+ static int smb_direct_create_header(struct smb_direct_transport *t,
+@@ -1332,8 +1345,8 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc,
+ smb_direct_disconnect_rdma_connection(t);
+ }
+
+- if (atomic_inc_return(&t->rw_avail_ops) > 0)
+- wake_up(&t->wait_rw_avail_ops);
++ if (atomic_inc_return(&t->rw_credits) > 0)
++ wake_up(&t->wait_rw_credits);
+
+ rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sg_list, msg->sgt.nents, dir);
+@@ -1364,8 +1377,10 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
+ struct ib_send_wr *first_wr = NULL;
+ u32 remote_key = le32_to_cpu(desc[0].token);
+ u64 remote_offset = le64_to_cpu(desc[0].offset);
++ int credits_needed;
+
+- ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops);
++ credits_needed = calc_rw_credits(t, buf, buf_len);
++ ret = wait_for_rw_credits(t, credits_needed);
+ if (ret < 0)
+ return ret;
+
+@@ -1373,7 +1388,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
+ msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) +
+ sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL);
+ if (!msg) {
+- atomic_inc(&t->rw_avail_ops);
++ atomic_add(credits_needed, &t->rw_credits);
+ return -ENOMEM;
+ }
+
+@@ -1382,7 +1397,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
+ get_buf_page_count(buf, buf_len),
+ msg->sg_list, SG_CHUNK_SIZE);
+ if (ret) {
+- atomic_inc(&t->rw_avail_ops);
++ atomic_add(credits_needed, &t->rw_credits);
+ kfree(msg);
+ return -ENOMEM;
+ }
+@@ -1418,7 +1433,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
+ return 0;
+
+ err:
+- atomic_inc(&t->rw_avail_ops);
++ atomic_add(credits_needed, &t->rw_credits);
+ if (first_wr)
+ rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port,
+ msg->sg_list, msg->sgt.nents,
+@@ -1643,11 +1658,19 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t)
+ return ret;
+ }
+
++static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t)
++{
++ return min_t(unsigned int,
++ t->cm_id->device->attrs.max_fast_reg_page_list_len,
++ 256);
++}
++
+ static int smb_direct_init_params(struct smb_direct_transport *t,
+ struct ib_qp_cap *cap)
+ {
+ struct ib_device *device = t->cm_id->device;
+- int max_send_sges, max_pages, max_rw_wrs, max_send_wrs;
++ int max_send_sges, max_rw_wrs, max_send_wrs;
++ unsigned int max_sge_per_wr, wrs_per_credit;
+
+ /* need 2 more sge. because a SMB_DIRECT header will be mapped,
+ * and maybe a send buffer could be not page aligned.
+@@ -1659,25 +1682,31 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
+ return -EINVAL;
+ }
+
+- /*
+- * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA
+- * read/writes. HCA guarantees at least max_send_sge of sges for
+- * a RDMA read/write work request, and if memory registration is used,
+- * we need reg_mr, local_inv wrs for each read/write.
++ /* Calculate the number of work requests for RDMA R/W.
++ * The maximum number of pages which can be registered
++ * with one Memory region can be transferred with one
++ * R/W credit. And at least 4 work requests for each credit
++ * are needed for MR registration, RDMA R/W, local & remote
++ * MR invalidation.
+ */
+ t->max_rdma_rw_size = smb_direct_max_read_write_size;
+- max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
+- max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES);
+- max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num,
+- max_pages) * 2;
+- max_rw_wrs *= smb_direct_max_outstanding_rw_ops;
++ t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t);
++ t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size,
++ (t->pages_per_rw_credit - 1) *
++ PAGE_SIZE);
++
++ max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge,
++ device->attrs.max_sge_rd);
++ wrs_per_credit = max_t(unsigned int, 4,
++ DIV_ROUND_UP(t->pages_per_rw_credit,
++ max_sge_per_wr) + 1);
++ max_rw_wrs = t->max_rw_credits * wrs_per_credit;
+
+ max_send_wrs = smb_direct_send_credit_target + max_rw_wrs;
+ if (max_send_wrs > device->attrs.max_cqe ||
+ max_send_wrs > device->attrs.max_qp_wr) {
+- pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n",
+- smb_direct_send_credit_target,
+- smb_direct_max_outstanding_rw_ops);
++ pr_err("consider lowering send_credit_target = %d\n",
++ smb_direct_send_credit_target);
+ pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
+ device->attrs.max_cqe, device->attrs.max_qp_wr);
+ return -EINVAL;
+@@ -1712,7 +1741,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
+
+ t->send_credit_target = smb_direct_send_credit_target;
+ atomic_set(&t->send_credits, 0);
+- atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops);
++ atomic_set(&t->rw_credits, t->max_rw_credits);
+
+ t->max_send_size = smb_direct_max_send_size;
+ t->max_recv_size = smb_direct_max_receive_size;
+@@ -1720,12 +1749,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t,
+
+ cap->max_send_wr = max_send_wrs;
+ cap->max_recv_wr = t->recv_credit_max;
+- cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES;
++ cap->max_send_sge = max_sge_per_wr;
+ cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES;
+ cap->max_inline_data = 0;
+- cap->max_rdma_ctxs =
+- rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) *
+- smb_direct_max_outstanding_rw_ops;
++ cap->max_rdma_ctxs = t->max_rw_credits;
+ return 0;
+ }
+
+@@ -1818,7 +1845,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
+ }
+
+ t->send_cq = ib_alloc_cq(t->cm_id->device, t,
+- t->send_credit_target, 0, IB_POLL_WORKQUEUE);
++ smb_direct_send_credit_target + cap->max_rdma_ctxs,
++ 0, IB_POLL_WORKQUEUE);
+ if (IS_ERR(t->send_cq)) {
+ pr_err("Can't create RDMA send CQ\n");
+ ret = PTR_ERR(t->send_cq);
+@@ -1827,8 +1855,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
+ }
+
+ t->recv_cq = ib_alloc_cq(t->cm_id->device, t,
+- cap->max_send_wr + cap->max_rdma_ctxs,
+- 0, IB_POLL_WORKQUEUE);
++ t->recv_credit_max, 0, IB_POLL_WORKQUEUE);
+ if (IS_ERR(t->recv_cq)) {
+ pr_err("Can't create RDMA recv CQ\n");
+ ret = PTR_ERR(t->recv_cq);
+@@ -1857,17 +1884,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
+
+ pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1;
+ if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) {
+- int pages_per_mr, mr_count;
+-
+- pages_per_mr = min_t(int, pages_per_rw,
+- t->cm_id->device->attrs.max_fast_reg_page_list_len);
+- mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) *
+- atomic_read(&t->rw_avail_ops);
+- ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count,
+- IB_MR_TYPE_MEM_REG, pages_per_mr, 0);
++ ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs,
++ t->max_rw_credits, IB_MR_TYPE_MEM_REG,
++ t->pages_per_rw_credit, 0);
+ if (ret) {
+ pr_err("failed to init mr pool count %d pages %d\n",
+- mr_count, pages_per_mr);
++ t->max_rw_credits, t->pages_per_rw_credit);
+ goto err;
+ }
+ }
+--
+2.35.1
+
--- /dev/null
+From e58f6941d18cf59076c318ceaa24694c385f008b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 May 2022 15:40:10 +0200
+Subject: ksmbd: validate length in smb2_write()
+
+From: Marios Makassikis <mmakassikis@freebox.fr>
+
+[ Upstream commit 158a66b245739e15858de42c0ba60fcf3de9b8e6 ]
+
+The SMB2 Write packet contains data that is to be written
+to a file or to a pipe. Depending on the client, there may
+be padding between the header and the data field.
+Currently, the length is validated only in the case padding
+is present.
+
+Since the DataOffset field always points to the beginning
+of the data, there is no need to have a special case for
+padding. By removing this, the length is validated in both
+cases.
+
+Signed-off-by: Marios Makassikis <mmakassikis@freebox.fr>
+Acked-by: Namjae Jeon <linkinjeon@kernel.org>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ksmbd/smb2pdu.c | 49 ++++++++++++++++++----------------------------
+ 1 file changed, 19 insertions(+), 30 deletions(-)
+
+diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
+index 0a76aa7fe5f9..5df87fe18905 100644
+--- a/fs/ksmbd/smb2pdu.c
++++ b/fs/ksmbd/smb2pdu.c
+@@ -6344,23 +6344,18 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
+ length = le32_to_cpu(req->Length);
+ id = req->VolatileFileId;
+
+- if (le16_to_cpu(req->DataOffset) ==
+- offsetof(struct smb2_write_req, Buffer)) {
+- data_buf = (char *)&req->Buffer[0];
+- } else {
+- if ((u64)le16_to_cpu(req->DataOffset) + length >
+- get_rfc1002_len(work->request_buf)) {
+- pr_err("invalid write data offset %u, smb_len %u\n",
+- le16_to_cpu(req->DataOffset),
+- get_rfc1002_len(work->request_buf));
+- err = -EINVAL;
+- goto out;
+- }
+-
+- data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+- le16_to_cpu(req->DataOffset));
++ if ((u64)le16_to_cpu(req->DataOffset) + length >
++ get_rfc1002_len(work->request_buf)) {
++ pr_err("invalid write data offset %u, smb_len %u\n",
++ le16_to_cpu(req->DataOffset),
++ get_rfc1002_len(work->request_buf));
++ err = -EINVAL;
++ goto out;
+ }
+
++ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
++ le16_to_cpu(req->DataOffset));
++
+ rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length);
+ if (rpc_resp) {
+ if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) {
+@@ -6505,22 +6500,16 @@ int smb2_write(struct ksmbd_work *work)
+
+ if (req->Channel != SMB2_CHANNEL_RDMA_V1 &&
+ req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
+- if (le16_to_cpu(req->DataOffset) ==
+- offsetof(struct smb2_write_req, Buffer)) {
+- data_buf = (char *)&req->Buffer[0];
+- } else {
+- if ((u64)le16_to_cpu(req->DataOffset) + length >
+- get_rfc1002_len(work->request_buf)) {
+- pr_err("invalid write data offset %u, smb_len %u\n",
+- le16_to_cpu(req->DataOffset),
+- get_rfc1002_len(work->request_buf));
+- err = -EINVAL;
+- goto out;
+- }
+-
+- data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
+- le16_to_cpu(req->DataOffset));
++ if ((u64)le16_to_cpu(req->DataOffset) + length >
++ get_rfc1002_len(work->request_buf)) {
++ pr_err("invalid write data offset %u, smb_len %u\n",
++ le16_to_cpu(req->DataOffset),
++ get_rfc1002_len(work->request_buf));
++ err = -EINVAL;
++ goto out;
+ }
++ data_buf = (char *)(((char *)&req->hdr.ProtocolId) +
++ le16_to_cpu(req->DataOffset));
+
+ ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags));
+ if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH)
+--
+2.35.1
+
--- /dev/null
+From 0c595ec21faf719ae7503cd05c3534eb55ebe586 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 22 Jul 2022 22:44:08 +0000
+Subject: KVM: nVMX: Attempt to load PERF_GLOBAL_CTRL on nVMX xfer iff it
+ exists
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 4496a6f9b45e8cd83343ad86a3984d614e22cf54 ]
+
+Attempt to load PERF_GLOBAL_CTRL during nested VM-Enter/VM-Exit if and
+only if the MSR exists (according to the guest vCPU model). KVM has very
+misguided handling of VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL and
+attempts to force the nVMX MSR settings to match the vPMU model, i.e. to
+hide/expose the control based on whether or not the MSR exists from the
+guest's perspective.
+
+KVM's modifications fail to handle the scenario where the vPMU is hidden
+from the guest _after_ being exposed to the guest, e.g. by userspace
+doing multiple KVM_SET_CPUID2 calls, which is allowed if done before any
+KVM_RUN. nested_vmx_pmu_refresh() is called if and only if there's a
+recognized vPMU, i.e. KVM will leave the bits in the allow state and then
+ultimately reject the MSR load and WARN.
+
+KVM should not force the VMX MSRs in the first place. KVM taking control
+of the MSRs was a misguided attempt at mimicking what commit 5f76f6f5ff96
+("KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled",
+2018-10-01) did for MPX. However, the MPX commit was a workaround for
+another KVM bug and not something that should be imitated (and it should
+never been done in the first place).
+
+In other words, KVM's ABI _should_ be that userspace has full control
+over the MSRs, at which point triggering the WARN that loading the MSR
+must not fail is trivial.
+
+The intent of the WARN is still valid; KVM has consistency checks to
+ensure that vmcs12->{guest,host}_ia32_perf_global_ctrl is valid. The
+problem is that '0' must be considered a valid value at all times, and so
+the simple/obvious solution is to just not actually load the MSR when it
+does not exist. It is userspace's responsibility to provide a sane vCPU
+model, i.e. KVM is well within its ABI and Intel's VMX architecture to
+skip the loads if the MSR does not exist.
+
+Fixes: 03a8871add95 ("KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL VM-{Entry,Exit} control")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220722224409.1336532-5-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/nested.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
+index aa287302f991..5c62e552082a 100644
+--- a/arch/x86/kvm/vmx/nested.c
++++ b/arch/x86/kvm/vmx/nested.c
+@@ -2621,6 +2621,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
+
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
++ intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) &&
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+@@ -4346,7 +4347,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+ vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+ vcpu->arch.pat = vmcs12->host_ia32_pat;
+ }
+- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
++ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
++ intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)))
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl));
+
+--
+2.35.1
+
--- /dev/null
+From afc6d9998f6e8d08603017905a5383988aa91bd0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 21 May 2022 08:15:11 +0000
+Subject: KVM: set_msr_mce: Permit guests to ignore single-bit ECC errors
+
+From: Lev Kujawski <lkujaw@member.fsf.org>
+
+[ Upstream commit 0471a7bd1bca2a47a5f378f2222c5cf39ce94152 ]
+
+Certain guest operating systems (e.g., UNIXWARE) clear bit 0 of
+MC1_CTL to ignore single-bit ECC data errors. Single-bit ECC data
+errors are always correctable and thus are safe to ignore because they
+are informational in nature rather than signaling a loss of data
+integrity.
+
+Prior to this patch, these guests would crash upon writing MC1_CTL,
+with resultant error messages like the following:
+
+error: kvm run failed Operation not permitted
+EAX=fffffffe EBX=fffffffe ECX=00000404 EDX=ffffffff
+ESI=ffffffff EDI=00000001 EBP=fffdaba4 ESP=fffdab20
+EIP=c01333a5 EFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
+ES =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA]
+CS =0100 00000000 ffffffff 00c09b00 DPL=0 CS32 [-RA]
+SS =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA]
+DS =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA]
+FS =0000 00000000 ffffffff 00c00000
+GS =0000 00000000 ffffffff 00c00000
+LDT=0118 c1026390 00000047 00008200 DPL=0 LDT
+TR =0110 ffff5af0 00000067 00008b00 DPL=0 TSS32-busy
+GDT= ffff5020 000002cf
+IDT= ffff52f0 000007ff
+CR0=8001003b CR2=00000000 CR3=0100a000 CR4=00000230
+DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000
+DR6=ffff0ff0 DR7=00000400
+EFER=0000000000000000
+Code=08 89 01 89 51 04 c3 8b 4c 24 08 8b 01 8b 51 04 8b 4c 24 04 <0f>
+30 c3 f7 05 a4 6d ff ff 10 00 00 00 74 03 0f 31 c3 33 c0 33 d2 c3 8d
+74 26 00 0f 31 c3
+
+Signed-off-by: Lev Kujawski <lkujaw@member.fsf.org>
+Message-Id: <20220521081511.187388-1-lkujaw@member.fsf.org>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 767a61e29f51..2316c978b598 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3226,10 +3226,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ /* only 0 or all 1s can be written to IA32_MCi_CTL
+ * some Linux kernels though clear bit 10 in bank 4 to
+ * workaround a BIOS/GART TBL issue on AMD K8s, ignore
+- * this to avoid an uncatched #GP in the guest
++ * this to avoid an uncatched #GP in the guest.
++ *
++ * UNIXWARE clears bit 0 of MC1_CTL to ignore
++ * correctable, single-bit ECC data errors.
+ */
+ if ((offset & 0x3) == 0 &&
+- data != 0 && (data | (1 << 10)) != ~(u64)0)
++ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+ return -1;
+
+ /* MCi_STATUS */
+--
+2.35.1
+
--- /dev/null
+From d297e223ab3f71968097240e39d44fb5cc478e26 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 22 Jul 2022 22:44:07 +0000
+Subject: KVM: VMX: Add helper to check if the guest PMU has PERF_GLOBAL_CTRL
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit b663f0b5f3d665c261256d1f76e98f077c6e56af ]
+
+Add a helper to check of the guest PMU has PERF_GLOBAL_CTRL, which is
+unintuitive _and_ diverges from Intel's architecturally defined behavior.
+Even worse, KVM currently implements the check using two different (but
+equivalent) checks, _and_ there has been at least one attempt to add a
+_third_ flavor.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220722224409.1336532-4-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/pmu_intel.c | 4 ++--
+ arch/x86/kvm/vmx/vmx.h | 12 ++++++++++++
+ 2 files changed, 14 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 2cbd5f183ab5..8bd154f8c966 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -98,7 +98,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
+ {
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+- if (pmu->version < 2)
++ if (!intel_pmu_has_perf_global_ctrl(pmu))
+ return true;
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+@@ -215,7 +215,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+- ret = pmu->version > 1;
++ return intel_pmu_has_perf_global_ctrl(pmu);
+ break;
+ default:
+ ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
+index 1e7f9453894b..93aa1f3ea01e 100644
+--- a/arch/x86/kvm/vmx/vmx.h
++++ b/arch/x86/kvm/vmx/vmx.h
+@@ -92,6 +92,18 @@ union vmx_exit_reason {
+ u32 full;
+ };
+
++static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu)
++{
++ /*
++ * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is
++ * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is
++ * greater than zero. However, KVM only exposes and emulates the MSR
++ * to/for the guest if the guest PMU supports at least "Architectural
++ * Performance Monitoring Version 2".
++ */
++ return pmu->version > 1;
++}
++
+ #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc)
+ #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records)
+
+--
+2.35.1
+
--- /dev/null
+From cbb6518aa4bc3a15da904a5d81ff02604a6fcbe8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 22 Jul 2022 22:44:06 +0000
+Subject: KVM: VMX: Mark all PERF_GLOBAL_(OVF)_CTRL bits reserved if there's no
+ vPMU
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 93255bf92939d948bc86d81c6bb70bb0fecc5db1 ]
+
+Mark all MSR_CORE_PERF_GLOBAL_CTRL and MSR_CORE_PERF_GLOBAL_OVF_CTRL bits
+as reserved if there is no guest vPMU. The nVMX VM-Entry consistency
+checks do not check for a valid vPMU prior to consuming the masks via
+kvm_valid_perf_global_ctrl(), i.e. may incorrectly allow a non-zero mask
+to be loaded via VM-Enter or VM-Exit (well, attempted to be loaded, the
+actual MSR load will be rejected by intel_is_valid_msr()).
+
+Fixes: f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Message-Id: <20220722224409.1336532-3-seanjc@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/pmu_intel.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index 040d598622e3..cd2d0454f8b0 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -488,6 +488,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+ pmu->version = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
++ pmu->global_ctrl_mask = ~0ull;
++ pmu->global_ovf_ctrl_mask = ~0ull;
+ pmu->fixed_ctr_ctrl_mask = ~0ull;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+--
+2.35.1
+
--- /dev/null
+From 6174683901bf4385f235ccf8923ece845be35a32 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 9 May 2022 18:22:02 +0800
+Subject: KVM: x86/pmu: Ignore pmu->global_ctrl check if vPMU doesn't support
+ global_ctrl
+
+From: Like Xu <likexu@tencent.com>
+
+[ Upstream commit 98defd2e17803263f49548fea930cfc974d505aa ]
+
+MSR_CORE_PERF_GLOBAL_CTRL is introduced as part of Architecture PMU V2,
+as indicated by Intel SDM 19.2.2 and the intel_is_valid_msr() function.
+
+So in the absence of global_ctrl support, all PMCs are enabled as AMD does.
+
+Signed-off-by: Like Xu <likexu@tencent.com>
+Message-Id: <20220509102204.62389-1-likexu@tencent.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/vmx/pmu_intel.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index cd2d0454f8b0..2cbd5f183ab5 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -98,6 +98,9 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
+ {
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
++ if (pmu->version < 2)
++ return true;
++
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+ }
+
+--
+2.35.1
+
--- /dev/null
+From 1687eb7807a86765c388c89e08f961c30ccc30a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Apr 2022 18:19:34 +0800
+Subject: KVM: x86/pmu: Introduce the ctrl_mask value for fixed counter
+
+From: Like Xu <like.xu@linux.intel.com>
+
+[ Upstream commit 2c985527dd8d283e786ad7a67e532ef7f6f00fac ]
+
+The mask value of fixed counter control register should be dynamic
+adjusted with the number of fixed counters. This patch introduces a
+variable that includes the reserved bits of fixed counter control
+registers. This is a generic code refactoring.
+
+Co-developed-by: Luwei Kang <luwei.kang@intel.com>
+Signed-off-by: Luwei Kang <luwei.kang@intel.com>
+Signed-off-by: Like Xu <like.xu@linux.intel.com>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Message-Id: <20220411101946.20262-6-likexu@tencent.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/kvm_host.h | 1 +
+ arch/x86/kvm/vmx/pmu_intel.c | 6 +++++-
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
+index 57550a427789..35c7a1fce8ea 100644
+--- a/arch/x86/include/asm/kvm_host.h
++++ b/arch/x86/include/asm/kvm_host.h
+@@ -508,6 +508,7 @@ struct kvm_pmu {
+ unsigned nr_arch_fixed_counters;
+ unsigned available_event_types;
+ u64 fixed_ctr_ctrl;
++ u64 fixed_ctr_ctrl_mask;
+ u64 global_ctrl;
+ u64 global_status;
+ u64 counter_bitmask[2];
+diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
+index b82b6709d7a8..040d598622e3 100644
+--- a/arch/x86/kvm/vmx/pmu_intel.c
++++ b/arch/x86/kvm/vmx/pmu_intel.c
+@@ -395,7 +395,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ if (pmu->fixed_ctr_ctrl == data)
+ return 0;
+- if (!(data & 0xfffffffffffff444ull)) {
++ if (!(data & pmu->fixed_ctr_ctrl_mask)) {
+ reprogram_fixed_counters(pmu, data);
+ return 0;
+ }
+@@ -479,6 +479,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
++ int i;
+
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+@@ -487,6 +488,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+ pmu->version = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
++ pmu->fixed_ctr_ctrl_mask = ~0ull;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+ if (!entry || !vcpu->kvm->arch.enable_pmu)
+@@ -522,6 +524,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+ setup_fixed_pmc_eventsel(pmu);
+ }
+
++ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
++ pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
+ pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
+ pmu->global_ctrl_mask = ~pmu->global_ctrl;
+--
+2.35.1
+
--- /dev/null
+From e9cd8ca56097b6239965d4f48f6472c096fa12cf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 May 2022 22:27:14 +0000
+Subject: KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS)
+
+From: Sean Christopherson <seanjc@google.com>
+
+[ Upstream commit 2368048bf5c2ec4b604ac3431564071e89a0bc71 ]
+
+Return '1', not '-1', when handling an illegal WRMSR to a MCi_CTL or
+MCi_STATUS MSR. The behavior of "all zeros' or "all ones" for CTL MSRs
+is architectural, as is the "only zeros" behavior for STATUS MSRs. I.e.
+the intent is to inject a #GP, not exit to userspace due to an unhandled
+emulation case. Returning '-1' gets interpreted as -EPERM up the stack
+and effecitvely kills the guest.
+
+Fixes: 890ca9aefa78 ("KVM: Add MCE support")
+Fixes: 9ffd986c6e4e ("KVM: X86: #GP when guest attempts to write MCi_STATUS register w/o 0")
+Cc: stable@vger.kernel.org
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Reviewed-by: Jim Mattson <jmattson@google.com>
+Link: https://lore.kernel.org/r/20220512222716.4112548-2-seanjc@google.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kvm/x86.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 2316c978b598..0d6cea0d33a9 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -3233,13 +3233,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+ */
+ if ((offset & 0x3) == 0 &&
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+- return -1;
++ return 1;
+
+ /* MCi_STATUS */
+ if (!msr_info->host_initiated &&
+ (offset & 0x3) == 1 && data != 0) {
+ if (!can_set_mci_status(vcpu))
+- return -1;
++ return 1;
+ }
+
+ vcpu->arch.mce_banks[offset] = data;
+--
+2.35.1
+
--- /dev/null
+From 9219faa1ac767c0882f58d034a20086d6c06042e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 10 May 2022 17:46:39 +0800
+Subject: locking/csd_lock: Change csdlock_debug from early_param to __setup
+
+From: Chen Zhongjin <chenzhongjin@huawei.com>
+
+[ Upstream commit 9c9b26b0df270d4f9246e483a44686fca951a29c ]
+
+The csdlock_debug kernel-boot parameter is parsed by the
+early_param() function csdlock_debug(). If set, csdlock_debug()
+invokes static_branch_enable() to enable csd_lock_wait feature, which
+triggers a panic on arm64 for kernels built with CONFIG_SPARSEMEM=y and
+CONFIG_SPARSEMEM_VMEMMAP=n.
+
+With CONFIG_SPARSEMEM_VMEMMAP=n, __nr_to_section is called in
+static_key_enable() and returns NULL, resulting in a NULL dereference
+because mem_section is initialized only later in sparse_init().
+
+This is also a problem for powerpc because early_param() functions
+are invoked earlier than jump_label_init(), also resulting in
+static_key_enable() failures. These failures cause the warning "static
+key 'xxx' used before call to jump_label_init()".
+
+Thus, early_param is too early for csd_lock_wait to run
+static_branch_enable(), so changes it to __setup to fix these.
+
+Fixes: 8d0968cc6b8f ("locking/csd_lock: Add boot parameter for controlling CSD lock debugging")
+Cc: stable@vger.kernel.org
+Reported-by: Chen jingwen <chenjingwen6@huawei.com>
+Signed-off-by: Chen Zhongjin <chenzhongjin@huawei.com>
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/smp.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/smp.c b/kernel/smp.c
+index 65a630f62363..381eb15cd28f 100644
+--- a/kernel/smp.c
++++ b/kernel/smp.c
+@@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str)
+ if (val)
+ static_branch_enable(&csdlock_debug_enabled);
+
+- return 0;
++ return 1;
+ }
+-early_param("csdlock_debug", csdlock_debug);
++__setup("csdlock_debug=", csdlock_debug);
+
+ static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
+ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
+--
+2.35.1
+
--- /dev/null
+From da88b2e18326479f78988cfb3e313dbb2286e447 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Jul 2022 14:37:46 +0800
+Subject: mm/damon/reclaim: fix potential memory leak in damon_reclaim_init()
+
+From: Jianglei Nie <niejianglei2021@163.com>
+
+[ Upstream commit 188043c7f4f2bd662f2a55957d684fffa543e600 ]
+
+damon_reclaim_init() allocates a memory chunk for ctx with
+damon_new_ctx(). When damon_select_ops() fails, ctx is not released,
+which will lead to a memory leak.
+
+We should release the ctx with damon_destroy_ctx() when damon_select_ops()
+fails to fix the memory leak.
+
+Link: https://lkml.kernel.org/r/20220714063746.2343549-1-niejianglei2021@163.com
+Fixes: 4d69c3457821 ("mm/damon/reclaim: use damon_select_ops() instead of damon_{v,p}a_set_operations()")
+Signed-off-by: Jianglei Nie <niejianglei2021@163.com>
+Reviewed-by: SeongJae Park <sj@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/damon/reclaim.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
+index e34c4d0c4d93..11982685508e 100644
+--- a/mm/damon/reclaim.c
++++ b/mm/damon/reclaim.c
+@@ -384,8 +384,10 @@ static int __init damon_reclaim_init(void)
+ if (!ctx)
+ return -ENOMEM;
+
+- if (damon_select_ops(ctx, DAMON_OPS_PADDR))
++ if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
++ damon_destroy_ctx(ctx);
+ return -EINVAL;
++ }
+
+ ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
+
+--
+2.35.1
+
--- /dev/null
+From 54577663faf8efe35cef8b782a6a2be7dbe01e35 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 10 Jul 2022 09:14:02 -0500
+Subject: net/9p: Initialize the iounit field during fid creation
+
+From: Tyler Hicks <tyhicks@linux.microsoft.com>
+
+[ Upstream commit aa7aeee169480e98cf41d83c01290a37e569be6d ]
+
+Ensure that the fid's iounit field is set to zero when a new fid is
+created. Certain 9P operations, such as OPEN and CREATE, allow the
+server to reply with an iounit size which the client code assigns to the
+p9_fid struct shortly after the fid is created by p9_fid_create(). On
+the other hand, an XATTRWALK operation doesn't allow for the server to
+specify an iounit value. The iounit field of the newly allocated p9_fid
+struct remained uninitialized in that case. Depending on allocation
+patterns, the iounit value could have been something reasonable that was
+carried over from previously freed fids or, in the worst case, could
+have been arbitrary values from non-fid related usages of the memory
+location.
+
+The bug was detected in the Windows Subsystem for Linux 2 (WSL2) kernel
+after the uninitialized iounit field resulted in the typical sequence of
+two getxattr(2) syscalls, one to get the size of an xattr and another
+after allocating a sufficiently sized buffer to fit the xattr value, to
+hit an unexpected ERANGE error in the second call to getxattr(2). An
+uninitialized iounit field would sometimes force rsize to be smaller
+than the xattr value size in p9_client_read_once() and the 9P server in
+WSL refused to chunk up the READ on the attr_fid and, instead, returned
+ERANGE to the client. The virtfs server in QEMU seems happy to chunk up
+the READ and this problem goes undetected there.
+
+Link: https://lkml.kernel.org/r/20220710141402.803295-1-tyhicks@linux.microsoft.com
+Fixes: ebf46264a004 ("fs/9p: Add support user. xattr")
+Cc: stable@vger.kernel.org
+Signed-off-by: Tyler Hicks <tyhicks@linux.microsoft.com>
+Reviewed-by: Christian Schoenebeck <linux_oss@crudebyte.com>
+Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/9p/client.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/net/9p/client.c b/net/9p/client.c
+index a36a40137caa..87cde948f628 100644
+--- a/net/9p/client.c
++++ b/net/9p/client.c
+@@ -886,16 +886,13 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
+ struct p9_fid *fid;
+
+ p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
+- fid = kmalloc(sizeof(*fid), GFP_KERNEL);
++ fid = kzalloc(sizeof(*fid), GFP_KERNEL);
+ if (!fid)
+ return NULL;
+
+- memset(&fid->qid, 0, sizeof(fid->qid));
+ fid->mode = -1;
+ fid->uid = current_fsuid();
+ fid->clnt = clnt;
+- fid->rdir = NULL;
+- fid->fid = 0;
+ refcount_set(&fid->count, 1);
+
+ idr_preload(GFP_KERNEL);
+--
+2.35.1
+
--- /dev/null
+From dbe842dddf2be13dbdb8de5eb0b9ee702f315d95 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 9 May 2022 18:14:41 +0000
+Subject: PCI/AER: Iterate over error counters instead of error strings
+
+From: Mohamed Khalfella <mkhalfella@purestorage.com>
+
+[ Upstream commit 5e6ae050955b566484f3cc6a66e3925eae87a0ed ]
+
+Previously we iterated over AER stat *names*, e.g.,
+aer_correctable_error_string[32], but the actual stat *counters* may not be
+that large, e.g., pdev->aer_stats->dev_cor_errs[16], which means that we
+printed junk in the sysfs stats files.
+
+Iterate over the stat counter arrays instead of the names to avoid this
+junk.
+
+Also, added a build time check to make sure all
+counters have entries in strings array.
+
+Fixes: 0678e3109a3c ("PCI/AER: Simplify __aer_print_error()")
+Link: https://lore.kernel.org/r/20220509181441.31884-1-mkhalfella@purestorage.com
+Reported-by: Meeta Saggi <msaggi@purestorage.com>
+Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Reviewed-by: Meeta Saggi <msaggi@purestorage.com>
+Reviewed-by: Eric Badger <ebadger@purestorage.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/pci/pcie/aer.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
+index 7952e5efd6cf..a1e38ca93cd9 100644
+--- a/drivers/pci/pcie/aer.c
++++ b/drivers/pci/pcie/aer.c
+@@ -538,7 +538,7 @@ static const char *aer_agent_string[] = {
+ u64 *stats = pdev->aer_stats->stats_array; \
+ size_t len = 0; \
+ \
+- for (i = 0; i < ARRAY_SIZE(strings_array); i++) { \
++ for (i = 0; i < ARRAY_SIZE(pdev->aer_stats->stats_array); i++) {\
+ if (strings_array[i]) \
+ len += sysfs_emit_at(buf, len, "%s %llu\n", \
+ strings_array[i], \
+@@ -1347,6 +1347,11 @@ static int aer_probe(struct pcie_device *dev)
+ struct device *device = &dev->device;
+ struct pci_dev *port = dev->port;
+
++ BUILD_BUG_ON(ARRAY_SIZE(aer_correctable_error_string) <
++ AER_MAX_TYPEOF_COR_ERRS);
++ BUILD_BUG_ON(ARRAY_SIZE(aer_uncorrectable_error_string) <
++ AER_MAX_TYPEOF_UNCOR_ERRS);
++
+ /* Limit to Root Ports or Root Complex Event Collectors */
+ if ((pci_pcie_type(port) != PCI_EXP_TYPE_RC_EC) &&
+ (pci_pcie_type(port) != PCI_EXP_TYPE_ROOT_PORT))
+--
+2.35.1
+
--- /dev/null
+From 458d79349bdb8364d0035a65afb591e5eff068d8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 23 Jun 2022 17:50:03 +0200
+Subject: PCI: qcom: Power on PHY before IPQ8074 DBI register accesses
+
+From: Robert Marko <robimarko@gmail.com>
+
+[ Upstream commit a0e43bb9973b06ce5c666f0901e104e2037c1b34 ]
+
+Currently the Gen2 port in IPQ8074 will cause the system to hang as it
+accesses DBI registers in qcom_pcie_init_2_3_3(), and those are only
+accesible after phy_power_on().
+
+Move the DBI read/writes to a new qcom_pcie_post_init_2_3_3(), which is
+executed after phy_power_on().
+
+Link: https://lore.kernel.org/r/20220623155004.688090-1-robimarko@gmail.com
+Fixes: a0fd361db8e5 ("PCI: dwc: Move "dbi", "dbi2", and "addr_space" resource setup into common code")
+Signed-off-by: Robert Marko <robimarko@gmail.com>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+Cc: stable@vger.kernel.org # v5.11+
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/pci/controller/dwc/pcie-qcom.c | 48 +++++++++++++++-----------
+ 1 file changed, 28 insertions(+), 20 deletions(-)
+
+diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c
+index ab04818f6ed9..340542aab8a5 100644
+--- a/drivers/pci/controller/dwc/pcie-qcom.c
++++ b/drivers/pci/controller/dwc/pcie-qcom.c
+@@ -1036,9 +1036,7 @@ static int qcom_pcie_init_2_3_3(struct qcom_pcie *pcie)
+ struct qcom_pcie_resources_2_3_3 *res = &pcie->res.v2_3_3;
+ struct dw_pcie *pci = pcie->pci;
+ struct device *dev = pci->dev;
+- u16 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
+ int i, ret;
+- u32 val;
+
+ for (i = 0; i < ARRAY_SIZE(res->rst); i++) {
+ ret = reset_control_assert(res->rst[i]);
+@@ -1095,6 +1093,33 @@ static int qcom_pcie_init_2_3_3(struct qcom_pcie *pcie)
+ goto err_clk_aux;
+ }
+
++ return 0;
++
++err_clk_aux:
++ clk_disable_unprepare(res->ahb_clk);
++err_clk_ahb:
++ clk_disable_unprepare(res->axi_s_clk);
++err_clk_axi_s:
++ clk_disable_unprepare(res->axi_m_clk);
++err_clk_axi_m:
++ clk_disable_unprepare(res->iface);
++err_clk_iface:
++ /*
++ * Not checking for failure, will anyway return
++ * the original failure in 'ret'.
++ */
++ for (i = 0; i < ARRAY_SIZE(res->rst); i++)
++ reset_control_assert(res->rst[i]);
++
++ return ret;
++}
++
++static int qcom_pcie_post_init_2_3_3(struct qcom_pcie *pcie)
++{
++ struct dw_pcie *pci = pcie->pci;
++ u16 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP);
++ u32 val;
++
+ writel(SLV_ADDR_SPACE_SZ,
+ pcie->parf + PCIE20_v3_PARF_SLV_ADDR_SPACE_SIZE);
+
+@@ -1122,24 +1147,6 @@ static int qcom_pcie_init_2_3_3(struct qcom_pcie *pcie)
+ PCI_EXP_DEVCTL2);
+
+ return 0;
+-
+-err_clk_aux:
+- clk_disable_unprepare(res->ahb_clk);
+-err_clk_ahb:
+- clk_disable_unprepare(res->axi_s_clk);
+-err_clk_axi_s:
+- clk_disable_unprepare(res->axi_m_clk);
+-err_clk_axi_m:
+- clk_disable_unprepare(res->iface);
+-err_clk_iface:
+- /*
+- * Not checking for failure, will anyway return
+- * the original failure in 'ret'.
+- */
+- for (i = 0; i < ARRAY_SIZE(res->rst); i++)
+- reset_control_assert(res->rst[i]);
+-
+- return ret;
+ }
+
+ static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie)
+@@ -1465,6 +1472,7 @@ static const struct qcom_pcie_ops ops_2_4_0 = {
+ static const struct qcom_pcie_ops ops_2_3_3 = {
+ .get_resources = qcom_pcie_get_resources_2_3_3,
+ .init = qcom_pcie_init_2_3_3,
++ .post_init = qcom_pcie_post_init_2_3_3,
+ .deinit = qcom_pcie_deinit_2_3_3,
+ .ltssm_enable = qcom_pcie_2_3_2_ltssm_enable,
+ };
+--
+2.35.1
+
--- /dev/null
+From 6df1064850462c2a7cac009c6e72ad3fcd7d0fa6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 28 Jul 2022 00:32:18 +1000
+Subject: powerpc/powernv/kvm: Use darn for H_RANDOM on Power9
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+[ Upstream commit 7ef3d06f1bc4a5e62273726f3dc2bd258ae1c71f ]
+
+The existing logic in KVM to support guests calling H_RANDOM only works
+on Power8, because it looks for an RNG in the device tree, but on Power9
+we just use darn.
+
+In addition the existing code needs to work in real mode, so we have the
+special cased powernv_get_random_real_mode() to deal with that.
+
+Instead just have KVM call ppc_md.get_random_seed(), and do the real
+mode check inside of there, that way we use whatever RNG is available,
+including darn on Power9.
+
+Fixes: e928e9cb3601 ("KVM: PPC: Book3S HV: Add fast real-mode H_RANDOM implementation.")
+Cc: stable@vger.kernel.org # v4.1+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Tested-by: Sachin Sant <sachinp@linux.ibm.com>
+[mpe: Rebase on previous commit, update change log appropriately]
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://lore.kernel.org/r/20220727143219.2684192-2-mpe@ellerman.id.au
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/powerpc/include/asm/archrandom.h | 5 ----
+ arch/powerpc/kvm/book3s_hv_builtin.c | 7 +++---
+ arch/powerpc/platforms/powernv/rng.c | 36 ++++++---------------------
+ 3 files changed, 12 insertions(+), 36 deletions(-)
+
+diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h
+index 9a53e29680f4..258174304904 100644
+--- a/arch/powerpc/include/asm/archrandom.h
++++ b/arch/powerpc/include/asm/archrandom.h
+@@ -38,12 +38,7 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
+ #endif /* CONFIG_ARCH_RANDOM */
+
+ #ifdef CONFIG_PPC_POWERNV
+-int powernv_hwrng_present(void);
+ int powernv_get_random_long(unsigned long *v);
+-int powernv_get_random_real_mode(unsigned long *v);
+-#else
+-static inline int powernv_hwrng_present(void) { return 0; }
+-static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; }
+ #endif
+
+ #endif /* _ASM_POWERPC_ARCHRANDOM_H */
+diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
+index 7e52d0beee77..5e4251b76e75 100644
+--- a/arch/powerpc/kvm/book3s_hv_builtin.c
++++ b/arch/powerpc/kvm/book3s_hv_builtin.c
+@@ -19,7 +19,7 @@
+ #include <asm/interrupt.h>
+ #include <asm/kvm_ppc.h>
+ #include <asm/kvm_book3s.h>
+-#include <asm/archrandom.h>
++#include <asm/machdep.h>
+ #include <asm/xics.h>
+ #include <asm/xive.h>
+ #include <asm/dbell.h>
+@@ -176,13 +176,14 @@ EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+ int kvmppc_hwrng_present(void)
+ {
+- return powernv_hwrng_present();
++ return ppc_md.get_random_seed != NULL;
+ }
+ EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+ long kvmppc_rm_h_random(struct kvm_vcpu *vcpu)
+ {
+- if (powernv_get_random_real_mode(&vcpu->arch.regs.gpr[4]))
++ if (ppc_md.get_random_seed &&
++ ppc_md.get_random_seed(&vcpu->arch.regs.gpr[4]))
+ return H_SUCCESS;
+
+ return H_HARDWARE;
+diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
+index 2287c9cd0cd5..d19305292e1e 100644
+--- a/arch/powerpc/platforms/powernv/rng.c
++++ b/arch/powerpc/platforms/powernv/rng.c
+@@ -29,15 +29,6 @@ struct powernv_rng {
+
+ static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
+
+-int powernv_hwrng_present(void)
+-{
+- struct powernv_rng *rng;
+-
+- rng = get_cpu_var(powernv_rng);
+- put_cpu_var(rng);
+- return rng != NULL;
+-}
+-
+ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
+ {
+ unsigned long parity;
+@@ -58,19 +49,6 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
+ return val;
+ }
+
+-int powernv_get_random_real_mode(unsigned long *v)
+-{
+- struct powernv_rng *rng;
+-
+- rng = raw_cpu_read(powernv_rng);
+- if (!rng)
+- return 0;
+-
+- *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
+-
+- return 1;
+-}
+-
+ static int powernv_get_random_darn(unsigned long *v)
+ {
+ unsigned long val;
+@@ -107,12 +85,14 @@ int powernv_get_random_long(unsigned long *v)
+ {
+ struct powernv_rng *rng;
+
+- rng = get_cpu_var(powernv_rng);
+-
+- *v = rng_whiten(rng, in_be64(rng->regs));
+-
+- put_cpu_var(rng);
+-
++ if (mfmsr() & MSR_DR) {
++ rng = get_cpu_var(powernv_rng);
++ *v = rng_whiten(rng, in_be64(rng->regs));
++ put_cpu_var(rng);
++ } else {
++ rng = raw_cpu_read(powernv_rng);
++ *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
++ }
+ return 1;
+ }
+ EXPORT_SYMBOL_GPL(powernv_get_random_long);
+--
+2.35.1
+
--- /dev/null
+From acaca81b30149dc942304fa4d9460deb6ab485ba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 26 Jul 2022 18:57:59 +0200
+Subject: s390/unwind: fix fgraph return address recovery
+
+From: Sumanth Korikkar <sumanthk@linux.ibm.com>
+
+[ Upstream commit ded466e1806686794b403ebf031133bbaca76bb2 ]
+
+When HAVE_FUNCTION_GRAPH_RET_ADDR_PTR is defined, the return
+address to the fgraph caller is recovered by tagging it along with the
+stack pointer of ftrace stack. This makes the stack unwinding more
+reliable.
+
+When the fgraph return address is modified to return_to_handler,
+ftrace_graph_ret_addr tries to restore it to the original
+value using tagged stack pointer.
+
+Fix this by passing tagged sp to ftrace_graph_ret_addr.
+
+Fixes: d81675b60d09 ("s390/unwind: recover kretprobe modified return address in stacktrace")
+Cc: <stable@vger.kernel.org> # 5.18
+Reviewed-by: Vasily Gorbik <gor@linux.ibm.com>
+Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
+Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/s390/include/asm/unwind.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h
+index 0bf06f1682d8..02462e7100c1 100644
+--- a/arch/s390/include/asm/unwind.h
++++ b/arch/s390/include/asm/unwind.h
+@@ -47,7 +47,7 @@ struct unwind_state {
+ static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state,
+ unsigned long ip)
+ {
+- ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL);
++ ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp);
+ if (is_kretprobe_trampoline(ip))
+ ip = kretprobe_find_ret_addr(state->task, (void *)state->sp, &state->kr_cur);
+ return ip;
+--
+2.35.1
+
--- /dev/null
+From 983ed3d61efd61bb2643bd8b27af58c45e0dfcce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 22:20:40 -0700
+Subject: scsi: qla2xxx: edif: Fix dropped IKE message
+
+From: Quinn Tran <qutran@marvell.com>
+
+[ Upstream commit c019cd656e717349ff22d0c41d6fbfc773f48c52 ]
+
+This patch fixes IKE message being dropped due to error in processing Purex
+IOCB and Continuation IOCBs.
+
+Link: https://lore.kernel.org/r/20220713052045.10683-6-njavali@marvell.com
+Fixes: fac2807946c1 ("scsi: qla2xxx: edif: Add extraction of auth_els from the wire")
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Quinn Tran <qutran@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_isr.c | 54 +++++++++++++++-------------------
+ 1 file changed, 24 insertions(+), 30 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
+index c509bb23af40..125b208cf118 100644
+--- a/drivers/scsi/qla2xxx/qla_isr.c
++++ b/drivers/scsi/qla2xxx/qla_isr.c
+@@ -3711,12 +3711,11 @@ void qla24xx_nvme_ls4_iocb(struct scsi_qla_host *vha,
+ * Return: 0 all iocbs has arrived, xx- all iocbs have not arrived.
+ */
+ static int qla_chk_cont_iocb_avail(struct scsi_qla_host *vha,
+- struct rsp_que *rsp, response_t *pkt)
++ struct rsp_que *rsp, response_t *pkt, u32 rsp_q_in)
+ {
+- int start_pkt_ring_index, end_pkt_ring_index, n_ring_index;
+- response_t *end_pkt;
++ int start_pkt_ring_index;
++ u32 iocb_cnt = 0;
+ int rc = 0;
+- u32 rsp_q_in;
+
+ if (pkt->entry_count == 1)
+ return rc;
+@@ -3727,34 +3726,18 @@ static int qla_chk_cont_iocb_avail(struct scsi_qla_host *vha,
+ else
+ start_pkt_ring_index = rsp->ring_index - 1;
+
+- if ((start_pkt_ring_index + pkt->entry_count) >= rsp->length)
+- end_pkt_ring_index = start_pkt_ring_index + pkt->entry_count -
+- rsp->length - 1;
++ if (rsp_q_in < start_pkt_ring_index)
++ /* q in ptr is wrapped */
++ iocb_cnt = rsp->length - start_pkt_ring_index + rsp_q_in;
+ else
+- end_pkt_ring_index = start_pkt_ring_index + pkt->entry_count - 1;
++ iocb_cnt = rsp_q_in - start_pkt_ring_index;
+
+- end_pkt = rsp->ring + end_pkt_ring_index;
+-
+- /* next pkt = end_pkt + 1 */
+- n_ring_index = end_pkt_ring_index + 1;
+- if (n_ring_index >= rsp->length)
+- n_ring_index = 0;
+-
+- rsp_q_in = rsp->qpair->use_shadow_reg ? *rsp->in_ptr :
+- rd_reg_dword(rsp->rsp_q_in);
+-
+- /* rsp_q_in is either wrapped or pointing beyond endpkt */
+- if ((rsp_q_in < start_pkt_ring_index && rsp_q_in < n_ring_index) ||
+- rsp_q_in >= n_ring_index)
+- /* all IOCBs arrived. */
+- rc = 0;
+- else
++ if (iocb_cnt < pkt->entry_count)
+ rc = -EIO;
+
+- ql_dbg(ql_dbg_init + ql_dbg_verbose, vha, 0x5091,
+- "%s - ring %p pkt %p end pkt %p entry count %#x rsp_q_in %d rc %d\n",
+- __func__, rsp->ring, pkt, end_pkt, pkt->entry_count,
+- rsp_q_in, rc);
++ ql_dbg(ql_dbg_init, vha, 0x5091,
++ "%s - ring %p pkt %p entry count %d iocb_cnt %d rsp_q_in %d rc %d\n",
++ __func__, rsp->ring, pkt, pkt->entry_count, iocb_cnt, rsp_q_in, rc);
+
+ return rc;
+ }
+@@ -3771,7 +3754,7 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha,
+ struct qla_hw_data *ha = vha->hw;
+ struct purex_entry_24xx *purex_entry;
+ struct purex_item *pure_item;
+- u16 rsp_in = 0;
++ u16 rsp_in = 0, cur_ring_index;
+ int follow_inptr, is_shadow_hba;
+
+ if (!ha->flags.fw_started)
+@@ -3802,6 +3785,7 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha,
+ (!follow_inptr &&
+ rsp->ring_ptr->signature != RESPONSE_PROCESSED)) {
+ pkt = (struct sts_entry_24xx *)rsp->ring_ptr;
++ cur_ring_index = rsp->ring_index;
+
+ rsp->ring_index++;
+ if (rsp->ring_index == rsp->length) {
+@@ -3922,7 +3906,17 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha,
+ break;
+
+ case ELS_AUTH_ELS:
+- if (qla_chk_cont_iocb_avail(vha, rsp, (response_t *)pkt)) {
++ if (qla_chk_cont_iocb_avail(vha, rsp, (response_t *)pkt, rsp_in)) {
++ /*
++ * ring_ptr and ring_index were
++ * pre-incremented above. Reset them
++ * back to current. Wait for next
++ * interrupt with all IOCBs to arrive
++ * and re-process.
++ */
++ rsp->ring_ptr = (response_t *)pkt;
++ rsp->ring_index = cur_ring_index;
++
+ ql_dbg(ql_dbg_init, vha, 0x5091,
+ "Defer processing ELS opcode %#x...\n",
+ purex_entry->els_frame_payload[3]);
+--
+2.35.1
+
--- /dev/null
+From ff24e11b6f46bae7af4d61b2021903bdc747c5ec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 22:35:02 -0700
+Subject: scsi: qla2xxx: Fix crash due to stale SRB access around I/O timeouts
+
+From: Arun Easi <aeasi@marvell.com>
+
+[ Upstream commit c39587bc0abaf16593f7abcdf8aeec3c038c7d52 ]
+
+Ensure SRB is returned during I/O timeout error escalation. If that is not
+possible fail the escalation path.
+
+Following crash stack was seen:
+
+BUG: unable to handle kernel paging request at 0000002f56aa90f8
+IP: qla_chk_edif_rx_sa_delete_pending+0x14/0x30 [qla2xxx]
+Call Trace:
+ ? qla2x00_status_entry+0x19f/0x1c50 [qla2xxx]
+ ? qla2x00_start_sp+0x116/0x1170 [qla2xxx]
+ ? dma_pool_alloc+0x1d6/0x210
+ ? mempool_alloc+0x54/0x130
+ ? qla24xx_process_response_queue+0x548/0x12b0 [qla2xxx]
+ ? qla_do_work+0x2d/0x40 [qla2xxx]
+ ? process_one_work+0x14c/0x390
+
+Link: https://lore.kernel.org/r/20220616053508.27186-6-njavali@marvell.com
+Fixes: d74595278f4a ("scsi: qla2xxx: Add multiple queue pair functionality.")
+Cc: stable@vger.kernel.org
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_os.c | 43 +++++++++++++++++++++++++----------
+ 1 file changed, 31 insertions(+), 12 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
+index daa9a3c3f7b5..f9ad0847782d 100644
+--- a/drivers/scsi/qla2xxx/qla_os.c
++++ b/drivers/scsi/qla2xxx/qla_os.c
+@@ -1342,21 +1342,20 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
+ /*
+ * Returns: QLA_SUCCESS or QLA_FUNCTION_FAILED.
+ */
+-int
+-qla2x00_eh_wait_for_pending_commands(scsi_qla_host_t *vha, unsigned int t,
+- uint64_t l, enum nexus_wait_type type)
++static int
++__qla2x00_eh_wait_for_pending_commands(struct qla_qpair *qpair, unsigned int t,
++ uint64_t l, enum nexus_wait_type type)
+ {
+ int cnt, match, status;
+ unsigned long flags;
+- struct qla_hw_data *ha = vha->hw;
+- struct req_que *req;
++ scsi_qla_host_t *vha = qpair->vha;
++ struct req_que *req = qpair->req;
+ srb_t *sp;
+ struct scsi_cmnd *cmd;
+
+ status = QLA_SUCCESS;
+
+- spin_lock_irqsave(&ha->hardware_lock, flags);
+- req = vha->req;
++ spin_lock_irqsave(qpair->qp_lock_ptr, flags);
+ for (cnt = 1; status == QLA_SUCCESS &&
+ cnt < req->num_outstanding_cmds; cnt++) {
+ sp = req->outstanding_cmds[cnt];
+@@ -1383,12 +1382,32 @@ qla2x00_eh_wait_for_pending_commands(scsi_qla_host_t *vha, unsigned int t,
+ if (!match)
+ continue;
+
+- spin_unlock_irqrestore(&ha->hardware_lock, flags);
++ spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
+ status = qla2x00_eh_wait_on_command(cmd);
+- spin_lock_irqsave(&ha->hardware_lock, flags);
++ spin_lock_irqsave(qpair->qp_lock_ptr, flags);
+ }
+- spin_unlock_irqrestore(&ha->hardware_lock, flags);
++ spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
++
++ return status;
++}
++
++int
++qla2x00_eh_wait_for_pending_commands(scsi_qla_host_t *vha, unsigned int t,
++ uint64_t l, enum nexus_wait_type type)
++{
++ struct qla_qpair *qpair;
++ struct qla_hw_data *ha = vha->hw;
++ int i, status = QLA_SUCCESS;
+
++ status = __qla2x00_eh_wait_for_pending_commands(ha->base_qpair, t, l,
++ type);
++ for (i = 0; status == QLA_SUCCESS && i < ha->max_qpairs; i++) {
++ qpair = ha->queue_pair_map[i];
++ if (!qpair)
++ continue;
++ status = __qla2x00_eh_wait_for_pending_commands(qpair, t, l,
++ type);
++ }
+ return status;
+ }
+
+@@ -1425,7 +1444,7 @@ qla2xxx_eh_device_reset(struct scsi_cmnd *cmd)
+ return err;
+
+ if (fcport->deleted)
+- return SUCCESS;
++ return FAILED;
+
+ ql_log(ql_log_info, vha, 0x8009,
+ "DEVICE RESET ISSUED nexus=%ld:%d:%llu cmd=%p.\n", vha->host_no,
+@@ -1493,7 +1512,7 @@ qla2xxx_eh_target_reset(struct scsi_cmnd *cmd)
+ return err;
+
+ if (fcport->deleted)
+- return SUCCESS;
++ return FAILED;
+
+ ql_log(ql_log_info, vha, 0x8009,
+ "TARGET RESET ISSUED nexus=%ld:%d cmd=%p.\n", vha->host_no,
+--
+2.35.1
+
--- /dev/null
+From ada2019561e89a831747aab73d489754049451a5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 22:20:42 -0700
+Subject: scsi: qla2xxx: Fix discovery issues in FC-AL topology
+
+From: Arun Easi <aeasi@marvell.com>
+
+[ Upstream commit 47ccb113cead905bdc236571bf8ac6fed90321b3 ]
+
+A direct attach tape device, when gets swapped with another, was not
+discovered. Fix this by looking at loop map and reinitialize link if there
+are devices present.
+
+Link: https://lore.kernel.org/linux-scsi/baef87c3-5dad-3b47-44c1-6914bfc90108@cybernetics.com/
+Link: https://lore.kernel.org/r/20220713052045.10683-8-njavali@marvell.com
+Cc: stable@vger.kernel.org
+Reported-by: Tony Battersby <tonyb@cybernetics.com>
+Tested-by: Tony Battersby <tonyb@cybernetics.com>
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_gbl.h | 3 ++-
+ drivers/scsi/qla2xxx/qla_init.c | 29 +++++++++++++++++++++++++++++
+ drivers/scsi/qla2xxx/qla_mbx.c | 5 ++++-
+ 3 files changed, 35 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h
+index 20ae0ef7d078..331b33200f50 100644
+--- a/drivers/scsi/qla2xxx/qla_gbl.h
++++ b/drivers/scsi/qla2xxx/qla_gbl.h
+@@ -436,7 +436,8 @@ extern int
+ qla2x00_get_resource_cnts(scsi_qla_host_t *);
+
+ extern int
+-qla2x00_get_fcal_position_map(scsi_qla_host_t *ha, char *pos_map);
++qla2x00_get_fcal_position_map(scsi_qla_host_t *ha, char *pos_map,
++ u8 *num_entries);
+
+ extern int
+ qla2x00_get_link_status(scsi_qla_host_t *, uint16_t, struct link_statistics *,
+diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
+index 01c7eda51d5a..51503a316b10 100644
+--- a/drivers/scsi/qla2xxx/qla_init.c
++++ b/drivers/scsi/qla2xxx/qla_init.c
+@@ -5516,6 +5516,22 @@ static int qla2x00_configure_n2n_loop(scsi_qla_host_t *vha)
+ return QLA_FUNCTION_FAILED;
+ }
+
++static void
++qla_reinitialize_link(scsi_qla_host_t *vha)
++{
++ int rval;
++
++ atomic_set(&vha->loop_state, LOOP_DOWN);
++ atomic_set(&vha->loop_down_timer, LOOP_DOWN_TIME);
++ rval = qla2x00_full_login_lip(vha);
++ if (rval == QLA_SUCCESS) {
++ ql_dbg(ql_dbg_disc, vha, 0xd050, "Link reinitialized\n");
++ } else {
++ ql_dbg(ql_dbg_disc, vha, 0xd051,
++ "Link reinitialization failed (%d)\n", rval);
++ }
++}
++
+ /*
+ * qla2x00_configure_local_loop
+ * Updates Fibre Channel Device Database with local loop devices.
+@@ -5567,6 +5583,19 @@ qla2x00_configure_local_loop(scsi_qla_host_t *vha)
+ spin_unlock_irqrestore(&vha->work_lock, flags);
+
+ if (vha->scan.scan_retry < MAX_SCAN_RETRIES) {
++ u8 loop_map_entries = 0;
++ int rc;
++
++ rc = qla2x00_get_fcal_position_map(vha, NULL,
++ &loop_map_entries);
++ if (rc == QLA_SUCCESS && loop_map_entries > 1) {
++ /*
++ * There are devices that are still not logged
++ * in. Reinitialize to give them a chance.
++ */
++ qla_reinitialize_link(vha);
++ return QLA_FUNCTION_FAILED;
++ }
+ set_bit(LOCAL_LOOP_UPDATE, &vha->dpc_flags);
+ set_bit(LOOP_RESYNC_NEEDED, &vha->dpc_flags);
+ }
+diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
+index bcade1deb798..86d8c455c07a 100644
+--- a/drivers/scsi/qla2xxx/qla_mbx.c
++++ b/drivers/scsi/qla2xxx/qla_mbx.c
+@@ -3068,7 +3068,8 @@ qla2x00_get_resource_cnts(scsi_qla_host_t *vha)
+ * Kernel context.
+ */
+ int
+-qla2x00_get_fcal_position_map(scsi_qla_host_t *vha, char *pos_map)
++qla2x00_get_fcal_position_map(scsi_qla_host_t *vha, char *pos_map,
++ u8 *num_entries)
+ {
+ int rval;
+ mbx_cmd_t mc;
+@@ -3108,6 +3109,8 @@ qla2x00_get_fcal_position_map(scsi_qla_host_t *vha, char *pos_map)
+
+ if (pos_map)
+ memcpy(pos_map, pmap, FCAL_MAP_SIZE);
++ if (num_entries)
++ *num_entries = pmap[0];
+ }
+ dma_pool_free(ha->s_dma_pool, pmap, pmap_dma);
+
+--
+2.35.1
+
--- /dev/null
+From ed1aa089d6371962ed13e3fe349ef0d02f660393 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 22:35:07 -0700
+Subject: scsi: qla2xxx: Fix erroneous mailbox timeout after PCI error
+ injection
+
+From: Quinn Tran <qutran@marvell.com>
+
+[ Upstream commit f260694e6463b63ae550aad25ddefe94cb1904da ]
+
+Clear wait for mailbox interrupt flag to prevent stale mailbox:
+
+Feb 22 05:22:56 ltcden4-lp7 kernel: qla2xxx [0135:90:00.1]-500a:4: LOOP UP detected (16 Gbps).
+Feb 22 05:22:59 ltcden4-lp7 kernel: qla2xxx [0135:90:00.1]-d04c:4: MBX Command timeout for cmd 69, ...
+
+To fix the issue, driver needs to clear the MBX_INTR_WAIT flag on purging
+the mailbox. When the stale mailbox completion does arrive, it will be
+dropped.
+
+Link: https://lore.kernel.org/r/20220616053508.27186-11-njavali@marvell.com
+Fixes: b6faaaf796d7 ("scsi: qla2xxx: Serialize mailbox request")
+Cc: Naresh Bannoth <nbannoth@in.ibm.com>
+Cc: Kyle Mahlkuch <Kyle.Mahlkuch@ibm.com>
+Cc: stable@vger.kernel.org
+Reported-by: Naresh Bannoth <nbannoth@in.ibm.com>
+Tested-by: Naresh Bannoth <nbannoth@in.ibm.com>
+Signed-off-by: Quinn Tran <qutran@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_mbx.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c
+index 15d8866046b3..bcade1deb798 100644
+--- a/drivers/scsi/qla2xxx/qla_mbx.c
++++ b/drivers/scsi/qla2xxx/qla_mbx.c
+@@ -276,6 +276,12 @@ qla2x00_mailbox_command(scsi_qla_host_t *vha, mbx_cmd_t *mcp)
+ atomic_inc(&ha->num_pend_mbx_stage3);
+ if (!wait_for_completion_timeout(&ha->mbx_intr_comp,
+ mcp->tov * HZ)) {
++ ql_dbg(ql_dbg_mbx, vha, 0x117a,
++ "cmd=%x Timeout.\n", command);
++ spin_lock_irqsave(&ha->hardware_lock, flags);
++ clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags);
++ spin_unlock_irqrestore(&ha->hardware_lock, flags);
++
+ if (chip_reset != ha->chip_reset) {
+ eeh_delay = ha->flags.eeh_busy ? 1 : 0;
+
+@@ -288,12 +294,6 @@ qla2x00_mailbox_command(scsi_qla_host_t *vha, mbx_cmd_t *mcp)
+ rval = QLA_ABORTED;
+ goto premature_exit;
+ }
+- ql_dbg(ql_dbg_mbx, vha, 0x117a,
+- "cmd=%x Timeout.\n", command);
+- spin_lock_irqsave(&ha->hardware_lock, flags);
+- clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags);
+- spin_unlock_irqrestore(&ha->hardware_lock, flags);
+-
+ } else if (ha->flags.purge_mbox ||
+ chip_reset != ha->chip_reset) {
+ eeh_delay = ha->flags.eeh_busy ? 1 : 0;
+--
+2.35.1
+
--- /dev/null
+From 1537088b80ff6a934403b341ee3ffb445867f3d3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 22:34:58 -0700
+Subject: scsi: qla2xxx: Fix excessive I/O error messages by default
+
+From: Arun Easi <aeasi@marvell.com>
+
+[ Upstream commit bff4873c709085e09d0ffae0c25b8e65256e3205 ]
+
+Disable printing I/O error messages by default. The messages will be
+printed only when logging was enabled.
+
+Link: https://lore.kernel.org/r/20220616053508.27186-2-njavali@marvell.com
+Fixes: 8e2d81c6b5be ("scsi: qla2xxx: Fix excessive messages during device logout")
+Cc: stable@vger.kernel.org
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_isr.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
+index ad55eace66aa..5e99f559230f 100644
+--- a/drivers/scsi/qla2xxx/qla_isr.c
++++ b/drivers/scsi/qla2xxx/qla_isr.c
+@@ -2637,7 +2637,7 @@ static void qla24xx_nvme_iocb_entry(scsi_qla_host_t *vha, struct req_que *req,
+ }
+
+ if (unlikely(logit))
+- ql_log(ql_dbg_io, fcport->vha, 0x5060,
++ ql_dbg(ql_dbg_io, fcport->vha, 0x5060,
+ "NVME-%s ERR Handling - hdl=%x status(%x) tr_len:%x resid=%x ox_id=%x\n",
+ sp->name, sp->handle, comp_status,
+ fd->transferred_length, le32_to_cpu(sts->residual_len),
+@@ -3495,7 +3495,7 @@ qla2x00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt)
+
+ out:
+ if (logit)
+- ql_log(ql_dbg_io, fcport->vha, 0x3022,
++ ql_dbg(ql_dbg_io, fcport->vha, 0x3022,
+ "FCP command status: 0x%x-0x%x (0x%x) nexus=%ld:%d:%llu portid=%02x%02x%02x oxid=0x%x cdb=%10phN len=0x%x rsp_info=0x%x resid=0x%x fw_resid=0x%x sp=%p cp=%p.\n",
+ comp_status, scsi_status, res, vha->host_no,
+ cp->device->id, cp->device->lun, fcport->d_id.b.domain,
+--
+2.35.1
+
--- /dev/null
+From 10606f5e0ba0d8d8210cc69be72aa4035377a56d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 22:20:41 -0700
+Subject: scsi: qla2xxx: Fix imbalance vha->vref_count
+
+From: Quinn Tran <qutran@marvell.com>
+
+[ Upstream commit 63fa7f2644b4b48e1913af33092c044bf48e9321 ]
+
+vref_count took an extra decrement in the task management path. Add an
+extra ref count to compensate the imbalance.
+
+Link: https://lore.kernel.org/r/20220713052045.10683-7-njavali@marvell.com
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Quinn Tran <qutran@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_init.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
+index 3df66934fb1e..01c7eda51d5a 100644
+--- a/drivers/scsi/qla2xxx/qla_init.c
++++ b/drivers/scsi/qla2xxx/qla_init.c
+@@ -168,6 +168,7 @@ int qla24xx_async_abort_cmd(srb_t *cmd_sp, bool wait)
+ struct srb_iocb *abt_iocb;
+ srb_t *sp;
+ int rval = QLA_FUNCTION_FAILED;
++ uint8_t bail;
+
+ /* ref: INIT for ABTS command */
+ sp = qla2xxx_get_qpair_sp(cmd_sp->vha, cmd_sp->qpair, cmd_sp->fcport,
+@@ -175,6 +176,7 @@ int qla24xx_async_abort_cmd(srb_t *cmd_sp, bool wait)
+ if (!sp)
+ return QLA_MEMORY_ALLOC_FAILED;
+
++ QLA_VHA_MARK_BUSY(vha, bail);
+ abt_iocb = &sp->u.iocb_cmd;
+ sp->type = SRB_ABT_CMD;
+ sp->name = "abort";
+@@ -2018,12 +2020,14 @@ qla2x00_async_tm_cmd(fc_port_t *fcport, uint32_t flags, uint32_t lun,
+ struct srb_iocb *tm_iocb;
+ srb_t *sp;
+ int rval = QLA_FUNCTION_FAILED;
++ uint8_t bail;
+
+ /* ref: INIT */
+ sp = qla2x00_get_sp(vha, fcport, GFP_KERNEL);
+ if (!sp)
+ goto done;
+
++ QLA_VHA_MARK_BUSY(vha, bail);
+ sp->type = SRB_TM_CMD;
+ sp->name = "tmf";
+ qla2x00_init_async_sp(sp, qla2x00_get_async_timeout(vha),
+--
+2.35.1
+
--- /dev/null
+From eea51e30df139a84ea79448c0a19833387a8bb12 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 22:35:03 -0700
+Subject: scsi: qla2xxx: Fix losing FCP-2 targets during port perturbation
+ tests
+
+From: Arun Easi <aeasi@marvell.com>
+
+[ Upstream commit 58d1c124cd79ea686b512043c5bd515590b2ed95 ]
+
+When a mix of FCP-2 (tape) and non-FCP-2 targets are present, FCP-2 target
+state was incorrectly transitioned when both of the targets were gone. Fix
+this by ignoring state transition for FCP-2 targets.
+
+Link: https://lore.kernel.org/r/20220616053508.27186-7-njavali@marvell.com
+Fixes: 44c57f205876 ("scsi: qla2xxx: Changes to support FCP2 Target")
+Cc: stable@vger.kernel.org
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_gs.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_gs.c b/drivers/scsi/qla2xxx/qla_gs.c
+index c914b5df9c12..7ca734337000 100644
+--- a/drivers/scsi/qla2xxx/qla_gs.c
++++ b/drivers/scsi/qla2xxx/qla_gs.c
+@@ -3629,7 +3629,7 @@ void qla24xx_async_gnnft_done(scsi_qla_host_t *vha, srb_t *sp)
+ do_delete) {
+ if (fcport->loop_id != FC_NO_LOOP_ID) {
+ if (fcport->flags & FCF_FCP2_DEVICE)
+- fcport->logout_on_delete = 0;
++ continue;
+
+ ql_log(ql_log_warn, vha, 0x20f0,
+ "%s %d %8phC post del sess\n",
+--
+2.35.1
+
--- /dev/null
+From cd632680dceb7e22c58ec69f997ffaaf7c4c0dac Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 22:35:06 -0700
+Subject: scsi: qla2xxx: Fix losing FCP-2 targets on long port disable with
+ I/Os
+
+From: Arun Easi <aeasi@marvell.com>
+
+[ Upstream commit 2416ccd3815ba1613e10a6da0a24ef21acfe5633 ]
+
+FCP-2 devices were not coming back online once they were lost, login
+retries exhausted, and then came back up. Fix this by accepting RSCN when
+the device is not online.
+
+Link: https://lore.kernel.org/r/20220616053508.27186-10-njavali@marvell.com
+Fixes: 44c57f205876 ("scsi: qla2xxx: Changes to support FCP2 Target")
+Cc: stable@vger.kernel.org
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_init.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
+index a0d26e2e0ce1..3df66934fb1e 100644
+--- a/drivers/scsi/qla2xxx/qla_init.c
++++ b/drivers/scsi/qla2xxx/qla_init.c
+@@ -1832,7 +1832,8 @@ void qla2x00_handle_rscn(scsi_qla_host_t *vha, struct event_arg *ea)
+ case RSCN_PORT_ADDR:
+ fcport = qla2x00_find_fcport_by_nportid(vha, &ea->id, 1);
+ if (fcport) {
+- if (fcport->flags & FCF_FCP2_DEVICE) {
++ if (fcport->flags & FCF_FCP2_DEVICE &&
++ atomic_read(&fcport->state) == FCS_ONLINE) {
+ ql_dbg(ql_dbg_disc, vha, 0x2115,
+ "Delaying session delete for FCP2 portid=%06x %8phC ",
+ fcport->d_id.b24, fcport->port_name);
+@@ -1864,7 +1865,8 @@ void qla2x00_handle_rscn(scsi_qla_host_t *vha, struct event_arg *ea)
+ break;
+ case RSCN_AREA_ADDR:
+ list_for_each_entry(fcport, &vha->vp_fcports, list) {
+- if (fcport->flags & FCF_FCP2_DEVICE)
++ if (fcport->flags & FCF_FCP2_DEVICE &&
++ atomic_read(&fcport->state) == FCS_ONLINE)
+ continue;
+
+ if ((ea->id.b24 & 0xffff00) == (fcport->d_id.b24 & 0xffff00)) {
+@@ -1875,7 +1877,8 @@ void qla2x00_handle_rscn(scsi_qla_host_t *vha, struct event_arg *ea)
+ break;
+ case RSCN_DOM_ADDR:
+ list_for_each_entry(fcport, &vha->vp_fcports, list) {
+- if (fcport->flags & FCF_FCP2_DEVICE)
++ if (fcport->flags & FCF_FCP2_DEVICE &&
++ atomic_read(&fcport->state) == FCS_ONLINE)
+ continue;
+
+ if ((ea->id.b24 & 0xff0000) == (fcport->d_id.b24 & 0xff0000)) {
+@@ -1887,7 +1890,8 @@ void qla2x00_handle_rscn(scsi_qla_host_t *vha, struct event_arg *ea)
+ case RSCN_FAB_ADDR:
+ default:
+ list_for_each_entry(fcport, &vha->vp_fcports, list) {
+- if (fcport->flags & FCF_FCP2_DEVICE)
++ if (fcport->flags & FCF_FCP2_DEVICE &&
++ atomic_read(&fcport->state) == FCS_ONLINE)
+ continue;
+
+ fcport->scan_needed = 1;
+--
+2.35.1
+
--- /dev/null
+From a9228406b04cbe27522f81b77535befe4c5e7924 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 22:35:04 -0700
+Subject: scsi: qla2xxx: Fix losing target when it reappears during delete
+
+From: Arun Easi <aeasi@marvell.com>
+
+[ Upstream commit 118b0c863c8f5629cc5271fc24d72d926e0715d9 ]
+
+FC target disappeared during port perturbation tests due to a race that
+tramples target state. Fix the issue by adding state checks before
+proceeding.
+
+Link: https://lore.kernel.org/r/20220616053508.27186-8-njavali@marvell.com
+Fixes: 44c57f205876 ("scsi: qla2xxx: Changes to support FCP2 Target")
+Cc: stable@vger.kernel.org
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_attr.c | 24 +++++++++++++++++-------
+ 1 file changed, 17 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c
+index 3b3e4234f37a..412ad888bdc1 100644
+--- a/drivers/scsi/qla2xxx/qla_attr.c
++++ b/drivers/scsi/qla2xxx/qla_attr.c
+@@ -2716,17 +2716,24 @@ qla2x00_dev_loss_tmo_callbk(struct fc_rport *rport)
+ if (!fcport)
+ return;
+
+- /* Now that the rport has been deleted, set the fcport state to
+- FCS_DEVICE_DEAD */
+- qla2x00_set_fcport_state(fcport, FCS_DEVICE_DEAD);
++
++ /*
++ * Now that the rport has been deleted, set the fcport state to
++ * FCS_DEVICE_DEAD, if the fcport is still lost.
++ */
++ if (fcport->scan_state != QLA_FCPORT_FOUND)
++ qla2x00_set_fcport_state(fcport, FCS_DEVICE_DEAD);
+
+ /*
+ * Transport has effectively 'deleted' the rport, clear
+ * all local references.
+ */
+ spin_lock_irqsave(host->host_lock, flags);
+- fcport->rport = fcport->drport = NULL;
+- *((fc_port_t **)rport->dd_data) = NULL;
++ /* Confirm port has not reappeared before clearing pointers. */
++ if (rport->port_state != FC_PORTSTATE_ONLINE) {
++ fcport->rport = fcport->drport = NULL;
++ *((fc_port_t **)rport->dd_data) = NULL;
++ }
+ spin_unlock_irqrestore(host->host_lock, flags);
+
+ if (test_bit(ABORT_ISP_ACTIVE, &fcport->vha->dpc_flags))
+@@ -2759,9 +2766,12 @@ qla2x00_terminate_rport_io(struct fc_rport *rport)
+ /*
+ * At this point all fcport's software-states are cleared. Perform any
+ * final cleanup of firmware resources (PCBs and XCBs).
++ *
++ * Attempt to cleanup only lost devices.
+ */
+ if (fcport->loop_id != FC_NO_LOOP_ID) {
+- if (IS_FWI2_CAPABLE(fcport->vha->hw)) {
++ if (IS_FWI2_CAPABLE(fcport->vha->hw) &&
++ fcport->scan_state != QLA_FCPORT_FOUND) {
+ if (fcport->loop_id != FC_NO_LOOP_ID)
+ fcport->logout_on_delete = 1;
+
+@@ -2771,7 +2781,7 @@ qla2x00_terminate_rport_io(struct fc_rport *rport)
+ __LINE__);
+ qlt_schedule_sess_for_deletion(fcport);
+ }
+- } else {
++ } else if (!IS_FWI2_CAPABLE(fcport->vha->hw)) {
+ qla2x00_port_logout(fcport->vha, fcport);
+ }
+ }
+--
+2.35.1
+
--- /dev/null
+From ac1c86d5f6f0826e7897d9395a1c6abba08a0f8d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 22:20:39 -0700
+Subject: scsi: qla2xxx: Fix response queue handler reading stale packets
+
+From: Arun Easi <aeasi@marvell.com>
+
+[ Upstream commit b1f707146923335849fb70237eec27d4d1ae7d62 ]
+
+On some platforms, the current logic of relying on finding new packet
+solely based on signature pattern can lead to driver reading stale
+packets. Though this is a bug in those platforms, reduce such exposures by
+limiting reading packets until the IN pointer.
+
+Two module parameters are introduced:
+
+ ql2xrspq_follow_inptr:
+
+ When set, on newer adapters that has queue pointer shadowing, look for
+ response packets only until response queue in pointer.
+
+ When reset, response packets are read based on a signature pattern
+ logic (old way).
+
+ ql2xrspq_follow_inptr_legacy:
+
+ Like ql2xrspq_follow_inptr, but for those adapters where there is no
+ queue pointer shadowing.
+
+Link: https://lore.kernel.org/r/20220713052045.10683-5-njavali@marvell.com
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_gbl.h | 2 ++
+ drivers/scsi/qla2xxx/qla_isr.c | 24 +++++++++++++++++++++++-
+ drivers/scsi/qla2xxx/qla_os.c | 10 ++++++++++
+ 3 files changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h
+index 84b44454c231..20ae0ef7d078 100644
+--- a/drivers/scsi/qla2xxx/qla_gbl.h
++++ b/drivers/scsi/qla2xxx/qla_gbl.h
+@@ -193,6 +193,8 @@ extern int ql2xsecenable;
+ extern int ql2xenforce_iocb_limit;
+ extern int ql2xabts_wait_nvme;
+ extern u32 ql2xnvme_queues;
++extern int ql2xrspq_follow_inptr;
++extern int ql2xrspq_follow_inptr_legacy;
+
+ extern int qla2x00_loop_reset(scsi_qla_host_t *);
+ extern void qla2x00_abort_all_cmds(scsi_qla_host_t *, int);
+diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
+index de348628aa53..c509bb23af40 100644
+--- a/drivers/scsi/qla2xxx/qla_isr.c
++++ b/drivers/scsi/qla2xxx/qla_isr.c
+@@ -3771,6 +3771,8 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha,
+ struct qla_hw_data *ha = vha->hw;
+ struct purex_entry_24xx *purex_entry;
+ struct purex_item *pure_item;
++ u16 rsp_in = 0;
++ int follow_inptr, is_shadow_hba;
+
+ if (!ha->flags.fw_started)
+ return;
+@@ -3780,7 +3782,25 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha,
+ qla_cpu_update(rsp->qpair, smp_processor_id());
+ }
+
+- while (rsp->ring_ptr->signature != RESPONSE_PROCESSED) {
++#define __update_rsp_in(_update, _is_shadow_hba, _rsp, _rsp_in) \
++ do { \
++ if (_update) { \
++ _rsp_in = _is_shadow_hba ? *(_rsp)->in_ptr : \
++ rd_reg_dword_relaxed((_rsp)->rsp_q_in); \
++ } \
++ } while (0)
++
++ is_shadow_hba = IS_SHADOW_REG_CAPABLE(ha);
++ follow_inptr = is_shadow_hba ? ql2xrspq_follow_inptr :
++ ql2xrspq_follow_inptr_legacy;
++
++ __update_rsp_in(follow_inptr, is_shadow_hba, rsp, rsp_in);
++
++ while ((likely(follow_inptr &&
++ rsp->ring_index != rsp_in &&
++ rsp->ring_ptr->signature != RESPONSE_PROCESSED)) ||
++ (!follow_inptr &&
++ rsp->ring_ptr->signature != RESPONSE_PROCESSED)) {
+ pkt = (struct sts_entry_24xx *)rsp->ring_ptr;
+
+ rsp->ring_index++;
+@@ -3893,6 +3913,8 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha,
+ }
+ pure_item = qla27xx_copy_fpin_pkt(vha,
+ (void **)&pkt, &rsp);
++ __update_rsp_in(follow_inptr, is_shadow_hba,
++ rsp, rsp_in);
+ if (!pure_item)
+ break;
+ qla24xx_queue_purex_item(vha, pure_item,
+diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
+index f9ad0847782d..3bbfce3ccf2e 100644
+--- a/drivers/scsi/qla2xxx/qla_os.c
++++ b/drivers/scsi/qla2xxx/qla_os.c
+@@ -338,6 +338,16 @@ module_param(ql2xdelay_before_pci_error_handling, uint, 0644);
+ MODULE_PARM_DESC(ql2xdelay_before_pci_error_handling,
+ "Number of seconds delayed before qla begin PCI error self-handling (default: 5).\n");
+
++int ql2xrspq_follow_inptr = 1;
++module_param(ql2xrspq_follow_inptr, int, 0644);
++MODULE_PARM_DESC(ql2xrspq_follow_inptr,
++ "Follow RSP IN pointer for RSP updates for HBAs 27xx and newer (default: 1).");
++
++int ql2xrspq_follow_inptr_legacy = 1;
++module_param(ql2xrspq_follow_inptr_legacy, int, 0644);
++MODULE_PARM_DESC(ql2xrspq_follow_inptr_legacy,
++ "Follow RSP IN pointer for RSP updates for HBAs older than 27XX. (default: 1).");
++
+ static void qla2x00_clear_drv_active(struct qla_hw_data *);
+ static void qla2x00_free_device(scsi_qla_host_t *);
+ static int qla2xxx_map_queues(struct Scsi_Host *shost);
+--
+2.35.1
+
--- /dev/null
+From 6467d2dbfbfe7cc5abe98d997e428cbfb58354d9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 22:35:01 -0700
+Subject: scsi: qla2xxx: Turn off multi-queue for 8G adapters
+
+From: Quinn Tran <qutran@marvell.com>
+
+[ Upstream commit 5304673bdb1635e27555bd636fd5d6956f1cd552 ]
+
+For 8G adapters, multi-queue was enabled accidentally. Make sure
+multi-queue is not enabled.
+
+Link: https://lore.kernel.org/r/20220616053508.27186-5-njavali@marvell.com
+Cc: stable@vger.kernel.org
+Signed-off-by: Quinn Tran <qutran@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_def.h | 4 ++--
+ drivers/scsi/qla2xxx/qla_isr.c | 16 ++++++----------
+ 2 files changed, 8 insertions(+), 12 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
+index 4cbaea4b993e..01cdd5f8723c 100644
+--- a/drivers/scsi/qla2xxx/qla_def.h
++++ b/drivers/scsi/qla2xxx/qla_def.h
+@@ -4268,8 +4268,8 @@ struct qla_hw_data {
+ #define IS_OEM_001(ha) ((ha)->device_type & DT_OEM_001)
+ #define HAS_EXTENDED_IDS(ha) ((ha)->device_type & DT_EXTENDED_IDS)
+ #define IS_CT6_SUPPORTED(ha) ((ha)->device_type & DT_CT6_SUPPORTED)
+-#define IS_MQUE_CAPABLE(ha) ((ha)->mqenable || IS_QLA83XX(ha) || \
+- IS_QLA27XX(ha) || IS_QLA28XX(ha))
++#define IS_MQUE_CAPABLE(ha) (IS_QLA83XX(ha) || IS_QLA27XX(ha) || \
++ IS_QLA28XX(ha))
+ #define IS_BIDI_CAPABLE(ha) \
+ (IS_QLA25XX(ha) || IS_QLA2031(ha) || IS_QLA27XX(ha) || IS_QLA28XX(ha))
+ /* Bit 21 of fw_attributes decides the MCTP capabilities */
+diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c
+index 5e99f559230f..de348628aa53 100644
+--- a/drivers/scsi/qla2xxx/qla_isr.c
++++ b/drivers/scsi/qla2xxx/qla_isr.c
+@@ -4419,16 +4419,12 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp)
+ }
+
+ /* Enable MSI-X vector for response queue update for queue 0 */
+- if (IS_QLA83XX(ha) || IS_QLA27XX(ha) || IS_QLA28XX(ha)) {
+- if (ha->msixbase && ha->mqiobase &&
+- (ha->max_rsp_queues > 1 || ha->max_req_queues > 1 ||
+- ql2xmqsupport))
+- ha->mqenable = 1;
+- } else
+- if (ha->mqiobase &&
+- (ha->max_rsp_queues > 1 || ha->max_req_queues > 1 ||
+- ql2xmqsupport))
+- ha->mqenable = 1;
++ if (IS_MQUE_CAPABLE(ha) &&
++ (ha->msixbase && ha->mqiobase && ha->max_qpairs))
++ ha->mqenable = 1;
++ else
++ ha->mqenable = 0;
++
+ ql_dbg(ql_dbg_multiq, vha, 0xc005,
+ "mqiobase=%p, max_rsp_queues=%d, max_req_queues=%d.\n",
+ ha->mqiobase, ha->max_rsp_queues, ha->max_req_queues);
+--
+2.35.1
+
--- /dev/null
+From fc0719b2de5e78eb7fd2b53cd1e1f8a06e81653f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Jul 2022 22:20:44 -0700
+Subject: scsi: qla2xxx: Update manufacturer details
+
+From: Bikash Hazarika <bhazarika@marvell.com>
+
+[ Upstream commit 1ccad27716ecad1fd58c35e579bedb81fa5e1ad5 ]
+
+Update manufacturer details to indicate Marvell Semiconductors.
+
+Link: https://lore.kernel.org/r/20220713052045.10683-10-njavali@marvell.com
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Bikash Hazarika <bhazarika@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_def.h | 2 +-
+ drivers/scsi/qla2xxx/qla_gs.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
+index 01cdd5f8723c..17b8a4e86ffb 100644
+--- a/drivers/scsi/qla2xxx/qla_def.h
++++ b/drivers/scsi/qla2xxx/qla_def.h
+@@ -78,7 +78,7 @@ typedef union {
+ #include "qla_nvme.h"
+ #define QLA2XXX_DRIVER_NAME "qla2xxx"
+ #define QLA2XXX_APIDEV "ql2xapidev"
+-#define QLA2XXX_MANUFACTURER "QLogic Corporation"
++#define QLA2XXX_MANUFACTURER "Marvell Semiconductor, Inc."
+
+ /*
+ * We have MAILBOX_REGISTER_COUNT sized arrays in a few places,
+diff --git a/drivers/scsi/qla2xxx/qla_gs.c b/drivers/scsi/qla2xxx/qla_gs.c
+index 7ca734337000..64ab070b8716 100644
+--- a/drivers/scsi/qla2xxx/qla_gs.c
++++ b/drivers/scsi/qla2xxx/qla_gs.c
+@@ -1616,7 +1616,7 @@ qla2x00_hba_attributes(scsi_qla_host_t *vha, void *entries,
+ eiter->type = cpu_to_be16(FDMI_HBA_MANUFACTURER);
+ alen = scnprintf(
+ eiter->a.manufacturer, sizeof(eiter->a.manufacturer),
+- "%s", "QLogic Corporation");
++ "%s", QLA2XXX_MANUFACTURER);
+ alen += FDMI_ATTR_ALIGNMENT(alen);
+ alen += FDMI_ATTR_TYPELEN(eiter);
+ eiter->len = cpu_to_be16(alen);
+--
+2.35.1
+
--- /dev/null
+From 7d35e2215d13472f85fd7000bc0f76847bc4d08e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Jun 2022 22:35:00 -0700
+Subject: scsi: qla2xxx: Wind down adapter after PCIe error
+
+From: Quinn Tran <qutran@marvell.com>
+
+[ Upstream commit d3117c83ba316b3200d9f2fe900f2b9a5525a25c ]
+
+Put adapter into a wind down state if OS does not make any attempt to
+recover the adapter after PCIe error.
+
+Link: https://lore.kernel.org/r/20220616053508.27186-4-njavali@marvell.com
+Cc: stable@vger.kernel.org
+Signed-off-by: Quinn Tran <qutran@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/scsi/qla2xxx/qla_bsg.c | 10 ++++++-
+ drivers/scsi/qla2xxx/qla_def.h | 4 +++
+ drivers/scsi/qla2xxx/qla_init.c | 20 ++++++++++++++
+ drivers/scsi/qla2xxx/qla_os.c | 48 +++++++++++++++++++++++++++++++++
+ 4 files changed, 81 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c
+index c2f00f076f79..726af9e40572 100644
+--- a/drivers/scsi/qla2xxx/qla_bsg.c
++++ b/drivers/scsi/qla2xxx/qla_bsg.c
+@@ -2975,6 +2975,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
+
+ ql_log(ql_log_info, vha, 0x708b, "%s CMD timeout. bsg ptr %p.\n",
+ __func__, bsg_job);
++
++ if (qla2x00_isp_reg_stat(ha)) {
++ ql_log(ql_log_info, vha, 0x9007,
++ "PCI/Register disconnect.\n");
++ qla_pci_set_eeh_busy(vha);
++ }
++
+ /* find the bsg job from the active list of commands */
+ spin_lock_irqsave(&ha->hardware_lock, flags);
+ for (que = 0; que < ha->max_req_queues; que++) {
+@@ -2992,7 +2999,8 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
+ sp->u.bsg_job == bsg_job) {
+ req->outstanding_cmds[cnt] = NULL;
+ spin_unlock_irqrestore(&ha->hardware_lock, flags);
+- if (ha->isp_ops->abort_command(sp)) {
++
++ if (!ha->flags.eeh_busy && ha->isp_ops->abort_command(sp)) {
+ ql_log(ql_log_warn, vha, 0x7089,
+ "mbx abort_command failed.\n");
+ bsg_reply->result = -EIO;
+diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
+index 4062d46f33a6..4cbaea4b993e 100644
+--- a/drivers/scsi/qla2xxx/qla_def.h
++++ b/drivers/scsi/qla2xxx/qla_def.h
+@@ -4048,6 +4048,9 @@ struct qla_hw_data {
+ uint32_t n2n_fw_acc_sec:1;
+ uint32_t plogi_template_valid:1;
+ uint32_t port_isolated:1;
++ uint32_t eeh_flush:2;
++#define EEH_FLUSH_RDY 1
++#define EEH_FLUSH_DONE 2
+ } flags;
+
+ uint16_t max_exchg;
+@@ -4082,6 +4085,7 @@ struct qla_hw_data {
+ uint32_t rsp_que_len;
+ uint32_t req_que_off;
+ uint32_t rsp_que_off;
++ unsigned long eeh_jif;
+
+ /* Multi queue data structs */
+ device_reg_t *mqiobase;
+diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c
+index 7bd10b4ed9ed..a0d26e2e0ce1 100644
+--- a/drivers/scsi/qla2xxx/qla_init.c
++++ b/drivers/scsi/qla2xxx/qla_init.c
+@@ -47,6 +47,7 @@ qla2x00_sp_timeout(struct timer_list *t)
+ {
+ srb_t *sp = from_timer(sp, t, u.iocb_cmd.timer);
+ struct srb_iocb *iocb;
++ scsi_qla_host_t *vha = sp->vha;
+
+ WARN_ON(irqs_disabled());
+ iocb = &sp->u.iocb_cmd;
+@@ -54,6 +55,12 @@ qla2x00_sp_timeout(struct timer_list *t)
+
+ /* ref: TMR */
+ kref_put(&sp->cmd_kref, qla2x00_sp_release);
++
++ if (vha && qla2x00_isp_reg_stat(vha->hw)) {
++ ql_log(ql_log_info, vha, 0x9008,
++ "PCI/Register disconnect.\n");
++ qla_pci_set_eeh_busy(vha);
++ }
+ }
+
+ void qla2x00_sp_free(srb_t *sp)
+@@ -9669,6 +9676,12 @@ int qla2xxx_disable_port(struct Scsi_Host *host)
+
+ vha->hw->flags.port_isolated = 1;
+
++ if (qla2x00_isp_reg_stat(vha->hw)) {
++ ql_log(ql_log_info, vha, 0x9006,
++ "PCI/Register disconnect, exiting.\n");
++ qla_pci_set_eeh_busy(vha);
++ return FAILED;
++ }
+ if (qla2x00_chip_is_down(vha))
+ return 0;
+
+@@ -9684,6 +9697,13 @@ int qla2xxx_enable_port(struct Scsi_Host *host)
+ {
+ scsi_qla_host_t *vha = shost_priv(host);
+
++ if (qla2x00_isp_reg_stat(vha->hw)) {
++ ql_log(ql_log_info, vha, 0x9001,
++ "PCI/Register disconnect, exiting.\n");
++ qla_pci_set_eeh_busy(vha);
++ return FAILED;
++ }
++
+ vha->hw->flags.port_isolated = 0;
+ /* Set the flag to 1, so that isp_abort can proceed */
+ vha->flags.online = 1;
+diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
+index 3c68dad00d04..daa9a3c3f7b5 100644
+--- a/drivers/scsi/qla2xxx/qla_os.c
++++ b/drivers/scsi/qla2xxx/qla_os.c
+@@ -333,6 +333,11 @@ MODULE_PARM_DESC(ql2xabts_wait_nvme,
+ "To wait for ABTS response on I/O timeouts for NVMe. (default: 1)");
+
+
++u32 ql2xdelay_before_pci_error_handling = 5;
++module_param(ql2xdelay_before_pci_error_handling, uint, 0644);
++MODULE_PARM_DESC(ql2xdelay_before_pci_error_handling,
++ "Number of seconds delayed before qla begin PCI error self-handling (default: 5).\n");
++
+ static void qla2x00_clear_drv_active(struct qla_hw_data *);
+ static void qla2x00_free_device(scsi_qla_host_t *);
+ static int qla2xxx_map_queues(struct Scsi_Host *shost);
+@@ -7239,6 +7244,44 @@ static void qla_heart_beat(struct scsi_qla_host *vha, u16 dpc_started)
+ }
+ }
+
++static void qla_wind_down_chip(scsi_qla_host_t *vha)
++{
++ struct qla_hw_data *ha = vha->hw;
++
++ if (!ha->flags.eeh_busy)
++ return;
++ if (ha->pci_error_state)
++ /* system is trying to recover */
++ return;
++
++ /*
++ * Current system is not handling PCIE error. At this point, this is
++ * best effort to wind down the adapter.
++ */
++ if (time_after_eq(jiffies, ha->eeh_jif + ql2xdelay_before_pci_error_handling * HZ) &&
++ !ha->flags.eeh_flush) {
++ ql_log(ql_log_info, vha, 0x9009,
++ "PCI Error detected, attempting to reset hardware.\n");
++
++ ha->isp_ops->reset_chip(vha);
++ ha->isp_ops->disable_intrs(ha);
++
++ ha->flags.eeh_flush = EEH_FLUSH_RDY;
++ ha->eeh_jif = jiffies;
++
++ } else if (ha->flags.eeh_flush == EEH_FLUSH_RDY &&
++ time_after_eq(jiffies, ha->eeh_jif + 5 * HZ)) {
++ pci_clear_master(ha->pdev);
++
++ /* flush all command */
++ qla2x00_abort_isp_cleanup(vha);
++ ha->flags.eeh_flush = EEH_FLUSH_DONE;
++
++ ql_log(ql_log_info, vha, 0x900a,
++ "PCI Error handling complete, all IOs aborted.\n");
++ }
++}
++
+ /**************************************************************************
+ * qla2x00_timer
+ *
+@@ -7262,6 +7305,8 @@ qla2x00_timer(struct timer_list *t)
+ fc_port_t *fcport = NULL;
+
+ if (ha->flags.eeh_busy) {
++ qla_wind_down_chip(vha);
++
+ ql_dbg(ql_dbg_timer, vha, 0x6000,
+ "EEH = %d, restarting timer.\n",
+ ha->flags.eeh_busy);
+@@ -7842,6 +7887,9 @@ void qla_pci_set_eeh_busy(struct scsi_qla_host *vha)
+
+ spin_lock_irqsave(&base_vha->work_lock, flags);
+ if (!ha->flags.eeh_busy) {
++ ha->eeh_jif = jiffies;
++ ha->flags.eeh_flush = 0;
++
+ ha->flags.eeh_busy = 1;
+ do_cleanup = true;
+ }
+--
+2.35.1
+
--- /dev/null
+From 6690821f686011456d2aa183db7777be5132b6b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 18 Apr 2022 16:27:33 +0100
+Subject: serial: 8250: Add proper clock handling for OxSemi PCIe devices
+
+From: Maciej W. Rozycki <macro@orcam.me.uk>
+
+[ Upstream commit 366f6c955d4d1a5125ffcd6875ead26a3c7a2a1c ]
+
+Oxford Semiconductor PCIe (Tornado) 950 serial port devices are driven
+by a fixed 62.5MHz clock input derived from the 100MHz PCI Express clock.
+
+We currently drive the device using its default oversampling rate of 16
+and the clock prescaler disabled, consequently yielding the baud base of
+3906250. This base is inadequate for some of the high-speed baud rates
+such as 460800bps, for which the closest rate possible can be obtained
+by dividing the baud base by 8, yielding the baud rate of 488281.25bps,
+which is off by 5.9638%. This is enough for data communication to break
+with the remote end talking actual 460800bps, where missed stop bits
+have been observed.
+
+We can do better however, by taking advantage of a reduced oversampling
+rate, which can be set to any integer value from 4 to 16 inclusive by
+programming the TCR register, and by using the clock prescaler, which
+can be set to any value from 1 to 63.875 in increments of 0.125 in the
+CPR/CPR2 register pair. The prescaler has to be explicitly enabled
+though by setting bit 7 in the MCR or otherwise it is bypassed (in the
+enhanced mode that we enable) as if the value of 1 was used.
+
+Make use of these features then as follows:
+
+- Set the baud base to 15625000, reflecting the minimum oversampling
+ rate of 4 with the clock prescaler and divisor both set to 1.
+
+- Override the `set_mctrl' and set the MCR shadow there so as to have
+ MCR[7] always set and have the 8250 core propagate these settings.
+
+- Override the `get_divisor' handler and determine a good combination of
+ parameters by using a lookup table with predetermined value pairs of
+ the oversampling rate and the clock prescaler and finding a pair that
+ divides the input clock such that the quotient, when rounded to the
+ nearest integer, deviates the least from the exact result. Calculate
+ the clock divisor accordingly.
+
+ Scale the resulting oversampling rate (only by powers of two) if
+ possible so as to maximise it, reducing the divisor accordingly, and
+ avoid a divisor overflow for very low baud rates by scaling the
+ oversampling rate and/or the prescaler even if that causes some
+ accuracy loss.
+
+ Also handle the historic spd_cust feature so as to allow one to set
+ all the three parameters manually to arbitrary values, by keeping the
+ low 16 bits for the divisor and then putting TCR in bits 19:16 and
+ CPR/CPR2 in bits 28:20, sanitising the bit pattern supplied such as
+ to clamp CPR/CPR2 values between 0.000 and 0.875 inclusive to 33.875.
+ This preserves compatibility with any existing setups, that is where
+ requesting a custom divisor that only has any bits set among the low
+ 16 the oversampling rate of 16 and the clock prescaler of 33.875 will
+ be used as with the original 8250.
+
+ Finally abuse the `frac' argument to store the determined bit patterns
+ for the TCR, CPR and CPR2 registers.
+
+- Override the `set_divisor' handler so as to set the TCR, CPR and CPR2
+ registers from the `frac' value supplied. Set the divisor as usual.
+
+With the baud base set to 15625000 and the unsigned 16-bit UART_DIV_MAX
+limitation imposed by `serial8250_get_baud_rate' standard baud rates
+below 300bps become unavailable in the regular way, e.g. the rate of
+200bps requires the baud base to be divided by 78125 and that is beyond
+the unsigned 16-bit range. The historic spd_cust feature can still be
+used to obtain such rates if so required.
+
+See Documentation/tty/device_drivers/oxsemi-tornado.rst for more details.
+
+Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
+Link: https://lore.kernel.org/r/alpine.DEB.2.21.2204181519450.9383@angie.orcam.me.uk
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../tty/device_drivers/oxsemi-tornado.rst | 129 +++++++
+ drivers/tty/serial/8250/8250_pci.c | 339 ++++++++++++++----
+ 2 files changed, 400 insertions(+), 68 deletions(-)
+ create mode 100644 Documentation/tty/device_drivers/oxsemi-tornado.rst
+
+diff --git a/Documentation/tty/device_drivers/oxsemi-tornado.rst b/Documentation/tty/device_drivers/oxsemi-tornado.rst
+new file mode 100644
+index 000000000000..0180d8bb0881
+--- /dev/null
++++ b/Documentation/tty/device_drivers/oxsemi-tornado.rst
+@@ -0,0 +1,129 @@
++.. SPDX-License-Identifier: GPL-2.0
++
++====================================================================
++Notes on Oxford Semiconductor PCIe (Tornado) 950 serial port devices
++====================================================================
++
++Oxford Semiconductor PCIe (Tornado) 950 serial port devices are driven
++by a fixed 62.5MHz clock input derived from the 100MHz PCI Express clock.
++
++The baud rate produced by the baud generator is obtained from this input
++frequency by dividing it by the clock prescaler, which can be set to any
++value from 1 to 63.875 in increments of 0.125, and then the usual 16-bit
++divisor is used as with the original 8250, to divide the frequency by a
++value from 1 to 65535. Finally a programmable oversampling rate is used
++that can take any value from 4 to 16 to divide the frequency further and
++determine the actual baud rate used. Baud rates from 15625000bps down
++to 0.933bps can be obtained this way.
++
++By default the oversampling rate is set to 16 and the clock prescaler is
++set to 33.875, meaning that the frequency to be used as the reference
++for the usual 16-bit divisor is 115313.653, which is close enough to the
++frequency of 115200 used by the original 8250 for the same values to be
++used for the divisor to obtain the requested baud rates by software that
++is unaware of the extra clock controls available.
++
++The oversampling rate is programmed with the TCR register and the clock
++prescaler is programmed with the CPR/CPR2 register pair[1][2][3][4].
++To switch away from the default value of 33.875 for the prescaler the
++the enhanced mode has to be explicitly enabled though, by setting bit 4
++of the EFR. In that mode setting bit 7 in the MCR enables the prescaler
++or otherwise it is bypassed as if the value of 1 was used. Additionally
++writing any value to CPR clears CPR2 for compatibility with old software
++written for older conventional PCI Oxford Semiconductor devices that do
++not have the extra prescaler's 9th bit in CPR2, so the CPR/CPR2 register
++pair has to be programmed in the right order.
++
++By using these parameters rates from 15625000bps down to 1bps can be
++obtained, with either exact or highly-accurate actual bit rates for
++standard and many non-standard rates.
++
++Here are the figures for the standard and some non-standard baud rates
++(including those quoted in Oxford Semiconductor documentation), giving
++the requested rate (r), the actual rate yielded (a) and its deviation
++from the requested rate (d), and the values of the oversampling rate
++(tcr), the clock prescaler (cpr) and the divisor (div) produced by the
++new `get_divisor' handler:
++
++r: 15625000, a: 15625000.00, d: 0.0000%, tcr: 4, cpr: 1.000, div: 1
++r: 12500000, a: 12500000.00, d: 0.0000%, tcr: 5, cpr: 1.000, div: 1
++r: 10416666, a: 10416666.67, d: 0.0000%, tcr: 6, cpr: 1.000, div: 1
++r: 8928571, a: 8928571.43, d: 0.0000%, tcr: 7, cpr: 1.000, div: 1
++r: 7812500, a: 7812500.00, d: 0.0000%, tcr: 8, cpr: 1.000, div: 1
++r: 4000000, a: 4000000.00, d: 0.0000%, tcr: 5, cpr: 3.125, div: 1
++r: 3686400, a: 3676470.59, d: -0.2694%, tcr: 8, cpr: 2.125, div: 1
++r: 3500000, a: 3496503.50, d: -0.0999%, tcr: 13, cpr: 1.375, div: 1
++r: 3000000, a: 2976190.48, d: -0.7937%, tcr: 14, cpr: 1.500, div: 1
++r: 2500000, a: 2500000.00, d: 0.0000%, tcr: 10, cpr: 2.500, div: 1
++r: 2000000, a: 2000000.00, d: 0.0000%, tcr: 10, cpr: 3.125, div: 1
++r: 1843200, a: 1838235.29, d: -0.2694%, tcr: 16, cpr: 2.125, div: 1
++r: 1500000, a: 1492537.31, d: -0.4975%, tcr: 5, cpr: 8.375, div: 1
++r: 1152000, a: 1152073.73, d: 0.0064%, tcr: 14, cpr: 3.875, div: 1
++r: 921600, a: 919117.65, d: -0.2694%, tcr: 16, cpr: 2.125, div: 2
++r: 576000, a: 576036.87, d: 0.0064%, tcr: 14, cpr: 3.875, div: 2
++r: 460800, a: 460829.49, d: 0.0064%, tcr: 7, cpr: 3.875, div: 5
++r: 230400, a: 230414.75, d: 0.0064%, tcr: 14, cpr: 3.875, div: 5
++r: 115200, a: 115207.37, d: 0.0064%, tcr: 14, cpr: 1.250, div: 31
++r: 57600, a: 57603.69, d: 0.0064%, tcr: 8, cpr: 3.875, div: 35
++r: 38400, a: 38402.46, d: 0.0064%, tcr: 14, cpr: 3.875, div: 30
++r: 19200, a: 19201.23, d: 0.0064%, tcr: 8, cpr: 3.875, div: 105
++r: 9600, a: 9600.06, d: 0.0006%, tcr: 9, cpr: 1.125, div: 643
++r: 4800, a: 4799.98, d: -0.0004%, tcr: 7, cpr: 2.875, div: 647
++r: 2400, a: 2400.02, d: 0.0008%, tcr: 9, cpr: 2.250, div: 1286
++r: 1200, a: 1200.00, d: 0.0000%, tcr: 14, cpr: 2.875, div: 1294
++r: 300, a: 300.00, d: 0.0000%, tcr: 11, cpr: 2.625, div: 7215
++r: 200, a: 200.00, d: 0.0000%, tcr: 16, cpr: 1.250, div: 15625
++r: 150, a: 150.00, d: 0.0000%, tcr: 13, cpr: 2.250, div: 14245
++r: 134, a: 134.00, d: 0.0000%, tcr: 11, cpr: 2.625, div: 16153
++r: 110, a: 110.00, d: 0.0000%, tcr: 12, cpr: 1.000, div: 47348
++r: 75, a: 75.00, d: 0.0000%, tcr: 4, cpr: 5.875, div: 35461
++r: 50, a: 50.00, d: 0.0000%, tcr: 16, cpr: 1.250, div: 62500
++r: 25, a: 25.00, d: 0.0000%, tcr: 16, cpr: 2.500, div: 62500
++r: 4, a: 4.00, d: 0.0000%, tcr: 16, cpr: 20.000, div: 48828
++r: 2, a: 2.00, d: 0.0000%, tcr: 16, cpr: 40.000, div: 48828
++r: 1, a: 1.00, d: 0.0000%, tcr: 16, cpr: 63.875, div: 61154
++
++With the baud base set to 15625000 and the unsigned 16-bit UART_DIV_MAX
++limitation imposed by `serial8250_get_baud_rate' standard baud rates
++below 300bps become unavailable in the regular way, e.g. the rate of
++200bps requires the baud base to be divided by 78125 and that is beyond
++the unsigned 16-bit range. The historic spd_cust feature can still be
++used by encoding the values for, the prescaler, the oversampling rate
++and the clock divisor (DLM/DLL) as follows to obtain such rates if so
++required:
++
++ 31 29 28 20 19 16 15 0
+++-----+-----------------+-------+-------------------------------+
++|0 0 0| CPR2:CPR | TCR | DLM:DLL |
+++-----+-----------------+-------+-------------------------------+
++
++Use a value such encoded for the `custom_divisor' field along with the
++ASYNC_SPD_CUST flag set in the `flags' field in `struct serial_struct'
++passed with the TIOCSSERIAL ioctl(2), such as with the setserial(8)
++utility and its `divisor' and `spd_cust' parameters, and the select
++the baud rate of 38400bps. Note that the value of 0 in TCR sets the
++oversampling rate to 16 and prescaler values below 1 in CPR2/CPR are
++clamped by the driver to 1.
++
++For example the value of 0x1f4004e2 will set CPR2/CPR, TCR and DLM/DLL
++respectively to 0x1f4, 0x0 and 0x04e2, choosing the prescaler value,
++the oversampling rate and the clock divisor of 62.500, 16 and 1250
++respectively. These parameters will set the baud rate for the serial
++port to 62500000 / 62.500 / 1250 / 16 = 50bps.
++
++References:
++
++[1] "OXPCIe200 PCI Express Multi-Port Bridge", Oxford Semiconductor,
++ Inc., DS-0045, 10 Nov 2008, Section "950 Mode", pp. 64-65
++
++[2] "OXPCIe952 PCI Express Bridge to Dual Serial & Parallel Port",
++ Oxford Semiconductor, Inc., DS-0046, Mar 06 08, Section "950 Mode",
++ p. 20
++
++[3] "OXPCIe954 PCI Express Bridge to Quad Serial Port", Oxford
++ Semiconductor, Inc., DS-0047, Feb 08, Section "950 Mode", p. 20
++
++[4] "OXPCIe958 PCI Express Bridge to Octal Serial Port", Oxford
++ Semiconductor, Inc., DS-0048, Feb 08, Section "950 Mode", p. 20
++
++Maciej W. Rozycki <macro@orcam.me.uk>
+diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
+index 4b0e84e01e55..818ed6cd3132 100644
+--- a/drivers/tty/serial/8250/8250_pci.c
++++ b/drivers/tty/serial/8250/8250_pci.c
+@@ -11,6 +11,7 @@
+ #include <linux/pci.h>
+ #include <linux/string.h>
+ #include <linux/kernel.h>
++#include <linux/math.h>
+ #include <linux/slab.h>
+ #include <linux/delay.h>
+ #include <linux/tty.h>
+@@ -1044,6 +1045,208 @@ static int pci_oxsemi_tornado_init(struct pci_dev *dev)
+ return number_uarts;
+ }
+
++/* Tornado-specific constants for the TCR and CPR registers; see below. */
++#define OXSEMI_TORNADO_TCR_MASK 0xf
++#define OXSEMI_TORNADO_CPR_MASK 0x1ff
++#define OXSEMI_TORNADO_CPR_MIN 0x008
++#define OXSEMI_TORNADO_CPR_DEF 0x10f
++
++/*
++ * Determine the oversampling rate, the clock prescaler, and the clock
++ * divisor for the requested baud rate. The clock rate is 62.5 MHz,
++ * which is four times the baud base, and the prescaler increments in
++ * steps of 1/8. Therefore to make calculations on integers we need
++ * to use a scaled clock rate, which is the baud base multiplied by 32
++ * (or our assumed UART clock rate multiplied by 2).
++ *
++ * The allowed oversampling rates are from 4 up to 16 inclusive (values
++ * from 0 to 3 inclusive map to 16). Likewise the clock prescaler allows
++ * values between 1.000 and 63.875 inclusive (operation for values from
++ * 0.000 to 0.875 has not been specified). The clock divisor is the usual
++ * unsigned 16-bit integer.
++ *
++ * For the most accurate baud rate we use a table of predetermined
++ * oversampling rates and clock prescalers that records all possible
++ * products of the two parameters in the range from 4 up to 255 inclusive,
++ * and additionally 335 for the 1500000bps rate, with the prescaler scaled
++ * by 8. The table is sorted by the decreasing value of the oversampling
++ * rate and ties are resolved by sorting by the decreasing value of the
++ * product. This way preference is given to higher oversampling rates.
++ *
++ * We iterate over the table and choose the product of an oversampling
++ * rate and a clock prescaler that gives the lowest integer division
++ * result deviation, or if an exact integer divider is found we stop
++ * looking for it right away. We do some fixup if the resulting clock
++ * divisor required would be out of its unsigned 16-bit integer range.
++ *
++ * Finally we abuse the supposed fractional part returned to encode the
++ * 4-bit value of the oversampling rate and the 9-bit value of the clock
++ * prescaler which will end up in the TCR and CPR/CPR2 registers.
++ */
++static unsigned int pci_oxsemi_tornado_get_divisor(struct uart_port *port,
++ unsigned int baud,
++ unsigned int *frac)
++{
++ static u8 p[][2] = {
++ { 16, 14, }, { 16, 13, }, { 16, 12, }, { 16, 11, },
++ { 16, 10, }, { 16, 9, }, { 16, 8, }, { 15, 17, },
++ { 15, 16, }, { 15, 15, }, { 15, 14, }, { 15, 13, },
++ { 15, 12, }, { 15, 11, }, { 15, 10, }, { 15, 9, },
++ { 15, 8, }, { 14, 18, }, { 14, 17, }, { 14, 14, },
++ { 14, 13, }, { 14, 12, }, { 14, 11, }, { 14, 10, },
++ { 14, 9, }, { 14, 8, }, { 13, 19, }, { 13, 18, },
++ { 13, 17, }, { 13, 13, }, { 13, 12, }, { 13, 11, },
++ { 13, 10, }, { 13, 9, }, { 13, 8, }, { 12, 19, },
++ { 12, 18, }, { 12, 17, }, { 12, 11, }, { 12, 9, },
++ { 12, 8, }, { 11, 23, }, { 11, 22, }, { 11, 21, },
++ { 11, 20, }, { 11, 19, }, { 11, 18, }, { 11, 17, },
++ { 11, 11, }, { 11, 10, }, { 11, 9, }, { 11, 8, },
++ { 10, 25, }, { 10, 23, }, { 10, 20, }, { 10, 19, },
++ { 10, 17, }, { 10, 10, }, { 10, 9, }, { 10, 8, },
++ { 9, 27, }, { 9, 23, }, { 9, 21, }, { 9, 19, },
++ { 9, 18, }, { 9, 17, }, { 9, 9, }, { 9, 8, },
++ { 8, 31, }, { 8, 29, }, { 8, 23, }, { 8, 19, },
++ { 8, 17, }, { 8, 8, }, { 7, 35, }, { 7, 31, },
++ { 7, 29, }, { 7, 25, }, { 7, 23, }, { 7, 21, },
++ { 7, 19, }, { 7, 17, }, { 7, 15, }, { 7, 14, },
++ { 7, 13, }, { 7, 12, }, { 7, 11, }, { 7, 10, },
++ { 7, 9, }, { 7, 8, }, { 6, 41, }, { 6, 37, },
++ { 6, 31, }, { 6, 29, }, { 6, 23, }, { 6, 19, },
++ { 6, 17, }, { 6, 13, }, { 6, 11, }, { 6, 10, },
++ { 6, 9, }, { 6, 8, }, { 5, 67, }, { 5, 47, },
++ { 5, 43, }, { 5, 41, }, { 5, 37, }, { 5, 31, },
++ { 5, 29, }, { 5, 25, }, { 5, 23, }, { 5, 19, },
++ { 5, 17, }, { 5, 15, }, { 5, 13, }, { 5, 11, },
++ { 5, 10, }, { 5, 9, }, { 5, 8, }, { 4, 61, },
++ { 4, 59, }, { 4, 53, }, { 4, 47, }, { 4, 43, },
++ { 4, 41, }, { 4, 37, }, { 4, 31, }, { 4, 29, },
++ { 4, 23, }, { 4, 19, }, { 4, 17, }, { 4, 13, },
++ { 4, 9, }, { 4, 8, },
++ };
++ /* Scale the quotient for comparison to get the fractional part. */
++ const unsigned int quot_scale = 65536;
++ unsigned int sclk = port->uartclk * 2;
++ unsigned int sdiv = DIV_ROUND_CLOSEST(sclk, baud);
++ unsigned int best_squot;
++ unsigned int squot;
++ unsigned int quot;
++ u16 cpr;
++ u8 tcr;
++ int i;
++
++ /* Old custom speed handling. */
++ if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST) {
++ unsigned int cust_div = port->custom_divisor;
++
++ quot = cust_div & UART_DIV_MAX;
++ tcr = (cust_div >> 16) & OXSEMI_TORNADO_TCR_MASK;
++ cpr = (cust_div >> 20) & OXSEMI_TORNADO_CPR_MASK;
++ if (cpr < OXSEMI_TORNADO_CPR_MIN)
++ cpr = OXSEMI_TORNADO_CPR_DEF;
++ } else {
++ best_squot = quot_scale;
++ for (i = 0; i < ARRAY_SIZE(p); i++) {
++ unsigned int spre;
++ unsigned int srem;
++ u8 cp;
++ u8 tc;
++
++ tc = p[i][0];
++ cp = p[i][1];
++ spre = tc * cp;
++
++ srem = sdiv % spre;
++ if (srem > spre / 2)
++ srem = spre - srem;
++ squot = DIV_ROUND_CLOSEST(srem * quot_scale, spre);
++
++ if (srem == 0) {
++ tcr = tc;
++ cpr = cp;
++ quot = sdiv / spre;
++ break;
++ } else if (squot < best_squot) {
++ best_squot = squot;
++ tcr = tc;
++ cpr = cp;
++ quot = DIV_ROUND_CLOSEST(sdiv, spre);
++ }
++ }
++ while (tcr <= (OXSEMI_TORNADO_TCR_MASK + 1) >> 1 &&
++ quot % 2 == 0) {
++ quot >>= 1;
++ tcr <<= 1;
++ }
++ while (quot > UART_DIV_MAX) {
++ if (tcr <= (OXSEMI_TORNADO_TCR_MASK + 1) >> 1) {
++ quot >>= 1;
++ tcr <<= 1;
++ } else if (cpr <= OXSEMI_TORNADO_CPR_MASK >> 1) {
++ quot >>= 1;
++ cpr <<= 1;
++ } else {
++ quot = quot * cpr / OXSEMI_TORNADO_CPR_MASK;
++ cpr = OXSEMI_TORNADO_CPR_MASK;
++ }
++ }
++ }
++
++ *frac = (cpr << 8) | (tcr & OXSEMI_TORNADO_TCR_MASK);
++ return quot;
++}
++
++/*
++ * Set the oversampling rate in the transmitter clock cycle register (TCR),
++ * the clock prescaler in the clock prescaler register (CPR and CPR2), and
++ * the clock divisor in the divisor latch (DLL and DLM). Note that for
++ * backwards compatibility any write to CPR clears CPR2 and therefore CPR
++ * has to be written first, followed by CPR2, which occupies the location
++ * of CKS used with earlier UART designs.
++ */
++static void pci_oxsemi_tornado_set_divisor(struct uart_port *port,
++ unsigned int baud,
++ unsigned int quot,
++ unsigned int quot_frac)
++{
++ struct uart_8250_port *up = up_to_u8250p(port);
++ u8 cpr2 = quot_frac >> 16;
++ u8 cpr = quot_frac >> 8;
++ u8 tcr = quot_frac;
++
++ serial_icr_write(up, UART_TCR, tcr);
++ serial_icr_write(up, UART_CPR, cpr);
++ serial_icr_write(up, UART_CKS, cpr2);
++ serial8250_do_set_divisor(port, baud, quot, 0);
++}
++
++/*
++ * For Tornado devices we force MCR[7] set for the Divide-by-M N/8 baud rate
++ * generator prescaler (CPR and CPR2). Otherwise no prescaler would be used.
++ */
++static void pci_oxsemi_tornado_set_mctrl(struct uart_port *port,
++ unsigned int mctrl)
++{
++ struct uart_8250_port *up = up_to_u8250p(port);
++
++ up->mcr |= UART_MCR_CLKSEL;
++ serial8250_do_set_mctrl(port, mctrl);
++}
++
++static int pci_oxsemi_tornado_setup(struct serial_private *priv,
++ const struct pciserial_board *board,
++ struct uart_8250_port *up, int idx)
++{
++ struct pci_dev *dev = priv->dev;
++
++ if (pci_oxsemi_tornado_p(dev)) {
++ up->port.get_divisor = pci_oxsemi_tornado_get_divisor;
++ up->port.set_divisor = pci_oxsemi_tornado_set_divisor;
++ up->port.set_mctrl = pci_oxsemi_tornado_set_mctrl;
++ }
++
++ return pci_default_setup(priv, board, up, idx);
++}
++
+ static int pci_asix_setup(struct serial_private *priv,
+ const struct pciserial_board *board,
+ struct uart_8250_port *port, int idx)
+@@ -2245,7 +2448,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = {
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .init = pci_oxsemi_tornado_init,
+- .setup = pci_default_setup,
++ .setup = pci_oxsemi_tornado_setup,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_MAINPINE,
+@@ -2253,7 +2456,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = {
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .init = pci_oxsemi_tornado_init,
+- .setup = pci_default_setup,
++ .setup = pci_oxsemi_tornado_setup,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_DIGI,
+@@ -2261,7 +2464,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = {
+ .subvendor = PCI_SUBVENDOR_ID_IBM,
+ .subdevice = PCI_ANY_ID,
+ .init = pci_oxsemi_tornado_init,
+- .setup = pci_default_setup,
++ .setup = pci_oxsemi_tornado_setup,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_INTEL,
+@@ -2578,7 +2781,7 @@ enum pci_board_num_t {
+ pbn_b0_2_1843200,
+ pbn_b0_4_1843200,
+
+- pbn_b0_1_3906250,
++ pbn_b0_1_15625000,
+
+ pbn_b0_bt_1_115200,
+ pbn_b0_bt_2_115200,
+@@ -2657,10 +2860,10 @@ enum pci_board_num_t {
+ pbn_panacom4,
+ pbn_plx_romulus,
+ pbn_oxsemi,
+- pbn_oxsemi_1_3906250,
+- pbn_oxsemi_2_3906250,
+- pbn_oxsemi_4_3906250,
+- pbn_oxsemi_8_3906250,
++ pbn_oxsemi_1_15625000,
++ pbn_oxsemi_2_15625000,
++ pbn_oxsemi_4_15625000,
++ pbn_oxsemi_8_15625000,
+ pbn_intel_i960,
+ pbn_sgi_ioc3,
+ pbn_computone_4,
+@@ -2803,10 +3006,10 @@ static struct pciserial_board pci_boards[] = {
+ .uart_offset = 8,
+ },
+
+- [pbn_b0_1_3906250] = {
++ [pbn_b0_1_15625000] = {
+ .flags = FL_BASE0,
+ .num_ports = 1,
+- .base_baud = 3906250,
++ .base_baud = 15625000,
+ .uart_offset = 8,
+ },
+
+@@ -3187,31 +3390,31 @@ static struct pciserial_board pci_boards[] = {
+ .base_baud = 115200,
+ .uart_offset = 8,
+ },
+- [pbn_oxsemi_1_3906250] = {
++ [pbn_oxsemi_1_15625000] = {
+ .flags = FL_BASE0,
+ .num_ports = 1,
+- .base_baud = 3906250,
++ .base_baud = 15625000,
+ .uart_offset = 0x200,
+ .first_offset = 0x1000,
+ },
+- [pbn_oxsemi_2_3906250] = {
++ [pbn_oxsemi_2_15625000] = {
+ .flags = FL_BASE0,
+ .num_ports = 2,
+- .base_baud = 3906250,
++ .base_baud = 15625000,
+ .uart_offset = 0x200,
+ .first_offset = 0x1000,
+ },
+- [pbn_oxsemi_4_3906250] = {
++ [pbn_oxsemi_4_15625000] = {
+ .flags = FL_BASE0,
+ .num_ports = 4,
+- .base_baud = 3906250,
++ .base_baud = 15625000,
+ .uart_offset = 0x200,
+ .first_offset = 0x1000,
+ },
+- [pbn_oxsemi_8_3906250] = {
++ [pbn_oxsemi_8_15625000] = {
+ .flags = FL_BASE0,
+ .num_ports = 8,
+- .base_baud = 3906250,
++ .base_baud = 15625000,
+ .uart_offset = 0x200,
+ .first_offset = 0x1000,
+ },
+@@ -4192,165 +4395,165 @@ static const struct pci_device_id serial_pci_tbl[] = {
+ */
+ { PCI_VENDOR_ID_OXSEMI, 0xc101, /* OXPCIe952 1 Legacy UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_b0_1_3906250 },
++ pbn_b0_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc105, /* OXPCIe952 1 Legacy UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_b0_1_3906250 },
++ pbn_b0_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc11b, /* OXPCIe952 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc11f, /* OXPCIe952 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc120, /* OXPCIe952 1 Legacy UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_b0_1_3906250 },
++ pbn_b0_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc124, /* OXPCIe952 1 Legacy UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_b0_1_3906250 },
++ pbn_b0_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc138, /* OXPCIe952 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc13d, /* OXPCIe952 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc140, /* OXPCIe952 1 Legacy UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_b0_1_3906250 },
++ pbn_b0_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc141, /* OXPCIe952 1 Legacy UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_b0_1_3906250 },
++ pbn_b0_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc144, /* OXPCIe952 1 Legacy UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_b0_1_3906250 },
++ pbn_b0_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc145, /* OXPCIe952 1 Legacy UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_b0_1_3906250 },
++ pbn_b0_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc158, /* OXPCIe952 2 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_2_3906250 },
++ pbn_oxsemi_2_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc15d, /* OXPCIe952 2 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_2_3906250 },
++ pbn_oxsemi_2_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc208, /* OXPCIe954 4 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_4_3906250 },
++ pbn_oxsemi_4_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc20d, /* OXPCIe954 4 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_4_3906250 },
++ pbn_oxsemi_4_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc308, /* OXPCIe958 8 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_8_3906250 },
++ pbn_oxsemi_8_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc30d, /* OXPCIe958 8 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_8_3906250 },
++ pbn_oxsemi_8_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc40b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc40f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc41b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc41f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc42b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc42f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc43b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc43f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc44b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc44f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc45b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc45f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc46b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc46f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc47b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc47f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc48b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc48f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc49b, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc49f, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc4ab, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc4af, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc4bb, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc4bf, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc4cb, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_OXSEMI, 0xc4cf, /* OXPCIe200 1 Native UART */
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ /*
+ * Mainpine Inc. IQ Express "Rev3" utilizing OxSemi Tornado
+ */
+ { PCI_VENDOR_ID_MAINPINE, 0x4000, /* IQ Express 1 Port V.34 Super-G3 Fax */
+ PCI_VENDOR_ID_MAINPINE, 0x4001, 0, 0,
+- pbn_oxsemi_1_3906250 },
++ pbn_oxsemi_1_15625000 },
+ { PCI_VENDOR_ID_MAINPINE, 0x4000, /* IQ Express 2 Port V.34 Super-G3 Fax */
+ PCI_VENDOR_ID_MAINPINE, 0x4002, 0, 0,
+- pbn_oxsemi_2_3906250 },
++ pbn_oxsemi_2_15625000 },
+ { PCI_VENDOR_ID_MAINPINE, 0x4000, /* IQ Express 4 Port V.34 Super-G3 Fax */
+ PCI_VENDOR_ID_MAINPINE, 0x4004, 0, 0,
+- pbn_oxsemi_4_3906250 },
++ pbn_oxsemi_4_15625000 },
+ { PCI_VENDOR_ID_MAINPINE, 0x4000, /* IQ Express 8 Port V.34 Super-G3 Fax */
+ PCI_VENDOR_ID_MAINPINE, 0x4008, 0, 0,
+- pbn_oxsemi_8_3906250 },
++ pbn_oxsemi_8_15625000 },
+
+ /*
+ * Digi/IBM PCIe 2-port Async EIA-232 Adapter utilizing OxSemi Tornado
+ */
+ { PCI_VENDOR_ID_DIGI, PCIE_DEVICE_ID_NEO_2_OX_IBM,
+ PCI_SUBVENDOR_ID_IBM, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_2_3906250 },
++ pbn_oxsemi_2_15625000 },
+ /*
+ * EndRun Technologies. PCI express device range.
+ * EndRun PTP/1588 has 2 Native UARTs utilizing OxSemi 952.
+ */
+ { PCI_VENDOR_ID_ENDRUN, PCI_DEVICE_ID_ENDRUN_1588,
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_oxsemi_2_3906250 },
++ pbn_oxsemi_2_15625000 },
+
+ /*
+ * SBS Technologies, Inc. P-Octal and PMC-OCTPRO cards,
+--
+2.35.1
+
--- /dev/null
+From cfb171de62f71cfb83a28429ef85c60e8be57a08 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 18 Apr 2022 16:27:22 +0100
+Subject: serial: 8250: Fold EndRun device support into OxSemi Tornado code
+
+From: Maciej W. Rozycki <macro@orcam.me.uk>
+
+[ Upstream commit 1f32c65bad24b9787d3e52843de375430e3df822 ]
+
+The EndRun PTP/1588 dual serial port device is based on the Oxford
+Semiconductor OXPCIe952 UART device with the PCI vendor:device ID set
+for EndRun Technologies and uses the same sequence to determine the
+number of ports available. Despite that we have duplicate code
+specific to the EndRun device.
+
+Remove redundant code then and factor out OxSemi Tornado device
+detection.
+
+Signed-off-by: Maciej W. Rozycki <macro@orcam.me.uk>
+Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
+Link: https://lore.kernel.org/r/alpine.DEB.2.21.2204181516220.9383@angie.orcam.me.uk
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/tty/serial/8250/8250_pci.c | 76 ++++++++++--------------------
+ 1 file changed, 25 insertions(+), 51 deletions(-)
+
+diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
+index a293e9f107d0..4b0e84e01e55 100644
+--- a/drivers/tty/serial/8250/8250_pci.c
++++ b/drivers/tty/serial/8250/8250_pci.c
+@@ -994,41 +994,29 @@ static void pci_ite887x_exit(struct pci_dev *dev)
+ }
+
+ /*
+- * EndRun Technologies.
+- * Determine the number of ports available on the device.
++ * Oxford Semiconductor Inc.
++ * Check if an OxSemi device is part of the Tornado range of devices.
+ */
+ #define PCI_VENDOR_ID_ENDRUN 0x7401
+ #define PCI_DEVICE_ID_ENDRUN_1588 0xe100
+
+-static int pci_endrun_init(struct pci_dev *dev)
++static bool pci_oxsemi_tornado_p(struct pci_dev *dev)
+ {
+- u8 __iomem *p;
+- unsigned long deviceID;
+- unsigned int number_uarts = 0;
++ /* OxSemi Tornado devices are all 0xCxxx */
++ if (dev->vendor == PCI_VENDOR_ID_OXSEMI &&
++ (dev->device & 0xf000) != 0xc000)
++ return false;
+
+- /* EndRun device is all 0xexxx */
++ /* EndRun devices are all 0xExxx */
+ if (dev->vendor == PCI_VENDOR_ID_ENDRUN &&
+- (dev->device & 0xf000) != 0xe000)
+- return 0;
+-
+- p = pci_iomap(dev, 0, 5);
+- if (p == NULL)
+- return -ENOMEM;
++ (dev->device & 0xf000) != 0xe000)
++ return false;
+
+- deviceID = ioread32(p);
+- /* EndRun device */
+- if (deviceID == 0x07000200) {
+- number_uarts = ioread8(p + 4);
+- pci_dbg(dev, "%d ports detected on EndRun PCI Express device\n", number_uarts);
+- }
+- pci_iounmap(dev, p);
+- return number_uarts;
++ return true;
+ }
+
+ /*
+- * Oxford Semiconductor Inc.
+- * Check that device is part of the Tornado range of devices, then determine
+- * the number of ports available on the device.
++ * Determine the number of ports available on a Tornado device.
+ */
+ static int pci_oxsemi_tornado_init(struct pci_dev *dev)
+ {
+@@ -1036,9 +1024,7 @@ static int pci_oxsemi_tornado_init(struct pci_dev *dev)
+ unsigned long deviceID;
+ unsigned int number_uarts = 0;
+
+- /* OxSemi Tornado devices are all 0xCxxx */
+- if (dev->vendor == PCI_VENDOR_ID_OXSEMI &&
+- (dev->device & 0xF000) != 0xC000)
++ if (!pci_oxsemi_tornado_p(dev))
+ return 0;
+
+ p = pci_iomap(dev, 0, 5);
+@@ -1049,7 +1035,10 @@ static int pci_oxsemi_tornado_init(struct pci_dev *dev)
+ /* Tornado device */
+ if (deviceID == 0x07000200) {
+ number_uarts = ioread8(p + 4);
+- pci_dbg(dev, "%d ports detected on Oxford PCI Express device\n", number_uarts);
++ pci_dbg(dev, "%d ports detected on %s PCI Express device\n",
++ number_uarts,
++ dev->vendor == PCI_VENDOR_ID_ENDRUN ?
++ "EndRun" : "Oxford");
+ }
+ pci_iounmap(dev, p);
+ return number_uarts;
+@@ -2244,7 +2233,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = {
+ .device = PCI_ANY_ID,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+- .init = pci_endrun_init,
++ .init = pci_oxsemi_tornado_init,
+ .setup = pci_default_setup,
+ },
+ /*
+@@ -2667,7 +2656,6 @@ enum pci_board_num_t {
+ pbn_panacom2,
+ pbn_panacom4,
+ pbn_plx_romulus,
+- pbn_endrun_2_3906250,
+ pbn_oxsemi,
+ pbn_oxsemi_1_3906250,
+ pbn_oxsemi_2_3906250,
+@@ -3189,20 +3177,6 @@ static struct pciserial_board pci_boards[] = {
+ .first_offset = 0x03,
+ },
+
+- /*
+- * EndRun Technologies
+- * Uses the size of PCI Base region 0 to
+- * signal now many ports are available
+- * 2 port 952 Uart support
+- */
+- [pbn_endrun_2_3906250] = {
+- .flags = FL_BASE0,
+- .num_ports = 2,
+- .base_baud = 3906250,
+- .uart_offset = 0x200,
+- .first_offset = 0x1000,
+- },
+-
+ /*
+ * This board uses the size of PCI Base region 0 to
+ * signal now many ports are available
+@@ -4109,13 +4083,6 @@ static const struct pci_device_id serial_pci_tbl[] = {
+ { PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_ROMULUS,
+ 0x10b5, 0x106a, 0, 0,
+ pbn_plx_romulus },
+- /*
+- * EndRun Technologies. PCI express device range.
+- * EndRun PTP/1588 has 2 Native UARTs.
+- */
+- { PCI_VENDOR_ID_ENDRUN, PCI_DEVICE_ID_ENDRUN_1588,
+- PCI_ANY_ID, PCI_ANY_ID, 0, 0,
+- pbn_endrun_2_3906250 },
+ /*
+ * Quatech cards. These actually have configurable clocks but for
+ * now we just use the default.
+@@ -4377,6 +4344,13 @@ static const struct pci_device_id serial_pci_tbl[] = {
+ { PCI_VENDOR_ID_DIGI, PCIE_DEVICE_ID_NEO_2_OX_IBM,
+ PCI_SUBVENDOR_ID_IBM, PCI_ANY_ID, 0, 0,
+ pbn_oxsemi_2_3906250 },
++ /*
++ * EndRun Technologies. PCI express device range.
++ * EndRun PTP/1588 has 2 Native UARTs utilizing OxSemi 952.
++ */
++ { PCI_VENDOR_ID_ENDRUN, PCI_DEVICE_ID_ENDRUN_1588,
++ PCI_ANY_ID, PCI_ANY_ID, 0, 0,
++ pbn_oxsemi_2_3906250 },
+
+ /*
+ * SBS Technologies, Inc. P-Octal and PMC-OCTPRO cards,
+--
+2.35.1
+
__follow_mount_rcu-verify-that-mount_lock-remains-unchanged.patch
spmi-trace-fix-stack-out-of-bound-access-in-spmi-tracing-functions.patch
drivers-base-fix-userspace-break-from-using-bin_attributes-for-cpumap-and-cpulist.patch
+drm-mediatek-keep-dsi-as-lp00-before-dcs-cmds-transf.patch
+crypto-blake2s-remove-shash-module.patch
+drm-dp-mst-read-the-extended-dpcd-capabilities-durin.patch
+scsi-qla2xxx-fix-excessive-i-o-error-messages-by-def.patch
+scsi-qla2xxx-wind-down-adapter-after-pcie-error.patch-27996
+scsi-qla2xxx-turn-off-multi-queue-for-8g-adapters.patch-18430
+scsi-qla2xxx-fix-crash-due-to-stale-srb-access-aroun.patch
+scsi-qla2xxx-fix-losing-fcp-2-targets-during-port-pe.patch
+scsi-qla2xxx-fix-losing-target-when-it-reappears-dur.patch
+scsi-qla2xxx-fix-losing-fcp-2-targets-on-long-port-d.patch
+scsi-qla2xxx-fix-erroneous-mailbox-timeout-after-pci.patch
+drm-vc4-drv-adopt-the-dma-configuration-from-the-hvs.patch
+usbnet-smsc95xx-don-t-clear-read-only-phy-interrupt.patch
+usbnet-smsc95xx-avoid-link-settings-race-on-interrup.patch
+usbnet-smsc95xx-forward-phy-interrupts-to-phy-driver.patch
+usbnet-smsc95xx-fix-deadlock-on-runtime-resume.patch
+firmware-arm_scpi-ensure-scpi_info-is-not-assigned-i.patch
+__follow_mount_rcu-verify-that-mount_lock-remains-un.patch
+intel_th-pci-add-meteor-lake-p-support.patch
+intel_th-pci-add-raptor-lake-s-pch-support.patch
+intel_th-pci-add-raptor-lake-s-cpu-support.patch
+kvm-set_msr_mce-permit-guests-to-ignore-single-bit-e.patch
+kvm-x86-signal-gp-not-eperm-on-bad-wrmsr-mci_ctl-sta.patch
+iommu-vt-d-avoid-invalid-memory-access-via-node_onli.patch
+pci-aer-iterate-over-error-counters-instead-of-error.patch
+pci-qcom-power-on-phy-before-ipq8074-dbi-register-ac.patch
+serial-8250-fold-endrun-device-support-into-oxsemi-t.patch
+serial-8250-add-proper-clock-handling-for-oxsemi-pci.patch
+tty-8250-add-support-for-brainboxes-px-cards.patch
+dm-writecache-set-a-default-max_writeback_jobs.patch
+x86-olpc-fix-logical-not-is-only-applied-to-the-left.patch
+drivers-base-fix-userspace-break-from-using-bin_attr.patch
+kexec_file-drop-weak-attribute-from-functions.patch
+kexec-clean-up-arch_kexec_kernel_verify_sig.patch
+kexec-keys-s390-make-use-of-built-in-and-secondary-k.patch
+tracing-events-add-__vstring-and-__assign_vstr-helpe.patch
+dm-thin-fix-use-after-free-crash-in-dm_sm_register_t.patch
+net-9p-initialize-the-iounit-field-during-fid-creati.patch
+timekeeping-contribute-wall-clock-to-rng-on-time-cha.patch
+scsi-qla2xxx-fix-response-queue-handler-reading-stal.patch
+scsi-qla2xxx-edif-fix-dropped-ike-message.patch
+scsi-qla2xxx-fix-imbalance-vha-vref_count.patch-27970
+scsi-qla2xxx-fix-discovery-issues-in-fc-al-topology.patch-4818
+scsi-qla2xxx-update-manufacturer-details.patch
+locking-csd_lock-change-csdlock_debug-from-early_par.patch
+block-serialize-all-debugfs-operations-using-q-debug.patch
+block-don-t-allow-the-same-type-rq_qos-add-more-than.patch
+spmi-trace-fix-stack-out-of-bound-access-in-spmi-tra.patch
+btrfs-tree-log-make-the-return-value-for-log-syncing.patch
+btrfs-ensure-pages-are-unlocked-on-cow_file_range-fa.patch
+btrfs-fix-error-handling-of-fallback-uncompress-writ.patch
+btrfs-reset-block-group-chunk-force-if-we-have-to-wa.patch
+btrfs-properly-flag-filesystem-with-btrfs_feature_in.patch
+block-add-a-bdev_max_zone_append_sectors-helper.patch
+block-add-bdev_max_segments-helper.patch
+btrfs-zoned-revive-max_zone_append_bytes.patch
+btrfs-replace-btrfs_max_extent_size-with-fs_info-max.patch
+btrfs-let-can_allocate_chunk-return-error.patch
+btrfs-zoned-finish-least-available-block-group-on-da.patch
+btrfs-zoned-disable-metadata-overcommit-for-zoned.patch
+btrfs-make-the-bg_reclaim_threshold-per-space-info.patch
+btrfs-zoned-introduce-btrfs_zoned_bg_is_full.patch
+btrfs-store-chunk-size-in-space-info-struct.patch
+btrfs-zoned-introduce-space_info-active_total_bytes.patch
+btrfs-zoned-activate-metadata-block-group-on-flush_s.patch
+btrfs-zoned-activate-necessary-block-group.patch
+btrfs-zoned-write-out-partially-allocated-region.patch
+btrfs-zoned-wait-until-zone-is-finished-when-allocat.patch
+intel_idle-add-alderlake-support.patch
+intel_idle-make-spr-c1-and-c1e-be-independent.patch
+acpi-cppc-do-not-prevent-cppc-from-working-in-the-fu.patch
+powerpc-powernv-kvm-use-darn-for-h_random-on-power9.patch
+s390-unwind-fix-fgraph-return-address-recovery.patch
+kvm-x86-pmu-introduce-the-ctrl_mask-value-for-fixed-.patch
+kvm-vmx-mark-all-perf_global_-ovf-_ctrl-bits-reserve.patch
+kvm-x86-pmu-ignore-pmu-global_ctrl-check-if-vpmu-doe.patch
+kvm-vmx-add-helper-to-check-if-the-guest-pmu-has-per.patch
+kvm-nvmx-attempt-to-load-perf_global_ctrl-on-nvmx-xf.patch
+dm-raid-fix-address-sanitizer-warning-in-raid_status.patch
+dm-raid-fix-address-sanitizer-warning-in-raid_resume.patch
+mm-damon-reclaim-fix-potential-memory-leak-in-damon_.patch
+hugetlb_cgroup-fix-wrong-hugetlb-cgroup-numa-stat.patch
+batman-adv-tracing-use-the-new-__vstring-helper.patch
+ftrace-x86-add-back-ftrace_expected-assignment.patch-2936
+tracing-use-a-struct-alignof-to-determine-trace-even.patch
+ksmbd-validate-length-in-smb2_write.patch
+ksmbd-smbd-change-prototypes-of-rdma-read-write-rela.patch
+ksmbd-smbd-introduce-read-write-credits-for-rdma-rea.patch
+ksmbd-add-smbd-max-io-size-parameter.patch
+ksmbd-fix-wrong-smbd-max-read-write-size-check.patch
+ksmbd-prevent-out-of-bound-read-for-smb2_write.patch
+input-gscps2-check-return-value-of-ioremap-in-gscps2.patch
+x86-kprobes-update-kcb-status-flag-after-singlestepp.patch
+ext4-update-s_overhead_clusters-in-the-superblock-du.patch
+ext4-fix-extent-status-tree-race-in-writeback-error-.patch
+ext4-add-ext4_inode_has_xattr_space-macro-in-xattr.h.patch
+ext4-fix-use-after-free-in-ext4_xattr_set_entry.patch
+ext4-correct-max_inline_xattr_value_size-computing.patch
+ext4-correct-the-misjudgment-in-ext4_iget_extra_inod.patch
+ext4-fix-warning-in-ext4_iomap_begin-as-race-between.patch
+ext4-check-if-directory-block-is-within-i_size.patch
+ext4-make-sure-ext4_append-always-allocates-new-bloc.patch
+ext4-remove-ea-inode-entry-from-mbcache-on-inode-evi.patch
+ext4-use-kmemdup-to-replace-kmalloc-memcpy.patch
+ext4-unindent-codeblock-in-ext4_xattr_block_set.patch
+ext4-fix-race-when-reusing-xattr-blocks.patch
+keys-asymmetric-enforce-sm2-signature-use-pkey-algo.patch
+tpm-eventlog-fix-section-mismatch-for-debug_section_.patch
+tpm-add-check-for-failure-mode-for-tpm2-modules.patch
--- /dev/null
+From 950c726de64542319fc82b60b84e385645aff774 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 27 Jun 2022 16:55:12 -0700
+Subject: spmi: trace: fix stack-out-of-bound access in SPMI tracing functions
+
+From: David Collins <quic_collinsd@quicinc.com>
+
+[ Upstream commit 2af28b241eea816e6f7668d1954f15894b45d7e3 ]
+
+trace_spmi_write_begin() and trace_spmi_read_end() both call
+memcpy() with a length of "len + 1". This leads to one extra
+byte being read beyond the end of the specified buffer. Fix
+this out-of-bound memory access by using a length of "len"
+instead.
+
+Here is a KASAN log showing the issue:
+
+BUG: KASAN: stack-out-of-bounds in trace_event_raw_event_spmi_read_end+0x1d0/0x234
+Read of size 2 at addr ffffffc0265b7540 by task thermal@2.0-ser/1314
+...
+Call trace:
+ dump_backtrace+0x0/0x3e8
+ show_stack+0x2c/0x3c
+ dump_stack_lvl+0xdc/0x11c
+ print_address_description+0x74/0x384
+ kasan_report+0x188/0x268
+ kasan_check_range+0x270/0x2b0
+ memcpy+0x90/0xe8
+ trace_event_raw_event_spmi_read_end+0x1d0/0x234
+ spmi_read_cmd+0x294/0x3ac
+ spmi_ext_register_readl+0x84/0x9c
+ regmap_spmi_ext_read+0x144/0x1b0 [regmap_spmi]
+ _regmap_raw_read+0x40c/0x754
+ regmap_raw_read+0x3a0/0x514
+ regmap_bulk_read+0x418/0x494
+ adc5_gen3_poll_wait_hs+0xe8/0x1e0 [qcom_spmi_adc5_gen3]
+ ...
+ __arm64_sys_read+0x4c/0x60
+ invoke_syscall+0x80/0x218
+ el0_svc_common+0xec/0x1c8
+ ...
+
+addr ffffffc0265b7540 is located in stack of task thermal@2.0-ser/1314 at offset 32 in frame:
+ adc5_gen3_poll_wait_hs+0x0/0x1e0 [qcom_spmi_adc5_gen3]
+
+this frame has 1 object:
+ [32, 33) 'status'
+
+Memory state around the buggy address:
+ ffffffc0265b7400: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1
+ ffffffc0265b7480: 04 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
+>ffffffc0265b7500: 00 00 00 00 f1 f1 f1 f1 01 f3 f3 f3 00 00 00 00
+ ^
+ ffffffc0265b7580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ ffffffc0265b7600: f1 f1 f1 f1 01 f2 07 f2 f2 f2 01 f3 00 00 00 00
+==================================================================
+
+Fixes: a9fce374815d ("spmi: add command tracepoints for SPMI")
+Cc: stable@vger.kernel.org
+Reviewed-by: Stephen Boyd <sboyd@kernel.org>
+Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: David Collins <quic_collinsd@quicinc.com>
+Link: https://lore.kernel.org/r/20220627235512.2272783-1-quic_collinsd@quicinc.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/trace/events/spmi.h | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/include/trace/events/spmi.h b/include/trace/events/spmi.h
+index 8b60efe18ba6..a6819fd85cdf 100644
+--- a/include/trace/events/spmi.h
++++ b/include/trace/events/spmi.h
+@@ -21,15 +21,15 @@ TRACE_EVENT(spmi_write_begin,
+ __field ( u8, sid )
+ __field ( u16, addr )
+ __field ( u8, len )
+- __dynamic_array ( u8, buf, len + 1 )
++ __dynamic_array ( u8, buf, len )
+ ),
+
+ TP_fast_assign(
+ __entry->opcode = opcode;
+ __entry->sid = sid;
+ __entry->addr = addr;
+- __entry->len = len + 1;
+- memcpy(__get_dynamic_array(buf), buf, len + 1);
++ __entry->len = len;
++ memcpy(__get_dynamic_array(buf), buf, len);
+ ),
+
+ TP_printk("opc=%d sid=%02d addr=0x%04x len=%d buf=0x[%*phD]",
+@@ -92,7 +92,7 @@ TRACE_EVENT(spmi_read_end,
+ __field ( u16, addr )
+ __field ( int, ret )
+ __field ( u8, len )
+- __dynamic_array ( u8, buf, len + 1 )
++ __dynamic_array ( u8, buf, len )
+ ),
+
+ TP_fast_assign(
+@@ -100,8 +100,8 @@ TRACE_EVENT(spmi_read_end,
+ __entry->sid = sid;
+ __entry->addr = addr;
+ __entry->ret = ret;
+- __entry->len = len + 1;
+- memcpy(__get_dynamic_array(buf), buf, len + 1);
++ __entry->len = len;
++ memcpy(__get_dynamic_array(buf), buf, len);
+ ),
+
+ TP_printk("opc=%d sid=%02d addr=0x%04x ret=%d len=%02d buf=0x[%*phD]",
+--
+2.35.1
+
--- /dev/null
+From 9b10f29c1c139d6a3b1b563093243b9621df2322 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 17 Jul 2022 23:53:34 +0200
+Subject: timekeeping: contribute wall clock to rng on time change
+
+From: Jason A. Donenfeld <Jason@zx2c4.com>
+
+[ Upstream commit b8ac29b40183a6038919768b5d189c9bd91ce9b4 ]
+
+The rng's random_init() function contributes the real time to the rng at
+boot time, so that events can at least start in relation to something
+particular in the real world. But this clock might not yet be set that
+point in boot, so nothing is contributed. In addition, the relation
+between minor clock changes from, say, NTP, and the cycle counter is
+potentially useful entropic data.
+
+This commit addresses this by mixing in a time stamp on calls to
+settimeofday and adjtimex. No entropy is credited in doing so, so it
+doesn't make initialization faster, but it is still useful input to
+have.
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Cc: stable@vger.kernel.org
+Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/time/timekeeping.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
+index 871c912860ed..d6a0ff68df41 100644
+--- a/kernel/time/timekeeping.c
++++ b/kernel/time/timekeeping.c
+@@ -23,6 +23,7 @@
+ #include <linux/pvclock_gtod.h>
+ #include <linux/compiler.h>
+ #include <linux/audit.h>
++#include <linux/random.h>
+
+ #include "tick-internal.h"
+ #include "ntp_internal.h"
+@@ -1326,8 +1327,10 @@ int do_settimeofday64(const struct timespec64 *ts)
+ /* Signal hrtimers about time change */
+ clock_was_set(CLOCK_SET_WALL);
+
+- if (!ret)
++ if (!ret) {
+ audit_tk_injoffset(ts_delta);
++ add_device_randomness(ts, sizeof(*ts));
++ }
+
+ return ret;
+ }
+@@ -2413,6 +2416,7 @@ int do_adjtimex(struct __kernel_timex *txc)
+ ret = timekeeping_validate_timex(txc);
+ if (ret)
+ return ret;
++ add_device_randomness(txc, sizeof(*txc));
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ struct timespec64 delta;
+@@ -2430,6 +2434,7 @@ int do_adjtimex(struct __kernel_timex *txc)
+ audit_ntp_init(&ad);
+
+ ktime_get_real_ts64(&ts);
++ add_device_randomness(&ts, sizeof(ts));
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&tk_core.seq);
+--
+2.35.1
+
--- /dev/null
+From 9c1985649dac8ce39d9305a8087aa7e7a44b6a11 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Aug 2022 15:57:03 +0200
+Subject: tpm: Add check for Failure mode for TPM2 modules
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Mårten Lindahl <marten.lindahl@axis.com>
+
+[ Upstream commit 863ed94c589fcd1984f4e3080f069d30508044bb ]
+
+In commit 0aa698787aa2 ("tpm: Add Upgrade/Reduced mode support for
+TPM2 modules") it was said that:
+
+"If the TPM is in Failure mode, it will successfully respond to both
+tpm2_do_selftest() and tpm2_startup() calls. Although, will fail to
+answer to tpm2_get_cc_attrs_tbl(). Use this fact to conclude that TPM
+is in Failure mode."
+
+But a check was never added in the commit when calling
+tpm2_get_cc_attrs_tbl() to conclude that the TPM is in Failure mode.
+This commit corrects this by adding a check.
+
+Fixes: 0aa698787aa2 ("tpm: Add Upgrade/Reduced mode support for TPM2 modules")
+Cc: stable@vger.kernel.org # v5.17+
+Signed-off-by: Mårten Lindahl <marten.lindahl@axis.com>
+Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/char/tpm/tpm2-cmd.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
+index 04a3e23a4afc..4419593d9531 100644
+--- a/drivers/char/tpm/tpm2-cmd.c
++++ b/drivers/char/tpm/tpm2-cmd.c
+@@ -752,6 +752,12 @@ int tpm2_auto_startup(struct tpm_chip *chip)
+ }
+
+ rc = tpm2_get_cc_attrs_tbl(chip);
++ if (rc == TPM2_RC_FAILURE || (rc < 0 && rc != -ENOMEM)) {
++ dev_info(&chip->dev,
++ "TPM in field failure mode, requires firmware upgrade\n");
++ chip->flags |= TPM_CHIP_FLAG_FIRMWARE_UPGRADE;
++ rc = 0;
++ }
+
+ out:
+ if (rc == TPM2_RC_UPGRADE) {
+--
+2.35.1
+
--- /dev/null
+From 965d4271f4ecbc74ebfd59eee61471d44d2e4694 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Jul 2022 09:17:38 +0800
+Subject: tpm: eventlog: Fix section mismatch for DEBUG_SECTION_MISMATCH
+
+From: Huacai Chen <chenhuacai@loongson.cn>
+
+[ Upstream commit bed4593645366ad7362a3aa7bc0d100d8d8236a8 ]
+
+If DEBUG_SECTION_MISMATCH enabled, __calc_tpm2_event_size() will not be
+inlined, this cause section mismatch like this:
+
+WARNING: modpost: vmlinux.o(.text.unlikely+0xe30c): Section mismatch in reference from the variable L0 to the function .init.text:early_ioremap()
+The function L0() references
+the function __init early_memremap().
+This is often because L0 lacks a __init
+annotation or the annotation of early_ioremap is wrong.
+
+Fix it by using __always_inline instead of inline for the called-once
+function __calc_tpm2_event_size().
+
+Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations")
+Cc: stable@vger.kernel.org # v5.3
+Reported-by: WANG Xuerui <git@xen0n.name>
+Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/tpm_eventlog.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h
+index 739ba9a03ec1..20c0ff54b7a0 100644
+--- a/include/linux/tpm_eventlog.h
++++ b/include/linux/tpm_eventlog.h
+@@ -157,7 +157,7 @@ struct tcg_algorithm_info {
+ * Return: size of the event on success, 0 on failure
+ */
+
+-static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
++static __always_inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event,
+ struct tcg_pcr_event *event_header,
+ bool do_mapping)
+ {
+--
+2.35.1
+
--- /dev/null
+From 3a68f50bc6f54e4ef743e38ffb6064be1ef5e481 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Jul 2022 18:44:54 -0400
+Subject: tracing/events: Add __vstring() and __assign_vstr() helper macros
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+[ Upstream commit 0563231f93c6d1f582b168a47753b345c1e20d81 ]
+
+There's several places that open code the following logic:
+
+ TP_STRUCT__entry(__dynamic_array(char, msg, MSG_MAX)),
+ TP_fast_assign(vsnprintf(__get_str(msg), MSG_MAX, vaf->fmt, *vaf->va);)
+
+To load a string created by variable array va_list.
+
+The main issue with this approach is that "MSG_MAX" usage in the
+__dynamic_array() portion. That actually just reserves the MSG_MAX in the
+event, and even wastes space because there's dynamic meta data also saved
+in the event to denote the offset and size of the dynamic array. It would
+have been better to just use a static __array() field.
+
+Instead, create __vstring() and __assign_vstr() that work like __string
+and __assign_str() but instead of taking a destination string to copy,
+take a format string and a va_list pointer and fill in the values.
+
+It uses the helper:
+
+ #define __trace_event_vstr_len(fmt, va) \
+ ({ \
+ va_list __ap; \
+ int __ret; \
+ \
+ va_copy(__ap, *(va)); \
+ __ret = vsnprintf(NULL, 0, fmt, __ap) + 1; \
+ va_end(__ap); \
+ \
+ min(__ret, TRACE_EVENT_STR_MAX); \
+ })
+
+To figure out the length to store the string. It may be slightly slower as
+it needs to run the vsnprintf() twice, but it now saves space on the ring
+buffer.
+
+Link: https://lkml.kernel.org/r/20220705224749.053570613@goodmis.org
+
+Cc: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Jason Gunthorpe <jgg@ziepe.ca>
+Cc: Leon Romanovsky <leon@kernel.org>
+Cc: Kalle Valo <kvalo@kernel.org>
+Cc: "David S. Miller" <davem@davemloft.net>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Jakub Kicinski <kuba@kernel.org>
+Cc: Paolo Abeni <pabeni@redhat.com>
+Cc: Arend van Spriel <aspriel@gmail.com>
+Cc: Franky Lin <franky.lin@broadcom.com>
+Cc: Hante Meuleman <hante.meuleman@broadcom.com>
+Cc: Gregory Greenman <gregory.greenman@intel.com>
+Cc: Peter Chen <peter.chen@kernel.org>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Mathias Nyman <mathias.nyman@intel.com>
+Cc: Chunfeng Yun <chunfeng.yun@mediatek.com>
+Cc: Bin Liu <b-liu@ti.com>
+Cc: Marek Lindner <mareklindner@neomailbox.ch>
+Cc: Simon Wunderlich <sw@simonwunderlich.de>
+Cc: Antonio Quartulli <a@unstable.cc>
+Cc: Sven Eckelmann <sven@narfation.org>
+Cc: Johannes Berg <johannes@sipsolutions.net>
+Cc: Jim Cromie <jim.cromie@gmail.com>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/trace_events.h | 18 ++++++++++++++++++
+ include/trace/stages/stage1_struct_define.h | 3 +++
+ include/trace/stages/stage2_data_offsets.h | 3 +++
+ include/trace/stages/stage4_event_fields.h | 3 +++
+ include/trace/stages/stage5_get_offsets.h | 4 ++++
+ include/trace/stages/stage6_event_callback.h | 7 +++++++
+ 6 files changed, 38 insertions(+)
+
+diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
+index e6e95a9f07a5..b18759a673c6 100644
+--- a/include/linux/trace_events.h
++++ b/include/linux/trace_events.h
+@@ -916,6 +916,24 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
+
+ #endif
+
++#define TRACE_EVENT_STR_MAX 512
++
++/*
++ * gcc warns that you can not use a va_list in an inlined
++ * function. But lets me make it into a macro :-/
++ */
++#define __trace_event_vstr_len(fmt, va) \
++({ \
++ va_list __ap; \
++ int __ret; \
++ \
++ va_copy(__ap, *(va)); \
++ __ret = vsnprintf(NULL, 0, fmt, __ap) + 1; \
++ va_end(__ap); \
++ \
++ min(__ret, TRACE_EVENT_STR_MAX); \
++})
++
+ #endif /* _LINUX_TRACE_EVENT_H */
+
+ /*
+diff --git a/include/trace/stages/stage1_struct_define.h b/include/trace/stages/stage1_struct_define.h
+index a16783419687..1b7bab60434c 100644
+--- a/include/trace/stages/stage1_struct_define.h
++++ b/include/trace/stages/stage1_struct_define.h
+@@ -26,6 +26,9 @@
+ #undef __string_len
+ #define __string_len(item, src, len) __dynamic_array(char, item, -1)
+
++#undef __vstring
++#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
++
+ #undef __bitmask
+ #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1)
+
+diff --git a/include/trace/stages/stage2_data_offsets.h b/include/trace/stages/stage2_data_offsets.h
+index 42fd1e8813ec..1b7a8f764fdd 100644
+--- a/include/trace/stages/stage2_data_offsets.h
++++ b/include/trace/stages/stage2_data_offsets.h
+@@ -32,6 +32,9 @@
+ #undef __string_len
+ #define __string_len(item, src, len) __dynamic_array(char, item, -1)
+
++#undef __vstring
++#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
++
+ #undef __bitmask
+ #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
+
+diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h
+index e80cdc397a43..c3790ec7a453 100644
+--- a/include/trace/stages/stage4_event_fields.h
++++ b/include/trace/stages/stage4_event_fields.h
+@@ -38,6 +38,9 @@
+ #undef __string_len
+ #define __string_len(item, src, len) __dynamic_array(char, item, -1)
+
++#undef __vstring
++#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
++
+ #undef __bitmask
+ #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
+
+diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h
+index 7ee5931300e6..fba4c24ed9e6 100644
+--- a/include/trace/stages/stage5_get_offsets.h
++++ b/include/trace/stages/stage5_get_offsets.h
+@@ -39,6 +39,10 @@
+ #undef __string_len
+ #define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1)
+
++#undef __vstring
++#define __vstring(item, fmt, ap) __dynamic_array(char, item, \
++ __trace_event_vstr_len(fmt, ap))
++
+ #undef __rel_dynamic_array
+ #define __rel_dynamic_array(type, item, len) \
+ __item_length = (len) * sizeof(type); \
+diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h
+index e1724f73594b..0f51f6b3ab70 100644
+--- a/include/trace/stages/stage6_event_callback.h
++++ b/include/trace/stages/stage6_event_callback.h
+@@ -24,6 +24,9 @@
+ #undef __string_len
+ #define __string_len(item, src, len) __dynamic_array(char, item, -1)
+
++#undef __vstring
++#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
++
+ #undef __assign_str
+ #define __assign_str(dst, src) \
+ strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)");
+@@ -35,6 +38,10 @@
+ __get_str(dst)[len] = '\0'; \
+ } while(0)
+
++#undef __assign_vstr
++#define __assign_vstr(dst, fmt, va) \
++ vsnprintf(__get_str(dst), TRACE_EVENT_STR_MAX, fmt, *(va))
++
+ #undef __bitmask
+ #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1)
+
+--
+2.35.1
+
--- /dev/null
+From 1551cb64c9964c8e7535b88210b5993eb066ab5c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 31 Jul 2022 01:59:28 -0400
+Subject: tracing: Use a struct alignof to determine trace event field
+ alignment
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+[ Upstream commit 4c3d2f9388d36eb28640a220a6f908328442d873 ]
+
+alignof() gives an alignment of types as they would be as standalone
+variables. But alignment in structures might be different, and when
+building the fields of events, the alignment must be the actual
+alignment otherwise the field offsets may not match what they actually
+are.
+
+This caused trace-cmd to crash, as libtraceevent did not check if the
+field offset was bigger than the event. The write_msr and read_msr
+events on 32 bit had their fields incorrect, because it had a u64 field
+between two ints. alignof(u64) would give 8, but the u64 field was at a
+4 byte alignment.
+
+Define a macro as:
+
+ ALIGN_STRUCTFIELD(type) ((int)(offsetof(struct {char a; type b;}, b)))
+
+which gives the actual alignment of types in a structure.
+
+Link: https://lkml.kernel.org/r/20220731015928.7ab3a154@rorschach.local.home
+
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: stable@vger.kernel.org
+Fixes: 04ae87a52074e ("ftrace: Rework event_create_dir()")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/trace/stages/stage4_event_fields.h | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h
+index c3790ec7a453..80d34f396555 100644
+--- a/include/trace/stages/stage4_event_fields.h
++++ b/include/trace/stages/stage4_event_fields.h
+@@ -2,16 +2,18 @@
+
+ /* Stage 4 definitions for creating trace events */
+
++#define ALIGN_STRUCTFIELD(type) ((int)(offsetof(struct {char a; type b;}, b)))
++
+ #undef __field_ext
+ #define __field_ext(_type, _item, _filter_type) { \
+ .type = #_type, .name = #_item, \
+- .size = sizeof(_type), .align = __alignof__(_type), \
++ .size = sizeof(_type), .align = ALIGN_STRUCTFIELD(_type), \
+ .is_signed = is_signed_type(_type), .filter_type = _filter_type },
+
+ #undef __field_struct_ext
+ #define __field_struct_ext(_type, _item, _filter_type) { \
+ .type = #_type, .name = #_item, \
+- .size = sizeof(_type), .align = __alignof__(_type), \
++ .size = sizeof(_type), .align = ALIGN_STRUCTFIELD(_type), \
+ 0, .filter_type = _filter_type },
+
+ #undef __field
+@@ -23,7 +25,7 @@
+ #undef __array
+ #define __array(_type, _item, _len) { \
+ .type = #_type"["__stringify(_len)"]", .name = #_item, \
+- .size = sizeof(_type[_len]), .align = __alignof__(_type), \
++ .size = sizeof(_type[_len]), .align = ALIGN_STRUCTFIELD(_type), \
+ .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER },
+
+ #undef __dynamic_array
+--
+2.35.1
+
--- /dev/null
+From 16376f07f879be2fc73315678d2b0862b01399b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Jul 2022 16:35:10 +0100
+Subject: tty: 8250: Add support for Brainboxes PX cards.
+
+From: Cameron Williams <cang1@live.co.uk>
+
+[ Upstream commit ef5a03a26c87a760bc3d86b5af7b773e82f8b1b7 ]
+
+Add support for some of the Brainboxes PCIe (PX) range of
+serial cards, including the PX-101, PX-235/PX-246,
+PX-203/PX-257, PX-260/PX-701, PX-310, PX-313,
+PX-320/PX-324/PX-376/PX-387, PX-335/PX-346, PX-368, PX-420,
+PX-803 and PX-846.
+
+Signed-off-by: Cameron Williams <cang1@live.co.uk>
+Cc: stable <stable@kernel.org>
+Link: https://lore.kernel.org/r/AM5PR0202MB2564669252BDC59BF55A6E87C4879@AM5PR0202MB2564.eurprd02.prod.outlook.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/tty/serial/8250/8250_pci.c | 109 +++++++++++++++++++++++++++++
+ 1 file changed, 109 insertions(+)
+
+diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c
+index 818ed6cd3132..aeac20f7cbb2 100644
+--- a/drivers/tty/serial/8250/8250_pci.c
++++ b/drivers/tty/serial/8250/8250_pci.c
+@@ -5063,6 +5063,115 @@ static const struct pci_device_id serial_pci_tbl[] = {
+ PCI_ANY_ID, PCI_ANY_ID,
+ 0, 0,
+ pbn_b2_4_115200 },
++ /*
++ * Brainboxes PX-101
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x4005,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_b0_2_115200 },
++ { PCI_VENDOR_ID_INTASHIELD, 0x4019,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_2_15625000 },
++ /*
++ * Brainboxes PX-235/246
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x4004,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_b0_1_115200 },
++ { PCI_VENDOR_ID_INTASHIELD, 0x4016,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_1_15625000 },
++ /*
++ * Brainboxes PX-203/PX-257
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x4006,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_b0_2_115200 },
++ { PCI_VENDOR_ID_INTASHIELD, 0x4015,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_4_15625000 },
++ /*
++ * Brainboxes PX-260/PX-701
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x400A,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_4_15625000 },
++ /*
++ * Brainboxes PX-310
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x400E,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_2_15625000 },
++ /*
++ * Brainboxes PX-313
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x400C,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_2_15625000 },
++ /*
++ * Brainboxes PX-320/324/PX-376/PX-387
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x400B,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_1_15625000 },
++ /*
++ * Brainboxes PX-335/346
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x400F,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_4_15625000 },
++ /*
++ * Brainboxes PX-368
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x4010,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_4_15625000 },
++ /*
++ * Brainboxes PX-420
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x4000,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_b0_4_115200 },
++ { PCI_VENDOR_ID_INTASHIELD, 0x4011,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_4_15625000 },
++ /*
++ * Brainboxes PX-803
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x4009,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_b0_1_115200 },
++ { PCI_VENDOR_ID_INTASHIELD, 0x401E,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_1_15625000 },
++ /*
++ * Brainboxes PX-846
++ */
++ { PCI_VENDOR_ID_INTASHIELD, 0x4008,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_b0_1_115200 },
++ { PCI_VENDOR_ID_INTASHIELD, 0x4017,
++ PCI_ANY_ID, PCI_ANY_ID,
++ 0, 0,
++ pbn_oxsemi_1_15625000 },
++
+ /*
+ * Perle PCI-RAS cards
+ */
+--
+2.35.1
+
--- /dev/null
+From a7b73d5774f2b3d6656d48dbee4b2f995629a2c2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 May 2022 10:42:04 +0200
+Subject: usbnet: smsc95xx: Avoid link settings race on interrupt reception
+
+From: Lukas Wunner <lukas@wunner.de>
+
+[ Upstream commit 8960f878e39fadc03d74292a6731f1e914cf2019 ]
+
+When a PHY interrupt is signaled, the SMSC LAN95xx driver updates the
+MAC full duplex mode and PHY flow control registers based on cached data
+in struct phy_device:
+
+ smsc95xx_status() # raises EVENT_LINK_RESET
+ usbnet_deferred_kevent()
+ smsc95xx_link_reset() # uses cached data in phydev
+
+Simultaneously, phylib polls link status once per second and updates
+that cached data:
+
+ phy_state_machine()
+ phy_check_link_status()
+ phy_read_status()
+ lan87xx_read_status()
+ genphy_read_status() # updates cached data in phydev
+
+If smsc95xx_link_reset() wins the race against genphy_read_status(),
+the registers may be updated based on stale data.
+
+E.g. if the link was previously down, phydev->duplex is set to
+DUPLEX_UNKNOWN and that's what smsc95xx_link_reset() will use, even
+though genphy_read_status() may update it to DUPLEX_FULL afterwards.
+
+PHY interrupts are currently only enabled on suspend to trigger wakeup,
+so the impact of the race is limited, but we're about to enable them
+perpetually.
+
+Avoid the race by delaying execution of smsc95xx_link_reset() until
+phy_state_machine() has done its job and calls back via
+smsc95xx_handle_link_change().
+
+Signaling EVENT_LINK_RESET on wakeup is not necessary because phylib
+picks up link status changes through polling. So drop the declaration
+of a ->link_reset() callback.
+
+Note that the semicolon on a line by itself added in smsc95xx_status()
+is a placeholder for a function call which will be added in a subsequent
+commit. That function call will actually handle the INT_ENP_PHY_INT_
+interrupt.
+
+Tested-by: Oleksij Rempel <o.rempel@pengutronix.de> # LAN9514/9512/9500
+Tested-by: Ferry Toth <fntoth@gmail.com> # LAN9514
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/usb/smsc95xx.c | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
+index 2cb44d65bbc3..f5a208948d22 100644
+--- a/drivers/net/usb/smsc95xx.c
++++ b/drivers/net/usb/smsc95xx.c
+@@ -566,7 +566,7 @@ static int smsc95xx_phy_update_flowcontrol(struct usbnet *dev)
+ return smsc95xx_write_reg(dev, AFC_CFG, afc_cfg);
+ }
+
+-static int smsc95xx_link_reset(struct usbnet *dev)
++static void smsc95xx_mac_update_fullduplex(struct usbnet *dev)
+ {
+ struct smsc95xx_priv *pdata = dev->driver_priv;
+ unsigned long flags;
+@@ -583,14 +583,16 @@ static int smsc95xx_link_reset(struct usbnet *dev)
+ spin_unlock_irqrestore(&pdata->mac_cr_lock, flags);
+
+ ret = smsc95xx_write_reg(dev, MAC_CR, pdata->mac_cr);
+- if (ret < 0)
+- return ret;
++ if (ret < 0) {
++ if (ret != -ENODEV)
++ netdev_warn(dev->net,
++ "Error updating MAC full duplex mode\n");
++ return;
++ }
+
+ ret = smsc95xx_phy_update_flowcontrol(dev);
+ if (ret < 0)
+ netdev_warn(dev->net, "Error updating PHY flow control\n");
+-
+- return ret;
+ }
+
+ static void smsc95xx_status(struct usbnet *dev, struct urb *urb)
+@@ -607,7 +609,7 @@ static void smsc95xx_status(struct usbnet *dev, struct urb *urb)
+ netif_dbg(dev, link, dev->net, "intdata: 0x%08X\n", intdata);
+
+ if (intdata & INT_ENP_PHY_INT_)
+- usbnet_defer_kevent(dev, EVENT_LINK_RESET);
++ ;
+ else
+ netdev_warn(dev->net, "unexpected interrupt, intdata=0x%08X\n",
+ intdata);
+@@ -1088,6 +1090,7 @@ static void smsc95xx_handle_link_change(struct net_device *net)
+ struct usbnet *dev = netdev_priv(net);
+
+ phy_print_status(net->phydev);
++ smsc95xx_mac_update_fullduplex(dev);
+ usbnet_defer_kevent(dev, EVENT_LINK_CHANGE);
+ }
+
+@@ -1993,7 +1996,6 @@ static const struct driver_info smsc95xx_info = {
+ .description = "smsc95xx USB 2.0 Ethernet",
+ .bind = smsc95xx_bind,
+ .unbind = smsc95xx_unbind,
+- .link_reset = smsc95xx_link_reset,
+ .reset = smsc95xx_reset,
+ .check_connect = smsc95xx_start_phy,
+ .stop = smsc95xx_stop,
+--
+2.35.1
+
--- /dev/null
+From 4a8d053acd3acd06d45b12cd8902178fdaf18acc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 May 2022 10:42:02 +0200
+Subject: usbnet: smsc95xx: Don't clear read-only PHY interrupt
+
+From: Lukas Wunner <lukas@wunner.de>
+
+[ Upstream commit 3108871f19221372b251f7da1ac38736928b5b3a ]
+
+Upon receiving data from the Interrupt Endpoint, the SMSC LAN95xx driver
+attempts to clear the signaled interrupts by writing "all ones" to the
+Interrupt Status Register.
+
+However the driver only ever enables a single type of interrupt, namely
+the PHY Interrupt. And according to page 119 of the LAN950x datasheet,
+its bit in the Interrupt Status Register is read-only. There's no other
+way to clear it than in a separate PHY register:
+
+https://www.microchip.com/content/dam/mchp/documents/UNG/ProductDocuments/DataSheets/LAN950x-Data-Sheet-DS00001875D.pdf
+
+Consequently, writing "all ones" to the Interrupt Status Register is
+pointless and can be dropped.
+
+Tested-by: Oleksij Rempel <o.rempel@pengutronix.de> # LAN9514/9512/9500
+Tested-by: Ferry Toth <fntoth@gmail.com> # LAN9514
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/usb/smsc95xx.c | 4 ----
+ 1 file changed, 4 deletions(-)
+
+diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
+index edf0492ad489..2cb44d65bbc3 100644
+--- a/drivers/net/usb/smsc95xx.c
++++ b/drivers/net/usb/smsc95xx.c
+@@ -572,10 +572,6 @@ static int smsc95xx_link_reset(struct usbnet *dev)
+ unsigned long flags;
+ int ret;
+
+- ret = smsc95xx_write_reg(dev, INT_STS, INT_STS_CLEAR_ALL_);
+- if (ret < 0)
+- return ret;
+-
+ spin_lock_irqsave(&pdata->mac_cr_lock, flags);
+ if (pdata->phydev->duplex != DUPLEX_FULL) {
+ pdata->mac_cr &= ~MAC_CR_FDPX_;
+--
+2.35.1
+
--- /dev/null
+From df274cfed28127dbf7d1d06f05318fa92faec8b6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Jul 2022 22:47:51 +0200
+Subject: usbnet: smsc95xx: Fix deadlock on runtime resume
+
+From: Lukas Wunner <lukas@wunner.de>
+
+[ Upstream commit 7b960c967f2aa01ab8f45c5a0bd78e754cffdeee ]
+
+Commit 05b35e7eb9a1 ("smsc95xx: add phylib support") amended
+smsc95xx_resume() to call phy_init_hw(). That function waits for the
+device to runtime resume even though it is placed in the runtime resume
+path, causing a deadlock.
+
+The problem is that phy_init_hw() calls down to smsc95xx_mdiobus_read(),
+which never uses the _nopm variant of usbnet_read_cmd().
+
+Commit b4df480f68ae ("usbnet: smsc95xx: add reset_resume function with
+reset operation") causes a similar deadlock on resume if the device was
+already runtime suspended when entering system sleep:
+
+That's because the commit introduced smsc95xx_reset_resume(), which
+calls down to smsc95xx_reset(), which neglects to use _nopm accessors.
+
+Fix by auto-detecting whether a device access is performed by the
+suspend/resume task_struct and use the _nopm variant if so. This works
+because the PM core guarantees that suspend/resume callbacks are run in
+task context.
+
+Stacktrace for posterity:
+
+ INFO: task kworker/2:1:49 blocked for more than 122 seconds.
+ Workqueue: usb_hub_wq hub_event
+ schedule
+ rpm_resume
+ __pm_runtime_resume
+ usb_autopm_get_interface
+ usbnet_read_cmd
+ __smsc95xx_read_reg
+ __smsc95xx_phy_wait_not_busy
+ __smsc95xx_mdio_read
+ smsc95xx_mdiobus_read
+ __mdiobus_read
+ mdiobus_read
+ smsc_phy_reset
+ phy_init_hw
+ smsc95xx_resume
+ usb_resume_interface
+ usb_resume_both
+ usb_runtime_resume
+ __rpm_callback
+ rpm_callback
+ rpm_resume
+ __pm_runtime_resume
+ usb_autoresume_device
+ hub_event
+ process_one_work
+
+Fixes: b4df480f68ae ("usbnet: smsc95xx: add reset_resume function with reset operation")
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Cc: stable@vger.kernel.org # v3.16+
+Cc: Andre Edich <andre.edich@microchip.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/usb/smsc95xx.c | 26 ++++++++++++++++++++------
+ 1 file changed, 20 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
+index 358b170cc8fb..515363d74078 100644
+--- a/drivers/net/usb/smsc95xx.c
++++ b/drivers/net/usb/smsc95xx.c
+@@ -71,6 +71,7 @@ struct smsc95xx_priv {
+ struct fwnode_handle *irqfwnode;
+ struct mii_bus *mdiobus;
+ struct phy_device *phydev;
++ struct task_struct *pm_task;
+ };
+
+ static bool turbo_mode = true;
+@@ -80,13 +81,14 @@ MODULE_PARM_DESC(turbo_mode, "Enable multiple frames per Rx transaction");
+ static int __must_check __smsc95xx_read_reg(struct usbnet *dev, u32 index,
+ u32 *data, int in_pm)
+ {
++ struct smsc95xx_priv *pdata = dev->driver_priv;
+ u32 buf;
+ int ret;
+ int (*fn)(struct usbnet *, u8, u8, u16, u16, void *, u16);
+
+ BUG_ON(!dev);
+
+- if (!in_pm)
++ if (current != pdata->pm_task)
+ fn = usbnet_read_cmd;
+ else
+ fn = usbnet_read_cmd_nopm;
+@@ -110,13 +112,14 @@ static int __must_check __smsc95xx_read_reg(struct usbnet *dev, u32 index,
+ static int __must_check __smsc95xx_write_reg(struct usbnet *dev, u32 index,
+ u32 data, int in_pm)
+ {
++ struct smsc95xx_priv *pdata = dev->driver_priv;
+ u32 buf;
+ int ret;
+ int (*fn)(struct usbnet *, u8, u8, u16, u16, const void *, u16);
+
+ BUG_ON(!dev);
+
+- if (!in_pm)
++ if (current != pdata->pm_task)
+ fn = usbnet_write_cmd;
+ else
+ fn = usbnet_write_cmd_nopm;
+@@ -1508,9 +1511,12 @@ static int smsc95xx_suspend(struct usb_interface *intf, pm_message_t message)
+ u32 val, link_up;
+ int ret;
+
++ pdata->pm_task = current;
++
+ ret = usbnet_suspend(intf, message);
+ if (ret < 0) {
+ netdev_warn(dev->net, "usbnet_suspend error\n");
++ pdata->pm_task = NULL;
+ return ret;
+ }
+
+@@ -1750,6 +1756,7 @@ static int smsc95xx_suspend(struct usb_interface *intf, pm_message_t message)
+ if (ret && PMSG_IS_AUTO(message))
+ usbnet_resume(intf);
+
++ pdata->pm_task = NULL;
+ return ret;
+ }
+
+@@ -1770,29 +1777,31 @@ static int smsc95xx_resume(struct usb_interface *intf)
+ /* do this first to ensure it's cleared even in error case */
+ pdata->suspend_flags = 0;
+
++ pdata->pm_task = current;
++
+ if (suspend_flags & SUSPEND_ALLMODES) {
+ /* clear wake-up sources */
+ ret = smsc95xx_read_reg_nopm(dev, WUCSR, &val);
+ if (ret < 0)
+- return ret;
++ goto done;
+
+ val &= ~(WUCSR_WAKE_EN_ | WUCSR_MPEN_);
+
+ ret = smsc95xx_write_reg_nopm(dev, WUCSR, val);
+ if (ret < 0)
+- return ret;
++ goto done;
+
+ /* clear wake-up status */
+ ret = smsc95xx_read_reg_nopm(dev, PM_CTRL, &val);
+ if (ret < 0)
+- return ret;
++ goto done;
+
+ val &= ~PM_CTL_WOL_EN_;
+ val |= PM_CTL_WUPS_;
+
+ ret = smsc95xx_write_reg_nopm(dev, PM_CTRL, val);
+ if (ret < 0)
+- return ret;
++ goto done;
+ }
+
+ phy_init_hw(pdata->phydev);
+@@ -1801,15 +1810,20 @@ static int smsc95xx_resume(struct usb_interface *intf)
+ if (ret < 0)
+ netdev_warn(dev->net, "usbnet_resume error\n");
+
++done:
++ pdata->pm_task = NULL;
+ return ret;
+ }
+
+ static int smsc95xx_reset_resume(struct usb_interface *intf)
+ {
+ struct usbnet *dev = usb_get_intfdata(intf);
++ struct smsc95xx_priv *pdata = dev->driver_priv;
+ int ret;
+
++ pdata->pm_task = current;
+ ret = smsc95xx_reset(dev);
++ pdata->pm_task = NULL;
+ if (ret < 0)
+ return ret;
+
+--
+2.35.1
+
--- /dev/null
+From 0a0b027602d0b6c0b8dcdfc10ac37e8ec0d648d4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 May 2022 10:42:05 +0200
+Subject: usbnet: smsc95xx: Forward PHY interrupts to PHY driver to avoid
+ polling
+
+From: Lukas Wunner <lukas@wunner.de>
+
+[ Upstream commit 1ce8b37241ed291af56f7a49bbdbf20c08728e88 ]
+
+Link status of SMSC LAN95xx chips is polled once per second, even though
+they're capable of signaling PHY interrupts through the MAC layer.
+
+Forward those interrupts to the PHY driver to avoid polling. Benefits
+are reduced bus traffic, reduced CPU overhead and quicker interface
+bringup.
+
+Polling was introduced in 2016 by commit d69d16949346 ("usbnet:
+smsc95xx: fix link detection for disabled autonegotiation").
+Back then, the LAN95xx driver neglected to enable the ENERGYON interrupt,
+hence couldn't detect link-up events when auto-negotiation was disabled.
+The proper solution would have been to enable the ENERGYON interrupt
+instead of polling.
+
+Since then, PHY handling was moved from the LAN95xx driver to the SMSC
+PHY driver with commit 05b35e7eb9a1 ("smsc95xx: add phylib support").
+That PHY driver is capable of link detection with auto-negotiation
+disabled because it enables the ENERGYON interrupt.
+
+Note that signaling interrupts through the MAC layer not only works with
+the integrated PHY, but also with an external PHY, provided its
+interrupt pin is attached to LAN95xx's nPHY_INT pin.
+
+In the unlikely event that the interrupt pin of an external PHY is
+attached to a GPIO of the SoC (or not connected at all), the driver can
+be amended to retrieve the irq from the PHY's of_node.
+
+To forward PHY interrupts to phylib, it is not sufficient to call
+phy_mac_interrupt(). Instead, the PHY's interrupt handler needs to run
+so that PHY interrupts are cleared. That's because according to page
+119 of the LAN950x datasheet, "The source of this interrupt is a level.
+The interrupt persists until it is cleared in the PHY."
+
+https://www.microchip.com/content/dam/mchp/documents/UNG/ProductDocuments/DataSheets/LAN950x-Data-Sheet-DS00001875D.pdf
+
+Therefore, create an IRQ domain with a single IRQ for the PHY. In the
+future, the IRQ domain may be extended to support the 11 GPIOs on the
+LAN95xx.
+
+Normally the PHY interrupt should be masked until the PHY driver has
+cleared it. However masking requires a (sleeping) USB transaction and
+interrupts are received in (non-sleepable) softirq context. I decided
+not to mask the interrupt at all (by using the dummy_irq_chip's noop
+->irq_mask() callback): The USB interrupt endpoint is polled in 1 msec
+intervals and normally that's sufficient to wake the PHY driver's IRQ
+thread and have it clear the interrupt. If it does take longer, worst
+thing that can happen is the IRQ thread is woken again. No big deal.
+
+Because PHY interrupts are now perpetually enabled, there's no need to
+selectively enable them on suspend. So remove all invocations of
+smsc95xx_enable_phy_wakeup_interrupts().
+
+In smsc95xx_resume(), move the call of phy_init_hw() before
+usbnet_resume() (which restarts the status URB) to ensure that the PHY
+is fully initialized when an interrupt is handled.
+
+Tested-by: Oleksij Rempel <o.rempel@pengutronix.de> # LAN9514/9512/9500
+Tested-by: Ferry Toth <fntoth@gmail.com> # LAN9514
+Signed-off-by: Lukas Wunner <lukas@wunner.de>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch> # from a PHY perspective
+Cc: Andre Edich <andre.edich@microchip.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/usb/smsc95xx.c | 113 ++++++++++++++++++++-----------------
+ 1 file changed, 61 insertions(+), 52 deletions(-)
+
+diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
+index f5a208948d22..358b170cc8fb 100644
+--- a/drivers/net/usb/smsc95xx.c
++++ b/drivers/net/usb/smsc95xx.c
+@@ -18,6 +18,8 @@
+ #include <linux/usb/usbnet.h>
+ #include <linux/slab.h>
+ #include <linux/of_net.h>
++#include <linux/irq.h>
++#include <linux/irqdomain.h>
+ #include <linux/mdio.h>
+ #include <linux/phy.h>
+ #include <net/selftests.h>
+@@ -53,6 +55,9 @@
+ #define SUSPEND_ALLMODES (SUSPEND_SUSPEND0 | SUSPEND_SUSPEND1 | \
+ SUSPEND_SUSPEND2 | SUSPEND_SUSPEND3)
+
++#define SMSC95XX_NR_IRQS (1) /* raise to 12 for GPIOs */
++#define PHY_HWIRQ (SMSC95XX_NR_IRQS - 1)
++
+ struct smsc95xx_priv {
+ u32 mac_cr;
+ u32 hash_hi;
+@@ -61,6 +66,9 @@ struct smsc95xx_priv {
+ spinlock_t mac_cr_lock;
+ u8 features;
+ u8 suspend_flags;
++ struct irq_chip irqchip;
++ struct irq_domain *irqdomain;
++ struct fwnode_handle *irqfwnode;
+ struct mii_bus *mdiobus;
+ struct phy_device *phydev;
+ };
+@@ -597,6 +605,8 @@ static void smsc95xx_mac_update_fullduplex(struct usbnet *dev)
+
+ static void smsc95xx_status(struct usbnet *dev, struct urb *urb)
+ {
++ struct smsc95xx_priv *pdata = dev->driver_priv;
++ unsigned long flags;
+ u32 intdata;
+
+ if (urb->actual_length != 4) {
+@@ -608,11 +618,15 @@ static void smsc95xx_status(struct usbnet *dev, struct urb *urb)
+ intdata = get_unaligned_le32(urb->transfer_buffer);
+ netif_dbg(dev, link, dev->net, "intdata: 0x%08X\n", intdata);
+
++ local_irq_save(flags);
++
+ if (intdata & INT_ENP_PHY_INT_)
+- ;
++ generic_handle_domain_irq(pdata->irqdomain, PHY_HWIRQ);
+ else
+ netdev_warn(dev->net, "unexpected interrupt, intdata=0x%08X\n",
+ intdata);
++
++ local_irq_restore(flags);
+ }
+
+ /* Enable or disable Tx & Rx checksum offload engines */
+@@ -1098,8 +1112,9 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf)
+ {
+ struct smsc95xx_priv *pdata;
+ bool is_internal_phy;
++ char usb_path[64];
++ int ret, phy_irq;
+ u32 val;
+- int ret;
+
+ printk(KERN_INFO SMSC_CHIPNAME " v" SMSC_DRIVER_VERSION "\n");
+
+@@ -1139,10 +1154,38 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf)
+ if (ret)
+ goto free_pdata;
+
++ /* create irq domain for use by PHY driver and GPIO consumers */
++ usb_make_path(dev->udev, usb_path, sizeof(usb_path));
++ pdata->irqfwnode = irq_domain_alloc_named_fwnode(usb_path);
++ if (!pdata->irqfwnode) {
++ ret = -ENOMEM;
++ goto free_pdata;
++ }
++
++ pdata->irqdomain = irq_domain_create_linear(pdata->irqfwnode,
++ SMSC95XX_NR_IRQS,
++ &irq_domain_simple_ops,
++ pdata);
++ if (!pdata->irqdomain) {
++ ret = -ENOMEM;
++ goto free_irqfwnode;
++ }
++
++ phy_irq = irq_create_mapping(pdata->irqdomain, PHY_HWIRQ);
++ if (!phy_irq) {
++ ret = -ENOENT;
++ goto remove_irqdomain;
++ }
++
++ pdata->irqchip = dummy_irq_chip;
++ pdata->irqchip.name = SMSC_CHIPNAME;
++ irq_set_chip_and_handler_name(phy_irq, &pdata->irqchip,
++ handle_simple_irq, "phy");
++
+ pdata->mdiobus = mdiobus_alloc();
+ if (!pdata->mdiobus) {
+ ret = -ENOMEM;
+- goto free_pdata;
++ goto dispose_irq;
+ }
+
+ ret = smsc95xx_read_reg(dev, HW_CFG, &val);
+@@ -1175,6 +1218,7 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf)
+ goto unregister_mdio;
+ }
+
++ pdata->phydev->irq = phy_irq;
+ pdata->phydev->is_internal = is_internal_phy;
+
+ /* detect device revision as different features may be available */
+@@ -1217,6 +1261,15 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf)
+ free_mdio:
+ mdiobus_free(pdata->mdiobus);
+
++dispose_irq:
++ irq_dispose_mapping(phy_irq);
++
++remove_irqdomain:
++ irq_domain_remove(pdata->irqdomain);
++
++free_irqfwnode:
++ irq_domain_free_fwnode(pdata->irqfwnode);
++
+ free_pdata:
+ kfree(pdata);
+ return ret;
+@@ -1229,6 +1282,9 @@ static void smsc95xx_unbind(struct usbnet *dev, struct usb_interface *intf)
+ phy_disconnect(dev->net->phydev);
+ mdiobus_unregister(pdata->mdiobus);
+ mdiobus_free(pdata->mdiobus);
++ irq_dispose_mapping(irq_find_mapping(pdata->irqdomain, PHY_HWIRQ));
++ irq_domain_remove(pdata->irqdomain);
++ irq_domain_free_fwnode(pdata->irqfwnode);
+ netif_dbg(dev, ifdown, dev->net, "free pdata\n");
+ kfree(pdata);
+ }
+@@ -1253,29 +1309,6 @@ static u32 smsc_crc(const u8 *buffer, size_t len, int filter)
+ return crc << ((filter % 2) * 16);
+ }
+
+-static int smsc95xx_enable_phy_wakeup_interrupts(struct usbnet *dev, u16 mask)
+-{
+- int ret;
+-
+- netdev_dbg(dev->net, "enabling PHY wakeup interrupts\n");
+-
+- /* read to clear */
+- ret = smsc95xx_mdio_read_nopm(dev, PHY_INT_SRC);
+- if (ret < 0)
+- return ret;
+-
+- /* enable interrupt source */
+- ret = smsc95xx_mdio_read_nopm(dev, PHY_INT_MASK);
+- if (ret < 0)
+- return ret;
+-
+- ret |= mask;
+-
+- smsc95xx_mdio_write_nopm(dev, PHY_INT_MASK, ret);
+-
+- return 0;
+-}
+-
+ static int smsc95xx_link_ok_nopm(struct usbnet *dev)
+ {
+ int ret;
+@@ -1442,7 +1475,6 @@ static int smsc95xx_enter_suspend3(struct usbnet *dev)
+ static int smsc95xx_autosuspend(struct usbnet *dev, u32 link_up)
+ {
+ struct smsc95xx_priv *pdata = dev->driver_priv;
+- int ret;
+
+ if (!netif_running(dev->net)) {
+ /* interface is ifconfig down so fully power down hw */
+@@ -1461,27 +1493,10 @@ static int smsc95xx_autosuspend(struct usbnet *dev, u32 link_up)
+ }
+
+ netdev_dbg(dev->net, "autosuspend entering SUSPEND1\n");
+-
+- /* enable PHY wakeup events for if cable is attached */
+- ret = smsc95xx_enable_phy_wakeup_interrupts(dev,
+- PHY_INT_MASK_ANEG_COMP_);
+- if (ret < 0) {
+- netdev_warn(dev->net, "error enabling PHY wakeup ints\n");
+- return ret;
+- }
+-
+ netdev_info(dev->net, "entering SUSPEND1 mode\n");
+ return smsc95xx_enter_suspend1(dev);
+ }
+
+- /* enable PHY wakeup events so we remote wakeup if cable is pulled */
+- ret = smsc95xx_enable_phy_wakeup_interrupts(dev,
+- PHY_INT_MASK_LINK_DOWN_);
+- if (ret < 0) {
+- netdev_warn(dev->net, "error enabling PHY wakeup ints\n");
+- return ret;
+- }
+-
+ netdev_dbg(dev->net, "autosuspend entering SUSPEND3\n");
+ return smsc95xx_enter_suspend3(dev);
+ }
+@@ -1547,13 +1562,6 @@ static int smsc95xx_suspend(struct usb_interface *intf, pm_message_t message)
+ }
+
+ if (pdata->wolopts & WAKE_PHY) {
+- ret = smsc95xx_enable_phy_wakeup_interrupts(dev,
+- (PHY_INT_MASK_ANEG_COMP_ | PHY_INT_MASK_LINK_DOWN_));
+- if (ret < 0) {
+- netdev_warn(dev->net, "error enabling PHY wakeup ints\n");
+- goto done;
+- }
+-
+ /* if link is down then configure EDPD and enter SUSPEND1,
+ * otherwise enter SUSPEND0 below
+ */
+@@ -1787,11 +1795,12 @@ static int smsc95xx_resume(struct usb_interface *intf)
+ return ret;
+ }
+
++ phy_init_hw(pdata->phydev);
++
+ ret = usbnet_resume(intf);
+ if (ret < 0)
+ netdev_warn(dev->net, "usbnet_resume error\n");
+
+- phy_init_hw(pdata->phydev);
+ return ret;
+ }
+
+--
+2.35.1
+
--- /dev/null
+From 2eb8d6ab6b01411d529d883d9a23396b251b4c91 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Aug 2022 15:04:16 +0900
+Subject: x86/kprobes: Update kcb status flag after singlestepping
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+
+[ Upstream commit dec8784c9088b131a1523f582c2194cfc8107dc0 ]
+
+Fix kprobes to update kcb (kprobes control block) status flag to
+KPROBE_HIT_SSDONE even if the kp->post_handler is not set.
+
+This bug may cause a kernel panic if another INT3 user runs right
+after kprobes because kprobe_int3_handler() misunderstands the
+INT3 is kprobe's single stepping INT3.
+
+Fixes: 6256e668b7af ("x86/kprobes: Use int3 instead of debug trap for single-step")
+Reported-by: Daniel Müller <deso@posteo.net>
+Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Tested-by: Daniel Müller <deso@posteo.net>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/all/20220727210136.jjgc3lpqeq42yr3m@muellerd-fedora-PC2BDTX9
+Link: https://lore.kernel.org/r/165942025658.342061.12452378391879093249.stgit@devnote2
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/kernel/kprobes/core.c | 18 +++++++++++-------
+ 1 file changed, 11 insertions(+), 7 deletions(-)
+
+diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
+index 7c4ab8870da4..74167dc5f55e 100644
+--- a/arch/x86/kernel/kprobes/core.c
++++ b/arch/x86/kernel/kprobes/core.c
+@@ -814,16 +814,20 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+ static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+ {
+- if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+- kcb->kprobe_status = KPROBE_HIT_SSDONE;
+- cur->post_handler(cur, regs, 0);
+- }
+-
+ /* Restore back the original saved kprobes variables and continue. */
+- if (kcb->kprobe_status == KPROBE_REENTER)
++ if (kcb->kprobe_status == KPROBE_REENTER) {
++ /* This will restore both kcb and current_kprobe */
+ restore_previous_kprobe(kcb);
+- else
++ } else {
++ /*
++ * Always update the kcb status because
++ * reset_curent_kprobe() doesn't update kcb.
++ */
++ kcb->kprobe_status = KPROBE_HIT_SSDONE;
++ if (cur->post_handler)
++ cur->post_handler(cur, regs, 0);
+ reset_current_kprobe();
++ }
+ }
+ NOKPROBE_SYMBOL(kprobe_post_process);
+
+--
+2.35.1
+
--- /dev/null
+From b63e4693e67826d5eb07e8254daed08277df7133 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Jul 2022 17:15:36 +0200
+Subject: x86/olpc: fix 'logical not is only applied to the left hand side'
+
+From: Alexander Lobakin <alexandr.lobakin@intel.com>
+
+[ Upstream commit 3a2ba42cbd0b669ce3837ba400905f93dd06c79f ]
+
+The bitops compile-time optimization series revealed one more
+problem in olpc-xo1-sci.c:send_ebook_state(), resulted in GCC
+warnings:
+
+arch/x86/platform/olpc/olpc-xo1-sci.c: In function 'send_ebook_state':
+arch/x86/platform/olpc/olpc-xo1-sci.c:83:63: warning: logical not is only applied to the left hand side of comparison [-Wlogical-not-parentheses]
+ 83 | if (!!test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == state)
+ | ^~
+arch/x86/platform/olpc/olpc-xo1-sci.c:83:13: note: add parentheses around left hand side expression to silence this warning
+
+Despite this code working as intended, this redundant double
+negation of boolean value, together with comparing to `char`
+with no explicit conversion to bool, makes compilers think
+the author made some unintentional logical mistakes here.
+Make it the other way around and negate the char instead
+to silence the warnings.
+
+Fixes: d2aa37411b8e ("x86/olpc/xo1/sci: Produce wakeup events for buttons and switches")
+Cc: stable@vger.kernel.org # 3.5+
+Reported-by: Guenter Roeck <linux@roeck-us.net>
+Reported-by: kernel test robot <lkp@intel.com>
+Reviewed-and-tested-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Alexander Lobakin <alexandr.lobakin@intel.com>
+Signed-off-by: Yury Norov <yury.norov@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/platform/olpc/olpc-xo1-sci.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
+index f03a6883dcc6..89f25af4b3c3 100644
+--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
++++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
+@@ -80,7 +80,7 @@ static void send_ebook_state(void)
+ return;
+ }
+
+- if (!!test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == state)
++ if (test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == !!state)
+ return; /* Nothing new to report. */
+
+ input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state);
+--
+2.35.1
+