From: Sasha Levin Date: Mon, 15 Aug 2022 05:56:00 +0000 (-0400) Subject: Fixes for 5.18 X-Git-Tag: v5.15.61~99 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=3aedefd3f3565e566ffa402436535e847d65049b;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.18 Signed-off-by: Sasha Levin --- diff --git a/queue-5.18/__follow_mount_rcu-verify-that-mount_lock-remains-un.patch b/queue-5.18/__follow_mount_rcu-verify-that-mount_lock-remains-un.patch new file mode 100644 index 00000000000..c561a7f535b --- /dev/null +++ b/queue-5.18/__follow_mount_rcu-verify-that-mount_lock-remains-un.patch @@ -0,0 +1,51 @@ +From 4be2ce739fb3c1ad0fbc2337b07b33a326009677 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Jul 2022 17:26:29 -0400 +Subject: __follow_mount_rcu(): verify that mount_lock remains unchanged + +From: Al Viro + +[ Upstream commit 20aac6c60981f5bfacd66661d090d907bf1482f0 ] + +Validate mount_lock seqcount as soon as we cross into mount in RCU +mode. Sure, ->mnt_root is pinned and will remain so until we +do rcu_read_unlock() anyway, and we will eventually fail to unlazy if +the mount_lock had been touched, but we might run into a hard error +(e.g. -ENOENT) before trying to unlazy. And it's possible to end +up with RCU pathwalk racing with rename() and umount() in a way +that would fail with -ENOENT while non-RCU pathwalk would've +succeeded with any timings. + +Once upon a time we hadn't needed that, but analysis had been subtle, +brittle and went out of window as soon as RENAME_EXCHANGE had been +added. + +It's narrow, hard to hit and won't get you anything other than +stray -ENOENT that could be arranged in much easier way with the +same priveleges, but it's a bug all the same. + +Cc: stable@kernel.org +X-sky-is-falling: unlikely +Fixes: da1ce0670c14 "vfs: add cross-rename" +Signed-off-by: Al Viro +Signed-off-by: Sasha Levin +--- + fs/namei.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/namei.c b/fs/namei.c +index 740a40802780..2fa412c5a082 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -1511,6 +1511,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, + * becoming unpinned. + */ + flags = dentry->d_flags; ++ if (read_seqretry(&mount_lock, nd->m_seq)) ++ return false; + continue; + } + if (read_seqretry(&mount_lock, nd->m_seq)) +-- +2.35.1 + diff --git a/queue-5.18/acpi-cppc-do-not-prevent-cppc-from-working-in-the-fu.patch b/queue-5.18/acpi-cppc-do-not-prevent-cppc-from-working-in-the-fu.patch new file mode 100644 index 00000000000..1d1f000424b --- /dev/null +++ b/queue-5.18/acpi-cppc-do-not-prevent-cppc-from-working-in-the-fu.patch @@ -0,0 +1,131 @@ +From 5f735daa405bd9da9301cbe524cf0d5239a6082d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Jul 2022 19:41:10 +0200 +Subject: ACPI: CPPC: Do not prevent CPPC from working in the future + +From: Rafael J. Wysocki + +[ Upstream commit 4f4179fcf420873002035cf1941d844c9e0e7cb3 ] + +There is a problem with the current revision checks in +is_cppc_supported() that they essentially prevent the CPPC support +from working if a new _CPC package format revision being a proper +superset of the v3 and only causing _CPC to return a package with more +entries (while retaining the types and meaning of the entries defined by +the v3) is introduced in the future and used by the platform firmware. + +In that case, as long as the number of entries in the _CPC return +package is at least CPPC_V3_NUM_ENT, it should be perfectly fine to +use the v3 support code and disregard the additional package entries +added by the new package format revision. + +For this reason, drop is_cppc_supported() altogether, put the revision +checks directly into acpi_cppc_processor_probe() so they are easier to +follow and rework them to take the case mentioned above into account. + +Fixes: 4773e77cdc9b ("ACPI / CPPC: Add support for CPPC v3") +Cc: 4.18+ # 4.18+ +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/acpi/cppc_acpi.c | 54 ++++++++++++++++++---------------------- + include/acpi/cppc_acpi.h | 2 +- + 2 files changed, 25 insertions(+), 31 deletions(-) + +diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c +index b8e26b6b5523..35d894674eba 100644 +--- a/drivers/acpi/cppc_acpi.c ++++ b/drivers/acpi/cppc_acpi.c +@@ -600,33 +600,6 @@ static int pcc_data_alloc(int pcc_ss_id) + return 0; + } + +-/* Check if CPPC revision + num_ent combination is supported */ +-static bool is_cppc_supported(int revision, int num_ent) +-{ +- int expected_num_ent; +- +- switch (revision) { +- case CPPC_V2_REV: +- expected_num_ent = CPPC_V2_NUM_ENT; +- break; +- case CPPC_V3_REV: +- expected_num_ent = CPPC_V3_NUM_ENT; +- break; +- default: +- pr_debug("Firmware exports unsupported CPPC revision: %d\n", +- revision); +- return false; +- } +- +- if (expected_num_ent != num_ent) { +- pr_debug("Firmware exports %d entries. Expected: %d for CPPC rev:%d\n", +- num_ent, expected_num_ent, revision); +- return false; +- } +- +- return true; +-} +- + /* + * An example CPC table looks like the following. + * +@@ -715,7 +688,6 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) + cpc_obj->type, pr->id); + goto out_free; + } +- cpc_ptr->num_entries = num_ent; + + /* Second entry should be revision. */ + cpc_obj = &out_obj->package.elements[1]; +@@ -726,10 +698,32 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) + cpc_obj->type, pr->id); + goto out_free; + } +- cpc_ptr->version = cpc_rev; + +- if (!is_cppc_supported(cpc_rev, num_ent)) ++ if (cpc_rev < CPPC_V2_REV) { ++ pr_debug("Unsupported _CPC Revision (%d) for CPU:%d\n", cpc_rev, ++ pr->id); ++ goto out_free; ++ } ++ ++ /* ++ * Disregard _CPC if the number of entries in the return pachage is not ++ * as expected, but support future revisions being proper supersets of ++ * the v3 and only causing more entries to be returned by _CPC. ++ */ ++ if ((cpc_rev == CPPC_V2_REV && num_ent != CPPC_V2_NUM_ENT) || ++ (cpc_rev == CPPC_V3_REV && num_ent != CPPC_V3_NUM_ENT) || ++ (cpc_rev > CPPC_V3_REV && num_ent <= CPPC_V3_NUM_ENT)) { ++ pr_debug("Unexpected number of _CPC return package entries (%d) for CPU:%d\n", ++ num_ent, pr->id); + goto out_free; ++ } ++ if (cpc_rev > CPPC_V3_REV) { ++ num_ent = CPPC_V3_NUM_ENT; ++ cpc_rev = CPPC_V3_REV; ++ } ++ ++ cpc_ptr->num_entries = num_ent; ++ cpc_ptr->version = cpc_rev; + + /* Iterate through remaining entries in _CPC */ + for (i = 2; i < num_ent; i++) { +diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h +index 181907349b49..a76f8c6b732d 100644 +--- a/include/acpi/cppc_acpi.h ++++ b/include/acpi/cppc_acpi.h +@@ -17,7 +17,7 @@ + #include + #include + +-/* Support CPPCv2 and CPPCv3 */ ++/* CPPCv2 and CPPCv3 support */ + #define CPPC_V2_REV 2 + #define CPPC_V3_REV 3 + #define CPPC_V2_NUM_ENT 21 +-- +2.35.1 + diff --git a/queue-5.18/batman-adv-tracing-use-the-new-__vstring-helper.patch b/queue-5.18/batman-adv-tracing-use-the-new-__vstring-helper.patch new file mode 100644 index 00000000000..9cc6be78217 --- /dev/null +++ b/queue-5.18/batman-adv-tracing-use-the-new-__vstring-helper.patch @@ -0,0 +1,69 @@ +From 6e6c54f719b05010a877a1d77f3b5cab7c585471 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 24 Jul 2022 19:16:50 -0400 +Subject: batman-adv: tracing: Use the new __vstring() helper + +From: Steven Rostedt (Google) + +[ Upstream commit 9abc291812d784bd4a26c01af4ebdbf9f2dbf0bb ] + +Instead of open coding a __dynamic_array() with a fixed length (which +defeats the purpose of the dynamic array in the first place). Use the new +__vstring() helper that will use a va_list and only write enough of the +string into the ring buffer that is needed. + +Link: https://lkml.kernel.org/r/20220724191650.236b1355@rorschach.local.home + +Cc: Marek Lindner +Cc: Ingo Molnar +Cc: Andrew Morton +Cc: Simon Wunderlich +Cc: Antonio Quartulli +Cc: "David S. Miller" +Cc: Eric Dumazet +Cc: Jakub Kicinski +Cc: Paolo Abeni +Cc: b.a.t.m.a.n@lists.open-mesh.org +Cc: netdev@vger.kernel.org +Acked-by: Sven Eckelmann +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + net/batman-adv/trace.h | 9 ++------- + 1 file changed, 2 insertions(+), 7 deletions(-) + +diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h +index d673ebdd0426..31c8f922651d 100644 +--- a/net/batman-adv/trace.h ++++ b/net/batman-adv/trace.h +@@ -28,8 +28,6 @@ + + #endif /* CONFIG_BATMAN_ADV_TRACING */ + +-#define BATADV_MAX_MSG_LEN 256 +- + TRACE_EVENT(batadv_dbg, + + TP_PROTO(struct batadv_priv *bat_priv, +@@ -40,16 +38,13 @@ TRACE_EVENT(batadv_dbg, + TP_STRUCT__entry( + __string(device, bat_priv->soft_iface->name) + __string(driver, KBUILD_MODNAME) +- __dynamic_array(char, msg, BATADV_MAX_MSG_LEN) ++ __vstring(msg, vaf->fmt, vaf->va) + ), + + TP_fast_assign( + __assign_str(device, bat_priv->soft_iface->name); + __assign_str(driver, KBUILD_MODNAME); +- WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), +- BATADV_MAX_MSG_LEN, +- vaf->fmt, +- *vaf->va) >= BATADV_MAX_MSG_LEN); ++ __assign_vstr(msg, vaf->fmt, vaf->va); + ), + + TP_printk( +-- +2.35.1 + diff --git a/queue-5.18/block-add-a-bdev_max_zone_append_sectors-helper.patch b/queue-5.18/block-add-a-bdev_max_zone_append_sectors-helper.patch new file mode 100644 index 00000000000..3a8ba25e558 --- /dev/null +++ b/queue-5.18/block-add-a-bdev_max_zone_append_sectors-helper.patch @@ -0,0 +1,80 @@ +From dd6495e668dadb91e946e5f118bc7f72cc8bed13 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Apr 2022 06:52:46 +0200 +Subject: block: add a bdev_max_zone_append_sectors helper + +From: Christoph Hellwig + +[ Upstream commit 2aba0d19f4d8c8929b4b3b94a9cfde2aa20e6ee2 ] + +Add a helper to check the max supported sectors for zone append based on +the block_device instead of having to poke into the block layer internal +request_queue. + +Signed-off-by: Christoph Hellwig +Acked-by: Damien Le Moal +Reviewed-by: Martin K. Petersen +Reviewed-by: Johannes Thumshirn +Reviewed-by: Chaitanya Kulkarni +Link: https://lore.kernel.org/r/20220415045258.199825-16-hch@lst.de +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + drivers/nvme/target/zns.c | 3 +-- + fs/zonefs/super.c | 3 +-- + include/linux/blkdev.h | 6 ++++++ + 3 files changed, 8 insertions(+), 4 deletions(-) + +diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c +index e34718b09550..82b61acf7a72 100644 +--- a/drivers/nvme/target/zns.c ++++ b/drivers/nvme/target/zns.c +@@ -34,8 +34,7 @@ static int validate_conv_zones_cb(struct blk_zone *z, + + bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) + { +- struct request_queue *q = ns->bdev->bd_disk->queue; +- u8 zasl = nvmet_zasl(queue_max_zone_append_sectors(q)); ++ u8 zasl = nvmet_zasl(bdev_max_zone_append_sectors(ns->bdev)); + struct gendisk *bd_disk = ns->bdev->bd_disk; + int ret; + +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index 15a4c7c07a3b..b68798a572fc 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -723,13 +723,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct block_device *bdev = inode->i_sb->s_bdev; +- unsigned int max; ++ unsigned int max = bdev_max_zone_append_sectors(bdev); + struct bio *bio; + ssize_t size; + int nr_pages; + ssize_t ret; + +- max = queue_max_zone_append_sectors(bdev_get_queue(bdev)); + max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); + iov_iter_truncate(from, max); + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index cc6b24a5098f..34f2b88dfd6e 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -1190,6 +1190,12 @@ static inline unsigned int queue_max_zone_append_sectors(const struct request_qu + return min(l->max_zone_append_sectors, l->max_sectors); + } + ++static inline unsigned int ++bdev_max_zone_append_sectors(struct block_device *bdev) ++{ ++ return queue_max_zone_append_sectors(bdev_get_queue(bdev)); ++} ++ + static inline unsigned queue_logical_block_size(const struct request_queue *q) + { + int retval = 512; +-- +2.35.1 + diff --git a/queue-5.18/block-add-bdev_max_segments-helper.patch b/queue-5.18/block-add-bdev_max_segments-helper.patch new file mode 100644 index 00000000000..420dd711ca9 --- /dev/null +++ b/queue-5.18/block-add-bdev_max_segments-helper.patch @@ -0,0 +1,40 @@ +From 065935c8b7fbf75e3eb0c7a9d9f88ff921b1c9a4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:38 +0900 +Subject: block: add bdev_max_segments() helper + +From: Naohiro Aota + +[ Upstream commit 65ea1b66482f415d51cd46515b02477257330339 ] + +Add bdev_max_segments() like other queue parameters. + +Reviewed-by: Johannes Thumshirn +Reviewed-by: Jens Axboe +Reviewed-by: Christoph Hellwig +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + include/linux/blkdev.h | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 34f2b88dfd6e..7927480b9cf7 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -1196,6 +1196,11 @@ bdev_max_zone_append_sectors(struct block_device *bdev) + return queue_max_zone_append_sectors(bdev_get_queue(bdev)); + } + ++static inline unsigned int bdev_max_segments(struct block_device *bdev) ++{ ++ return queue_max_segments(bdev_get_queue(bdev)); ++} ++ + static inline unsigned queue_logical_block_size(const struct request_queue *q) + { + int retval = 512; +-- +2.35.1 + diff --git a/queue-5.18/block-don-t-allow-the-same-type-rq_qos-add-more-than.patch b/queue-5.18/block-don-t-allow-the-same-type-rq_qos-add-more-than.patch new file mode 100644 index 00000000000..b578a245adf --- /dev/null +++ b/queue-5.18/block-don-t-allow-the-same-type-rq_qos-add-more-than.patch @@ -0,0 +1,199 @@ +From 9d6969d9e591d57389ab123fdc51c860fd939781 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Jul 2022 17:36:16 +0800 +Subject: block: don't allow the same type rq_qos add more than once + +From: Jinke Han + +[ Upstream commit 14a6e2eb7df5c7897c15b109cba29ab0c4a791b6 ] + +In our test of iocost, we encountered some list add/del corruptions of +inner_walk list in ioc_timer_fn. + +The reason can be described as follows: + +cpu 0 cpu 1 +ioc_qos_write ioc_qos_write + +ioc = q_to_ioc(queue); +if (!ioc) { + ioc = kzalloc(); + ioc = q_to_ioc(queue); + if (!ioc) { + ioc = kzalloc(); + ... + rq_qos_add(q, rqos); + } + ... + rq_qos_add(q, rqos); + ... +} + +When the io.cost.qos file is written by two cpus concurrently, rq_qos may +be added to one disk twice. In that case, there will be two iocs enabled +and running on one disk. They own different iocgs on their active list. In +the ioc_timer_fn function, because of the iocgs from two iocs have the +same root iocg, the root iocg's walk_list may be overwritten by each other +and this leads to list add/del corruptions in building or destroying the +inner_walk list. + +And so far, the blk-rq-qos framework works in case that one instance for +one type rq_qos per queue by default. This patch make this explicit and +also fix the crash above. + +Signed-off-by: Jinke Han +Reviewed-by: Muchun Song +Acked-by: Tejun Heo +Cc: +Link: https://lore.kernel.org/r/20220720093616.70584-1-hanjinke.666@bytedance.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + block/blk-iocost.c | 20 +++++++++++++------- + block/blk-iolatency.c | 18 +++++++++++------- + block/blk-rq-qos.h | 11 ++++++++++- + block/blk-wbt.c | 12 +++++++++++- + 4 files changed, 45 insertions(+), 16 deletions(-) + +diff --git a/block/blk-iocost.c b/block/blk-iocost.c +index 16705fbd0699..a19f2db4eeb2 100644 +--- a/block/blk-iocost.c ++++ b/block/blk-iocost.c +@@ -2893,15 +2893,21 @@ static int blk_iocost_init(struct request_queue *q) + * called before policy activation completion, can't assume that the + * target bio has an iocg associated and need to test for NULL iocg. + */ +- rq_qos_add(q, rqos); ++ ret = rq_qos_add(q, rqos); ++ if (ret) ++ goto err_free_ioc; ++ + ret = blkcg_activate_policy(q, &blkcg_policy_iocost); +- if (ret) { +- rq_qos_del(q, rqos); +- free_percpu(ioc->pcpu_stat); +- kfree(ioc); +- return ret; +- } ++ if (ret) ++ goto err_del_qos; + return 0; ++ ++err_del_qos: ++ rq_qos_del(q, rqos); ++err_free_ioc: ++ free_percpu(ioc->pcpu_stat); ++ kfree(ioc); ++ return ret; + } + + static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) +diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c +index 9568bf8dfe82..7845dca5fcfd 100644 +--- a/block/blk-iolatency.c ++++ b/block/blk-iolatency.c +@@ -773,19 +773,23 @@ int blk_iolatency_init(struct request_queue *q) + rqos->ops = &blkcg_iolatency_ops; + rqos->q = q; + +- rq_qos_add(q, rqos); +- ++ ret = rq_qos_add(q, rqos); ++ if (ret) ++ goto err_free; + ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); +- if (ret) { +- rq_qos_del(q, rqos); +- kfree(blkiolat); +- return ret; +- } ++ if (ret) ++ goto err_qos_del; + + timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); + INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); + + return 0; ++ ++err_qos_del: ++ rq_qos_del(q, rqos); ++err_free: ++ kfree(blkiolat); ++ return ret; + } + + static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) +diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h +index 0e46052b018a..08b856570ad1 100644 +--- a/block/blk-rq-qos.h ++++ b/block/blk-rq-qos.h +@@ -86,7 +86,7 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) + init_waitqueue_head(&rq_wait->wait); + } + +-static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) ++static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos) + { + /* + * No IO can be in-flight when adding rqos, so freeze queue, which +@@ -98,6 +98,8 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); ++ if (rq_qos_id(q, rqos->id)) ++ goto ebusy; + rqos->next = q->rq_qos; + q->rq_qos = rqos; + spin_unlock_irq(&q->queue_lock); +@@ -109,6 +111,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) + blk_mq_debugfs_register_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); + } ++ ++ return 0; ++ebusy: ++ spin_unlock_irq(&q->queue_lock); ++ blk_mq_unfreeze_queue(q); ++ return -EBUSY; ++ + } + + static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) +diff --git a/block/blk-wbt.c b/block/blk-wbt.c +index 0c119be0e813..ae6ea0b54579 100644 +--- a/block/blk-wbt.c ++++ b/block/blk-wbt.c +@@ -820,6 +820,7 @@ int wbt_init(struct request_queue *q) + { + struct rq_wb *rwb; + int i; ++ int ret; + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) +@@ -846,7 +847,10 @@ int wbt_init(struct request_queue *q) + /* + * Assign rwb and add the stats callback. + */ +- rq_qos_add(q, &rwb->rqos); ++ ret = rq_qos_add(q, &rwb->rqos); ++ if (ret) ++ goto err_free; ++ + blk_stat_add_callback(q, rwb->cb); + + rwb->min_lat_nsec = wbt_default_latency_nsec(q); +@@ -855,4 +859,10 @@ int wbt_init(struct request_queue *q) + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + + return 0; ++ ++err_free: ++ blk_stat_free_callback(rwb->cb); ++ kfree(rwb); ++ return ret; ++ + } +-- +2.35.1 + diff --git a/queue-5.18/block-serialize-all-debugfs-operations-using-q-debug.patch b/queue-5.18/block-serialize-all-debugfs-operations-using-q-debug.patch new file mode 100644 index 00000000000..49641b45e25 --- /dev/null +++ b/queue-5.18/block-serialize-all-debugfs-operations-using-q-debug.patch @@ -0,0 +1,334 @@ +From de8ba1b6410a2cffe26c1f98058fd39d39dc0e62 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Jun 2022 09:48:25 +0200 +Subject: block: serialize all debugfs operations using q->debugfs_mutex + +From: Christoph Hellwig + +[ Upstream commit 5cf9c91ba927119fc6606b938b1895bb2459d3bc ] + +Various places like I/O schedulers or the QOS infrastructure try to +register debugfs files on demans, which can race with creating and +removing the main queue debugfs directory. Use the existing +debugfs_mutex to serialize all debugfs operations that rely on +q->debugfs_dir or the directories hanging off it. + +To make the teardown code a little simpler declare all debugfs dentry +pointers and not just the main one uncoditionally in blkdev.h. + +Move debugfs_mutex next to the dentries that it protects and document +what it is used for. + +Signed-off-by: Christoph Hellwig +Link: https://lore.kernel.org/r/20220614074827.458955-3-hch@lst.de +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + block/blk-mq-debugfs.c | 25 ++++++++++++++++++++----- + block/blk-mq-debugfs.h | 5 ----- + block/blk-mq-sched.c | 11 +++++++++++ + block/blk-rq-qos.c | 2 ++ + block/blk-rq-qos.h | 7 ++++++- + block/blk-sysfs.c | 20 +++++++++----------- + include/linux/blkdev.h | 8 ++++---- + kernel/trace/blktrace.c | 3 --- + 8 files changed, 52 insertions(+), 29 deletions(-) + +diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c +index 34bee263936c..d491b6eb0ab9 100644 +--- a/block/blk-mq-debugfs.c ++++ b/block/blk-mq-debugfs.c +@@ -713,11 +713,6 @@ void blk_mq_debugfs_register(struct request_queue *q) + } + } + +-void blk_mq_debugfs_unregister(struct request_queue *q) +-{ +- q->sched_debugfs_dir = NULL; +-} +- + static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) + { +@@ -751,6 +746,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, + + void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) + { ++ if (!hctx->queue->debugfs_dir) ++ return; + debugfs_remove_recursive(hctx->debugfs_dir); + hctx->sched_debugfs_dir = NULL; + hctx->debugfs_dir = NULL; +@@ -778,6 +775,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) + { + struct elevator_type *e = q->elevator->type; + ++ lockdep_assert_held(&q->debugfs_mutex); ++ + /* + * If the parent directory has not been created yet, return, we will be + * called again later on and the directory/files will be created then. +@@ -795,6 +794,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) + + void blk_mq_debugfs_unregister_sched(struct request_queue *q) + { ++ lockdep_assert_held(&q->debugfs_mutex); ++ + debugfs_remove_recursive(q->sched_debugfs_dir); + q->sched_debugfs_dir = NULL; + } +@@ -816,6 +817,10 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) + + void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) + { ++ lockdep_assert_held(&rqos->q->debugfs_mutex); ++ ++ if (!rqos->q->debugfs_dir) ++ return; + debugfs_remove_recursive(rqos->debugfs_dir); + rqos->debugfs_dir = NULL; + } +@@ -825,6 +830,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) + struct request_queue *q = rqos->q; + const char *dir_name = rq_qos_id_to_name(rqos->id); + ++ lockdep_assert_held(&q->debugfs_mutex); ++ + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) + return; + +@@ -840,6 +847,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) + + void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) + { ++ lockdep_assert_held(&q->debugfs_mutex); ++ + debugfs_remove_recursive(q->rqos_debugfs_dir); + q->rqos_debugfs_dir = NULL; + } +@@ -849,6 +858,8 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, + { + struct elevator_type *e = q->elevator->type; + ++ lockdep_assert_held(&q->debugfs_mutex); ++ + /* + * If the parent debugfs directory has not been created yet, return; + * We will be called again later on with appropriate parent debugfs +@@ -868,6 +879,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, + + void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) + { ++ lockdep_assert_held(&hctx->queue->debugfs_mutex); ++ ++ if (!hctx->queue->debugfs_dir) ++ return; + debugfs_remove_recursive(hctx->sched_debugfs_dir); + hctx->sched_debugfs_dir = NULL; + } +diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h +index 69918f4170d6..771d45832878 100644 +--- a/block/blk-mq-debugfs.h ++++ b/block/blk-mq-debugfs.h +@@ -21,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); + int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); + + void blk_mq_debugfs_register(struct request_queue *q); +-void blk_mq_debugfs_unregister(struct request_queue *q); + void blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx); + void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); +@@ -42,10 +41,6 @@ static inline void blk_mq_debugfs_register(struct request_queue *q) + { + } + +-static inline void blk_mq_debugfs_unregister(struct request_queue *q) +-{ +-} +- + static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) + { +diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c +index 9e56a69422b6..e84bec39fd3a 100644 +--- a/block/blk-mq-sched.c ++++ b/block/blk-mq-sched.c +@@ -593,7 +593,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) + if (ret) + goto err_free_map_and_rqs; + ++ mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_sched(q); ++ mutex_unlock(&q->debugfs_mutex); + + queue_for_each_hw_ctx(q, hctx, i) { + if (e->ops.init_hctx) { +@@ -606,7 +608,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) + return ret; + } + } ++ mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_sched_hctx(q, hctx); ++ mutex_unlock(&q->debugfs_mutex); + } + + return 0; +@@ -647,14 +651,21 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) + unsigned int flags = 0; + + queue_for_each_hw_ctx(q, hctx, i) { ++ mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_unregister_sched_hctx(hctx); ++ mutex_unlock(&q->debugfs_mutex); ++ + if (e->type->ops.exit_hctx && hctx->sched_data) { + e->type->ops.exit_hctx(hctx, i); + hctx->sched_data = NULL; + } + flags = hctx->flags; + } ++ ++ mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_unregister_sched(q); ++ mutex_unlock(&q->debugfs_mutex); ++ + if (e->type->ops.exit_sched) + e->type->ops.exit_sched(e); + blk_mq_sched_tags_teardown(q, flags); +diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c +index e83af7bc7591..249a6f05dd3b 100644 +--- a/block/blk-rq-qos.c ++++ b/block/blk-rq-qos.c +@@ -294,7 +294,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, + + void rq_qos_exit(struct request_queue *q) + { ++ mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_unregister_queue_rqos(q); ++ mutex_unlock(&q->debugfs_mutex); + + while (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; +diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h +index 68267007da1c..0e46052b018a 100644 +--- a/block/blk-rq-qos.h ++++ b/block/blk-rq-qos.h +@@ -104,8 +104,11 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) + + blk_mq_unfreeze_queue(q); + +- if (rqos->ops->debugfs_attrs) ++ if (rqos->ops->debugfs_attrs) { ++ mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rqos(rqos); ++ mutex_unlock(&q->debugfs_mutex); ++ } + } + + static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) +@@ -129,7 +132,9 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) + + blk_mq_unfreeze_queue(q); + ++ mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_unregister_rqos(rqos); ++ mutex_unlock(&q->debugfs_mutex); + } + + typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); +diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c +index 88bd41d4cb59..6e4801b217a7 100644 +--- a/block/blk-sysfs.c ++++ b/block/blk-sysfs.c +@@ -779,14 +779,13 @@ static void blk_release_queue(struct kobject *kobj) + if (queue_is_mq(q)) + blk_mq_release(q); + +- blk_trace_shutdown(q); + mutex_lock(&q->debugfs_mutex); ++ blk_trace_shutdown(q); + debugfs_remove_recursive(q->debugfs_dir); ++ q->debugfs_dir = NULL; ++ q->sched_debugfs_dir = NULL; + mutex_unlock(&q->debugfs_mutex); + +- if (queue_is_mq(q)) +- blk_mq_debugfs_unregister(q); +- + bioset_exit(&q->bio_split); + + if (blk_queue_has_srcu(q)) +@@ -836,17 +835,16 @@ int blk_register_queue(struct gendisk *disk) + goto unlock; + } + ++ if (queue_is_mq(q)) ++ __blk_mq_register_dev(dev, q); ++ mutex_lock(&q->sysfs_lock); ++ + mutex_lock(&q->debugfs_mutex); + q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), + blk_debugfs_root); +- mutex_unlock(&q->debugfs_mutex); +- +- if (queue_is_mq(q)) { +- __blk_mq_register_dev(dev, q); ++ if (queue_is_mq(q)) + blk_mq_debugfs_register(q); +- } +- +- mutex_lock(&q->sysfs_lock); ++ mutex_unlock(&q->debugfs_mutex); + + ret = disk_register_independent_access_ranges(disk, NULL); + if (ret) +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 108e3d114bfc..cc6b24a5098f 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -466,7 +466,6 @@ struct request_queue { + #endif /* CONFIG_BLK_DEV_ZONED */ + + int node; +- struct mutex debugfs_mutex; + #ifdef CONFIG_BLK_DEV_IO_TRACE + struct blk_trace __rcu *blk_trace; + #endif +@@ -510,11 +509,12 @@ struct request_queue { + struct bio_set bio_split; + + struct dentry *debugfs_dir; +- +-#ifdef CONFIG_BLK_DEBUG_FS + struct dentry *sched_debugfs_dir; + struct dentry *rqos_debugfs_dir; +-#endif ++ /* ++ * Serializes all debugfs metadata operations using the above dentries. ++ */ ++ struct mutex debugfs_mutex; + + bool mq_sysfs_init_done; + +diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c +index f22219495541..f0500b5cfefe 100644 +--- a/kernel/trace/blktrace.c ++++ b/kernel/trace/blktrace.c +@@ -770,14 +770,11 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) + **/ + void blk_trace_shutdown(struct request_queue *q) + { +- mutex_lock(&q->debugfs_mutex); + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->debugfs_mutex))) { + __blk_trace_startstop(q, 0); + __blk_trace_remove(q); + } +- +- mutex_unlock(&q->debugfs_mutex); + } + + #ifdef CONFIG_BLK_CGROUP +-- +2.35.1 + diff --git a/queue-5.18/btrfs-ensure-pages-are-unlocked-on-cow_file_range-fa.patch b/queue-5.18/btrfs-ensure-pages-are-unlocked-on-cow_file_range-fa.patch new file mode 100644 index 00000000000..314b37a7e26 --- /dev/null +++ b/queue-5.18/btrfs-ensure-pages-are-unlocked-on-cow_file_range-fa.patch @@ -0,0 +1,196 @@ +From e082d91f540e4ca12f8ed8aad09d7fc71a7d45de Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 21 Jun 2022 15:40:59 +0900 +Subject: btrfs: ensure pages are unlocked on cow_file_range() failure + +From: Naohiro Aota + +[ Upstream commit 9ce7466f372d83054c7494f6b3e4b9abaf3f0355 ] + +There is a hung_task report on zoned btrfs like below. + +https://github.com/naota/linux/issues/59 + + [726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds. + [726.329839] Not tainted 5.16.0-rc1+ #1 + [726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + [726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000 + [726.331608] Call Trace: + [726.331611] + [726.331614] __schedule+0x2e5/0x9d0 + [726.331622] schedule+0x58/0xd0 + [726.331626] io_schedule+0x3f/0x70 + [726.331629] __folio_lock+0x125/0x200 + [726.331634] ? find_get_entries+0x1bc/0x240 + [726.331638] ? filemap_invalidate_unlock_two+0x40/0x40 + [726.331642] truncate_inode_pages_range+0x5b2/0x770 + [726.331649] truncate_inode_pages_final+0x44/0x50 + [726.331653] btrfs_evict_inode+0x67/0x480 + [726.331658] evict+0xd0/0x180 + [726.331661] iput+0x13f/0x200 + [726.331664] do_unlinkat+0x1c0/0x2b0 + [726.331668] __x64_sys_unlink+0x23/0x30 + [726.331670] do_syscall_64+0x3b/0xc0 + [726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae + [726.331677] RIP: 0033:0x7fb9490a171b + [726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057 + [726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b + [726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300 + [726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000 + [726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000 + [726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260 + [726.331693] + +While we debug the issue, we found running fstests generic/551 on 5GB +non-zoned null_blk device in the emulated zoned mode also had a +similar hung issue. + +Also, we can reproduce the same symptom with an error injected +cow_file_range() setup. + +The hang occurs when cow_file_range() fails in the middle of +allocation. cow_file_range() called from do_allocation_zoned() can +split the give region ([start, end]) for allocation depending on +current block group usages. When btrfs can allocate bytes for one part +of the split regions but fails for the other region (e.g. because of +-ENOSPC), we return the error leaving the pages in the succeeded regions +locked. Technically, this occurs only when @unlock == 0. Otherwise, we +unlock the pages in an allocated region after creating an ordered +extent. + +Considering the callers of cow_file_range(unlock=0) won't write out +the pages, we can unlock the pages on error exit from +cow_file_range(). So, we can ensure all the pages except @locked_page +are unlocked on error case. + +In summary, cow_file_range now behaves like this: + +- page_started == 1 (return value) + - All the pages are unlocked. IO is started. +- unlock == 1 + - All the pages except @locked_page are unlocked in any case +- unlock == 0 + - On success, all the pages are locked for writing out them + - On failure, all the pages except @locked_page are unlocked + +Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems") +CC: stable@vger.kernel.org # 5.12+ +Reviewed-by: Filipe Manana +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 64 insertions(+), 8 deletions(-) + +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 5d15e374d032..54afa9e538c5 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1097,6 +1097,28 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. ++ * ++ * When unlock == 1, we unlock the pages in successfully allocated regions. ++ * When unlock == 0, we leave them locked for writing them out. ++ * ++ * However, we unlock all the pages except @locked_page in case of failure. ++ * ++ * In summary, page locking state will be as follow: ++ * ++ * - page_started == 1 (return value) ++ * - All the pages are unlocked. IO is started. ++ * - Note that this can happen only on success ++ * - unlock == 1 ++ * - All the pages except @locked_page are unlocked in any case ++ * - unlock == 0 ++ * - On success, all the pages are locked for writing out them ++ * - On failure, all the pages except @locked_page are unlocked ++ * ++ * When a failure happens in the second or later iteration of the ++ * while-loop, the ordered extents created in previous iterations are kept ++ * intact. So, the caller must clean them up by calling ++ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for ++ * example. + */ + static noinline int cow_file_range(struct btrfs_inode *inode, + struct page *locked_page, +@@ -1106,6 +1128,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 alloc_hint = 0; ++ u64 orig_start = start; + u64 num_bytes; + unsigned long ram_size; + u64 cur_alloc_size = 0; +@@ -1293,18 +1316,44 @@ static noinline int cow_file_range(struct btrfs_inode *inode, + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + out_unlock: ++ /* ++ * Now, we have three regions to clean up: ++ * ++ * |-------(1)----|---(2)---|-------------(3)----------| ++ * `- orig_start `- start `- start + cur_alloc_size `- end ++ * ++ * We process each region below. ++ */ ++ + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; ++ + /* +- * If we reserved an extent for our delalloc range (or a subrange) and +- * failed to create the respective ordered extent, then it means that +- * when we reserved the extent we decremented the extent's size from +- * the data space_info's bytes_may_use counter and incremented the +- * space_info's bytes_reserved counter by the same amount. We must make +- * sure extent_clear_unlock_delalloc() does not try to decrement again +- * the data space_info's bytes_may_use counter, therefore we do not pass +- * it the flag EXTENT_CLEAR_DATA_RESV. ++ * For the range (1). We have already instantiated the ordered extents ++ * for this region. They are cleaned up by ++ * btrfs_cleanup_ordered_extents() in e.g, ++ * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are ++ * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | ++ * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup ++ * function. ++ * ++ * However, in case of unlock == 0, we still need to unlock the pages ++ * (except @locked_page) to ensure all the pages are unlocked. ++ */ ++ if (!unlock && orig_start < start) ++ extent_clear_unlock_delalloc(inode, orig_start, start - 1, ++ locked_page, 0, page_ops); ++ ++ /* ++ * For the range (2). If we reserved an extent for our delalloc range ++ * (or a subrange) and failed to create the respective ordered extent, ++ * then it means that when we reserved the extent we decremented the ++ * extent's size from the data space_info's bytes_may_use counter and ++ * incremented the space_info's bytes_reserved counter by the same ++ * amount. We must make sure extent_clear_unlock_delalloc() does not try ++ * to decrement again the data space_info's bytes_may_use counter, ++ * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. + */ + if (extent_reserved) { + extent_clear_unlock_delalloc(inode, start, +@@ -1316,6 +1365,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, + if (start >= end) + goto out; + } ++ ++ /* ++ * For the range (3). We never touched the region. In addition to the ++ * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data ++ * space_info's bytes_may_use counter, reserved in ++ * btrfs_check_data_free_space(). ++ */ + extent_clear_unlock_delalloc(inode, start, end, locked_page, + clear_bits | EXTENT_CLEAR_DATA_RESV, + page_ops); +-- +2.35.1 + diff --git a/queue-5.18/btrfs-fix-error-handling-of-fallback-uncompress-writ.patch b/queue-5.18/btrfs-fix-error-handling-of-fallback-uncompress-writ.patch new file mode 100644 index 00000000000..02c4c2e462f --- /dev/null +++ b/queue-5.18/btrfs-fix-error-handling-of-fallback-uncompress-writ.patch @@ -0,0 +1,72 @@ +From db0a5d9ef124f104269150d76fb2bcbc29e5293a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 21 Jun 2022 15:41:01 +0900 +Subject: btrfs: fix error handling of fallback uncompress write + +From: Naohiro Aota + +[ Upstream commit 71aa147b4d9d81fa65afa6016f50d7818b64a54f ] + +When cow_file_range() fails in the middle of the allocation loop, it +unlocks the pages but leaves the ordered extents intact. Thus, we need +to call btrfs_cleanup_ordered_extents() to finish the created ordered +extents. + +Also, we need to call end_extent_writepage() if locked_page is available +because btrfs_cleanup_ordered_extents() never processes the region on +the locked_page. + +Furthermore, we need to set the mapping as error if locked_page is +unavailable before unlocking the pages, so that the errno is properly +propagated to the user space. + +CC: stable@vger.kernel.org # 5.18+ +Reviewed-by: Filipe Manana +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/inode.c | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 54afa9e538c5..1e404476fe6a 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -891,8 +891,18 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, + goto out; + } + if (ret < 0) { +- if (locked_page) ++ btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); ++ if (locked_page) { ++ const u64 page_start = page_offset(locked_page); ++ const u64 page_end = page_start + PAGE_SIZE - 1; ++ ++ btrfs_page_set_error(inode->root->fs_info, locked_page, ++ page_start, PAGE_SIZE); ++ set_page_writeback(locked_page); ++ end_page_writeback(locked_page); ++ end_extent_writepage(locked_page, ret, page_start, page_end); + unlock_page(locked_page); ++ } + goto out; + } + +@@ -1341,9 +1351,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, + * However, in case of unlock == 0, we still need to unlock the pages + * (except @locked_page) to ensure all the pages are unlocked. + */ +- if (!unlock && orig_start < start) ++ if (!unlock && orig_start < start) { ++ if (!locked_page) ++ mapping_set_error(inode->vfs_inode.i_mapping, ret); + extent_clear_unlock_delalloc(inode, orig_start, start - 1, + locked_page, 0, page_ops); ++ } + + /* + * For the range (2). If we reserved an extent for our delalloc range +-- +2.35.1 + diff --git a/queue-5.18/btrfs-let-can_allocate_chunk-return-error.patch b/queue-5.18/btrfs-let-can_allocate_chunk-return-error.patch new file mode 100644 index 00000000000..4e6d94d8fbd --- /dev/null +++ b/queue-5.18/btrfs-let-can_allocate_chunk-return-error.patch @@ -0,0 +1,66 @@ +From d95e54fdfd9bad3d0327ac42599359dc8c90ef75 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:43 +0900 +Subject: btrfs: let can_allocate_chunk return error + +From: Naohiro Aota + +[ Upstream commit bb9950d3df7169a673c594d38fb74e241ed4fb2a ] + +For the later patch, convert the return type from bool to int and return +errors. No functional changes. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index f45ecd939a2c..8bdcbc0c6d60 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3985,12 +3985,12 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, + } + } + +-static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, +- struct find_free_extent_ctl *ffe_ctl) ++static int can_allocate_chunk(struct btrfs_fs_info *fs_info, ++ struct find_free_extent_ctl *ffe_ctl) + { + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: +- return true; ++ return 0; + case BTRFS_EXTENT_ALLOC_ZONED: + /* + * If we have enough free space left in an already +@@ -4000,8 +4000,8 @@ static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && + !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) +- return false; +- return true; ++ return -ENOSPC; ++ return 0; + default: + BUG(); + } +@@ -4083,8 +4083,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, + int exist = 0; + + /*Check if allocation policy allows to create a new chunk */ +- if (!can_allocate_chunk(fs_info, ffe_ctl)) +- return -ENOSPC; ++ ret = can_allocate_chunk(fs_info, ffe_ctl); ++ if (ret) ++ return ret; + + trans = current->journal_info; + if (trans) +-- +2.35.1 + diff --git a/queue-5.18/btrfs-make-the-bg_reclaim_threshold-per-space-info.patch b/queue-5.18/btrfs-make-the-bg_reclaim_threshold-per-space-info.patch new file mode 100644 index 00000000000..de2ee52b6a4 --- /dev/null +++ b/queue-5.18/btrfs-make-the-bg_reclaim_threshold-per-space-info.patch @@ -0,0 +1,176 @@ +From e4ae7bab98014a3d07e59efe09a829cb15cb518f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 29 Mar 2022 01:56:06 -0700 +Subject: btrfs: make the bg_reclaim_threshold per-space info + +From: Josef Bacik + +[ Upstream commit bb5a098d9791f184899499531ff4411089e2a5e0 ] + +For non-zoned file systems it's useful to have the auto reclaim feature, +however there are different use cases for non-zoned, for example we may +not want to reclaim metadata chunks ever, only data chunks. Move this +sysfs flag to per-space_info. This won't affect current users because +this tunable only ever did anything for zoned, and that is currently +hidden behind BTRFS_CONFIG_DEBUG. + +Tested-by: Pankaj Raghav +Signed-off-by: Josef Bacik +[ jth restore global bg_reclaim_threshold ] +Signed-off-by: Johannes Thumshirn +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/free-space-cache.c | 7 +++++-- + fs/btrfs/space-info.c | 9 +++++++++ + fs/btrfs/space-info.h | 6 ++++++ + fs/btrfs/sysfs.c | 37 +++++++++++++++++++++++++++++++++++++ + fs/btrfs/zoned.h | 6 +----- + 5 files changed, 58 insertions(+), 7 deletions(-) + +diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c +index 01a408db5683..ef84bc5030cd 100644 +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -2630,16 +2630,19 @@ int __btrfs_add_free_space(struct btrfs_block_group *block_group, + static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + u64 bytenr, u64 size, bool used) + { +- struct btrfs_fs_info *fs_info = block_group->fs_info; ++ struct btrfs_space_info *sinfo = block_group->space_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + u64 offset = bytenr - block_group->start; + u64 to_free, to_unusable; +- const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); ++ int bg_reclaim_threshold = 0; + bool initial = (size == block_group->length); + u64 reclaimable_unusable; + + WARN_ON(!initial && offset + size > block_group->zone_capacity); + ++ if (!initial) ++ bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); ++ + spin_lock(&ctl->tree_lock); + if (!used) + to_free = size; +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 56a7c99fc03e..85608acb9557 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) + found->full = 0; + } + ++/* ++ * Block groups with more than this value (percents) of unusable space will be ++ * scheduled for background reclaim. ++ */ ++#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) ++ + static int create_space_info(struct btrfs_fs_info *info, u64 flags) + { + +@@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) + INIT_LIST_HEAD(&space_info->priority_tickets); + space_info->clamp = 1; + ++ if (btrfs_is_zoned(info)) ++ space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; ++ + ret = btrfs_sysfs_add_space_info_type(info, space_info); + if (ret) + return ret; +diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h +index d841fed73492..a803e29bd781 100644 +--- a/fs/btrfs/space-info.h ++++ b/fs/btrfs/space-info.h +@@ -24,6 +24,12 @@ struct btrfs_space_info { + the space info if we had an ENOSPC in the + allocator. */ + ++ /* ++ * Once a block group drops below this threshold (percents) we'll ++ * schedule it for reclaim. ++ */ ++ int bg_reclaim_threshold; ++ + int clamp; /* Used to scale our threshold for preemptive + flushing. The value is >> clamp, so turns + out to be a 2^clamp divisor. */ +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index ba78ca5aabbb..43845cae0c74 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -722,6 +722,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable); + SPACE_INFO_ATTR(disk_used); + SPACE_INFO_ATTR(disk_total); + ++static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *a, ++ char *buf) ++{ ++ struct btrfs_space_info *space_info = to_space_info(kobj); ++ ssize_t ret; ++ ++ ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); ++ ++ return ret; ++} ++ ++static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *a, ++ const char *buf, size_t len) ++{ ++ struct btrfs_space_info *space_info = to_space_info(kobj); ++ int thresh; ++ int ret; ++ ++ ret = kstrtoint(buf, 10, &thresh); ++ if (ret) ++ return ret; ++ ++ if (thresh != 0 && (thresh <= 50 || thresh > 100)) ++ return -EINVAL; ++ ++ WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); ++ ++ return len; ++} ++ ++BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, ++ btrfs_sinfo_bg_reclaim_threshold_show, ++ btrfs_sinfo_bg_reclaim_threshold_store); ++ + /* + * Allocation information about block group types. + * +@@ -738,6 +774,7 @@ static struct attribute *space_info_attrs[] = { + BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), + BTRFS_ATTR_PTR(space_info, disk_used), + BTRFS_ATTR_PTR(space_info, disk_total), ++ BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + NULL, + }; + ATTRIBUTE_GROUPS(space_info); +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index c424417e19bb..199b69670fa2 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -10,11 +10,7 @@ + #include "block-group.h" + #include "btrfs_inode.h" + +-/* +- * Block groups with more than this value (percents) of unusable space will be +- * scheduled for background reclaim. +- */ +-#define BTRFS_DEFAULT_RECLAIM_THRESH 75 ++#define BTRFS_DEFAULT_RECLAIM_THRESH (75) + + struct btrfs_zoned_device_info { + /* +-- +2.35.1 + diff --git a/queue-5.18/btrfs-properly-flag-filesystem-with-btrfs_feature_in.patch b/queue-5.18/btrfs-properly-flag-filesystem-with-btrfs_feature_in.patch new file mode 100644 index 00000000000..795088e5545 --- /dev/null +++ b/queue-5.18/btrfs-properly-flag-filesystem-with-btrfs_feature_in.patch @@ -0,0 +1,72 @@ +From 2317054fdc824202a5af22cdb2b8b2f0fcc792d6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 23 Jun 2022 10:55:47 +0300 +Subject: btrfs: properly flag filesystem with + BTRFS_FEATURE_INCOMPAT_BIG_METADATA + +From: Nikolay Borisov + +[ Upstream commit e26b04c4c91925dba57324db177a24e18e2d0013 ] + +Commit 6f93e834fa7c seemingly inadvertently moved the code responsible +for flagging the filesystem as having BIG_METADATA to a place where +setting the flag was essentially lost. This means that +filesystems created with kernels containing this bug (starting with 5.15) +can potentially be mounted by older (pre-3.4) kernels. In reality +chances for this happening are low because there are other incompat +flags introduced in the mean time. Still the correct behavior is to set +INCOMPAT_BIG_METADATA flag and persist this in the superblock. + +Fixes: 6f93e834fa7c ("btrfs: fix upper limit for max_inline for page size 64K") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Qu Wenruo +Signed-off-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/disk-io.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index f45470798022..34cd57d799e4 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3577,16 +3577,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + +- /* +- * Flag our filesystem as having big metadata blocks if they are bigger +- * than the page size. +- */ +- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { +- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) +- btrfs_info(fs_info, +- "flagging fs with big metadata feature"); +- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; +- } + + /* Set up fs_info before parsing mount options */ + nodesize = btrfs_super_nodesize(disk_super); +@@ -3627,6 +3617,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) + btrfs_info(fs_info, "has skinny extents"); + ++ /* ++ * Flag our filesystem as having big metadata blocks if they are bigger ++ * than the page size. ++ */ ++ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { ++ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) ++ btrfs_info(fs_info, ++ "flagging fs with big metadata feature"); ++ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; ++ } ++ + /* + * mixed block groups end up with duplicate but slightly offset + * extent buffers for the same range. It leads to corruptions +-- +2.35.1 + diff --git a/queue-5.18/btrfs-replace-btrfs_max_extent_size-with-fs_info-max.patch b/queue-5.18/btrfs-replace-btrfs_max_extent_size-with-fs_info-max.patch new file mode 100644 index 00000000000..1e010423059 --- /dev/null +++ b/queue-5.18/btrfs-replace-btrfs_max_extent_size-with-fs_info-max.patch @@ -0,0 +1,222 @@ +From dcb75d1d2ed081e90e672a228bd75205ce484c3e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:40 +0900 +Subject: btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size + +From: Naohiro Aota + +[ Upstream commit f7b12a62f008a3041f42f2426983e59a6a0a3c59 ] + +On zoned filesystem, data write out is limited by max_zone_append_size, +and a large ordered extent is split according the size of a bio. OTOH, +the number of extents to be written is calculated using +BTRFS_MAX_EXTENT_SIZE, and that estimated number is used to reserve the +metadata bytes to update and/or create the metadata items. + +The metadata reservation is done at e.g, btrfs_buffered_write() and then +released according to the estimation changes. Thus, if the number of extent +increases massively, the reserved metadata can run out. + +The increase of the number of extents easily occurs on zoned filesystem +if BTRFS_MAX_EXTENT_SIZE > max_zone_append_size. And, it causes the +following warning on a small RAM environment with disabling metadata +over-commit (in the following patch). + +[75721.498492] ------------[ cut here ]------------ +[75721.505624] BTRFS: block rsv 1 returned -28 +[75721.512230] WARNING: CPU: 24 PID: 2327559 at fs/btrfs/block-rsv.c:537 btrfs_use_block_rsv+0x560/0x760 [btrfs] +[75721.581854] CPU: 24 PID: 2327559 Comm: kworker/u64:10 Kdump: loaded Tainted: G W 5.18.0-rc2-BTRFS-ZNS+ #109 +[75721.597200] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.0 02/22/2021 +[75721.607310] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] +[75721.616209] RIP: 0010:btrfs_use_block_rsv+0x560/0x760 [btrfs] +[75721.646649] RSP: 0018:ffffc9000fbdf3e0 EFLAGS: 00010286 +[75721.654126] RAX: 0000000000000000 RBX: 0000000000004000 RCX: 0000000000000000 +[75721.663524] RDX: 0000000000000004 RSI: 0000000000000008 RDI: fffff52001f7be6e +[75721.672921] RBP: ffffc9000fbdf420 R08: 0000000000000001 R09: ffff889f8d1fc6c7 +[75721.682493] R10: ffffed13f1a3f8d8 R11: 0000000000000001 R12: ffff88980a3c0e28 +[75721.692284] R13: ffff889b66590000 R14: ffff88980a3c0e40 R15: ffff88980a3c0e8a +[75721.701878] FS: 0000000000000000(0000) GS:ffff889f8d000000(0000) knlGS:0000000000000000 +[75721.712601] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[75721.720726] CR2: 000055d12e05c018 CR3: 0000800193594000 CR4: 0000000000350ee0 +[75721.730499] Call Trace: +[75721.735166] +[75721.739886] btrfs_alloc_tree_block+0x1e1/0x1100 [btrfs] +[75721.747545] ? btrfs_alloc_logged_file_extent+0x550/0x550 [btrfs] +[75721.756145] ? btrfs_get_32+0xea/0x2d0 [btrfs] +[75721.762852] ? btrfs_get_32+0xea/0x2d0 [btrfs] +[75721.769520] ? push_leaf_left+0x420/0x620 [btrfs] +[75721.776431] ? memcpy+0x4e/0x60 +[75721.781931] split_leaf+0x433/0x12d0 [btrfs] +[75721.788392] ? btrfs_get_token_32+0x580/0x580 [btrfs] +[75721.795636] ? push_for_double_split.isra.0+0x420/0x420 [btrfs] +[75721.803759] ? leaf_space_used+0x15d/0x1a0 [btrfs] +[75721.811156] btrfs_search_slot+0x1bc3/0x2790 [btrfs] +[75721.818300] ? lock_downgrade+0x7c0/0x7c0 +[75721.824411] ? free_extent_buffer.part.0+0x107/0x200 [btrfs] +[75721.832456] ? split_leaf+0x12d0/0x12d0 [btrfs] +[75721.839149] ? free_extent_buffer.part.0+0x14f/0x200 [btrfs] +[75721.846945] ? free_extent_buffer+0x13/0x20 [btrfs] +[75721.853960] ? btrfs_release_path+0x4b/0x190 [btrfs] +[75721.861429] btrfs_csum_file_blocks+0x85c/0x1500 [btrfs] +[75721.869313] ? rcu_read_lock_sched_held+0x16/0x80 +[75721.876085] ? lock_release+0x552/0xf80 +[75721.881957] ? btrfs_del_csums+0x8c0/0x8c0 [btrfs] +[75721.888886] ? __kasan_check_write+0x14/0x20 +[75721.895152] ? do_raw_read_unlock+0x44/0x80 +[75721.901323] ? _raw_write_lock_irq+0x60/0x80 +[75721.907983] ? btrfs_global_root+0xb9/0xe0 [btrfs] +[75721.915166] ? btrfs_csum_root+0x12b/0x180 [btrfs] +[75721.921918] ? btrfs_get_global_root+0x820/0x820 [btrfs] +[75721.929166] ? _raw_write_unlock+0x23/0x40 +[75721.935116] ? unpin_extent_cache+0x1e3/0x390 [btrfs] +[75721.942041] btrfs_finish_ordered_io.isra.0+0xa0c/0x1dc0 [btrfs] +[75721.949906] ? try_to_wake_up+0x30/0x14a0 +[75721.955700] ? btrfs_unlink_subvol+0xda0/0xda0 [btrfs] +[75721.962661] ? rcu_read_lock_sched_held+0x16/0x80 +[75721.969111] ? lock_acquire+0x41b/0x4c0 +[75721.974982] finish_ordered_fn+0x15/0x20 [btrfs] +[75721.981639] btrfs_work_helper+0x1af/0xa80 [btrfs] +[75721.988184] ? _raw_spin_unlock_irq+0x28/0x50 +[75721.994643] process_one_work+0x815/0x1460 +[75722.000444] ? pwq_dec_nr_in_flight+0x250/0x250 +[75722.006643] ? do_raw_spin_trylock+0xbb/0x190 +[75722.013086] worker_thread+0x59a/0xeb0 +[75722.018511] kthread+0x2ac/0x360 +[75722.023428] ? process_one_work+0x1460/0x1460 +[75722.029431] ? kthread_complete_and_exit+0x30/0x30 +[75722.036044] ret_from_fork+0x22/0x30 +[75722.041255] +[75722.045047] irq event stamp: 0 +[75722.049703] hardirqs last enabled at (0): [<0000000000000000>] 0x0 +[75722.057610] hardirqs last disabled at (0): [] copy_process+0x1c1a/0x66b0 +[75722.067533] softirqs last enabled at (0): [] copy_process+0x1c59/0x66b0 +[75722.077423] softirqs last disabled at (0): [<0000000000000000>] 0x0 +[75722.085335] ---[ end trace 0000000000000000 ]--- + +To fix the estimation, we need to introduce fs_info->max_extent_size to +replace BTRFS_MAX_EXTENT_SIZE, which allow setting the different size for +regular vs zoned filesystem. + +Set fs_info->max_extent_size to BTRFS_MAX_EXTENT_SIZE by default. On zoned +filesystem, it is set to fs_info->max_zone_append_size. + +CC: stable@vger.kernel.org # 5.12+ +Fixes: d8e3fb106f39 ("btrfs: zoned: use ZONE_APPEND write for zoned mode") +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/ctree.h | 6 ++++++ + fs/btrfs/disk-io.c | 2 ++ + fs/btrfs/extent_io.c | 4 +++- + fs/btrfs/inode.c | 6 ++++-- + fs/btrfs/zoned.c | 5 ++++- + 5 files changed, 19 insertions(+), 4 deletions(-) + +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 1c377bcfe787..97f5a3d320ff 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1032,6 +1032,12 @@ struct btrfs_fs_info { + u32 csums_per_leaf; + u32 stripesize; + ++ /* ++ * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular ++ * filesystem, on zoned it depends on the device constraints. ++ */ ++ u64 max_extent_size; ++ + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 34cd57d799e4..bf5c6ac67e87 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3246,6 +3246,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) + fs_info->sectorsize_bits = ilog2(4096); + fs_info->stripesize = 4096; + ++ fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; ++ + spin_lock_init(&fs_info->swapfile_pins_lock); + fs_info->swapfile_pins = RB_ROOT; + +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index 68ddd90685d9..bfc7d5b31156 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -1992,10 +1992,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, + struct page *locked_page, u64 *start, + u64 *end) + { ++ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + const u64 orig_start = *start; + const u64 orig_end = *end; +- u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; ++ /* The sanity tests may not set a valid fs_info. */ ++ u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; + u64 delalloc_start; + u64 delalloc_end; + bool found; +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 1e404476fe6a..c50288d90c66 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -2102,6 +2102,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page + void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split) + { ++ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 size; + + /* not delalloc, ignore it */ +@@ -2109,7 +2110,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, + return; + + size = orig->end - orig->start + 1; +- if (size > BTRFS_MAX_EXTENT_SIZE) { ++ if (size > fs_info->max_extent_size) { + u32 num_extents; + u64 new_size; + +@@ -2138,6 +2139,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, + void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other) + { ++ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 new_size, old_size; + u32 num_extents; + +@@ -2151,7 +2153,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + new_size = other->end - new->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ +- if (new_size <= BTRFS_MAX_EXTENT_SIZE) { ++ if (new_size <= fs_info->max_extent_size) { + spin_lock(&BTRFS_I(inode)->lock); + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); + spin_unlock(&BTRFS_I(inode)->lock); +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 1d5b9308f5ef..a0bf2c20fa61 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -731,8 +731,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) + } + + fs_info->zone_size = zone_size; +- fs_info->max_zone_append_size = max_zone_append_size; ++ fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, ++ fs_info->sectorsize); + fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; ++ if (fs_info->max_zone_append_size < fs_info->max_extent_size) ++ fs_info->max_extent_size = fs_info->max_zone_append_size; + + /* + * Check mount options here, because we might change fs_info->zoned +-- +2.35.1 + diff --git a/queue-5.18/btrfs-reset-block-group-chunk-force-if-we-have-to-wa.patch b/queue-5.18/btrfs-reset-block-group-chunk-force-if-we-have-to-wa.patch new file mode 100644 index 00000000000..ab43457423b --- /dev/null +++ b/queue-5.18/btrfs-reset-block-group-chunk-force-if-we-have-to-wa.patch @@ -0,0 +1,42 @@ +From 7b8c917f29d18606d01d4f3ae4aab8f10342c9f7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Jun 2022 18:31:17 -0400 +Subject: btrfs: reset block group chunk force if we have to wait + +From: Josef Bacik + +[ Upstream commit 1314ca78b2c35d3e7d0f097268a2ee6dc0d369ef ] + +If you try to force a chunk allocation, but you race with another chunk +allocation, you will end up waiting on the chunk allocation that just +occurred and then allocate another chunk. If you have many threads all +doing this at once you can way over-allocate chunks. + +Fix this by resetting force to NO_FORCE, that way if we think we need to +allocate we can, otherwise we don't force another chunk allocation if +one is already happening. + +Reviewed-by: Filipe Manana +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/block-group.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c +index 667b7025d503..1deca5164c23 100644 +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -3724,6 +3724,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, + * attempt. + */ + wait_for_alloc = true; ++ force = CHUNK_ALLOC_NO_FORCE; + spin_unlock(&space_info->lock); + mutex_lock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->chunk_mutex); +-- +2.35.1 + diff --git a/queue-5.18/btrfs-store-chunk-size-in-space-info-struct.patch b/queue-5.18/btrfs-store-chunk-size-in-space-info-struct.patch new file mode 100644 index 00000000000..0767c579892 --- /dev/null +++ b/queue-5.18/btrfs-store-chunk-size-in-space-info-struct.patch @@ -0,0 +1,141 @@ +From 43c91308d645899ddb58951c23e4338cb43cb48c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 8 Feb 2022 11:31:20 -0800 +Subject: btrfs: store chunk size in space-info struct + +From: Stefan Roesch + +[ Upstream commit f6fca3917b4d99d8c13901738afec35f570a3c2f ] + +The chunk size is stored in the btrfs_space_info structure. It is +initialized at the start and is then used. + +A new API is added to update the current chunk size. This API is used +to be able to expose the chunk_size as a sysfs setting. + +Signed-off-by: Stefan Roesch +Reviewed-by: David Sterba +[ rename and merge helpers, switch atomic type to u64, style fixes ] +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/space-info.c | 32 ++++++++++++++++++++++++++++++++ + fs/btrfs/space-info.h | 4 ++++ + fs/btrfs/volumes.c | 28 +++++++++------------------- + 3 files changed, 45 insertions(+), 19 deletions(-) + +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 85608acb9557..98a84b523be6 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -187,6 +187,37 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) + */ + #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) + ++/* ++ * Calculate chunk size depending on volume type (regular or zoned). ++ */ ++static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) ++{ ++ if (btrfs_is_zoned(fs_info)) ++ return fs_info->zone_size; ++ ++ ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ++ ++ if (flags & BTRFS_BLOCK_GROUP_DATA) ++ return SZ_1G; ++ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) ++ return SZ_32M; ++ ++ /* Handle BTRFS_BLOCK_GROUP_METADATA */ ++ if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) ++ return SZ_1G; ++ ++ return SZ_256M; ++} ++ ++/* ++ * Update default chunk size. ++ */ ++void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, ++ u64 chunk_size) ++{ ++ WRITE_ONCE(space_info->chunk_size, chunk_size); ++} ++ + static int create_space_info(struct btrfs_fs_info *info, u64 flags) + { + +@@ -208,6 +239,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) + INIT_LIST_HEAD(&space_info->tickets); + INIT_LIST_HEAD(&space_info->priority_tickets); + space_info->clamp = 1; ++ btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + + if (btrfs_is_zoned(info)) + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; +diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h +index a803e29bd781..137206b8049f 100644 +--- a/fs/btrfs/space-info.h ++++ b/fs/btrfs/space-info.h +@@ -23,6 +23,8 @@ struct btrfs_space_info { + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ ++ /* Chunk size in bytes */ ++ u64 chunk_size; + + /* + * Once a block group drops below this threshold (percents) we'll +@@ -121,6 +123,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, u64 bytes_zone_unusable, + struct btrfs_space_info **space_info); ++void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, ++ u64 chunk_size); + struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, + u64 flags); + u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index 659575526e9f..4bc97e7d8e46 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -5091,26 +5091,16 @@ static void init_alloc_chunk_ctl_policy_regular( + struct btrfs_fs_devices *fs_devices, + struct alloc_chunk_ctl *ctl) + { +- u64 type = ctl->type; ++ struct btrfs_space_info *space_info; + +- if (type & BTRFS_BLOCK_GROUP_DATA) { +- ctl->max_stripe_size = SZ_1G; +- ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; +- } else if (type & BTRFS_BLOCK_GROUP_METADATA) { +- /* For larger filesystems, use larger metadata chunks */ +- if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) +- ctl->max_stripe_size = SZ_1G; +- else +- ctl->max_stripe_size = SZ_256M; +- ctl->max_chunk_size = ctl->max_stripe_size; +- } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { +- ctl->max_stripe_size = SZ_32M; +- ctl->max_chunk_size = 2 * ctl->max_stripe_size; +- ctl->devs_max = min_t(int, ctl->devs_max, +- BTRFS_MAX_DEVS_SYS_CHUNK); +- } else { +- BUG(); +- } ++ space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); ++ ASSERT(space_info); ++ ++ ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); ++ ctl->max_stripe_size = ctl->max_chunk_size; ++ ++ if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) ++ ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); + + /* We don't want a chunk larger than 10% of writable space */ + ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), +-- +2.35.1 + diff --git a/queue-5.18/btrfs-tree-log-make-the-return-value-for-log-syncing.patch b/queue-5.18/btrfs-tree-log-make-the-return-value-for-log-syncing.patch new file mode 100644 index 00000000000..3dba67e719b --- /dev/null +++ b/queue-5.18/btrfs-tree-log-make-the-return-value-for-log-syncing.patch @@ -0,0 +1,142 @@ +From f844f5c9f2598302a23d546bfe15914ee71b9c29 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Jun 2022 15:09:48 -0400 +Subject: btrfs: tree-log: make the return value for log syncing consistent + +From: Josef Bacik + +[ Upstream commit f31f09f6be1c6c1a673e0566e258281a7bbaaa51 ] + +Currently we will return 1 or -EAGAIN if we decide we need to commit +the transaction rather than sync the log. In practice this doesn't +really matter, we interpret any !0 and !BTRFS_NO_LOG_SYNC as needing to +commit the transaction. However this makes it hard to figure out what +the correct thing to do is. + +Fix this up by defining BTRFS_LOG_FORCE_COMMIT and using this in all the +places where we want to force the transaction to be committed. + +CC: stable@vger.kernel.org # 5.15+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/file.c | 2 +- + fs/btrfs/tree-log.c | 18 +++++++++--------- + fs/btrfs/tree-log.h | 3 +++ + 3 files changed, 13 insertions(+), 10 deletions(-) + +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c +index 153920acd226..2d24f2dcc0ea 100644 +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -2344,7 +2344,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) + btrfs_release_log_ctx_extents(&ctx); + if (ret < 0) { + /* Fallthrough and commit/free transaction. */ +- ret = 1; ++ ret = BTRFS_LOG_FORCE_COMMIT; + } + + /* we've logged all the items and now have a consistent +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index e65633686378..08917069a125 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -171,7 +171,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, + int index = (root->log_transid + 1) % 2; + + if (btrfs_need_log_full_commit(trans)) { +- ret = -EAGAIN; ++ ret = BTRFS_LOG_FORCE_COMMIT; + goto out; + } + +@@ -194,7 +194,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, + * writing. + */ + if (zoned && !created) { +- ret = -EAGAIN; ++ ret = BTRFS_LOG_FORCE_COMMIT; + goto out; + } + +@@ -3122,7 +3122,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + + /* bail out if we need to do a full commit */ + if (btrfs_need_log_full_commit(trans)) { +- ret = -EAGAIN; ++ ret = BTRFS_LOG_FORCE_COMMIT; + mutex_unlock(&root->log_mutex); + goto out; + } +@@ -3223,7 +3223,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + } + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); +- ret = -EAGAIN; ++ ret = BTRFS_LOG_FORCE_COMMIT; + goto out; + } + +@@ -3262,7 +3262,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, + blk_finish_plug(&plug); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); +- ret = -EAGAIN; ++ ret = BTRFS_LOG_FORCE_COMMIT; + goto out_wake_log_root; + } + +@@ -5849,7 +5849,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, + inode_only == LOG_INODE_ALL && + inode->last_unlink_trans >= trans->transid) { + btrfs_set_log_full_commit(trans); +- ret = 1; ++ ret = BTRFS_LOG_FORCE_COMMIT; + goto out_unlock; + } + +@@ -6563,12 +6563,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + bool log_dentries = false; + + if (btrfs_test_opt(fs_info, NOTREELOG)) { +- ret = 1; ++ ret = BTRFS_LOG_FORCE_COMMIT; + goto end_no_trans; + } + + if (btrfs_root_refs(&root->root_item) == 0) { +- ret = 1; ++ ret = BTRFS_LOG_FORCE_COMMIT; + goto end_no_trans; + } + +@@ -6666,7 +6666,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, + end_trans: + if (ret < 0) { + btrfs_set_log_full_commit(trans); +- ret = 1; ++ ret = BTRFS_LOG_FORCE_COMMIT; + } + + if (ret) +diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h +index 1620f8170629..57ab5f3b8dc7 100644 +--- a/fs/btrfs/tree-log.h ++++ b/fs/btrfs/tree-log.h +@@ -12,6 +12,9 @@ + /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ + #define BTRFS_NO_LOG_SYNC 256 + ++/* We can't use the tree log for whatever reason, force a transaction commit */ ++#define BTRFS_LOG_FORCE_COMMIT (1) ++ + struct btrfs_log_ctx { + int log_ret; + int log_transid; +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-activate-metadata-block-group-on-flush_s.patch b/queue-5.18/btrfs-zoned-activate-metadata-block-group-on-flush_s.patch new file mode 100644 index 00000000000..5446c35a7a6 --- /dev/null +++ b/queue-5.18/btrfs-zoned-activate-metadata-block-group-on-flush_s.patch @@ -0,0 +1,180 @@ +From d866ac5585bc1afadfcc6bb5649813cd1f0f82ab Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:47 +0900 +Subject: btrfs: zoned: activate metadata block group on flush_space + +From: Naohiro Aota + +[ Upstream commit b0931513913633044ed6e3800334c28433c007b0 ] + +For metadata space on zoned filesystem, reaching ALLOC_CHUNK{,_FORCE} +means we don't have enough space left in the active_total_bytes. Before +allocating a new chunk, we can try to activate an existing block group +in this case. + +Also, allocating a chunk is not enough to grant a ticket for metadata +space on zoned filesystem we need to activate the block group to +increase the active_total_bytes. + +btrfs_zoned_activate_one_bg() implements the activation feature. It will +activate a block group by (maybe) finishing a block group. It will give up +activating a block group if it cannot finish any block group. + +CC: stable@vger.kernel.org # 5.16+ +Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/space-info.c | 30 ++++++++++++++++++++++++ + fs/btrfs/zoned.c | 53 +++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/zoned.h | 10 ++++++++ + 3 files changed, 93 insertions(+) + +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 4867199cf983..104cbc901c0e 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -9,6 +9,7 @@ + #include "ordered-data.h" + #include "transaction.h" + #include "block-group.h" ++#include "zoned.h" + + /* + * HOW DOES SPACE RESERVATION WORK +@@ -724,6 +725,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, + break; + case ALLOC_CHUNK: + case ALLOC_CHUNK_FORCE: ++ /* ++ * For metadata space on zoned filesystem, reaching here means we ++ * don't have enough space left in active_total_bytes. Try to ++ * activate a block group first, because we may have inactive ++ * block group already allocated. ++ */ ++ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); ++ if (ret < 0) ++ break; ++ else if (ret == 1) ++ break; ++ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); +@@ -734,6 +747,23 @@ static void flush_space(struct btrfs_fs_info *fs_info, + (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : + CHUNK_ALLOC_FORCE); + btrfs_end_transaction(trans); ++ ++ /* ++ * For metadata space on zoned filesystem, allocating a new chunk ++ * is not enough. We still need to activate the block * group. ++ * Active the newly allocated block group by (maybe) finishing ++ * a block group. ++ */ ++ if (ret == 1) { ++ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); ++ /* ++ * Revert to the original ret regardless we could finish ++ * one block group or not. ++ */ ++ if (ret >= 0) ++ ret = 1; ++ } ++ + if (ret > 0 || ret == -ENOSPC) + ret = 0; + break; +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 2ffc6d50d20d..0c2d81b0e3d3 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2222,3 +2222,56 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) + + return ret < 0 ? ret : 1; + } ++ ++int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, ++ struct btrfs_space_info *space_info, ++ bool do_finish) ++{ ++ struct btrfs_block_group *bg; ++ int index; ++ ++ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) ++ return 0; ++ ++ /* No more block groups to activate */ ++ if (space_info->active_total_bytes == space_info->total_bytes) ++ return 0; ++ ++ for (;;) { ++ int ret; ++ bool need_finish = false; ++ ++ down_read(&space_info->groups_sem); ++ for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { ++ list_for_each_entry(bg, &space_info->block_groups[index], ++ list) { ++ if (!spin_trylock(&bg->lock)) ++ continue; ++ if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { ++ spin_unlock(&bg->lock); ++ continue; ++ } ++ spin_unlock(&bg->lock); ++ ++ if (btrfs_zone_activate(bg)) { ++ up_read(&space_info->groups_sem); ++ return 1; ++ } ++ ++ need_finish = true; ++ } ++ } ++ up_read(&space_info->groups_sem); ++ ++ if (!do_finish || !need_finish) ++ break; ++ ++ ret = btrfs_zone_finish_one_bg(fs_info); ++ if (ret == 0) ++ break; ++ if (ret < 0) ++ return ret; ++ } ++ ++ return 0; ++} +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index 0740458894ac..1cac32266276 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -80,6 +80,8 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); + void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); + int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); ++int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, ++ struct btrfs_space_info *space_info, bool do_finish); + #else /* CONFIG_BLK_DEV_ZONED */ + static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +@@ -250,6 +252,14 @@ static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) + return 1; + } + ++static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, ++ struct btrfs_space_info *space_info, ++ bool do_finish) ++{ ++ /* Consider all the block groups are active */ ++ return 0; ++} ++ + #endif + + static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-activate-necessary-block-group.patch b/queue-5.18/btrfs-zoned-activate-necessary-block-group.patch new file mode 100644 index 00000000000..e2166bcf6a8 --- /dev/null +++ b/queue-5.18/btrfs-zoned-activate-necessary-block-group.patch @@ -0,0 +1,60 @@ +From eae937137d623d9fd942a7525307563e157bc79a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:48 +0900 +Subject: btrfs: zoned: activate necessary block group + +From: Naohiro Aota + +[ Upstream commit b6a98021e4019c562a23ad151a7e40adfa9f91e5 ] + +There are two places where allocating a chunk is not enough. These two +places are trying to ensure the space by allocating a chunk. To meet the +condition for active_total_bytes, we also need to activate a block group +there. + +CC: stable@vger.kernel.org # 5.16+ +Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/block-group.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c +index 88f59a2e4113..0c7fe3142d7c 100644 +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -2659,6 +2659,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, + ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + if (ret < 0) + goto out; ++ /* ++ * We have allocated a new chunk. We also need to activate that chunk to ++ * grant metadata tickets for zoned filesystem. ++ */ ++ ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); ++ if (ret < 0) ++ goto out; ++ + ret = inc_block_group_ro(cache, 0); + if (ret == -ETXTBSY) + goto unlock_out; +@@ -3853,6 +3861,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, + if (IS_ERR(bg)) { + ret = PTR_ERR(bg); + } else { ++ /* ++ * We have a new chunk. We also need to activate it for ++ * zoned filesystem. ++ */ ++ ret = btrfs_zoned_activate_one_bg(fs_info, info, true); ++ if (ret < 0) ++ return; ++ + /* + * If we fail to add the chunk item here, we end up + * trying again at phase 2 of chunk allocation, at +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-disable-metadata-overcommit-for-zoned.patch b/queue-5.18/btrfs-zoned-disable-metadata-overcommit-for-zoned.patch new file mode 100644 index 00000000000..472b3b8d2aa --- /dev/null +++ b/queue-5.18/btrfs-zoned-disable-metadata-overcommit-for-zoned.patch @@ -0,0 +1,46 @@ +From 28f8ac17eaf7d79fae9a491d13a968e609237c54 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:46 +0900 +Subject: btrfs: zoned: disable metadata overcommit for zoned + +From: Naohiro Aota + +[ Upstream commit 79417d040f4f77b19c701bccc23013b9cdac358d ] + +The metadata overcommit makes the space reservation flexible but it is also +harmful to active zone tracking. Since we cannot finish a block group from +the metadata allocation context, we might not activate a new block group +and might not be able to actually write out the overcommit reservations. + +So, disable metadata overcommit for zoned filesystems. We will ensure +the reservations are under active_total_bytes in the following patches. + +CC: stable@vger.kernel.org # 5.16+ +Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") +Signed-off-by: Naohiro Aota +Reviewed-by: Johannes Thumshirn +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/space-info.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index b87931a458eb..56a7c99fc03e 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -340,7 +340,10 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + return 0; + + used = btrfs_space_info_used(space_info, true); +- avail = calc_available_free_space(fs_info, space_info, flush); ++ if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) ++ avail = 0; ++ else ++ avail = calc_available_free_space(fs_info, space_info, flush); + + if (used + bytes < space_info->total_bytes + avail) + return 1; +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-finish-least-available-block-group-on-da.patch b/queue-5.18/btrfs-zoned-finish-least-available-block-group-on-da.patch new file mode 100644 index 00000000000..1e379e7d093 --- /dev/null +++ b/queue-5.18/btrfs-zoned-finish-least-available-block-group-on-da.patch @@ -0,0 +1,188 @@ +From fe36ff205f09ab9fb8e3d7405b25826217ca9aec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:44 +0900 +Subject: btrfs: zoned: finish least available block group on data bg + allocation + +From: Naohiro Aota + +[ Upstream commit 393f646e34c18b85d0f41272bfcbd475ae3a0d34 ] + +When we run out of active zones and no sufficient space is left in any +block groups, we need to finish one block group to make room to activate a +new block group. + +However, we cannot do this for metadata block groups because we can cause a +deadlock by waiting for a running transaction commit. So, do that only for +a data block group. + +Furthermore, the block group to be finished has two requirements. First, +the block group must not have reserved bytes left. Having reserved bytes +means we have an allocated region but did not yet send bios for it. If that +region is allocated by the thread calling btrfs_zone_finish(), it results +in a deadlock. + +Second, the block group to be finished must not be a SYSTEM block +group. Finishing a SYSTEM block group easily breaks further chunk +allocation by nullifying the SYSTEM free space. + +In a certain case, we cannot find any zone finish candidate or +btrfs_zone_finish() may fail. In that case, we fall back to split the +allocation bytes and fill the last spaces left in the block groups. + +CC: stable@vger.kernel.org # 5.16+ +Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 50 +++++++++++++++++++++++++++++++++--------- + fs/btrfs/zoned.c | 40 +++++++++++++++++++++++++++++++++ + fs/btrfs/zoned.h | 7 ++++++ + 3 files changed, 87 insertions(+), 10 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 8bdcbc0c6d60..bdebd77f31b4 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3985,6 +3985,45 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, + } + } + ++static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, ++ struct find_free_extent_ctl *ffe_ctl) ++{ ++ /* If we can activate new zone, just allocate a chunk and use it */ ++ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) ++ return 0; ++ ++ /* ++ * We already reached the max active zones. Try to finish one block ++ * group to make a room for a new block group. This is only possible ++ * for a data block group because btrfs_zone_finish() may need to wait ++ * for a running transaction which can cause a deadlock for metadata ++ * allocation. ++ */ ++ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { ++ int ret = btrfs_zone_finish_one_bg(fs_info); ++ ++ if (ret == 1) ++ return 0; ++ else if (ret < 0) ++ return ret; ++ } ++ ++ /* ++ * If we have enough free space left in an already active block group ++ * and we can't activate any other zone now, do not allow allocating a ++ * new chunk and let find_free_extent() retry with a smaller size. ++ */ ++ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) ++ return -ENOSPC; ++ ++ /* ++ * We cannot activate a new block group and no enough space left in any ++ * block groups. So, allocating a new block group may not help. But, ++ * there is nothing to do anyway, so let's go with it. ++ */ ++ return 0; ++} ++ + static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) + { +@@ -3992,16 +4031,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + case BTRFS_EXTENT_ALLOC_CLUSTERED: + return 0; + case BTRFS_EXTENT_ALLOC_ZONED: +- /* +- * If we have enough free space left in an already +- * active block group and we can't activate any other +- * zone now, do not allow allocating a new chunk and +- * let find_free_extent() retry with a smaller size. +- */ +- if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && +- !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) +- return -ENOSPC; +- return 0; ++ return can_allocate_chunk_zoned(fs_info, ffe_ctl); + default: + BUG(); + } +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index a0bf2c20fa61..0a6a3d6f5af7 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2176,3 +2176,43 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); + } ++ ++int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) ++{ ++ struct btrfs_block_group *block_group; ++ struct btrfs_block_group *min_bg = NULL; ++ u64 min_avail = U64_MAX; ++ int ret; ++ ++ spin_lock(&fs_info->zone_active_bgs_lock); ++ list_for_each_entry(block_group, &fs_info->zone_active_bgs, ++ active_bg_list) { ++ u64 avail; ++ ++ spin_lock(&block_group->lock); ++ if (block_group->reserved || ++ (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { ++ spin_unlock(&block_group->lock); ++ continue; ++ } ++ ++ avail = block_group->zone_capacity - block_group->alloc_offset; ++ if (min_avail > avail) { ++ if (min_bg) ++ btrfs_put_block_group(min_bg); ++ min_bg = block_group; ++ min_avail = avail; ++ btrfs_get_block_group(min_bg); ++ } ++ spin_unlock(&block_group->lock); ++ } ++ spin_unlock(&fs_info->zone_active_bgs_lock); ++ ++ if (!min_bg) ++ return 0; ++ ++ ret = btrfs_zone_finish(min_bg); ++ btrfs_put_block_group(min_bg); ++ ++ return ret < 0 ? ret : 1; ++} +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index 2d6da8f4b55a..c424417e19bb 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -83,6 +83,7 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); + void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); + void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); ++int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); + #else /* CONFIG_BLK_DEV_ZONED */ + static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, + struct blk_zone *zone) +@@ -247,6 +248,12 @@ static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } + + static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } ++ ++static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) ++{ ++ return 1; ++} ++ + #endif + + static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-introduce-btrfs_zoned_bg_is_full.patch b/queue-5.18/btrfs-zoned-introduce-btrfs_zoned_bg_is_full.patch new file mode 100644 index 00000000000..e8cd2141b72 --- /dev/null +++ b/queue-5.18/btrfs-zoned-introduce-btrfs_zoned_bg_is_full.patch @@ -0,0 +1,68 @@ +From 3672acc7c01efcf0c957c7949f4ff6dcaffde3f1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 3 May 2022 17:48:50 -0700 +Subject: btrfs: zoned: introduce btrfs_zoned_bg_is_full + +From: Naohiro Aota + +[ Upstream commit 1bfd476754a2d63f899ef9c3e253b17766b8fb73 ] + +Introduce a wrapper to check if all the space in a block group is +allocated or not. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 3 +-- + fs/btrfs/zoned.c | 2 +- + fs/btrfs/zoned.h | 6 ++++++ + 3 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index bdebd77f31b4..56185541e188 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3803,8 +3803,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, + + /* Check RO and no space case before trying to activate it */ + spin_lock(&block_group->lock); +- if (block_group->ro || +- block_group->alloc_offset == block_group->zone_capacity) { ++ if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { + ret = 1; + /* + * May need to clear fs_info->{treelog,data_reloc}_bg. +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 0a6a3d6f5af7..170681797283 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1859,7 +1859,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) + } + + /* No space left */ +- if (block_group->alloc_offset == block_group->zone_capacity) { ++ if (btrfs_zoned_bg_is_full(block_group)) { + ret = false; + goto out_unlock; + } +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index 199b69670fa2..0740458894ac 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -384,4 +384,10 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) + mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); + } + ++static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg) ++{ ++ ASSERT(btrfs_is_zoned(bg->fs_info)); ++ return (bg->alloc_offset == bg->zone_capacity); ++} ++ + #endif +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-introduce-space_info-active_total_bytes.patch b/queue-5.18/btrfs-zoned-introduce-space_info-active_total_bytes.patch new file mode 100644 index 00000000000..d89dee78bf7 --- /dev/null +++ b/queue-5.18/btrfs-zoned-introduce-space_info-active_total_bytes.patch @@ -0,0 +1,257 @@ +From 24698349d4bc8dc722b1db3ab3979252e0b38fe3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:45 +0900 +Subject: btrfs: zoned: introduce space_info->active_total_bytes + +From: Naohiro Aota + +[ Upstream commit 6a921de589926a350634e6e279f43fa5b9dbf5ba ] + +The active_total_bytes, like the total_bytes, accounts for the total bytes +of active block groups in the space_info. + +With an introduction of active_total_bytes, we can check if the reserved +bytes can be written to the block groups without activating a new block +group. The check is necessary for metadata allocation on zoned +filesystem. We cannot finish a block group, which may require waiting +for the current transaction, from the metadata allocation context. +Instead, we need to ensure the ongoing allocation (reserved bytes) fits +in active block groups. + +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/block-group.c | 12 +++++++++--- + fs/btrfs/space-info.c | 41 ++++++++++++++++++++++++++++++++--------- + fs/btrfs/space-info.h | 4 +++- + fs/btrfs/zoned.c | 6 ++++++ + 4 files changed, 50 insertions(+), 13 deletions(-) + +diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c +index 1deca5164c23..88f59a2e4113 100644 +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -1033,8 +1033,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + < block_group->zone_unusable); + WARN_ON(block_group->space_info->disk_total + < block_group->length * factor); ++ WARN_ON(block_group->zone_is_active && ++ block_group->space_info->active_total_bytes ++ < block_group->length); + } + block_group->space_info->total_bytes -= block_group->length; ++ if (block_group->zone_is_active) ++ block_group->space_info->active_total_bytes -= block_group->length; + block_group->space_info->bytes_readonly -= + (block_group->length - block_group->zone_unusable); + block_group->space_info->bytes_zone_unusable -= +@@ -2102,7 +2107,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, + trace_btrfs_add_block_group(info, cache, 0); + btrfs_update_space_info(info, cache->flags, cache->length, + cache->used, cache->bytes_super, +- cache->zone_unusable, &space_info); ++ cache->zone_unusable, cache->zone_is_active, ++ &space_info); + + cache->space_info = space_info; + +@@ -2172,7 +2178,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) + } + + btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, +- 0, 0, &space_info); ++ 0, 0, false, &space_info); + bg->space_info = space_info; + link_block_group(bg); + +@@ -2553,7 +2559,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran + trace_btrfs_add_block_group(fs_info, cache, 1); + btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, + cache->bytes_super, cache->zone_unusable, +- &cache->space_info); ++ cache->zone_is_active, &cache->space_info); + btrfs_update_global_block_rsv(fs_info); + + link_block_group(cache); +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 98a84b523be6..4867199cf983 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -295,7 +295,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) + void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, u64 bytes_zone_unusable, +- struct btrfs_space_info **space_info) ++ bool active, struct btrfs_space_info **space_info) + { + struct btrfs_space_info *found; + int factor; +@@ -306,6 +306,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += total_bytes; ++ if (active) ++ found->active_total_bytes += total_bytes; + found->disk_total += total_bytes * factor; + found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; +@@ -369,6 +371,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, + return avail; + } + ++static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, ++ struct btrfs_space_info *space_info) ++{ ++ /* ++ * On regular filesystem, all total_bytes are always writable. On zoned ++ * filesystem, there may be a limitation imposed by max_active_zones. ++ * For metadata allocation, we cannot finish an existing active block ++ * group to avoid a deadlock. Thus, we need to consider only the active ++ * groups to be writable for metadata space. ++ */ ++ if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) ++ return space_info->total_bytes; ++ ++ return space_info->active_total_bytes; ++} ++ + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +@@ -386,7 +404,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + else + avail = calc_available_free_space(fs_info, space_info, flush); + +- if (used + bytes < space_info->total_bytes + avail) ++ if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) + return 1; + return 0; + } +@@ -422,7 +440,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + ticket = list_first_entry(head, struct reserve_ticket, list); + + /* Check and see if our ticket can be satisfied now. */ +- if ((used + ticket->bytes <= space_info->total_bytes) || ++ if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || + btrfs_can_overcommit(fs_info, space_info, ticket->bytes, + flush)) { + btrfs_space_info_update_bytes_may_use(fs_info, +@@ -753,6 +771,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + { + u64 used; + u64 avail; ++ u64 total; + u64 to_reclaim = space_info->reclaim_size; + + lockdep_assert_held(&space_info->lock); +@@ -767,8 +786,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + * space. If that's the case add in our overage so we make sure to put + * appropriate pressure on the flushing state machine. + */ +- if (space_info->total_bytes + avail < used) +- to_reclaim += used - (space_info->total_bytes + avail); ++ total = writable_total_bytes(fs_info, space_info); ++ if (total + avail < used) ++ to_reclaim += used - (total + avail); + + return to_reclaim; + } +@@ -778,9 +798,12 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + { + u64 global_rsv_size = fs_info->global_block_rsv.reserved; + u64 ordered, delalloc; +- u64 thresh = div_factor_fine(space_info->total_bytes, 90); ++ u64 total = writable_total_bytes(fs_info, space_info); ++ u64 thresh; + u64 used; + ++ thresh = div_factor_fine(total, 90); ++ + lockdep_assert_held(&space_info->lock); + + /* If we're just plain full then async reclaim just slows us down. */ +@@ -842,8 +865,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + BTRFS_RESERVE_FLUSH_ALL); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_readonly + global_rsv_size; +- if (used < space_info->total_bytes) +- thresh += space_info->total_bytes - used; ++ if (used < total) ++ thresh += total - used; + thresh >>= space_info->clamp; + + used = space_info->bytes_pinned; +@@ -1560,7 +1583,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, + * can_overcommit() to ensure we can overcommit to continue. + */ + if (!pending_tickets && +- ((used + orig_bytes <= space_info->total_bytes) || ++ ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || + btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); +diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h +index 137206b8049f..b8cee27df213 100644 +--- a/fs/btrfs/space-info.h ++++ b/fs/btrfs/space-info.h +@@ -17,6 +17,8 @@ struct btrfs_space_info { + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ ++ /* Total bytes in the space, but only accounts active block groups. */ ++ u64 active_total_bytes; + u64 bytes_zone_unusable; /* total bytes that are unusable until + resetting the device zone */ + +@@ -122,7 +124,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); + void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, u64 bytes_zone_unusable, +- struct btrfs_space_info **space_info); ++ bool active, struct btrfs_space_info **space_info); + void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size); + struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 170681797283..2ffc6d50d20d 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1841,6 +1841,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, + bool btrfs_zone_activate(struct btrfs_block_group *block_group) + { + struct btrfs_fs_info *fs_info = block_group->fs_info; ++ struct btrfs_space_info *space_info = block_group->space_info; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; +@@ -1852,6 +1853,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) + + map = block_group->physical_map; + ++ spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (block_group->zone_is_active) { + ret = true; +@@ -1880,7 +1882,10 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) + + /* Successfully activated all the zones */ + block_group->zone_is_active = 1; ++ space_info->active_total_bytes += block_group->length; + spin_unlock(&block_group->lock); ++ btrfs_try_granting_tickets(fs_info, space_info); ++ spin_unlock(&space_info->lock); + + /* For the active block group list */ + btrfs_get_block_group(block_group); +@@ -1893,6 +1898,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) + + out_unlock: + spin_unlock(&block_group->lock); ++ spin_unlock(&space_info->lock); + return ret; + } + +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-revive-max_zone_append_bytes.patch b/queue-5.18/btrfs-zoned-revive-max_zone_append_bytes.patch new file mode 100644 index 00000000000..641ef5e54ef --- /dev/null +++ b/queue-5.18/btrfs-zoned-revive-max_zone_append_bytes.patch @@ -0,0 +1,108 @@ +From 33903083ab3fd9bee96c3d8ade9c045e2b61060a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:39 +0900 +Subject: btrfs: zoned: revive max_zone_append_bytes + +From: Naohiro Aota + +[ Upstream commit c2ae7b772ef4e86c5ddf3fd47bf59045ae96a414 ] + +This patch is basically a revert of commit 5a80d1c6a270 ("btrfs: zoned: +remove max_zone_append_size logic"), but without unnecessary ASSERT and +check. The max_zone_append_size will be used as a hint to estimate the +number of extents to cover delalloc/writeback region in the later commits. + +The size of a ZONE APPEND bio is also limited by queue_max_segments(), so +this commit considers it to calculate max_zone_append_size. Technically, a +bio can be larger than queue_max_segments() * PAGE_SIZE if the pages are +contiguous. But, it is safe to consider "queue_max_segments() * PAGE_SIZE" +as an upper limit of an extent size to calculate the number of extents +needed to write data. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/ctree.h | 2 ++ + fs/btrfs/zoned.c | 17 +++++++++++++++++ + fs/btrfs/zoned.h | 1 + + 3 files changed, 20 insertions(+) + +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 077c95e9baa5..1c377bcfe787 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1050,6 +1050,8 @@ struct btrfs_fs_info { + u64 zoned; + }; + ++ /* Max size to emit ZONE_APPEND write command */ ++ u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 84b6d39509bd..1d5b9308f5ef 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -407,6 +407,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) + nr_sectors = bdev_nr_sectors(bdev); + zone_info->zone_size_shift = ilog2(zone_info->zone_size); + zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); ++ /* ++ * We limit max_zone_append_size also by max_segments * ++ * PAGE_SIZE. Technically, we can have multiple pages per segment. But, ++ * since btrfs adds the pages one by one to a bio, and btrfs cannot ++ * increase the metadata reservation even if it increases the number of ++ * extents, it is safe to stick with the limit. ++ */ ++ zone_info->max_zone_append_size = ++ min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, ++ (u64)bdev_max_segments(bdev) << PAGE_SHIFT); + if (!IS_ALIGNED(nr_sectors, zone_sectors)) + zone_info->nr_zones++; + +@@ -632,6 +642,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) + u64 zoned_devices = 0; + u64 nr_devices = 0; + u64 zone_size = 0; ++ u64 max_zone_append_size = 0; + const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); + int ret = 0; + +@@ -666,6 +677,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) + ret = -EINVAL; + goto out; + } ++ if (!max_zone_append_size || ++ (zone_info->max_zone_append_size && ++ zone_info->max_zone_append_size < max_zone_append_size)) ++ max_zone_append_size = ++ zone_info->max_zone_append_size; + } + nr_devices++; + } +@@ -715,6 +731,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) + } + + fs_info->zone_size = zone_size; ++ fs_info->max_zone_append_size = max_zone_append_size; + fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + + /* +diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h +index cf6320feef46..2d6da8f4b55a 100644 +--- a/fs/btrfs/zoned.h ++++ b/fs/btrfs/zoned.h +@@ -23,6 +23,7 @@ struct btrfs_zoned_device_info { + */ + u64 zone_size; + u8 zone_size_shift; ++ u64 max_zone_append_size; + u32 nr_zones; + unsigned int max_active_zones; + atomic_t active_zones_left; +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-wait-until-zone-is-finished-when-allocat.patch b/queue-5.18/btrfs-zoned-wait-until-zone-is-finished-when-allocat.patch new file mode 100644 index 00000000000..454c96acf20 --- /dev/null +++ b/queue-5.18/btrfs-zoned-wait-until-zone-is-finished-when-allocat.patch @@ -0,0 +1,114 @@ +From bfdbe30121dfbc58093980adb193543c02b438ce Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:50 +0900 +Subject: btrfs: zoned: wait until zone is finished when allocation didn't + progress + +From: Naohiro Aota + +[ Upstream commit 2ce543f478433a0eec0f72090d7e814f1d53d456 ] + +When the allocated position doesn't progress, we cannot submit IOs to +finish a block group, but there should be ongoing IOs that will finish a +block group. So, in that case, we wait for a zone to be finished and retry +the allocation after that. + +Introduce a new flag BTRFS_FS_NEED_ZONE_FINISH for fs_info->flags to +indicate we need a zone finish to have proceeded. The flag is set when the +allocator detected it cannot activate a new block group. And, it is cleared +once a zone is finished. + +CC: stable@vger.kernel.org # 5.16+ +Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/ctree.h | 5 +++++ + fs/btrfs/disk-io.c | 1 + + fs/btrfs/inode.c | 9 +++++++-- + fs/btrfs/zoned.c | 6 ++++++ + 4 files changed, 19 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 97f5a3d320ff..76fbe4cf2a28 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -635,6 +635,9 @@ enum { + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + ++ /* Indicate we have to finish a zone to do next allocation. */ ++ BTRFS_FS_NEED_ZONE_FINISH, ++ + #if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, +@@ -1074,6 +1077,8 @@ struct btrfs_fs_info { + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; ++ /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ ++ wait_queue_head_t zone_finish_wait; + + #ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index bf5c6ac67e87..59fa7bf3a2e5 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3239,6 +3239,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) + init_waitqueue_head(&fs_info->transaction_blocked_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->delayed_iputs_wait); ++ init_waitqueue_head(&fs_info->zone_finish_wait); + + /* Usable values until the real ones are cached from the superblock */ + fs_info->nodesize = 4096; +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 9753fc47e488..64d310ecbb84 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1606,8 +1606,13 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, + if (ret == 0) + done_offset = end; + +- if (done_offset == start) +- return -ENOSPC; ++ if (done_offset == start) { ++ struct btrfs_fs_info *info = inode->root->fs_info; ++ ++ wait_var_event(&info->zone_finish_wait, ++ !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); ++ continue; ++ } + + if (!locked_page_done) { + __set_page_dirty_nobuffers(locked_page); +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 0c2d81b0e3d3..45e29b8c705c 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1993,6 +1993,9 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) + /* For active_bg_list */ + btrfs_put_block_group(block_group); + ++ clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); ++ wake_up_all(&fs_info->zone_finish_wait); ++ + return 0; + } + +@@ -2021,6 +2024,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) + } + mutex_unlock(&fs_info->chunk_mutex); + ++ if (!ret) ++ set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); ++ + return ret; + } + +-- +2.35.1 + diff --git a/queue-5.18/btrfs-zoned-write-out-partially-allocated-region.patch b/queue-5.18/btrfs-zoned-write-out-partially-allocated-region.patch new file mode 100644 index 00000000000..30718fdaa61 --- /dev/null +++ b/queue-5.18/btrfs-zoned-write-out-partially-allocated-region.patch @@ -0,0 +1,186 @@ +From e3d143c47bd2fd49137af62c599571f2ab7620d6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 9 Jul 2022 08:18:49 +0900 +Subject: btrfs: zoned: write out partially allocated region + +From: Naohiro Aota + +[ Upstream commit 898793d992c23dac6126a6a94ad893eae1a2c9df ] + +cow_file_range() works in an all-or-nothing way: if it fails to allocate an +extent for a part of the given region, it gives up all the region including +the successfully allocated parts. On cow_file_range(), run_delalloc_zoned() +writes data for the region only when it successfully allocate all the +region. + +This all-or-nothing allocation and write-out are problematic when available +space in all the block groups are get tight with the active zone +restriction. btrfs_reserve_extent() try hard to utilize the left space in +the active block groups and gives up finally and fails with +-ENOSPC. However, if we send IOs for the successfully allocated region, we +can finish a zone and can continue on the rest of the allocation on a newly +allocated block group. + +This patch implements the partial write-out for run_delalloc_zoned(). With +this patch applied, cow_file_range() returns -EAGAIN to tell the caller to +do something to progress the further allocation, and tells the successfully +allocated region with done_offset. Furthermore, the zoned extent allocator +returns -EAGAIN to tell cow_file_range() going back to the caller side. + +Actually, we still need to wait for an IO to complete to continue the +allocation. The next patch implements that part. + +CC: stable@vger.kernel.org # 5.16+ +Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 10 +++++++ + fs/btrfs/inode.c | 63 ++++++++++++++++++++++++++++++++---------- + 2 files changed, 59 insertions(+), 14 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 56185541e188..eee68a6f2be7 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4015,6 +4015,16 @@ static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) + return -ENOSPC; + ++ /* ++ * Even min_alloc_size is not left in any block groups. Since we cannot ++ * activate a new block group, allocating it may not help. Let's tell a ++ * caller to try again and hope it progress something by writing some ++ * parts of the region. That is only possible for data block groups, ++ * where a part of the region can be written. ++ */ ++ if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) ++ return -EAGAIN; ++ + /* + * We cannot activate a new block group and no enough space left in any + * block groups. So, allocating a new block group may not help. But, +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index c50288d90c66..9753fc47e488 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -92,7 +92,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); + static noinline int cow_file_range(struct btrfs_inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, +- unsigned long *nr_written, int unlock); ++ unsigned long *nr_written, int unlock, ++ u64 *done_offset); + static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, + u64 len, u64 orig_start, u64 block_start, + u64 block_len, u64 orig_block_len, +@@ -884,7 +885,7 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, + * can directly submit them without interruption. + */ + ret = cow_file_range(inode, locked_page, start, end, &page_started, +- &nr_written, 0); ++ &nr_written, 0, NULL); + /* Inline extent inserted, page gets unlocked and everything is done */ + if (page_started) { + ret = 0; +@@ -1133,7 +1134,8 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, + static noinline int cow_file_range(struct btrfs_inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, +- unsigned long *nr_written, int unlock) ++ unsigned long *nr_written, int unlock, ++ u64 *done_offset) + { + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; +@@ -1326,6 +1328,21 @@ static noinline int cow_file_range(struct btrfs_inode *inode, + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + out_unlock: ++ /* ++ * If done_offset is non-NULL and ret == -EAGAIN, we expect the ++ * caller to write out the successfully allocated region and retry. ++ */ ++ if (done_offset && ret == -EAGAIN) { ++ if (orig_start < start) ++ *done_offset = start - 1; ++ else ++ *done_offset = start; ++ return ret; ++ } else if (ret == -EAGAIN) { ++ /* Convert to -ENOSPC since the caller cannot retry. */ ++ ret = -ENOSPC; ++ } ++ + /* + * Now, we have three regions to clean up: + * +@@ -1571,19 +1588,37 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, + u64 end, int *page_started, + unsigned long *nr_written) + { ++ u64 done_offset = end; + int ret; ++ bool locked_page_done = false; + +- ret = cow_file_range(inode, locked_page, start, end, page_started, +- nr_written, 0); +- if (ret) +- return ret; ++ while (start <= end) { ++ ret = cow_file_range(inode, locked_page, start, end, page_started, ++ nr_written, 0, &done_offset); ++ if (ret && ret != -EAGAIN) ++ return ret; + +- if (*page_started) +- return 0; ++ if (*page_started) { ++ ASSERT(ret == 0); ++ return 0; ++ } ++ ++ if (ret == 0) ++ done_offset = end; ++ ++ if (done_offset == start) ++ return -ENOSPC; ++ ++ if (!locked_page_done) { ++ __set_page_dirty_nobuffers(locked_page); ++ account_page_redirty(locked_page); ++ } ++ locked_page_done = true; ++ extent_write_locked_range(&inode->vfs_inode, start, done_offset); ++ ++ start = done_offset + 1; ++ } + +- __set_page_dirty_nobuffers(locked_page); +- account_page_redirty(locked_page); +- extent_write_locked_range(&inode->vfs_inode, start, end); + *page_started = 1; + + return 0; +@@ -1675,7 +1710,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, + } + + return cow_file_range(inode, locked_page, start, end, page_started, +- nr_written, 1); ++ nr_written, 1, NULL); + } + + /* +@@ -2086,7 +2121,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page + page_started, nr_written); + else + ret = cow_file_range(inode, locked_page, start, end, +- page_started, nr_written, 1); ++ page_started, nr_written, 1, NULL); + } else { + set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); + ret = cow_file_range_async(inode, wbc, locked_page, start, end, +-- +2.35.1 + diff --git a/queue-5.18/crypto-blake2s-remove-shash-module.patch b/queue-5.18/crypto-blake2s-remove-shash-module.patch new file mode 100644 index 00000000000..8f4b10aadd4 --- /dev/null +++ b/queue-5.18/crypto-blake2s-remove-shash-module.patch @@ -0,0 +1,957 @@ +From 808c1dca59fc32bd267c25d387bbc56a55144ccb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 28 May 2022 21:44:07 +0200 +Subject: crypto: blake2s - remove shash module + +From: Jason A. Donenfeld + +[ Upstream commit 2d16803c562ecc644803d42ba98a8e0aef9c014e ] + +BLAKE2s has no currently known use as an shash. Just remove all of this +unnecessary plumbing. Removing this shash was something we talked about +back when we were making BLAKE2s a built-in, but I simply never got +around to doing it. So this completes that project. + +Importantly, this fixs a bug in which the lib code depends on +crypto_simd_disabled_for_test, causing linker errors. + +Also add more alignment tests to the selftests and compare SIMD and +non-SIMD compression functions, to make up for what we lose from +testmgr.c. + +Reported-by: gaochao +Cc: Eric Biggers +Cc: Ard Biesheuvel +Cc: stable@vger.kernel.org +Fixes: 6048fdcc5f26 ("lib/crypto: blake2s: include as built-in") +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Herbert Xu +Signed-off-by: Sasha Levin +--- + arch/arm/crypto/Kconfig | 2 +- + arch/arm/crypto/Makefile | 4 +- + arch/arm/crypto/blake2s-shash.c | 75 ----------- + arch/x86/crypto/Makefile | 4 +- + arch/x86/crypto/blake2s-glue.c | 3 +- + arch/x86/crypto/blake2s-shash.c | 77 ----------- + crypto/Kconfig | 20 +-- + crypto/Makefile | 1 - + crypto/blake2s_generic.c | 75 ----------- + crypto/tcrypt.c | 12 -- + crypto/testmgr.c | 24 ---- + crypto/testmgr.h | 217 ------------------------------ + include/crypto/internal/blake2s.h | 108 --------------- + lib/crypto/blake2s-selftest.c | 41 ++++++ + lib/crypto/blake2s.c | 37 ++++- + 15 files changed, 76 insertions(+), 624 deletions(-) + delete mode 100644 arch/arm/crypto/blake2s-shash.c + delete mode 100644 arch/x86/crypto/blake2s-shash.c + delete mode 100644 crypto/blake2s_generic.c + +diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig +index e4dba5461cb3..149a5bd6b88c 100644 +--- a/arch/arm/crypto/Kconfig ++++ b/arch/arm/crypto/Kconfig +@@ -63,7 +63,7 @@ config CRYPTO_SHA512_ARM + using optimized ARM assembler and NEON, when available. + + config CRYPTO_BLAKE2S_ARM +- tristate "BLAKE2s digest algorithm (ARM)" ++ bool "BLAKE2s digest algorithm (ARM)" + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S + help + BLAKE2s digest algorithm optimized with ARM scalar instructions. This +diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile +index 0274f81cc8ea..971e74546fb1 100644 +--- a/arch/arm/crypto/Makefile ++++ b/arch/arm/crypto/Makefile +@@ -9,8 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o + obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o + obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o + obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o +-obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o +-obj-$(if $(CONFIG_CRYPTO_BLAKE2S_ARM),y) += libblake2s-arm.o ++obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += libblake2s-arm.o + obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o + obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o + obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o +@@ -32,7 +31,6 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o + sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y) + sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o + sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y) +-blake2s-arm-y := blake2s-shash.o + libblake2s-arm-y:= blake2s-core.o blake2s-glue.o + blake2b-neon-y := blake2b-neon-core.o blake2b-neon-glue.o + sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o +diff --git a/arch/arm/crypto/blake2s-shash.c b/arch/arm/crypto/blake2s-shash.c +deleted file mode 100644 +index 763c73beea2d..000000000000 +--- a/arch/arm/crypto/blake2s-shash.c ++++ /dev/null +@@ -1,75 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-or-later +-/* +- * BLAKE2s digest algorithm, ARM scalar implementation +- * +- * Copyright 2020 Google LLC +- */ +- +-#include +-#include +- +-#include +- +-static int crypto_blake2s_update_arm(struct shash_desc *desc, +- const u8 *in, unsigned int inlen) +-{ +- return crypto_blake2s_update(desc, in, inlen, false); +-} +- +-static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out) +-{ +- return crypto_blake2s_final(desc, out, false); +-} +- +-#define BLAKE2S_ALG(name, driver_name, digest_size) \ +- { \ +- .base.cra_name = name, \ +- .base.cra_driver_name = driver_name, \ +- .base.cra_priority = 200, \ +- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ +- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ +- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ +- .base.cra_module = THIS_MODULE, \ +- .digestsize = digest_size, \ +- .setkey = crypto_blake2s_setkey, \ +- .init = crypto_blake2s_init, \ +- .update = crypto_blake2s_update_arm, \ +- .final = crypto_blake2s_final_arm, \ +- .descsize = sizeof(struct blake2s_state), \ +- } +- +-static struct shash_alg blake2s_arm_algs[] = { +- BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE), +- BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE), +- BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE), +- BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE), +-}; +- +-static int __init blake2s_arm_mod_init(void) +-{ +- return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? +- crypto_register_shashes(blake2s_arm_algs, +- ARRAY_SIZE(blake2s_arm_algs)) : 0; +-} +- +-static void __exit blake2s_arm_mod_exit(void) +-{ +- if (IS_REACHABLE(CONFIG_CRYPTO_HASH)) +- crypto_unregister_shashes(blake2s_arm_algs, +- ARRAY_SIZE(blake2s_arm_algs)); +-} +- +-module_init(blake2s_arm_mod_init); +-module_exit(blake2s_arm_mod_exit); +- +-MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation"); +-MODULE_LICENSE("GPL"); +-MODULE_AUTHOR("Eric Biggers "); +-MODULE_ALIAS_CRYPTO("blake2s-128"); +-MODULE_ALIAS_CRYPTO("blake2s-128-arm"); +-MODULE_ALIAS_CRYPTO("blake2s-160"); +-MODULE_ALIAS_CRYPTO("blake2s-160-arm"); +-MODULE_ALIAS_CRYPTO("blake2s-224"); +-MODULE_ALIAS_CRYPTO("blake2s-224-arm"); +-MODULE_ALIAS_CRYPTO("blake2s-256"); +-MODULE_ALIAS_CRYPTO("blake2s-256-arm"); +diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile +index 2831685adf6f..8ed4597fdf6a 100644 +--- a/arch/x86/crypto/Makefile ++++ b/arch/x86/crypto/Makefile +@@ -61,9 +61,7 @@ sha256-ssse3-$(CONFIG_AS_SHA256_NI) += sha256_ni_asm.o + obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o + sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o + +-obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o +-blake2s-x86_64-y := blake2s-shash.o +-obj-$(if $(CONFIG_CRYPTO_BLAKE2S_X86),y) += libblake2s-x86_64.o ++obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += libblake2s-x86_64.o + libblake2s-x86_64-y := blake2s-core.o blake2s-glue.o + + obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o +diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c +index 69853c13e8fb..aaba21230528 100644 +--- a/arch/x86/crypto/blake2s-glue.c ++++ b/arch/x86/crypto/blake2s-glue.c +@@ -4,7 +4,6 @@ + */ + + #include +-#include + + #include + #include +@@ -33,7 +32,7 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block, + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8); + +- if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { ++ if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } +diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c +deleted file mode 100644 +index 59ae28abe35c..000000000000 +--- a/arch/x86/crypto/blake2s-shash.c ++++ /dev/null +@@ -1,77 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 OR MIT +-/* +- * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. +- */ +- +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +- +-#include +-#include +- +-static int crypto_blake2s_update_x86(struct shash_desc *desc, +- const u8 *in, unsigned int inlen) +-{ +- return crypto_blake2s_update(desc, in, inlen, false); +-} +- +-static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) +-{ +- return crypto_blake2s_final(desc, out, false); +-} +- +-#define BLAKE2S_ALG(name, driver_name, digest_size) \ +- { \ +- .base.cra_name = name, \ +- .base.cra_driver_name = driver_name, \ +- .base.cra_priority = 200, \ +- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ +- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ +- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ +- .base.cra_module = THIS_MODULE, \ +- .digestsize = digest_size, \ +- .setkey = crypto_blake2s_setkey, \ +- .init = crypto_blake2s_init, \ +- .update = crypto_blake2s_update_x86, \ +- .final = crypto_blake2s_final_x86, \ +- .descsize = sizeof(struct blake2s_state), \ +- } +- +-static struct shash_alg blake2s_algs[] = { +- BLAKE2S_ALG("blake2s-128", "blake2s-128-x86", BLAKE2S_128_HASH_SIZE), +- BLAKE2S_ALG("blake2s-160", "blake2s-160-x86", BLAKE2S_160_HASH_SIZE), +- BLAKE2S_ALG("blake2s-224", "blake2s-224-x86", BLAKE2S_224_HASH_SIZE), +- BLAKE2S_ALG("blake2s-256", "blake2s-256-x86", BLAKE2S_256_HASH_SIZE), +-}; +- +-static int __init blake2s_mod_init(void) +-{ +- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) +- return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +- return 0; +-} +- +-static void __exit blake2s_mod_exit(void) +-{ +- if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3)) +- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +-} +- +-module_init(blake2s_mod_init); +-module_exit(blake2s_mod_exit); +- +-MODULE_ALIAS_CRYPTO("blake2s-128"); +-MODULE_ALIAS_CRYPTO("blake2s-128-x86"); +-MODULE_ALIAS_CRYPTO("blake2s-160"); +-MODULE_ALIAS_CRYPTO("blake2s-160-x86"); +-MODULE_ALIAS_CRYPTO("blake2s-224"); +-MODULE_ALIAS_CRYPTO("blake2s-224-x86"); +-MODULE_ALIAS_CRYPTO("blake2s-256"); +-MODULE_ALIAS_CRYPTO("blake2s-256-x86"); +-MODULE_LICENSE("GPL v2"); +diff --git a/crypto/Kconfig b/crypto/Kconfig +index b4e00a7a046b..38601a072b99 100644 +--- a/crypto/Kconfig ++++ b/crypto/Kconfig +@@ -692,26 +692,8 @@ config CRYPTO_BLAKE2B + + See https://blake2.net for further information. + +-config CRYPTO_BLAKE2S +- tristate "BLAKE2s digest algorithm" +- select CRYPTO_LIB_BLAKE2S_GENERIC +- select CRYPTO_HASH +- help +- Implementation of cryptographic hash function BLAKE2s +- optimized for 8-32bit platforms and can produce digests of any size +- between 1 to 32. The keyed hash is also implemented. +- +- This module provides the following algorithms: +- +- - blake2s-128 +- - blake2s-160 +- - blake2s-224 +- - blake2s-256 +- +- See https://blake2.net for further information. +- + config CRYPTO_BLAKE2S_X86 +- tristate "BLAKE2s digest algorithm (x86 accelerated version)" ++ bool "BLAKE2s digest algorithm (x86 accelerated version)" + depends on X86 && 64BIT + select CRYPTO_LIB_BLAKE2S_GENERIC + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S +diff --git a/crypto/Makefile b/crypto/Makefile +index a40e6d5fb2c8..dbfa53567c92 100644 +--- a/crypto/Makefile ++++ b/crypto/Makefile +@@ -83,7 +83,6 @@ obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o + obj-$(CONFIG_CRYPTO_WP512) += wp512.o + CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149 + obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b_generic.o +-obj-$(CONFIG_CRYPTO_BLAKE2S) += blake2s_generic.o + obj-$(CONFIG_CRYPTO_GF128MUL) += gf128mul.o + obj-$(CONFIG_CRYPTO_ECB) += ecb.o + obj-$(CONFIG_CRYPTO_CBC) += cbc.o +diff --git a/crypto/blake2s_generic.c b/crypto/blake2s_generic.c +deleted file mode 100644 +index 5f96a21f8788..000000000000 +--- a/crypto/blake2s_generic.c ++++ /dev/null +@@ -1,75 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 OR MIT +-/* +- * shash interface to the generic implementation of BLAKE2s +- * +- * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. +- */ +- +-#include +-#include +- +-#include +-#include +-#include +- +-static int crypto_blake2s_update_generic(struct shash_desc *desc, +- const u8 *in, unsigned int inlen) +-{ +- return crypto_blake2s_update(desc, in, inlen, true); +-} +- +-static int crypto_blake2s_final_generic(struct shash_desc *desc, u8 *out) +-{ +- return crypto_blake2s_final(desc, out, true); +-} +- +-#define BLAKE2S_ALG(name, driver_name, digest_size) \ +- { \ +- .base.cra_name = name, \ +- .base.cra_driver_name = driver_name, \ +- .base.cra_priority = 100, \ +- .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \ +- .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \ +- .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \ +- .base.cra_module = THIS_MODULE, \ +- .digestsize = digest_size, \ +- .setkey = crypto_blake2s_setkey, \ +- .init = crypto_blake2s_init, \ +- .update = crypto_blake2s_update_generic, \ +- .final = crypto_blake2s_final_generic, \ +- .descsize = sizeof(struct blake2s_state), \ +- } +- +-static struct shash_alg blake2s_algs[] = { +- BLAKE2S_ALG("blake2s-128", "blake2s-128-generic", +- BLAKE2S_128_HASH_SIZE), +- BLAKE2S_ALG("blake2s-160", "blake2s-160-generic", +- BLAKE2S_160_HASH_SIZE), +- BLAKE2S_ALG("blake2s-224", "blake2s-224-generic", +- BLAKE2S_224_HASH_SIZE), +- BLAKE2S_ALG("blake2s-256", "blake2s-256-generic", +- BLAKE2S_256_HASH_SIZE), +-}; +- +-static int __init blake2s_mod_init(void) +-{ +- return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +-} +- +-static void __exit blake2s_mod_exit(void) +-{ +- crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +-} +- +-subsys_initcall(blake2s_mod_init); +-module_exit(blake2s_mod_exit); +- +-MODULE_ALIAS_CRYPTO("blake2s-128"); +-MODULE_ALIAS_CRYPTO("blake2s-128-generic"); +-MODULE_ALIAS_CRYPTO("blake2s-160"); +-MODULE_ALIAS_CRYPTO("blake2s-160-generic"); +-MODULE_ALIAS_CRYPTO("blake2s-224"); +-MODULE_ALIAS_CRYPTO("blake2s-224-generic"); +-MODULE_ALIAS_CRYPTO("blake2s-256"); +-MODULE_ALIAS_CRYPTO("blake2s-256-generic"); +-MODULE_LICENSE("GPL v2"); +diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c +index 2bacf8384f59..66b7ca1ccb23 100644 +--- a/crypto/tcrypt.c ++++ b/crypto/tcrypt.c +@@ -1669,10 +1669,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) + ret += tcrypt_test("rmd160"); + break; + +- case 41: +- ret += tcrypt_test("blake2s-256"); +- break; +- + case 42: + ret += tcrypt_test("blake2b-512"); + break; +@@ -2240,10 +2236,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) + test_hash_speed("rmd160", sec, generic_hash_speed_template); + if (mode > 300 && mode < 400) break; + fallthrough; +- case 316: +- test_hash_speed("blake2s-256", sec, generic_hash_speed_template); +- if (mode > 300 && mode < 400) break; +- fallthrough; + case 317: + test_hash_speed("blake2b-512", sec, generic_hash_speed_template); + if (mode > 300 && mode < 400) break; +@@ -2352,10 +2344,6 @@ static int do_test(const char *alg, u32 type, u32 mask, int m, u32 num_mb) + test_ahash_speed("rmd160", sec, generic_hash_speed_template); + if (mode > 400 && mode < 500) break; + fallthrough; +- case 416: +- test_ahash_speed("blake2s-256", sec, generic_hash_speed_template); +- if (mode > 400 && mode < 500) break; +- fallthrough; + case 417: + test_ahash_speed("blake2b-512", sec, generic_hash_speed_template); + if (mode > 400 && mode < 500) break; +diff --git a/crypto/testmgr.c b/crypto/testmgr.c +index 4948201065cc..56facdb63843 100644 +--- a/crypto/testmgr.c ++++ b/crypto/testmgr.c +@@ -4324,30 +4324,6 @@ static const struct alg_test_desc alg_test_descs[] = { + .suite = { + .hash = __VECS(blake2b_512_tv_template) + } +- }, { +- .alg = "blake2s-128", +- .test = alg_test_hash, +- .suite = { +- .hash = __VECS(blakes2s_128_tv_template) +- } +- }, { +- .alg = "blake2s-160", +- .test = alg_test_hash, +- .suite = { +- .hash = __VECS(blakes2s_160_tv_template) +- } +- }, { +- .alg = "blake2s-224", +- .test = alg_test_hash, +- .suite = { +- .hash = __VECS(blakes2s_224_tv_template) +- } +- }, { +- .alg = "blake2s-256", +- .test = alg_test_hash, +- .suite = { +- .hash = __VECS(blakes2s_256_tv_template) +- } + }, { + .alg = "cbc(aes)", + .test = alg_test_skcipher, +diff --git a/crypto/testmgr.h b/crypto/testmgr.h +index 4d7449fc6a65..c29658337d96 100644 +--- a/crypto/testmgr.h ++++ b/crypto/testmgr.h +@@ -34034,221 +34034,4 @@ static const struct hash_testvec blake2b_512_tv_template[] = {{ + 0xae, 0x15, 0x81, 0x15, 0xd0, 0x88, 0xa0, 0x3c, }, + }}; + +-static const struct hash_testvec blakes2s_128_tv_template[] = {{ +- .digest = (u8[]){ 0x64, 0x55, 0x0d, 0x6f, 0xfe, 0x2c, 0x0a, 0x01, +- 0xa1, 0x4a, 0xba, 0x1e, 0xad, 0xe0, 0x20, 0x0c, }, +-}, { +- .plaintext = blake2_ordered_sequence, +- .psize = 64, +- .digest = (u8[]){ 0xdc, 0x66, 0xca, 0x8f, 0x03, 0x86, 0x58, 0x01, +- 0xb0, 0xff, 0xe0, 0x6e, 0xd8, 0xa1, 0xa9, 0x0e, }, +-}, { +- .ksize = 16, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 1, +- .digest = (u8[]){ 0x88, 0x1e, 0x42, 0xe7, 0xbb, 0x35, 0x80, 0x82, +- 0x63, 0x7c, 0x0a, 0x0f, 0xd7, 0xec, 0x6c, 0x2f, }, +-}, { +- .ksize = 32, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 7, +- .digest = (u8[]){ 0xcf, 0x9e, 0x07, 0x2a, 0xd5, 0x22, 0xf2, 0xcd, +- 0xa2, 0xd8, 0x25, 0x21, 0x80, 0x86, 0x73, 0x1c, }, +-}, { +- .ksize = 1, +- .key = "B", +- .plaintext = blake2_ordered_sequence, +- .psize = 15, +- .digest = (u8[]){ 0xf6, 0x33, 0x5a, 0x2c, 0x22, 0xa0, 0x64, 0xb2, +- 0xb6, 0x3f, 0xeb, 0xbc, 0xd1, 0xc3, 0xe5, 0xb2, }, +-}, { +- .ksize = 16, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 247, +- .digest = (u8[]){ 0x72, 0x66, 0x49, 0x60, 0xf9, 0x4a, 0xea, 0xbe, +- 0x1f, 0xf4, 0x60, 0xce, 0xb7, 0x81, 0xcb, 0x09, }, +-}, { +- .ksize = 32, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 256, +- .digest = (u8[]){ 0xd5, 0xa4, 0x0e, 0xc3, 0x16, 0xc7, 0x51, 0xa6, +- 0x3c, 0xd0, 0xd9, 0x11, 0x57, 0xfa, 0x1e, 0xbb, }, +-}}; +- +-static const struct hash_testvec blakes2s_160_tv_template[] = {{ +- .plaintext = blake2_ordered_sequence, +- .psize = 7, +- .digest = (u8[]){ 0xb4, 0xf2, 0x03, 0x49, 0x37, 0xed, 0xb1, 0x3e, +- 0x5b, 0x2a, 0xca, 0x64, 0x82, 0x74, 0xf6, 0x62, +- 0xe3, 0xf2, 0x84, 0xff, }, +-}, { +- .plaintext = blake2_ordered_sequence, +- .psize = 256, +- .digest = (u8[]){ 0xaa, 0x56, 0x9b, 0xdc, 0x98, 0x17, 0x75, 0xf2, +- 0xb3, 0x68, 0x83, 0xb7, 0x9b, 0x8d, 0x48, 0xb1, +- 0x9b, 0x2d, 0x35, 0x05, }, +-}, { +- .ksize = 1, +- .key = "B", +- .digest = (u8[]){ 0x50, 0x16, 0xe7, 0x0c, 0x01, 0xd0, 0xd3, 0xc3, +- 0xf4, 0x3e, 0xb1, 0x6e, 0x97, 0xa9, 0x4e, 0xd1, +- 0x79, 0x65, 0x32, 0x93, }, +-}, { +- .ksize = 32, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 1, +- .digest = (u8[]){ 0x1c, 0x2b, 0xcd, 0x9a, 0x68, 0xca, 0x8c, 0x71, +- 0x90, 0x29, 0x6c, 0x54, 0xfa, 0x56, 0x4a, 0xef, +- 0xa2, 0x3a, 0x56, 0x9c, }, +-}, { +- .ksize = 16, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 15, +- .digest = (u8[]){ 0x36, 0xc3, 0x5f, 0x9a, 0xdc, 0x7e, 0xbf, 0x19, +- 0x68, 0xaa, 0xca, 0xd8, 0x81, 0xbf, 0x09, 0x34, +- 0x83, 0x39, 0x0f, 0x30, }, +-}, { +- .ksize = 1, +- .key = "B", +- .plaintext = blake2_ordered_sequence, +- .psize = 64, +- .digest = (u8[]){ 0x86, 0x80, 0x78, 0xa4, 0x14, 0xec, 0x03, 0xe5, +- 0xb6, 0x9a, 0x52, 0x0e, 0x42, 0xee, 0x39, 0x9d, +- 0xac, 0xa6, 0x81, 0x63, }, +-}, { +- .ksize = 32, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 247, +- .digest = (u8[]){ 0x2d, 0xd8, 0xd2, 0x53, 0x66, 0xfa, 0xa9, 0x01, +- 0x1c, 0x9c, 0xaf, 0xa3, 0xe2, 0x9d, 0x9b, 0x10, +- 0x0a, 0xf6, 0x73, 0xe8, }, +-}}; +- +-static const struct hash_testvec blakes2s_224_tv_template[] = {{ +- .plaintext = blake2_ordered_sequence, +- .psize = 1, +- .digest = (u8[]){ 0x61, 0xb9, 0x4e, 0xc9, 0x46, 0x22, 0xa3, 0x91, +- 0xd2, 0xae, 0x42, 0xe6, 0x45, 0x6c, 0x90, 0x12, +- 0xd5, 0x80, 0x07, 0x97, 0xb8, 0x86, 0x5a, 0xfc, +- 0x48, 0x21, 0x97, 0xbb, }, +-}, { +- .plaintext = blake2_ordered_sequence, +- .psize = 247, +- .digest = (u8[]){ 0x9e, 0xda, 0xc7, 0x20, 0x2c, 0xd8, 0x48, 0x2e, +- 0x31, 0x94, 0xab, 0x46, 0x6d, 0x94, 0xd8, 0xb4, +- 0x69, 0xcd, 0xae, 0x19, 0x6d, 0x9e, 0x41, 0xcc, +- 0x2b, 0xa4, 0xd5, 0xf6, }, +-}, { +- .ksize = 16, +- .key = blake2_ordered_sequence, +- .digest = (u8[]){ 0x32, 0xc0, 0xac, 0xf4, 0x3b, 0xd3, 0x07, 0x9f, +- 0xbe, 0xfb, 0xfa, 0x4d, 0x6b, 0x4e, 0x56, 0xb3, +- 0xaa, 0xd3, 0x27, 0xf6, 0x14, 0xbf, 0xb9, 0x32, +- 0xa7, 0x19, 0xfc, 0xb8, }, +-}, { +- .ksize = 1, +- .key = "B", +- .plaintext = blake2_ordered_sequence, +- .psize = 7, +- .digest = (u8[]){ 0x73, 0xad, 0x5e, 0x6d, 0xb9, 0x02, 0x8e, 0x76, +- 0xf2, 0x66, 0x42, 0x4b, 0x4c, 0xfa, 0x1f, 0xe6, +- 0x2e, 0x56, 0x40, 0xe5, 0xa2, 0xb0, 0x3c, 0xe8, +- 0x7b, 0x45, 0xfe, 0x05, }, +-}, { +- .ksize = 32, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 15, +- .digest = (u8[]){ 0x16, 0x60, 0xfb, 0x92, 0x54, 0xb3, 0x6e, 0x36, +- 0x81, 0xf4, 0x16, 0x41, 0xc3, 0x3d, 0xd3, 0x43, +- 0x84, 0xed, 0x10, 0x6f, 0x65, 0x80, 0x7a, 0x3e, +- 0x25, 0xab, 0xc5, 0x02, }, +-}, { +- .ksize = 16, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 64, +- .digest = (u8[]){ 0xca, 0xaa, 0x39, 0x67, 0x9c, 0xf7, 0x6b, 0xc7, +- 0xb6, 0x82, 0xca, 0x0e, 0x65, 0x36, 0x5b, 0x7c, +- 0x24, 0x00, 0xfa, 0x5f, 0xda, 0x06, 0x91, 0x93, +- 0x6a, 0x31, 0x83, 0xb5, }, +-}, { +- .ksize = 1, +- .key = "B", +- .plaintext = blake2_ordered_sequence, +- .psize = 256, +- .digest = (u8[]){ 0x90, 0x02, 0x26, 0xb5, 0x06, 0x9c, 0x36, 0x86, +- 0x94, 0x91, 0x90, 0x1e, 0x7d, 0x2a, 0x71, 0xb2, +- 0x48, 0xb5, 0xe8, 0x16, 0xfd, 0x64, 0x33, 0x45, +- 0xb3, 0xd7, 0xec, 0xcc, }, +-}}; +- +-static const struct hash_testvec blakes2s_256_tv_template[] = {{ +- .plaintext = blake2_ordered_sequence, +- .psize = 15, +- .digest = (u8[]){ 0xd9, 0x7c, 0x82, 0x8d, 0x81, 0x82, 0xa7, 0x21, +- 0x80, 0xa0, 0x6a, 0x78, 0x26, 0x83, 0x30, 0x67, +- 0x3f, 0x7c, 0x4e, 0x06, 0x35, 0x94, 0x7c, 0x04, +- 0xc0, 0x23, 0x23, 0xfd, 0x45, 0xc0, 0xa5, 0x2d, }, +-}, { +- .ksize = 32, +- .key = blake2_ordered_sequence, +- .digest = (u8[]){ 0x48, 0xa8, 0x99, 0x7d, 0xa4, 0x07, 0x87, 0x6b, +- 0x3d, 0x79, 0xc0, 0xd9, 0x23, 0x25, 0xad, 0x3b, +- 0x89, 0xcb, 0xb7, 0x54, 0xd8, 0x6a, 0xb7, 0x1a, +- 0xee, 0x04, 0x7a, 0xd3, 0x45, 0xfd, 0x2c, 0x49, }, +-}, { +- .ksize = 1, +- .key = "B", +- .plaintext = blake2_ordered_sequence, +- .psize = 1, +- .digest = (u8[]){ 0x22, 0x27, 0xae, 0xaa, 0x6e, 0x81, 0x56, 0x03, +- 0xa7, 0xe3, 0xa1, 0x18, 0xa5, 0x9a, 0x2c, 0x18, +- 0xf4, 0x63, 0xbc, 0x16, 0x70, 0xf1, 0xe7, 0x4b, +- 0x00, 0x6d, 0x66, 0x16, 0xae, 0x9e, 0x74, 0x4e, }, +-}, { +- .ksize = 16, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 7, +- .digest = (u8[]){ 0x58, 0x5d, 0xa8, 0x60, 0x1c, 0xa4, 0xd8, 0x03, +- 0x86, 0x86, 0x84, 0x64, 0xd7, 0xa0, 0x8e, 0x15, +- 0x2f, 0x05, 0xa2, 0x1b, 0xbc, 0xef, 0x7a, 0x34, +- 0xb3, 0xc5, 0xbc, 0x4b, 0xf0, 0x32, 0xeb, 0x12, }, +-}, { +- .ksize = 32, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 64, +- .digest = (u8[]){ 0x89, 0x75, 0xb0, 0x57, 0x7f, 0xd3, 0x55, 0x66, +- 0xd7, 0x50, 0xb3, 0x62, 0xb0, 0x89, 0x7a, 0x26, +- 0xc3, 0x99, 0x13, 0x6d, 0xf0, 0x7b, 0xab, 0xab, +- 0xbd, 0xe6, 0x20, 0x3f, 0xf2, 0x95, 0x4e, 0xd4, }, +-}, { +- .ksize = 1, +- .key = "B", +- .plaintext = blake2_ordered_sequence, +- .psize = 247, +- .digest = (u8[]){ 0x2e, 0x74, 0x1c, 0x1d, 0x03, 0xf4, 0x9d, 0x84, +- 0x6f, 0xfc, 0x86, 0x32, 0x92, 0x49, 0x7e, 0x66, +- 0xd7, 0xc3, 0x10, 0x88, 0xfe, 0x28, 0xb3, 0xe0, +- 0xbf, 0x50, 0x75, 0xad, 0x8e, 0xa4, 0xe6, 0xb2, }, +-}, { +- .ksize = 16, +- .key = blake2_ordered_sequence, +- .plaintext = blake2_ordered_sequence, +- .psize = 256, +- .digest = (u8[]){ 0xb9, 0xd2, 0x81, 0x0e, 0x3a, 0xb1, 0x62, 0x9b, +- 0xad, 0x44, 0x05, 0xf4, 0x92, 0x2e, 0x99, 0xc1, +- 0x4a, 0x47, 0xbb, 0x5b, 0x6f, 0xb2, 0x96, 0xed, +- 0xd5, 0x06, 0xb5, 0x3a, 0x7c, 0x7a, 0x65, 0x1d, }, +-}}; +- + #endif /* _CRYPTO_TESTMGR_H */ +diff --git a/include/crypto/internal/blake2s.h b/include/crypto/internal/blake2s.h +index 52363eee2b20..506d56530ca9 100644 +--- a/include/crypto/internal/blake2s.h ++++ b/include/crypto/internal/blake2s.h +@@ -8,7 +8,6 @@ + #define _CRYPTO_INTERNAL_BLAKE2S_H + + #include +-#include + #include + + void blake2s_compress_generic(struct blake2s_state *state, const u8 *block, +@@ -19,111 +18,4 @@ void blake2s_compress(struct blake2s_state *state, const u8 *block, + + bool blake2s_selftest(void); + +-static inline void blake2s_set_lastblock(struct blake2s_state *state) +-{ +- state->f[0] = -1; +-} +- +-/* Helper functions for BLAKE2s shared by the library and shash APIs */ +- +-static __always_inline void +-__blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen, +- bool force_generic) +-{ +- const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; +- +- if (unlikely(!inlen)) +- return; +- if (inlen > fill) { +- memcpy(state->buf + state->buflen, in, fill); +- if (force_generic) +- blake2s_compress_generic(state, state->buf, 1, +- BLAKE2S_BLOCK_SIZE); +- else +- blake2s_compress(state, state->buf, 1, +- BLAKE2S_BLOCK_SIZE); +- state->buflen = 0; +- in += fill; +- inlen -= fill; +- } +- if (inlen > BLAKE2S_BLOCK_SIZE) { +- const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); +- /* Hash one less (full) block than strictly possible */ +- if (force_generic) +- blake2s_compress_generic(state, in, nblocks - 1, +- BLAKE2S_BLOCK_SIZE); +- else +- blake2s_compress(state, in, nblocks - 1, +- BLAKE2S_BLOCK_SIZE); +- in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); +- inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); +- } +- memcpy(state->buf + state->buflen, in, inlen); +- state->buflen += inlen; +-} +- +-static __always_inline void +-__blake2s_final(struct blake2s_state *state, u8 *out, bool force_generic) +-{ +- blake2s_set_lastblock(state); +- memset(state->buf + state->buflen, 0, +- BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ +- if (force_generic) +- blake2s_compress_generic(state, state->buf, 1, state->buflen); +- else +- blake2s_compress(state, state->buf, 1, state->buflen); +- cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); +- memcpy(out, state->h, state->outlen); +-} +- +-/* Helper functions for shash implementations of BLAKE2s */ +- +-struct blake2s_tfm_ctx { +- u8 key[BLAKE2S_KEY_SIZE]; +- unsigned int keylen; +-}; +- +-static inline int crypto_blake2s_setkey(struct crypto_shash *tfm, +- const u8 *key, unsigned int keylen) +-{ +- struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); +- +- if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) +- return -EINVAL; +- +- memcpy(tctx->key, key, keylen); +- tctx->keylen = keylen; +- +- return 0; +-} +- +-static inline int crypto_blake2s_init(struct shash_desc *desc) +-{ +- const struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); +- struct blake2s_state *state = shash_desc_ctx(desc); +- unsigned int outlen = crypto_shash_digestsize(desc->tfm); +- +- __blake2s_init(state, outlen, tctx->key, tctx->keylen); +- return 0; +-} +- +-static inline int crypto_blake2s_update(struct shash_desc *desc, +- const u8 *in, unsigned int inlen, +- bool force_generic) +-{ +- struct blake2s_state *state = shash_desc_ctx(desc); +- +- __blake2s_update(state, in, inlen, force_generic); +- return 0; +-} +- +-static inline int crypto_blake2s_final(struct shash_desc *desc, u8 *out, +- bool force_generic) +-{ +- struct blake2s_state *state = shash_desc_ctx(desc); +- +- __blake2s_final(state, out, force_generic); +- return 0; +-} +- + #endif /* _CRYPTO_INTERNAL_BLAKE2S_H */ +diff --git a/lib/crypto/blake2s-selftest.c b/lib/crypto/blake2s-selftest.c +index 409e4b728770..66f505220f43 100644 +--- a/lib/crypto/blake2s-selftest.c ++++ b/lib/crypto/blake2s-selftest.c +@@ -4,6 +4,8 @@ + */ + + #include ++#include ++#include + #include + + /* +@@ -587,5 +589,44 @@ bool __init blake2s_selftest(void) + } + } + ++ for (i = 0; i < 32; ++i) { ++ enum { TEST_ALIGNMENT = 16 }; ++ u8 unaligned_block[BLAKE2S_BLOCK_SIZE + TEST_ALIGNMENT - 1] ++ __aligned(TEST_ALIGNMENT); ++ u8 blocks[BLAKE2S_BLOCK_SIZE * 3]; ++ struct blake2s_state state1, state2; ++ ++ get_random_bytes(blocks, sizeof(blocks)); ++ get_random_bytes(&state, sizeof(state)); ++ ++#if defined(CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC) && \ ++ defined(CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S) ++ memcpy(&state1, &state, sizeof(state1)); ++ memcpy(&state2, &state, sizeof(state2)); ++ blake2s_compress(&state1, blocks, 3, BLAKE2S_BLOCK_SIZE); ++ blake2s_compress_generic(&state2, blocks, 3, BLAKE2S_BLOCK_SIZE); ++ if (memcmp(&state1, &state2, sizeof(state1))) { ++ pr_err("blake2s random compress self-test %d: FAIL\n", ++ i + 1); ++ success = false; ++ } ++#endif ++ ++ memcpy(&state1, &state, sizeof(state1)); ++ blake2s_compress(&state1, blocks, 1, BLAKE2S_BLOCK_SIZE); ++ for (l = 1; l < TEST_ALIGNMENT; ++l) { ++ memcpy(unaligned_block + l, blocks, ++ BLAKE2S_BLOCK_SIZE); ++ memcpy(&state2, &state, sizeof(state2)); ++ blake2s_compress(&state2, unaligned_block + l, 1, ++ BLAKE2S_BLOCK_SIZE); ++ if (memcmp(&state1, &state2, sizeof(state1))) { ++ pr_err("blake2s random compress align %d self-test %d: FAIL\n", ++ l, i + 1); ++ success = false; ++ } ++ } ++ } ++ + return success; + } +diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c +index c71c09621c09..98e688c6d891 100644 +--- a/lib/crypto/blake2s.c ++++ b/lib/crypto/blake2s.c +@@ -16,16 +16,44 @@ + #include + #include + ++static inline void blake2s_set_lastblock(struct blake2s_state *state) ++{ ++ state->f[0] = -1; ++} ++ + void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen) + { +- __blake2s_update(state, in, inlen, false); ++ const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; ++ ++ if (unlikely(!inlen)) ++ return; ++ if (inlen > fill) { ++ memcpy(state->buf + state->buflen, in, fill); ++ blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); ++ state->buflen = 0; ++ in += fill; ++ inlen -= fill; ++ } ++ if (inlen > BLAKE2S_BLOCK_SIZE) { ++ const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); ++ blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); ++ in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); ++ inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); ++ } ++ memcpy(state->buf + state->buflen, in, inlen); ++ state->buflen += inlen; + } + EXPORT_SYMBOL(blake2s_update); + + void blake2s_final(struct blake2s_state *state, u8 *out) + { + WARN_ON(IS_ENABLED(DEBUG) && !out); +- __blake2s_final(state, out, false); ++ blake2s_set_lastblock(state); ++ memset(state->buf + state->buflen, 0, ++ BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ ++ blake2s_compress(state, state->buf, 1, state->buflen); ++ cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); ++ memcpy(out, state->h, state->outlen); + memzero_explicit(state, sizeof(*state)); + } + EXPORT_SYMBOL(blake2s_final); +@@ -38,12 +66,7 @@ static int __init blake2s_mod_init(void) + return 0; + } + +-static void __exit blake2s_mod_exit(void) +-{ +-} +- + module_init(blake2s_mod_init); +-module_exit(blake2s_mod_exit); + MODULE_LICENSE("GPL v2"); + MODULE_DESCRIPTION("BLAKE2s hash function"); + MODULE_AUTHOR("Jason A. Donenfeld "); +-- +2.35.1 + diff --git a/queue-5.18/dm-raid-fix-address-sanitizer-warning-in-raid_resume.patch b/queue-5.18/dm-raid-fix-address-sanitizer-warning-in-raid_resume.patch new file mode 100644 index 00000000000..1482b50b107 --- /dev/null +++ b/queue-5.18/dm-raid-fix-address-sanitizer-warning-in-raid_resume.patch @@ -0,0 +1,38 @@ +From 737fa72cb41c624004f0ab3e93689a3a9cfa1b17 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 24 Jul 2022 14:33:52 -0400 +Subject: dm raid: fix address sanitizer warning in raid_resume + +From: Mikulas Patocka + +[ Upstream commit 7dad24db59d2d2803576f2e3645728866a056dab ] + +There is a KASAN warning in raid_resume when running the lvm test +lvconvert-raid.sh. The reason for the warning is that mddev->raid_disks +is greater than rs->raid_disks, so the loop touches one entry beyond +the allocated length. + +Cc: stable@vger.kernel.org +Signed-off-by: Mikulas Patocka +Signed-off-by: Mike Snitzer +Signed-off-by: Sasha Levin +--- + drivers/md/dm-raid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c +index 92e6b731f9d6..a55d6f6f294b 100644 +--- a/drivers/md/dm-raid.c ++++ b/drivers/md/dm-raid.c +@@ -3824,7 +3824,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) + + memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); + +- for (i = 0; i < mddev->raid_disks; i++) { ++ for (i = 0; i < rs->raid_disks; i++) { + r = &rs->dev[i].rdev; + /* HM FIXME: enhance journal device recovery processing */ + if (test_bit(Journal, &r->flags)) +-- +2.35.1 + diff --git a/queue-5.18/dm-raid-fix-address-sanitizer-warning-in-raid_status.patch b/queue-5.18/dm-raid-fix-address-sanitizer-warning-in-raid_status.patch new file mode 100644 index 00000000000..a6a9ac2b3c9 --- /dev/null +++ b/queue-5.18/dm-raid-fix-address-sanitizer-warning-in-raid_status.patch @@ -0,0 +1,68 @@ +From 78121652de18db6e02f29395b05f1f73c1cf3fd2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 24 Jul 2022 14:31:35 -0400 +Subject: dm raid: fix address sanitizer warning in raid_status + +From: Mikulas Patocka + +[ Upstream commit 1fbeea217d8f297fe0e0956a1516d14ba97d0396 ] + +There is this warning when using a kernel with the address sanitizer +and running this testsuite: +https://gitlab.com/cki-project/kernel-tests/-/tree/main/storage/swraid/scsi_raid + +================================================================== +BUG: KASAN: slab-out-of-bounds in raid_status+0x1747/0x2820 [dm_raid] +Read of size 4 at addr ffff888079d2c7e8 by task lvcreate/13319 +CPU: 0 PID: 13319 Comm: lvcreate Not tainted 5.18.0-0.rc3. #1 +Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 +Call Trace: + + dump_stack_lvl+0x6a/0x9c + print_address_description.constprop.0+0x1f/0x1e0 + print_report.cold+0x55/0x244 + kasan_report+0xc9/0x100 + raid_status+0x1747/0x2820 [dm_raid] + dm_ima_measure_on_table_load+0x4b8/0xca0 [dm_mod] + table_load+0x35c/0x630 [dm_mod] + ctl_ioctl+0x411/0x630 [dm_mod] + dm_ctl_ioctl+0xa/0x10 [dm_mod] + __x64_sys_ioctl+0x12a/0x1a0 + do_syscall_64+0x5b/0x80 + +The warning is caused by reading conf->max_nr_stripes in raid_status. The +code in raid_status reads mddev->private, casts it to struct r5conf and +reads the entry max_nr_stripes. + +However, if we have different raid type than 4/5/6, mddev->private +doesn't point to struct r5conf; it may point to struct r0conf, struct +r1conf, struct r10conf or struct mpconf. If we cast a pointer to one +of these structs to struct r5conf, we will be reading invalid memory +and KASAN warns about it. + +Fix this bug by reading struct r5conf only if raid type is 4, 5 or 6. + +Cc: stable@vger.kernel.org +Signed-off-by: Mikulas Patocka +Signed-off-by: Mike Snitzer +Signed-off-by: Sasha Levin +--- + drivers/md/dm-raid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c +index e362a7471512..92e6b731f9d6 100644 +--- a/drivers/md/dm-raid.c ++++ b/drivers/md/dm-raid.c +@@ -3514,7 +3514,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, + { + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; +- struct r5conf *conf = mddev->private; ++ struct r5conf *conf = rs_is_raid456(rs) ? mddev->private : NULL; + int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; + unsigned long recovery; + unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ +-- +2.35.1 + diff --git a/queue-5.18/dm-thin-fix-use-after-free-crash-in-dm_sm_register_t.patch b/queue-5.18/dm-thin-fix-use-after-free-crash-in-dm_sm_register_t.patch new file mode 100644 index 00000000000..79fc929fd68 --- /dev/null +++ b/queue-5.18/dm-thin-fix-use-after-free-crash-in-dm_sm_register_t.patch @@ -0,0 +1,96 @@ +From 185d3911adcea01b2082c14635a2a8071134384b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Jul 2022 19:28:25 +0800 +Subject: dm thin: fix use-after-free crash in + dm_sm_register_threshold_callback + +From: Luo Meng + +[ Upstream commit 3534e5a5ed2997ca1b00f44a0378a075bd05e8a3 ] + +Fault inject on pool metadata device reports: + BUG: KASAN: use-after-free in dm_pool_register_metadata_threshold+0x40/0x80 + Read of size 8 at addr ffff8881b9d50068 by task dmsetup/950 + + CPU: 7 PID: 950 Comm: dmsetup Tainted: G W 5.19.0-rc6 #1 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 + Call Trace: + + dump_stack_lvl+0x34/0x44 + print_address_description.constprop.0.cold+0xeb/0x3f4 + kasan_report.cold+0xe6/0x147 + dm_pool_register_metadata_threshold+0x40/0x80 + pool_ctr+0xa0a/0x1150 + dm_table_add_target+0x2c8/0x640 + table_load+0x1fd/0x430 + ctl_ioctl+0x2c4/0x5a0 + dm_ctl_ioctl+0xa/0x10 + __x64_sys_ioctl+0xb3/0xd0 + do_syscall_64+0x35/0x80 + entry_SYSCALL_64_after_hwframe+0x46/0xb0 + +This can be easily reproduced using: + echo offline > /sys/block/sda/device/state + dd if=/dev/zero of=/dev/mapper/thin bs=4k count=10 + dmsetup load pool --table "0 20971520 thin-pool /dev/sda /dev/sdb 128 0 0" + +If a metadata commit fails, the transaction will be aborted and the +metadata space maps will be destroyed. If a DM table reload then +happens for this failed thin-pool, a use-after-free will occur in +dm_sm_register_threshold_callback (called from +dm_pool_register_metadata_threshold). + +Fix this by in dm_pool_register_metadata_threshold() by returning the +-EINVAL error if the thin-pool is in fail mode. Also fail pool_ctr() +with a new error message: "Error registering metadata threshold". + +Fixes: ac8c3f3df65e4 ("dm thin: generate event when metadata threshold passed") +Cc: stable@vger.kernel.org +Reported-by: Hulk Robot +Signed-off-by: Luo Meng +Signed-off-by: Mike Snitzer +Signed-off-by: Sasha Levin +--- + drivers/md/dm-thin-metadata.c | 7 +++++-- + drivers/md/dm-thin.c | 4 +++- + 2 files changed, 8 insertions(+), 3 deletions(-) + +diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c +index 2db7030aba00..a27395c8621f 100644 +--- a/drivers/md/dm-thin-metadata.c ++++ b/drivers/md/dm-thin-metadata.c +@@ -2045,10 +2045,13 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, + dm_sm_threshold_fn fn, + void *context) + { +- int r; ++ int r = -EINVAL; + + pmd_write_lock_in_core(pmd); +- r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context); ++ if (!pmd->fail_io) { ++ r = dm_sm_register_threshold_callback(pmd->metadata_sm, ++ threshold, fn, context); ++ } + pmd_write_unlock(pmd); + + return r; +diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c +index 4d25d0e27031..53ac6ae870ac 100644 +--- a/drivers/md/dm-thin.c ++++ b/drivers/md/dm-thin.c +@@ -3382,8 +3382,10 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) + calc_metadata_threshold(pt), + metadata_low_callback, + pool); +- if (r) ++ if (r) { ++ ti->error = "Error registering metadata threshold"; + goto out_flags_changed; ++ } + + dm_pool_register_pre_commit_callback(pool->pmd, + metadata_pre_commit_callback, pool); +-- +2.35.1 + diff --git a/queue-5.18/dm-writecache-set-a-default-max_writeback_jobs.patch b/queue-5.18/dm-writecache-set-a-default-max_writeback_jobs.patch new file mode 100644 index 00000000000..99eadb5dc6c --- /dev/null +++ b/queue-5.18/dm-writecache-set-a-default-max_writeback_jobs.patch @@ -0,0 +1,41 @@ +From 36bc440738ef1c764c40ac40f0a1d699ee8d1d3f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Jul 2022 07:09:04 -0400 +Subject: dm writecache: set a default MAX_WRITEBACK_JOBS + +From: Mikulas Patocka + +[ Upstream commit ca7dc242e358e46d963b32f9d9dd829785a9e957 ] + +dm-writecache has the capability to limit the number of writeback jobs +in progress. However, this feature was off by default. As such there +were some out-of-memory crashes observed when lowering the low +watermark while the cache is full. + +This commit enables writeback limit by default. It is set to 256MiB or +1/16 of total system memory, whichever is smaller. + +Cc: stable@vger.kernel.org +Signed-off-by: Mikulas Patocka +Signed-off-by: Mike Snitzer +Signed-off-by: Sasha Levin +--- + drivers/md/dm-writecache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c +index e5acb393f70b..27557b852c94 100644 +--- a/drivers/md/dm-writecache.c ++++ b/drivers/md/dm-writecache.c +@@ -22,7 +22,7 @@ + + #define HIGH_WATERMARK 50 + #define LOW_WATERMARK 45 +-#define MAX_WRITEBACK_JOBS 0 ++#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16) + #define ENDIO_LATENCY 16 + #define WRITEBACK_LATENCY 64 + #define AUTOCOMMIT_BLOCKS_SSD 65536 +-- +2.35.1 + diff --git a/queue-5.18/drivers-base-fix-userspace-break-from-using-bin_attr.patch b/queue-5.18/drivers-base-fix-userspace-break-from-using-bin_attr.patch new file mode 100644 index 00000000000..8278cc9e8c2 --- /dev/null +++ b/queue-5.18/drivers-base-fix-userspace-break-from-using-bin_attr.patch @@ -0,0 +1,188 @@ +From cb546eb4a3db40281eb2d2b70ec132d20dba301e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Jul 2022 09:49:24 -0400 +Subject: drivers/base: fix userspace break from using bin_attributes for + cpumap and cpulist + +From: Phil Auld + +[ Upstream commit 7ee951acd31a88f941fd6535fbdee3a1567f1d63 ] + +Using bin_attributes with a 0 size causes fstat and friends to return that +0 size. This breaks userspace code that retrieves the size before reading +the file. Rather than reverting 75bd50fa841 ("drivers/base/node.c: use +bin_attribute to break the size limitation of cpumap ABI") let's put in a +size value at compile time. + +For cpulist the maximum size is on the order of + NR_CPUS * (ceil(log10(NR_CPUS)) + 1)/2 + +which for 8192 is 20480 (8192 * 5)/2. In order to get near that you'd need +a system with every other CPU on one node. For example: (0,2,4,8, ... ). +To simplify the math and support larger NR_CPUS in the future we are using +(NR_CPUS * 7)/2. We also set it to a min of PAGE_SIZE to retain the older +behavior for smaller NR_CPUS. + +The cpumap file the size works out to be NR_CPUS/4 + NR_CPUS/32 - 1 +(or NR_CPUS * 9/32 - 1) including the ","s. + +Add a set of macros for these values to cpumask.h so they can be used in +multiple places. Apply these to the handful of such files in +drivers/base/topology.c as well as node.c. + +As an example, on an 80 cpu 4-node system (NR_CPUS == 8192): + +before: + +-r--r--r--. 1 root root 0 Jul 12 14:08 system/node/node0/cpulist +-r--r--r--. 1 root root 0 Jul 11 17:25 system/node/node0/cpumap + +after: + +-r--r--r--. 1 root root 28672 Jul 13 11:32 system/node/node0/cpulist +-r--r--r--. 1 root root 4096 Jul 13 11:31 system/node/node0/cpumap + +CONFIG_NR_CPUS = 16384 +-r--r--r--. 1 root root 57344 Jul 13 14:03 system/node/node0/cpulist +-r--r--r--. 1 root root 4607 Jul 13 14:02 system/node/node0/cpumap + +The actual number of cpus doesn't matter for the reported size since they +are based on NR_CPUS. + +Fixes: 75bd50fa841d ("drivers/base/node.c: use bin_attribute to break the size limitation of cpumap ABI") +Fixes: bb9ec13d156e ("topology: use bin_attribute to break the size limitation of cpumap ABI") +Cc: Greg Kroah-Hartman +Cc: "Rafael J. Wysocki" +Cc: Yury Norov +Cc: stable@vger.kernel.org +Acked-by: Yury Norov (for include/linux/cpumask.h) +Signed-off-by: Phil Auld +Link: https://lore.kernel.org/r/20220715134924.3466194-1-pauld@redhat.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + drivers/base/node.c | 4 ++-- + drivers/base/topology.c | 32 ++++++++++++++++---------------- + include/linux/cpumask.h | 18 ++++++++++++++++++ + 3 files changed, 36 insertions(+), 18 deletions(-) + +diff --git a/drivers/base/node.c b/drivers/base/node.c +index 0ac6376ef7a1..eb0f43784c2b 100644 +--- a/drivers/base/node.c ++++ b/drivers/base/node.c +@@ -45,7 +45,7 @@ static inline ssize_t cpumap_read(struct file *file, struct kobject *kobj, + return n; + } + +-static BIN_ATTR_RO(cpumap, 0); ++static BIN_ATTR_RO(cpumap, CPUMAP_FILE_MAX_BYTES); + + static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, +@@ -66,7 +66,7 @@ static inline ssize_t cpulist_read(struct file *file, struct kobject *kobj, + return n; + } + +-static BIN_ATTR_RO(cpulist, 0); ++static BIN_ATTR_RO(cpulist, CPULIST_FILE_MAX_BYTES); + + /** + * struct node_access_nodes - Access class device to hold user visible +diff --git a/drivers/base/topology.c b/drivers/base/topology.c +index ac6ad9ab67f9..89f98be5c5b9 100644 +--- a/drivers/base/topology.c ++++ b/drivers/base/topology.c +@@ -62,47 +62,47 @@ define_id_show_func(ppin, "0x%llx"); + static DEVICE_ATTR_ADMIN_RO(ppin); + + define_siblings_read_func(thread_siblings, sibling_cpumask); +-static BIN_ATTR_RO(thread_siblings, 0); +-static BIN_ATTR_RO(thread_siblings_list, 0); ++static BIN_ATTR_RO(thread_siblings, CPUMAP_FILE_MAX_BYTES); ++static BIN_ATTR_RO(thread_siblings_list, CPULIST_FILE_MAX_BYTES); + + define_siblings_read_func(core_cpus, sibling_cpumask); +-static BIN_ATTR_RO(core_cpus, 0); +-static BIN_ATTR_RO(core_cpus_list, 0); ++static BIN_ATTR_RO(core_cpus, CPUMAP_FILE_MAX_BYTES); ++static BIN_ATTR_RO(core_cpus_list, CPULIST_FILE_MAX_BYTES); + + define_siblings_read_func(core_siblings, core_cpumask); +-static BIN_ATTR_RO(core_siblings, 0); +-static BIN_ATTR_RO(core_siblings_list, 0); ++static BIN_ATTR_RO(core_siblings, CPUMAP_FILE_MAX_BYTES); ++static BIN_ATTR_RO(core_siblings_list, CPULIST_FILE_MAX_BYTES); + + #ifdef TOPOLOGY_CLUSTER_SYSFS + define_siblings_read_func(cluster_cpus, cluster_cpumask); +-static BIN_ATTR_RO(cluster_cpus, 0); +-static BIN_ATTR_RO(cluster_cpus_list, 0); ++static BIN_ATTR_RO(cluster_cpus, CPUMAP_FILE_MAX_BYTES); ++static BIN_ATTR_RO(cluster_cpus_list, CPULIST_FILE_MAX_BYTES); + #endif + + #ifdef TOPOLOGY_DIE_SYSFS + define_siblings_read_func(die_cpus, die_cpumask); +-static BIN_ATTR_RO(die_cpus, 0); +-static BIN_ATTR_RO(die_cpus_list, 0); ++static BIN_ATTR_RO(die_cpus, CPUMAP_FILE_MAX_BYTES); ++static BIN_ATTR_RO(die_cpus_list, CPULIST_FILE_MAX_BYTES); + #endif + + define_siblings_read_func(package_cpus, core_cpumask); +-static BIN_ATTR_RO(package_cpus, 0); +-static BIN_ATTR_RO(package_cpus_list, 0); ++static BIN_ATTR_RO(package_cpus, CPUMAP_FILE_MAX_BYTES); ++static BIN_ATTR_RO(package_cpus_list, CPULIST_FILE_MAX_BYTES); + + #ifdef TOPOLOGY_BOOK_SYSFS + define_id_show_func(book_id, "%d"); + static DEVICE_ATTR_RO(book_id); + define_siblings_read_func(book_siblings, book_cpumask); +-static BIN_ATTR_RO(book_siblings, 0); +-static BIN_ATTR_RO(book_siblings_list, 0); ++static BIN_ATTR_RO(book_siblings, CPUMAP_FILE_MAX_BYTES); ++static BIN_ATTR_RO(book_siblings_list, CPULIST_FILE_MAX_BYTES); + #endif + + #ifdef TOPOLOGY_DRAWER_SYSFS + define_id_show_func(drawer_id, "%d"); + static DEVICE_ATTR_RO(drawer_id); + define_siblings_read_func(drawer_siblings, drawer_cpumask); +-static BIN_ATTR_RO(drawer_siblings, 0); +-static BIN_ATTR_RO(drawer_siblings_list, 0); ++static BIN_ATTR_RO(drawer_siblings, CPUMAP_FILE_MAX_BYTES); ++static BIN_ATTR_RO(drawer_siblings_list, CPULIST_FILE_MAX_BYTES); + #endif + + static struct bin_attribute *bin_attrs[] = { +diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h +index fe29ac7cc469..4592d0845941 100644 +--- a/include/linux/cpumask.h ++++ b/include/linux/cpumask.h +@@ -1071,4 +1071,22 @@ cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, + [0] = 1UL \ + } } + ++/* ++ * Provide a valid theoretical max size for cpumap and cpulist sysfs files ++ * to avoid breaking userspace which may allocate a buffer based on the size ++ * reported by e.g. fstat. ++ * ++ * for cpumap NR_CPUS * 9/32 - 1 should be an exact length. ++ * ++ * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up ++ * to 2 orders of magnitude larger than 8192. And then we divide by 2 to ++ * cover a worst-case of every other cpu being on one of two nodes for a ++ * very large NR_CPUS. ++ * ++ * Use PAGE_SIZE as a minimum for smaller configurations. ++ */ ++#define CPUMAP_FILE_MAX_BYTES ((((NR_CPUS * 9)/32 - 1) > PAGE_SIZE) \ ++ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) ++#define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) ++ + #endif /* __LINUX_CPUMASK_H */ +-- +2.35.1 + diff --git a/queue-5.18/drm-dp-mst-read-the-extended-dpcd-capabilities-durin.patch b/queue-5.18/drm-dp-mst-read-the-extended-dpcd-capabilities-durin.patch new file mode 100644 index 00000000000..59fc59a1ef0 --- /dev/null +++ b/queue-5.18/drm-dp-mst-read-the-extended-dpcd-capabilities-durin.patch @@ -0,0 +1,57 @@ +From c73a45d0946910af4186cf5c6152e0792eaaabd2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Jun 2022 12:45:37 +0300 +Subject: drm/dp/mst: Read the extended DPCD capabilities during system resume + +From: Imre Deak + +[ Upstream commit 7a710a8bc909313951eb9252d8419924c771d7c2 ] + +The WD22TB4 Thunderbolt dock at least will revert its DP_MAX_LINK_RATE +from HBR3 to HBR2 after system suspend/resume if the DP_DP13_DPCD_REV +registers are not read subsequently also as required. + +Fix this by reading DP_DP13_DPCD_REV registers as well, matching what is +done during connector detection. While at it also fix up the same call +in drm_dp_mst_dump_topology(). + +Cc: Lyude Paul +Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/5292 +Signed-off-by: Imre Deak +Reviewed-by: Jani Nikula +Cc: # v5.14+ +Reviewed-by: Lyude Paul +Link: https://patchwork.freedesktop.org/patch/msgid/20220614094537.885472-1-imre.deak@intel.com +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/dp/drm_dp_mst_topology.c | 7 ++----- + 1 file changed, 2 insertions(+), 5 deletions(-) + +diff --git a/drivers/gpu/drm/dp/drm_dp_mst_topology.c b/drivers/gpu/drm/dp/drm_dp_mst_topology.c +index 7a7cc44686f9..96869875390f 100644 +--- a/drivers/gpu/drm/dp/drm_dp_mst_topology.c ++++ b/drivers/gpu/drm/dp/drm_dp_mst_topology.c +@@ -3861,9 +3861,7 @@ int drm_dp_mst_topology_mgr_resume(struct drm_dp_mst_topology_mgr *mgr, + if (!mgr->mst_primary) + goto out_fail; + +- ret = drm_dp_dpcd_read(mgr->aux, DP_DPCD_REV, mgr->dpcd, +- DP_RECEIVER_CAP_SIZE); +- if (ret != DP_RECEIVER_CAP_SIZE) { ++ if (drm_dp_read_dpcd_caps(mgr->aux, mgr->dpcd) < 0) { + drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during suspend?\n"); + goto out_fail; + } +@@ -4912,8 +4910,7 @@ void drm_dp_mst_dump_topology(struct seq_file *m, + u8 buf[DP_PAYLOAD_TABLE_SIZE]; + int ret; + +- ret = drm_dp_dpcd_read(mgr->aux, DP_DPCD_REV, buf, DP_RECEIVER_CAP_SIZE); +- if (ret) { ++ if (drm_dp_read_dpcd_caps(mgr->aux, buf) < 0) { + seq_printf(m, "dpcd read failed\n"); + goto out; + } +-- +2.35.1 + diff --git a/queue-5.18/drm-mediatek-keep-dsi-as-lp00-before-dcs-cmds-transf.patch b/queue-5.18/drm-mediatek-keep-dsi-as-lp00-before-dcs-cmds-transf.patch new file mode 100644 index 00000000000..1adb3c8ff0a --- /dev/null +++ b/queue-5.18/drm-mediatek-keep-dsi-as-lp00-before-dcs-cmds-transf.patch @@ -0,0 +1,116 @@ +From 72b19d4c277cf659673dc5b4b02f03f6ea4a746e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 20 May 2022 10:00:06 +0800 +Subject: drm/mediatek: Keep dsi as LP00 before dcs cmds transfer + +From: Jitao Shi + +[ Upstream commit 39e8d062b03c3dc257d880d82bd55cdd9e185a3b ] + +To comply with the panel sequence, hold the mipi signal to LP00 before +the dcs cmds transmission, and pull the mipi signal high from LP00 to +LP11 until the start of the dcs cmds transmission. + +The normal panel timing is : +(1) pp1800 DC pull up +(2) avdd & avee AC pull high +(3) lcm_reset pull high -> pull low -> pull high +(4) Pull MIPI signal high (LP11) -> initial code -> send video data + (HS mode) + +The power-off sequence is reversed. +If dsi is not in cmd mode, then dsi will pull the mipi signal high in +the mtk_output_dsi_enable function. The delay in lane_ready func is +the reaction time of dsi_rx after pulling up the mipi signal. + +Fixes: 2dd8075d2185 ("drm/mediatek: mtk_dsi: Use the drm_panel_bridge API") + +Link: https://patchwork.kernel.org/project/linux-mediatek/patch/1653012007-11854-4-git-send-email-xinlei.lee@mediatek.com/ +Cc: # 5.10.x: 7f6335c6a258: drm/mediatek: Modify dsi funcs to atomic operations +Cc: # 5.10.x: cde7e2e35c28: drm/mediatek: Separate poweron/poweroff from enable/disable and define new funcs +Cc: # 5.10.x +Signed-off-by: Jitao Shi +Signed-off-by: Xinlei Lee +Reviewed-by: AngeloGioacchino Del Regno +Reviewed-by: Rex-BC Chen +Signed-off-by: Chun-Kuang Hu +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/mediatek/mtk_dsi.c | 28 +++++++++++++++++++++------- + 1 file changed, 21 insertions(+), 7 deletions(-) + +diff --git a/drivers/gpu/drm/mediatek/mtk_dsi.c b/drivers/gpu/drm/mediatek/mtk_dsi.c +index f0f523bdafb8..e0a2d5ea40af 100644 +--- a/drivers/gpu/drm/mediatek/mtk_dsi.c ++++ b/drivers/gpu/drm/mediatek/mtk_dsi.c +@@ -203,6 +203,7 @@ struct mtk_dsi { + struct mtk_phy_timing phy_timing; + int refcount; + bool enabled; ++ bool lanes_ready; + u32 irq_data; + wait_queue_head_t irq_wait_queue; + const struct mtk_dsi_driver_data *driver_data; +@@ -649,18 +650,11 @@ static int mtk_dsi_poweron(struct mtk_dsi *dsi) + mtk_dsi_reset_engine(dsi); + mtk_dsi_phy_timconfig(dsi); + +- mtk_dsi_rxtx_control(dsi); +- usleep_range(30, 100); +- mtk_dsi_reset_dphy(dsi); + mtk_dsi_ps_control_vact(dsi); + mtk_dsi_set_vm_cmd(dsi); + mtk_dsi_config_vdo_timing(dsi); + mtk_dsi_set_interrupt_enable(dsi); + +- mtk_dsi_clk_ulp_mode_leave(dsi); +- mtk_dsi_lane0_ulp_mode_leave(dsi); +- mtk_dsi_clk_hs_mode(dsi, 0); +- + return 0; + err_disable_engine_clk: + clk_disable_unprepare(dsi->engine_clk); +@@ -691,6 +685,23 @@ static void mtk_dsi_poweroff(struct mtk_dsi *dsi) + clk_disable_unprepare(dsi->digital_clk); + + phy_power_off(dsi->phy); ++ ++ dsi->lanes_ready = false; ++} ++ ++static void mtk_dsi_lane_ready(struct mtk_dsi *dsi) ++{ ++ if (!dsi->lanes_ready) { ++ dsi->lanes_ready = true; ++ mtk_dsi_rxtx_control(dsi); ++ usleep_range(30, 100); ++ mtk_dsi_reset_dphy(dsi); ++ mtk_dsi_clk_ulp_mode_leave(dsi); ++ mtk_dsi_lane0_ulp_mode_leave(dsi); ++ mtk_dsi_clk_hs_mode(dsi, 0); ++ msleep(20); ++ /* The reaction time after pulling up the mipi signal for dsi_rx */ ++ } + } + + static void mtk_output_dsi_enable(struct mtk_dsi *dsi) +@@ -698,6 +709,7 @@ static void mtk_output_dsi_enable(struct mtk_dsi *dsi) + if (dsi->enabled) + return; + ++ mtk_dsi_lane_ready(dsi); + mtk_dsi_set_mode(dsi); + mtk_dsi_clk_hs_mode(dsi, 1); + +@@ -1007,6 +1019,8 @@ static ssize_t mtk_dsi_host_transfer(struct mipi_dsi_host *host, + if (MTK_DSI_HOST_IS_READ(msg->type)) + irq_flag |= LPRX_RD_RDY_INT_FLAG; + ++ mtk_dsi_lane_ready(dsi); ++ + ret = mtk_dsi_host_send_cmd(dsi, msg, irq_flag); + if (ret) + goto restore_dsi_mode; +-- +2.35.1 + diff --git a/queue-5.18/drm-vc4-drv-adopt-the-dma-configuration-from-the-hvs.patch b/queue-5.18/drm-vc4-drv-adopt-the-dma-configuration-from-the-hvs.patch new file mode 100644 index 00000000000..186edb8bc1f --- /dev/null +++ b/queue-5.18/drm-vc4-drv-adopt-the-dma-configuration-from-the-hvs.patch @@ -0,0 +1,68 @@ +From eb33b7125fa369b35e05a9779b1ee772cfe928cb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Jun 2022 16:47:28 +0200 +Subject: drm/vc4: drv: Adopt the dma configuration from the HVS or V3D + component + +From: Dave Stevenson + +[ Upstream commit da8e393e23efb60eba8959856c7df88f9859f6eb ] + +vc4_drv isn't necessarily under the /soc node in DT as it is a +virtual device, but it is the one that does the allocations. +The DMA addresses are consumed by primarily the HVS or V3D, and +those require VideoCore cache alias address mapping, and so will be +under /soc. + +During probe find the a suitable device node for HVS or V3D, +and adopt the DMA configuration of that node. + +Cc: +Signed-off-by: Dave Stevenson +Link: https://lore.kernel.org/r/20220613144800.326124-2-maxime@cerno.tech +Signed-off-by: Maxime Ripard +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/vc4/vc4_drv.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c +index 162bc18e7497..14a7d529144d 100644 +--- a/drivers/gpu/drm/vc4/vc4_drv.c ++++ b/drivers/gpu/drm/vc4/vc4_drv.c +@@ -209,6 +209,15 @@ static void vc4_match_add_drivers(struct device *dev, + } + } + ++const struct of_device_id vc4_dma_range_matches[] = { ++ { .compatible = "brcm,bcm2711-hvs" }, ++ { .compatible = "brcm,bcm2835-hvs" }, ++ { .compatible = "brcm,bcm2835-v3d" }, ++ { .compatible = "brcm,cygnus-v3d" }, ++ { .compatible = "brcm,vc4-v3d" }, ++ {} ++}; ++ + static int vc4_drm_bind(struct device *dev) + { + struct platform_device *pdev = to_platform_device(dev); +@@ -227,6 +236,16 @@ static int vc4_drm_bind(struct device *dev) + vc4_drm_driver.driver_features &= ~DRIVER_RENDER; + of_node_put(node); + ++ node = of_find_matching_node_and_match(NULL, vc4_dma_range_matches, ++ NULL); ++ if (node) { ++ ret = of_dma_configure(dev, node, true); ++ of_node_put(node); ++ ++ if (ret) ++ return ret; ++ } ++ + vc4 = devm_drm_dev_alloc(dev, &vc4_drm_driver, struct vc4_dev, base); + if (IS_ERR(vc4)) + return PTR_ERR(vc4); +-- +2.35.1 + diff --git a/queue-5.18/ext4-add-ext4_inode_has_xattr_space-macro-in-xattr.h.patch b/queue-5.18/ext4-add-ext4_inode_has_xattr_space-macro-in-xattr.h.patch new file mode 100644 index 00000000000..e6f9e3641fc --- /dev/null +++ b/queue-5.18/ext4-add-ext4_inode_has_xattr_space-macro-in-xattr.h.patch @@ -0,0 +1,50 @@ +From cc80213fb3767c2c9e67a78719c06cb0b2f32c15 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 16 Jun 2022 10:13:55 +0800 +Subject: ext4: add EXT4_INODE_HAS_XATTR_SPACE macro in xattr.h + +From: Baokun Li + +[ Upstream commit 179b14152dcb6a24c3415200603aebca70ff13af ] + +When adding an xattr to an inode, we must ensure that the inode_size is +not less than EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad. Otherwise, +the end position may be greater than the start position, resulting in UAF. + +Signed-off-by: Baokun Li +Reviewed-by: Jan Kara +Reviewed-by: Ritesh Harjani (IBM) +Link: https://lore.kernel.org/r/20220616021358.2504451-2-libaokun1@huawei.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/xattr.h | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h +index 77efb9a627ad..f885f362add4 100644 +--- a/fs/ext4/xattr.h ++++ b/fs/ext4/xattr.h +@@ -95,6 +95,19 @@ struct ext4_xattr_entry { + + #define EXT4_ZERO_XATTR_VALUE ((void *)-1) + ++/* ++ * If we want to add an xattr to the inode, we should make sure that ++ * i_extra_isize is not 0 and that the inode size is not less than ++ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad. ++ * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data ++ * |--------------------------|------------|------|---------|---|-------| ++ */ ++#define EXT4_INODE_HAS_XATTR_SPACE(inode) \ ++ ((EXT4_I(inode)->i_extra_isize != 0) && \ ++ (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \ ++ sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \ ++ EXT4_INODE_SIZE((inode)->i_sb))) ++ + struct ext4_xattr_info { + const char *name; + const void *value; +-- +2.35.1 + diff --git a/queue-5.18/ext4-check-if-directory-block-is-within-i_size.patch b/queue-5.18/ext4-check-if-directory-block-is-within-i_size.patch new file mode 100644 index 00000000000..429ff0ef26c --- /dev/null +++ b/queue-5.18/ext4-check-if-directory-block-is-within-i_size.patch @@ -0,0 +1,56 @@ +From 6770bf434d6a397d0ac1762555133b5cf6e7a3e8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Jul 2022 16:27:20 +0200 +Subject: ext4: check if directory block is within i_size + +From: Lukas Czerner + +[ Upstream commit 65f8ea4cd57dbd46ea13b41dc8bac03176b04233 ] + +Currently ext4 directory handling code implicitly assumes that the +directory blocks are always within the i_size. In fact ext4_append() +will attempt to allocate next directory block based solely on i_size and +the i_size is then appropriately increased after a successful +allocation. + +However, for this to work it requires i_size to be correct. If, for any +reason, the directory inode i_size is corrupted in a way that the +directory tree refers to a valid directory block past i_size, we could +end up corrupting parts of the directory tree structure by overwriting +already used directory blocks when modifying the directory. + +Fix it by catching the corruption early in __ext4_read_dirblock(). + +Addresses Red-Hat-Bugzilla: #2070205 +CVE: CVE-2022-1184 +Signed-off-by: Lukas Czerner +Cc: stable@vger.kernel.org +Reviewed-by: Andreas Dilger +Link: https://lore.kernel.org/r/20220704142721.157985-1-lczerner@redhat.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/namei.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 4f0420b1ff3e..2bc3e4b27204 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -110,6 +110,13 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + struct ext4_dir_entry *dirent; + int is_dx_block = 0; + ++ if (block >= inode->i_size) { ++ ext4_error_inode(inode, func, line, block, ++ "Attempting to read directory block (%u) that is past i_size (%llu)", ++ block, inode->i_size); ++ return ERR_PTR(-EFSCORRUPTED); ++ } ++ + if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO)) + bh = ERR_PTR(-EIO); + else +-- +2.35.1 + diff --git a/queue-5.18/ext4-correct-max_inline_xattr_value_size-computing.patch b/queue-5.18/ext4-correct-max_inline_xattr_value_size-computing.patch new file mode 100644 index 00000000000..67a7e46f81c --- /dev/null +++ b/queue-5.18/ext4-correct-max_inline_xattr_value_size-computing.patch @@ -0,0 +1,41 @@ +From e4206366d1ea91bfb593c5f1c297f97d95cc09b7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 16 Jun 2022 10:13:57 +0800 +Subject: ext4: correct max_inline_xattr_value_size computing + +From: Baokun Li + +[ Upstream commit c9fd167d57133c5b748d16913c4eabc55e531c73 ] + +If the ext4 inode does not have xattr space, 0 is returned in the +get_max_inline_xattr_value_size function. Otherwise, the function returns +a negative value when the inode does not contain EXT4_STATE_XATTR. + +Cc: stable@kernel.org +Signed-off-by: Baokun Li +Reviewed-by: Ritesh Harjani (IBM) +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/20220616021358.2504451-4-libaokun1@huawei.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/inline.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c +index e9ef5cf30969..84fcd06a8e8a 100644 +--- a/fs/ext4/inline.c ++++ b/fs/ext4/inline.c +@@ -35,6 +35,9 @@ static int get_max_inline_xattr_value_size(struct inode *inode, + struct ext4_inode *raw_inode; + int free, min_offs; + ++ if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) ++ return 0; ++ + min_offs = EXT4_SB(inode->i_sb)->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE - + EXT4_I(inode)->i_extra_isize - +-- +2.35.1 + diff --git a/queue-5.18/ext4-correct-the-misjudgment-in-ext4_iget_extra_inod.patch b/queue-5.18/ext4-correct-the-misjudgment-in-ext4_iget_extra_inod.patch new file mode 100644 index 00000000000..45ae7da10e2 --- /dev/null +++ b/queue-5.18/ext4-correct-the-misjudgment-in-ext4_iget_extra_inod.patch @@ -0,0 +1,40 @@ +From bb87c08d8958bcedb747138758192ad8471a7f14 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 16 Jun 2022 10:13:58 +0800 +Subject: ext4: correct the misjudgment in ext4_iget_extra_inode + +From: Baokun Li + +[ Upstream commit fd7e672ea98b95b9d4c9dae316639f03c16a749d ] + +Use the EXT4_INODE_HAS_XATTR_SPACE macro to more accurately +determine whether the inode have xattr space. + +Cc: stable@kernel.org +Signed-off-by: Baokun Li +Reviewed-by: Ritesh Harjani (IBM) +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/20220616021358.2504451-5-libaokun1@huawei.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/inode.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 826e2deb10f8..e478cac3b8f2 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4681,8 +4681,7 @@ static inline int ext4_iget_extra_inode(struct inode *inode, + __le32 *magic = (void *)raw_inode + + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + +- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= +- EXT4_INODE_SIZE(inode->i_sb) && ++ if (EXT4_INODE_HAS_XATTR_SPACE(inode) && + *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + ext4_set_inode_state(inode, EXT4_STATE_XATTR); + return ext4_find_inline_data_nolock(inode); +-- +2.35.1 + diff --git a/queue-5.18/ext4-fix-extent-status-tree-race-in-writeback-error-.patch b/queue-5.18/ext4-fix-extent-status-tree-race-in-writeback-error-.patch new file mode 100644 index 00000000000..b190977b804 --- /dev/null +++ b/queue-5.18/ext4-fix-extent-status-tree-race-in-writeback-error-.patch @@ -0,0 +1,57 @@ +From dc4a02f5902dad0bd2ac7063ad9cc82a29f579f2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 12:05:30 -0400 +Subject: ext4: fix extent status tree race in writeback error recovery path + +From: Eric Whitney + +[ Upstream commit 7f0d8e1d607c1a4fa9a27362a108921d82230874 ] + +A race can occur in the unlikely event ext4 is unable to allocate a +physical cluster for a delayed allocation in a bigalloc file system +during writeback. Failure to allocate a cluster forces error recovery +that includes a call to mpage_release_unused_pages(). That function +removes any corresponding delayed allocated blocks from the extent +status tree. If a new delayed write is in progress on the same cluster +simultaneously, resulting in the addition of an new extent containing +one or more blocks in that cluster to the extent status tree, delayed +block accounting can be thrown off if that delayed write then encounters +a similar cluster allocation failure during future writeback. + +Write lock the i_data_sem in mpage_release_unused_pages() to fix this +problem. Ext4's block/cluster accounting code for bigalloc relies on +i_data_sem for mutual exclusion, as is found in the delayed write path, +and the locking in mpage_release_unused_pages() is missing. + +Cc: stable@kernel.org +Reported-by: Ye Bin +Signed-off-by: Eric Whitney +Link: https://lore.kernel.org/r/20220615160530.1928801-1-enwlinux@gmail.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/inode.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index beed9e32571c..826e2deb10f8 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1559,7 +1559,14 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, + ext4_lblk_t start, last; + start = index << (PAGE_SHIFT - inode->i_blkbits); + last = end << (PAGE_SHIFT - inode->i_blkbits); ++ ++ /* ++ * avoid racing with extent status tree scans made by ++ * ext4_insert_delayed_block() ++ */ ++ down_write(&EXT4_I(inode)->i_data_sem); + ext4_es_remove_extent(inode, start, last - start + 1); ++ up_write(&EXT4_I(inode)->i_data_sem); + } + + pagevec_init(&pvec); +-- +2.35.1 + diff --git a/queue-5.18/ext4-fix-race-when-reusing-xattr-blocks.patch b/queue-5.18/ext4-fix-race-when-reusing-xattr-blocks.patch new file mode 100644 index 00000000000..e1120bd877a --- /dev/null +++ b/queue-5.18/ext4-fix-race-when-reusing-xattr-blocks.patch @@ -0,0 +1,179 @@ +From 7e96f9358fc2312891b448cce6dd758e684e2b80 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 12:54:24 +0200 +Subject: ext4: fix race when reusing xattr blocks + +From: Jan Kara + +[ Upstream commit 65f8b80053a1b2fd602daa6814e62d6fa90e5e9b ] + +When ext4_xattr_block_set() decides to remove xattr block the following +race can happen: + +CPU1 CPU2 +ext4_xattr_block_set() ext4_xattr_release_block() + new_bh = ext4_xattr_block_cache_find() + + lock_buffer(bh); + ref = le32_to_cpu(BHDR(bh)->h_refcount); + if (ref == 1) { + ... + mb_cache_entry_delete(); + unlock_buffer(bh); + ext4_free_blocks(); + ... + ext4_forget(..., bh, ...); + jbd2_journal_revoke(..., bh); + + ext4_journal_get_write_access(..., new_bh, ...) + do_get_write_access() + jbd2_journal_cancel_revoke(..., new_bh); + +Later the code in ext4_xattr_block_set() finds out the block got freed +and cancels reusal of the block but the revoke stays canceled and so in +case of block reuse and journal replay the filesystem can get corrupted. +If the race works out slightly differently, we can also hit assertions +in the jbd2 code. + +Fix the problem by making sure that once matching mbcache entry is +found, code dropping the last xattr block reference (or trying to modify +xattr block in place) waits until the mbcache entry reference is +dropped. This way code trying to reuse xattr block is protected from +someone trying to drop the last reference to xattr block. + +Reported-and-tested-by: Ritesh Harjani +CC: stable@vger.kernel.org +Fixes: 82939d7999df ("ext4: convert to mbcache2") +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/r/20220712105436.32204-5-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/xattr.c | 67 +++++++++++++++++++++++++++++++++---------------- + 1 file changed, 45 insertions(+), 22 deletions(-) + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index a25942a74929..533216e80fa2 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -439,9 +439,16 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + /* Remove entry from mbcache when EA inode is getting evicted */ + void ext4_evict_ea_inode(struct inode *inode) + { +- if (EA_INODE_CACHE(inode)) +- mb_cache_entry_delete(EA_INODE_CACHE(inode), +- ext4_xattr_inode_get_hash(inode), inode->i_ino); ++ struct mb_cache_entry *oe; ++ ++ if (!EA_INODE_CACHE(inode)) ++ return; ++ /* Wait for entry to get unused so that we can remove it */ ++ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), ++ ext4_xattr_inode_get_hash(inode), inode->i_ino))) { ++ mb_cache_entry_wait_unused(oe); ++ mb_cache_entry_put(EA_INODE_CACHE(inode), oe); ++ } + } + + static int +@@ -1229,6 +1236,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, + if (error) + goto out; + ++retry_ref: + lock_buffer(bh); + hash = le32_to_cpu(BHDR(bh)->h_hash); + ref = le32_to_cpu(BHDR(bh)->h_refcount); +@@ -1238,9 +1246,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, + * This must happen under buffer lock for + * ext4_xattr_block_set() to reliably detect freed block + */ +- if (ea_block_cache) +- mb_cache_entry_delete(ea_block_cache, hash, +- bh->b_blocknr); ++ if (ea_block_cache) { ++ struct mb_cache_entry *oe; ++ ++ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, ++ bh->b_blocknr); ++ if (oe) { ++ unlock_buffer(bh); ++ mb_cache_entry_wait_unused(oe); ++ mb_cache_entry_put(ea_block_cache, oe); ++ goto retry_ref; ++ } ++ } + get_bh(bh); + unlock_buffer(bh); + +@@ -1867,9 +1884,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + * ext4_xattr_block_set() to reliably detect modified + * block + */ +- if (ea_block_cache) +- mb_cache_entry_delete(ea_block_cache, hash, +- bs->bh->b_blocknr); ++ if (ea_block_cache) { ++ struct mb_cache_entry *oe; ++ ++ oe = mb_cache_entry_delete_or_get(ea_block_cache, ++ hash, bs->bh->b_blocknr); ++ if (oe) { ++ /* ++ * Xattr block is getting reused. Leave ++ * it alone. ++ */ ++ mb_cache_entry_put(ea_block_cache, oe); ++ goto clone_block; ++ } ++ } + ea_bdebug(bs->bh, "modifying in-place"); + error = ext4_xattr_set_entry(i, s, handle, inode, + true /* is_block */); +@@ -1885,6 +1913,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + goto cleanup; + goto inserted; + } ++clone_block: + unlock_buffer(bs->bh); + ea_bdebug(bs->bh, "cloning"); + s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); +@@ -1990,18 +2019,13 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + lock_buffer(new_bh); + /* + * We have to be careful about races with +- * freeing, rehashing or adding references to +- * xattr block. Once we hold buffer lock xattr +- * block's state is stable so we can check +- * whether the block got freed / rehashed or +- * not. Since we unhash mbcache entry under +- * buffer lock when freeing / rehashing xattr +- * block, checking whether entry is still +- * hashed is reliable. Same rules hold for +- * e_reusable handling. ++ * adding references to xattr block. Once we ++ * hold buffer lock xattr block's state is ++ * stable so we can check the additional ++ * reference fits. + */ +- if (hlist_bl_unhashed(&ce->e_hash_list) || +- !ce->e_reusable) { ++ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; ++ if (ref > EXT4_XATTR_REFCOUNT_MAX) { + /* + * Undo everything and check mbcache + * again. +@@ -2016,9 +2040,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + new_bh = NULL; + goto inserted; + } +- ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; + BHDR(new_bh)->h_refcount = cpu_to_le32(ref); +- if (ref >= EXT4_XATTR_REFCOUNT_MAX) ++ if (ref == EXT4_XATTR_REFCOUNT_MAX) + ce->e_reusable = 0; + ea_bdebug(new_bh, "reusing; refcount now=%d", + ref); +-- +2.35.1 + diff --git a/queue-5.18/ext4-fix-use-after-free-in-ext4_xattr_set_entry.patch b/queue-5.18/ext4-fix-use-after-free-in-ext4_xattr_set_entry.patch new file mode 100644 index 00000000000..77b18e692c4 --- /dev/null +++ b/queue-5.18/ext4-fix-use-after-free-in-ext4_xattr_set_entry.patch @@ -0,0 +1,128 @@ +From a366d09886e9e7ed9fa6ffb4207af76a69c861ce Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 16 Jun 2022 10:13:56 +0800 +Subject: ext4: fix use-after-free in ext4_xattr_set_entry + +From: Baokun Li + +[ Upstream commit 67d7d8ad99beccd9fe92d585b87f1760dc9018e3 ] + +Hulk Robot reported a issue: +================================================================== +BUG: KASAN: use-after-free in ext4_xattr_set_entry+0x18ab/0x3500 +Write of size 4105 at addr ffff8881675ef5f4 by task syz-executor.0/7092 + +CPU: 1 PID: 7092 Comm: syz-executor.0 Not tainted 4.19.90-dirty #17 +Call Trace: +[...] + memcpy+0x34/0x50 mm/kasan/kasan.c:303 + ext4_xattr_set_entry+0x18ab/0x3500 fs/ext4/xattr.c:1747 + ext4_xattr_ibody_inline_set+0x86/0x2a0 fs/ext4/xattr.c:2205 + ext4_xattr_set_handle+0x940/0x1300 fs/ext4/xattr.c:2386 + ext4_xattr_set+0x1da/0x300 fs/ext4/xattr.c:2498 + __vfs_setxattr+0x112/0x170 fs/xattr.c:149 + __vfs_setxattr_noperm+0x11b/0x2a0 fs/xattr.c:180 + __vfs_setxattr_locked+0x17b/0x250 fs/xattr.c:238 + vfs_setxattr+0xed/0x270 fs/xattr.c:255 + setxattr+0x235/0x330 fs/xattr.c:520 + path_setxattr+0x176/0x190 fs/xattr.c:539 + __do_sys_lsetxattr fs/xattr.c:561 [inline] + __se_sys_lsetxattr fs/xattr.c:557 [inline] + __x64_sys_lsetxattr+0xc2/0x160 fs/xattr.c:557 + do_syscall_64+0xdf/0x530 arch/x86/entry/common.c:298 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 +RIP: 0033:0x459fe9 +RSP: 002b:00007fa5e54b4c08 EFLAGS: 00000246 ORIG_RAX: 00000000000000bd +RAX: ffffffffffffffda RBX: 000000000051bf60 RCX: 0000000000459fe9 +RDX: 00000000200003c0 RSI: 0000000020000180 RDI: 0000000020000140 +RBP: 000000000051bf60 R08: 0000000000000001 R09: 0000000000000000 +R10: 0000000000001009 R11: 0000000000000246 R12: 0000000000000000 +R13: 00007ffc73c93fc0 R14: 000000000051bf60 R15: 00007fa5e54b4d80 +[...] +================================================================== + +Above issue may happen as follows: +------------------------------------- +ext4_xattr_set + ext4_xattr_set_handle + ext4_xattr_ibody_find + >> s->end < s->base + >> no EXT4_STATE_XATTR + >> xattr_check_inode is not executed + ext4_xattr_ibody_set + ext4_xattr_set_entry + >> size_t min_offs = s->end - s->base + >> UAF in memcpy + +we can easily reproduce this problem with the following commands: + mkfs.ext4 -F /dev/sda + mount -o debug_want_extra_isize=128 /dev/sda /mnt + touch /mnt/file + setfattr -n user.cat -v `seq -s z 4096|tr -d '[:digit:]'` /mnt/file + +In ext4_xattr_ibody_find, we have the following assignment logic: + header = IHDR(inode, raw_inode) + = raw_inode + EXT4_GOOD_OLD_INODE_SIZE + i_extra_isize + is->s.base = IFIRST(header) + = header + sizeof(struct ext4_xattr_ibody_header) + is->s.end = raw_inode + s_inode_size + +In ext4_xattr_set_entry + min_offs = s->end - s->base + = s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - i_extra_isize - + sizeof(struct ext4_xattr_ibody_header) + last = s->first + free = min_offs - ((void *)last - s->base) - sizeof(__u32) + = s_inode_size - EXT4_GOOD_OLD_INODE_SIZE - i_extra_isize - + sizeof(struct ext4_xattr_ibody_header) - sizeof(__u32) + +In the calculation formula, all values except s_inode_size and +i_extra_size are fixed values. When i_extra_size is the maximum value +s_inode_size - EXT4_GOOD_OLD_INODE_SIZE, min_offs is -4 and free is -8. +The value overflows. As a result, the preceding issue is triggered when +memcpy is executed. + +Therefore, when finding xattr or setting xattr, check whether +there is space for storing xattr in the inode to resolve this issue. + +Cc: stable@kernel.org +Reported-by: Hulk Robot +Signed-off-by: Baokun Li +Reviewed-by: Ritesh Harjani (IBM) +Reviewed-by: Jan Kara +Link: https://lore.kernel.org/r/20220616021358.2504451-3-libaokun1@huawei.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/xattr.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 042325349098..c3c3194f3ee1 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -2176,8 +2176,9 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, + struct ext4_inode *raw_inode; + int error; + +- if (EXT4_I(inode)->i_extra_isize == 0) ++ if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) + return 0; ++ + raw_inode = ext4_raw_inode(&is->iloc); + header = IHDR(inode, raw_inode); + is->s.base = is->s.first = IFIRST(header); +@@ -2205,8 +2206,9 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, + struct ext4_xattr_search *s = &is->s; + int error; + +- if (EXT4_I(inode)->i_extra_isize == 0) ++ if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) + return -ENOSPC; ++ + error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); + if (error) + return error; +-- +2.35.1 + diff --git a/queue-5.18/ext4-fix-warning-in-ext4_iomap_begin-as-race-between.patch b/queue-5.18/ext4-fix-warning-in-ext4_iomap_begin-as-race-between.patch new file mode 100644 index 00000000000..da45295d4b9 --- /dev/null +++ b/queue-5.18/ext4-fix-warning-in-ext4_iomap_begin-as-race-between.patch @@ -0,0 +1,103 @@ +From 122a6fffdeebf14f28ad593406efaa4e52613ea6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 17 Jun 2022 09:39:35 +0800 +Subject: ext4: fix warning in ext4_iomap_begin as race between bmap and write + +From: Ye Bin + +[ Upstream commit 51ae846cff568c8c29921b1b28eb2dfbcd4ac12d ] + +We got issue as follows: +------------[ cut here ]------------ +WARNING: CPU: 3 PID: 9310 at fs/ext4/inode.c:3441 ext4_iomap_begin+0x182/0x5d0 +RIP: 0010:ext4_iomap_begin+0x182/0x5d0 +RSP: 0018:ffff88812460fa08 EFLAGS: 00010293 +RAX: ffff88811f168000 RBX: 0000000000000000 RCX: ffffffff97793c12 +RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 +RBP: ffff88812c669160 R08: ffff88811f168000 R09: ffffed10258cd20f +R10: ffff88812c669077 R11: ffffed10258cd20e R12: 0000000000000001 +R13: 00000000000000a4 R14: 000000000000000c R15: ffff88812c6691ee +FS: 00007fd0d6ff3740(0000) GS:ffff8883af180000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007fd0d6dda290 CR3: 0000000104a62000 CR4: 00000000000006e0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + iomap_apply+0x119/0x570 + iomap_bmap+0x124/0x150 + ext4_bmap+0x14f/0x250 + bmap+0x55/0x80 + do_vfs_ioctl+0x952/0xbd0 + __x64_sys_ioctl+0xc6/0x170 + do_syscall_64+0x33/0x40 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Above issue may happen as follows: + bmap write +bmap + ext4_bmap + iomap_bmap + ext4_iomap_begin + ext4_file_write_iter + ext4_buffered_write_iter + generic_perform_write + ext4_da_write_begin + ext4_da_write_inline_data_begin + ext4_prepare_inline_data + ext4_create_inline_data + ext4_set_inode_flag(inode, + EXT4_INODE_INLINE_DATA); + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) ->trigger bug_on + +To solved above issue hold inode lock in ext4_bamp. + +Signed-off-by: Ye Bin +Link: https://lore.kernel.org/r/20220617013935.397596-1-yebin10@huawei.com +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Sasha Levin +--- + fs/ext4/inode.c | 12 +++++++++--- + 1 file changed, 9 insertions(+), 3 deletions(-) + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index e478cac3b8f2..9ef6f41a5250 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3137,13 +3137,15 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) + { + struct inode *inode = mapping->host; + journal_t *journal; ++ sector_t ret = 0; + int err; + ++ inode_lock_shared(inode); + /* + * We can get here for an inline file via the FIBMAP ioctl + */ + if (ext4_has_inline_data(inode)) +- return 0; ++ goto out; + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && + test_opt(inode->i_sb, DELALLOC)) { +@@ -3182,10 +3184,14 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) + jbd2_journal_unlock_updates(journal); + + if (err) +- return 0; ++ goto out; + } + +- return iomap_bmap(mapping, block, &ext4_iomap_ops); ++ ret = iomap_bmap(mapping, block, &ext4_iomap_ops); ++ ++out: ++ inode_unlock_shared(inode); ++ return ret; + } + + static int ext4_readpage(struct file *file, struct page *page) +-- +2.35.1 + diff --git a/queue-5.18/ext4-make-sure-ext4_append-always-allocates-new-bloc.patch b/queue-5.18/ext4-make-sure-ext4_append-always-allocates-new-bloc.patch new file mode 100644 index 00000000000..0cd8a40cd15 --- /dev/null +++ b/queue-5.18/ext4-make-sure-ext4_append-always-allocates-new-bloc.patch @@ -0,0 +1,63 @@ +From 103d5a38ce71f77d0a0ab8577c7e93d52421b67c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 4 Jul 2022 16:27:21 +0200 +Subject: ext4: make sure ext4_append() always allocates new block + +From: Lukas Czerner + +[ Upstream commit b8a04fe77ef1360fbf73c80fddbdfeaa9407ed1b ] + +ext4_append() must always allocate a new block, otherwise we run the +risk of overwriting existing directory block corrupting the directory +tree in the process resulting in all manner of problems later on. + +Add a sanity check to see if the logical block is already allocated and +error out if it is. + +Cc: stable@kernel.org +Signed-off-by: Lukas Czerner +Reviewed-by: Andreas Dilger +Link: https://lore.kernel.org/r/20220704142721.157985-2-lczerner@redhat.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/namei.c | 16 ++++++++++++++++ + 1 file changed, 16 insertions(+) + +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 2bc3e4b27204..13b6265848c2 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block) + { ++ struct ext4_map_blocks map; + struct buffer_head *bh; + int err; + +@@ -63,6 +64,21 @@ static struct buffer_head *ext4_append(handle_t *handle, + return ERR_PTR(-ENOSPC); + + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ map.m_lblk = *block; ++ map.m_len = 1; ++ ++ /* ++ * We're appending new directory block. Make sure the block is not ++ * allocated yet, otherwise we will end up corrupting the ++ * directory. ++ */ ++ err = ext4_map_blocks(NULL, inode, &map, 0); ++ if (err < 0) ++ return ERR_PTR(err); ++ if (err) { ++ EXT4_ERROR_INODE(inode, "Logical block already allocated"); ++ return ERR_PTR(-EFSCORRUPTED); ++ } + + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); + if (IS_ERR(bh)) +-- +2.35.1 + diff --git a/queue-5.18/ext4-remove-ea-inode-entry-from-mbcache-on-inode-evi.patch b/queue-5.18/ext4-remove-ea-inode-entry-from-mbcache-on-inode-evi.patch new file mode 100644 index 00000000000..dfd57e50127 --- /dev/null +++ b/queue-5.18/ext4-remove-ea-inode-entry-from-mbcache-on-inode-evi.patch @@ -0,0 +1,116 @@ +From 61a993fc39a5ec6129c04ea49d65c2173d1071a4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 12:54:22 +0200 +Subject: ext4: remove EA inode entry from mbcache on inode eviction + +From: Jan Kara + +[ Upstream commit 6bc0d63dad7f9f54d381925ee855b402f652fa39 ] + +Currently we remove EA inode from mbcache as soon as its xattr refcount +drops to zero. However there can be pending attempts to reuse the inode +and thus refcount handling code has to handle the situation when +refcount increases from zero anyway. So save some work and just keep EA +inode in mbcache until it is getting evicted. At that moment we are sure +following iget() of EA inode will fail anyway (or wait for eviction to +finish and load things from the disk again) and so removing mbcache +entry at that moment is fine and simplifies the code a bit. + +CC: stable@vger.kernel.org +Fixes: 82939d7999df ("ext4: convert to mbcache2") +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/r/20220712105436.32204-3-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/inode.c | 2 ++ + fs/ext4/xattr.c | 24 ++++++++---------------- + fs/ext4/xattr.h | 1 + + 3 files changed, 11 insertions(+), 16 deletions(-) + +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 9ef6f41a5250..e94ec798dce1 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -178,6 +178,8 @@ void ext4_evict_inode(struct inode *inode) + + trace_ext4_evict_inode(inode); + ++ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ++ ext4_evict_ea_inode(inode); + if (inode->i_nlink) { + /* + * When journalling data dirty buffers are tracked only in the +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index c3c3194f3ee1..b57fd07fbdba 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -436,6 +436,14 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + return err; + } + ++/* Remove entry from mbcache when EA inode is getting evicted */ ++void ext4_evict_ea_inode(struct inode *inode) ++{ ++ if (EA_INODE_CACHE(inode)) ++ mb_cache_entry_delete(EA_INODE_CACHE(inode), ++ ext4_xattr_inode_get_hash(inode), inode->i_ino); ++} ++ + static int + ext4_xattr_inode_verify_hashes(struct inode *ea_inode, + struct ext4_xattr_entry *entry, void *buffer, +@@ -976,10 +984,8 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, + static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, + int ref_change) + { +- struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode); + struct ext4_iloc iloc; + s64 ref_count; +- u32 hash; + int ret; + + inode_lock(ea_inode); +@@ -1002,14 +1008,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, + + set_nlink(ea_inode, 1); + ext4_orphan_del(handle, ea_inode); +- +- if (ea_inode_cache) { +- hash = ext4_xattr_inode_get_hash(ea_inode); +- mb_cache_entry_create(ea_inode_cache, +- GFP_NOFS, hash, +- ea_inode->i_ino, +- true /* reusable */); +- } + } + } else { + WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", +@@ -1022,12 +1020,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, + + clear_nlink(ea_inode); + ext4_orphan_add(handle, ea_inode); +- +- if (ea_inode_cache) { +- hash = ext4_xattr_inode_get_hash(ea_inode); +- mb_cache_entry_delete(ea_inode_cache, hash, +- ea_inode->i_ino); +- } + } + } + +diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h +index f885f362add4..e5e36bd11f05 100644 +--- a/fs/ext4/xattr.h ++++ b/fs/ext4/xattr.h +@@ -191,6 +191,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); + + extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, + struct ext4_inode *raw_inode, handle_t *handle); ++extern void ext4_evict_ea_inode(struct inode *inode); + + extern const struct xattr_handler *ext4_xattr_handlers[]; + +-- +2.35.1 + diff --git a/queue-5.18/ext4-unindent-codeblock-in-ext4_xattr_block_set.patch b/queue-5.18/ext4-unindent-codeblock-in-ext4_xattr_block_set.patch new file mode 100644 index 00000000000..f1293e37b99 --- /dev/null +++ b/queue-5.18/ext4-unindent-codeblock-in-ext4_xattr_block_set.patch @@ -0,0 +1,125 @@ +From 08077db477ddbef197b91281d9078043da9adb08 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 12:54:23 +0200 +Subject: ext4: unindent codeblock in ext4_xattr_block_set() + +From: Jan Kara + +[ Upstream commit fd48e9acdf26d0cbd80051de07d4a735d05d29b2 ] + +Remove unnecessary else (and thus indentation level) from a code block +in ext4_xattr_block_set(). It will also make following code changes +easier. No functional changes. + +CC: stable@vger.kernel.org +Fixes: 82939d7999df ("ext4: convert to mbcache2") +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/r/20220712105436.32204-4-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/xattr.c | 77 ++++++++++++++++++++++++------------------------- + 1 file changed, 38 insertions(+), 39 deletions(-) + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index d92d50de5a01..a25942a74929 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1850,6 +1850,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + #define header(x) ((struct ext4_xattr_header *)(x)) + + if (s->base) { ++ int offset = (char *)s->here - bs->bh->b_data; ++ + BUFFER_TRACE(bs->bh, "get_write_access"); + error = ext4_journal_get_write_access(handle, sb, bs->bh, + EXT4_JTR_NONE); +@@ -1882,49 +1884,46 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + if (error) + goto cleanup; + goto inserted; +- } else { +- int offset = (char *)s->here - bs->bh->b_data; ++ } ++ unlock_buffer(bs->bh); ++ ea_bdebug(bs->bh, "cloning"); ++ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); ++ error = -ENOMEM; ++ if (s->base == NULL) ++ goto cleanup; ++ s->first = ENTRY(header(s->base)+1); ++ header(s->base)->h_refcount = cpu_to_le32(1); ++ s->here = ENTRY(s->base + offset); ++ s->end = s->base + bs->bh->b_size; + +- unlock_buffer(bs->bh); +- ea_bdebug(bs->bh, "cloning"); +- s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); +- error = -ENOMEM; +- if (s->base == NULL) ++ /* ++ * If existing entry points to an xattr inode, we need ++ * to prevent ext4_xattr_set_entry() from decrementing ++ * ref count on it because the reference belongs to the ++ * original block. In this case, make the entry look ++ * like it has an empty value. ++ */ ++ if (!s->not_found && s->here->e_value_inum) { ++ ea_ino = le32_to_cpu(s->here->e_value_inum); ++ error = ext4_xattr_inode_iget(inode, ea_ino, ++ le32_to_cpu(s->here->e_hash), ++ &tmp_inode); ++ if (error) + goto cleanup; +- s->first = ENTRY(header(s->base)+1); +- header(s->base)->h_refcount = cpu_to_le32(1); +- s->here = ENTRY(s->base + offset); +- s->end = s->base + bs->bh->b_size; + +- /* +- * If existing entry points to an xattr inode, we need +- * to prevent ext4_xattr_set_entry() from decrementing +- * ref count on it because the reference belongs to the +- * original block. In this case, make the entry look +- * like it has an empty value. +- */ +- if (!s->not_found && s->here->e_value_inum) { +- ea_ino = le32_to_cpu(s->here->e_value_inum); +- error = ext4_xattr_inode_iget(inode, ea_ino, +- le32_to_cpu(s->here->e_hash), +- &tmp_inode); +- if (error) +- goto cleanup; +- +- if (!ext4_test_inode_state(tmp_inode, +- EXT4_STATE_LUSTRE_EA_INODE)) { +- /* +- * Defer quota free call for previous +- * inode until success is guaranteed. +- */ +- old_ea_inode_quota = le32_to_cpu( +- s->here->e_value_size); +- } +- iput(tmp_inode); +- +- s->here->e_value_inum = 0; +- s->here->e_value_size = 0; ++ if (!ext4_test_inode_state(tmp_inode, ++ EXT4_STATE_LUSTRE_EA_INODE)) { ++ /* ++ * Defer quota free call for previous ++ * inode until success is guaranteed. ++ */ ++ old_ea_inode_quota = le32_to_cpu( ++ s->here->e_value_size); + } ++ iput(tmp_inode); ++ ++ s->here->e_value_inum = 0; ++ s->here->e_value_size = 0; + } + } else { + /* Allocate a buffer where we construct the new block. */ +-- +2.35.1 + diff --git a/queue-5.18/ext4-update-s_overhead_clusters-in-the-superblock-du.patch b/queue-5.18/ext4-update-s_overhead_clusters-in-the-superblock-du.patch new file mode 100644 index 00000000000..04936321686 --- /dev/null +++ b/queue-5.18/ext4-update-s_overhead_clusters-in-the-superblock-du.patch @@ -0,0 +1,51 @@ +From 17944868e9c621e876595423dbbf4c4660b651fd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 29 Jun 2022 00:00:25 -0400 +Subject: ext4: update s_overhead_clusters in the superblock during an on-line + resize + +From: Theodore Ts'o + +[ Upstream commit de394a86658ffe4e89e5328fd4993abfe41b7435 ] + +When doing an online resize, the on-disk superblock on-disk wasn't +updated. This means that when the file system is unmounted and +remounted, and the on-disk overhead value is non-zero, this would +result in the results of statfs(2) to be incorrect. + +This was partially fixed by Commits 10b01ee92df5 ("ext4: fix overhead +calculation to account for the reserved gdt blocks"), 85d825dbf489 +("ext4: force overhead calculation if the s_overhead_cluster makes no +sense"), and eb7054212eac ("ext4: update the cached overhead value in +the superblock"). + +However, since it was too expensive to forcibly recalculate the +overhead for bigalloc file systems at every mount, this didn't fix the +problem for bigalloc file systems. This commit should address the +problem when resizing file systems with the bigalloc feature enabled. + +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Reviewed-by: Andreas Dilger +Link: https://lore.kernel.org/r/20220629040026.112371-1-tytso@mit.edu +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/resize.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c +index 8b70a4701293..e5c2713aa11a 100644 +--- a/fs/ext4/resize.c ++++ b/fs/ext4/resize.c +@@ -1484,6 +1484,7 @@ static void ext4_update_super(struct super_block *sb, + * Update the fs overhead information + */ + ext4_calculate_overhead(sb); ++ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT4-fs: added group %u:" +-- +2.35.1 + diff --git a/queue-5.18/ext4-use-kmemdup-to-replace-kmalloc-memcpy.patch b/queue-5.18/ext4-use-kmemdup-to-replace-kmalloc-memcpy.patch new file mode 100644 index 00000000000..3d0c9acbd89 --- /dev/null +++ b/queue-5.18/ext4-use-kmemdup-to-replace-kmalloc-memcpy.patch @@ -0,0 +1,40 @@ +From 7f40ec1ab39f18dae16aba7df11a741b3bc968a1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 May 2022 11:01:20 +0800 +Subject: ext4: use kmemdup() to replace kmalloc + memcpy + +From: Shuqi Zhang + +[ Upstream commit 4efd9f0d120c55b08852ee5605dbb02a77089a5d ] + +Replace kmalloc + memcpy with kmemdup() + +Signed-off-by: Shuqi Zhang +Reviewed-by: Ritesh Harjani +Link: https://lore.kernel.org/r/20220525030120.803330-1-zhangshuqi3@huawei.com +Signed-off-by: Theodore Ts'o +Signed-off-by: Sasha Levin +--- + fs/ext4/xattr.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index b57fd07fbdba..d92d50de5a01 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1887,11 +1887,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, + + unlock_buffer(bs->bh); + ea_bdebug(bs->bh, "cloning"); +- s->base = kmalloc(bs->bh->b_size, GFP_NOFS); ++ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); + error = -ENOMEM; + if (s->base == NULL) + goto cleanup; +- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size); + s->first = ENTRY(header(s->base)+1); + header(s->base)->h_refcount = cpu_to_le32(1); + s->here = ENTRY(s->base + offset); +-- +2.35.1 + diff --git a/queue-5.18/firmware-arm_scpi-ensure-scpi_info-is-not-assigned-i.patch b/queue-5.18/firmware-arm_scpi-ensure-scpi_info-is-not-assigned-i.patch new file mode 100644 index 00000000000..fa70e82f929 --- /dev/null +++ b/queue-5.18/firmware-arm_scpi-ensure-scpi_info-is-not-assigned-i.patch @@ -0,0 +1,156 @@ +From 20eafffdcdc724c0cbbd8582b9bdedc47de0f785 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Jul 2022 17:03:10 +0100 +Subject: firmware: arm_scpi: Ensure scpi_info is not assigned if the probe + fails + +From: Sudeep Holla + +[ Upstream commit 689640efc0a2c4e07e6f88affe6d42cd40cc3f85 ] + +When scpi probe fails, at any point, we need to ensure that the scpi_info +is not set and will remain NULL until the probe succeeds. If it is not +taken care, then it could result use-after-free as the value is exported +via get_scpi_ops() and could refer to a memory allocated via devm_kzalloc() +but freed when the probe fails. + +Link: https://lore.kernel.org/r/20220701160310.148344-1-sudeep.holla@arm.com +Cc: stable@vger.kernel.org # 4.19+ +Reported-by: huhai +Reviewed-by: Jackie Liu +Signed-off-by: Sudeep Holla +Signed-off-by: Sasha Levin +--- + drivers/firmware/arm_scpi.c | 61 +++++++++++++++++++++---------------- + 1 file changed, 35 insertions(+), 26 deletions(-) + +diff --git a/drivers/firmware/arm_scpi.c b/drivers/firmware/arm_scpi.c +index ddf0b9ff9e15..435d0e2658a4 100644 +--- a/drivers/firmware/arm_scpi.c ++++ b/drivers/firmware/arm_scpi.c +@@ -815,7 +815,7 @@ static int scpi_init_versions(struct scpi_drvinfo *info) + info->firmware_version = le32_to_cpu(caps.platform_version); + } + /* Ignore error if not implemented */ +- if (scpi_info->is_legacy && ret == -EOPNOTSUPP) ++ if (info->is_legacy && ret == -EOPNOTSUPP) + return 0; + + return ret; +@@ -913,13 +913,14 @@ static int scpi_probe(struct platform_device *pdev) + struct resource res; + struct device *dev = &pdev->dev; + struct device_node *np = dev->of_node; ++ struct scpi_drvinfo *scpi_drvinfo; + +- scpi_info = devm_kzalloc(dev, sizeof(*scpi_info), GFP_KERNEL); +- if (!scpi_info) ++ scpi_drvinfo = devm_kzalloc(dev, sizeof(*scpi_drvinfo), GFP_KERNEL); ++ if (!scpi_drvinfo) + return -ENOMEM; + + if (of_match_device(legacy_scpi_of_match, &pdev->dev)) +- scpi_info->is_legacy = true; ++ scpi_drvinfo->is_legacy = true; + + count = of_count_phandle_with_args(np, "mboxes", "#mbox-cells"); + if (count < 0) { +@@ -927,19 +928,19 @@ static int scpi_probe(struct platform_device *pdev) + return -ENODEV; + } + +- scpi_info->channels = devm_kcalloc(dev, count, sizeof(struct scpi_chan), +- GFP_KERNEL); +- if (!scpi_info->channels) ++ scpi_drvinfo->channels = ++ devm_kcalloc(dev, count, sizeof(struct scpi_chan), GFP_KERNEL); ++ if (!scpi_drvinfo->channels) + return -ENOMEM; + +- ret = devm_add_action(dev, scpi_free_channels, scpi_info); ++ ret = devm_add_action(dev, scpi_free_channels, scpi_drvinfo); + if (ret) + return ret; + +- for (; scpi_info->num_chans < count; scpi_info->num_chans++) { ++ for (; scpi_drvinfo->num_chans < count; scpi_drvinfo->num_chans++) { + resource_size_t size; +- int idx = scpi_info->num_chans; +- struct scpi_chan *pchan = scpi_info->channels + idx; ++ int idx = scpi_drvinfo->num_chans; ++ struct scpi_chan *pchan = scpi_drvinfo->channels + idx; + struct mbox_client *cl = &pchan->cl; + struct device_node *shmem = of_parse_phandle(np, "shmem", idx); + +@@ -986,45 +987,53 @@ static int scpi_probe(struct platform_device *pdev) + return ret; + } + +- scpi_info->commands = scpi_std_commands; ++ scpi_drvinfo->commands = scpi_std_commands; + +- platform_set_drvdata(pdev, scpi_info); ++ platform_set_drvdata(pdev, scpi_drvinfo); + +- if (scpi_info->is_legacy) { ++ if (scpi_drvinfo->is_legacy) { + /* Replace with legacy variants */ + scpi_ops.clk_set_val = legacy_scpi_clk_set_val; +- scpi_info->commands = scpi_legacy_commands; ++ scpi_drvinfo->commands = scpi_legacy_commands; + + /* Fill priority bitmap */ + for (idx = 0; idx < ARRAY_SIZE(legacy_hpriority_cmds); idx++) + set_bit(legacy_hpriority_cmds[idx], +- scpi_info->cmd_priority); ++ scpi_drvinfo->cmd_priority); + } + +- ret = scpi_init_versions(scpi_info); ++ scpi_info = scpi_drvinfo; ++ ++ ret = scpi_init_versions(scpi_drvinfo); + if (ret) { + dev_err(dev, "incorrect or no SCP firmware found\n"); ++ scpi_info = NULL; + return ret; + } + +- if (scpi_info->is_legacy && !scpi_info->protocol_version && +- !scpi_info->firmware_version) ++ if (scpi_drvinfo->is_legacy && !scpi_drvinfo->protocol_version && ++ !scpi_drvinfo->firmware_version) + dev_info(dev, "SCP Protocol legacy pre-1.0 firmware\n"); + else + dev_info(dev, "SCP Protocol %lu.%lu Firmware %lu.%lu.%lu version\n", + FIELD_GET(PROTO_REV_MAJOR_MASK, +- scpi_info->protocol_version), ++ scpi_drvinfo->protocol_version), + FIELD_GET(PROTO_REV_MINOR_MASK, +- scpi_info->protocol_version), ++ scpi_drvinfo->protocol_version), + FIELD_GET(FW_REV_MAJOR_MASK, +- scpi_info->firmware_version), ++ scpi_drvinfo->firmware_version), + FIELD_GET(FW_REV_MINOR_MASK, +- scpi_info->firmware_version), ++ scpi_drvinfo->firmware_version), + FIELD_GET(FW_REV_PATCH_MASK, +- scpi_info->firmware_version)); +- scpi_info->scpi_ops = &scpi_ops; ++ scpi_drvinfo->firmware_version)); ++ ++ scpi_drvinfo->scpi_ops = &scpi_ops; + +- return devm_of_platform_populate(dev); ++ ret = devm_of_platform_populate(dev); ++ if (ret) ++ scpi_info = NULL; ++ ++ return ret; + } + + static const struct of_device_id scpi_of_match[] = { +-- +2.35.1 + diff --git a/queue-5.18/ftrace-x86-add-back-ftrace_expected-assignment.patch-2936 b/queue-5.18/ftrace-x86-add-back-ftrace_expected-assignment.patch-2936 new file mode 100644 index 00000000000..5be2f7ed9d0 --- /dev/null +++ b/queue-5.18/ftrace-x86-add-back-ftrace_expected-assignment.patch-2936 @@ -0,0 +1,49 @@ +From e881657d8ff0377cfd861e664e7e198f7d0ca102 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 26 Jul 2022 10:18:51 -0400 +Subject: ftrace/x86: Add back ftrace_expected assignment + +From: Steven Rostedt (Google) + +[ Upstream commit ac6c1b2ca77e722a1e5d651f12f437f2f237e658 ] + +When a ftrace_bug happens (where ftrace fails to modify a location) it is +helpful to have what was at that location as well as what was expected to +be there. + +But with the conversion to text_poke() the variable that assigns the +expected for debugging was dropped. Unfortunately, I noticed this when I +needed it. Add it back. + +Link: https://lkml.kernel.org/r/20220726101851.069d2e70@gandalf.local.home + +Cc: "x86@kernel.org" +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Ingo Molnar +Cc: Borislav Petkov +Cc: "H. Peter Anvin" +Cc: Andrew Morton +Cc: stable@vger.kernel.org +Fixes: 768ae4406a5c ("x86/ftrace: Use text_poke()") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/ftrace.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c +index 6892ca67d9c6..b6d7ece7bf51 100644 +--- a/arch/x86/kernel/ftrace.c ++++ b/arch/x86/kernel/ftrace.c +@@ -93,6 +93,7 @@ static int ftrace_verify_code(unsigned long ip, const char *old_code) + + /* Make sure it is what we expect it to be */ + if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) { ++ ftrace_expected = old_code; + WARN_ON(1); + return -EINVAL; + } +-- +2.35.1 + diff --git a/queue-5.18/hugetlb_cgroup-fix-wrong-hugetlb-cgroup-numa-stat.patch b/queue-5.18/hugetlb_cgroup-fix-wrong-hugetlb-cgroup-numa-stat.patch new file mode 100644 index 00000000000..8acc9d4141b --- /dev/null +++ b/queue-5.18/hugetlb_cgroup-fix-wrong-hugetlb-cgroup-numa-stat.patch @@ -0,0 +1,43 @@ +From 2cd3814758463e87405049bb89f7940da1dc7c1b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 23 Jul 2022 15:38:04 +0800 +Subject: hugetlb_cgroup: fix wrong hugetlb cgroup numa stat + +From: Miaohe Lin + +[ Upstream commit 2727cfe4072a35ce813e3708f74c135de7da8897 ] + +We forget to set cft->private for numa stat file. As a result, numa stat +of hstates[0] is always showed for all hstates. Encode the hstates index +into cft->private to fix this issue. + +Link: https://lkml.kernel.org/r/20220723073804.53035-1-linmiaohe@huawei.com +Fixes: f47761999052 ("hugetlb: add hugetlb.*.numa_stat file") +Signed-off-by: Miaohe Lin +Acked-by: Muchun Song +Cc: Kees Cook +Cc: Mike Kravetz +Cc: Mina Almasry +Cc: Shakeel Butt +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/hugetlb_cgroup.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c +index f9942841df18..c86691c431fd 100644 +--- a/mm/hugetlb_cgroup.c ++++ b/mm/hugetlb_cgroup.c +@@ -772,6 +772,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) + /* Add the numa stat file */ + cft = &h->cgroup_files_dfl[6]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); ++ cft->private = MEMFILE_PRIVATE(idx, 0); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + cft->flags = CFTYPE_NOT_ON_ROOT; + +-- +2.35.1 + diff --git a/queue-5.18/input-gscps2-check-return-value-of-ioremap-in-gscps2.patch b/queue-5.18/input-gscps2-check-return-value-of-ioremap-in-gscps2.patch new file mode 100644 index 00000000000..f07477db862 --- /dev/null +++ b/queue-5.18/input-gscps2-check-return-value-of-ioremap-in-gscps2.patch @@ -0,0 +1,40 @@ +From 6674953f1a89b966ac5c94dac089f2b80d9e9932 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Aug 2022 15:20:33 +0800 +Subject: Input: gscps2 - check return value of ioremap() in gscps2_probe() + +From: Xie Shaowen + +[ Upstream commit e61b3125a4f036b3c6b87ffd656fc1ab00440ae9 ] + +The function ioremap() in gscps2_probe() can fail, so +its return value should be checked. + +Fixes: 4bdc0d676a643 ("remove ioremap_nocache and devm_ioremap_nocache") +Cc: # v5.6+ +Reported-by: Hacash Robot +Signed-off-by: Xie Shaowen +Signed-off-by: Helge Deller +Signed-off-by: Sasha Levin +--- + drivers/input/serio/gscps2.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/input/serio/gscps2.c b/drivers/input/serio/gscps2.c +index a9065c6ab550..da2c67cb8642 100644 +--- a/drivers/input/serio/gscps2.c ++++ b/drivers/input/serio/gscps2.c +@@ -350,6 +350,10 @@ static int __init gscps2_probe(struct parisc_device *dev) + ps2port->port = serio; + ps2port->padev = dev; + ps2port->addr = ioremap(hpa, GSC_STATUS + 4); ++ if (!ps2port->addr) { ++ ret = -ENOMEM; ++ goto fail_nomem; ++ } + spin_lock_init(&ps2port->lock); + + gscps2_reset(ps2port); +-- +2.35.1 + diff --git a/queue-5.18/intel_idle-add-alderlake-support.patch b/queue-5.18/intel_idle-add-alderlake-support.patch new file mode 100644 index 00000000000..56730c36316 --- /dev/null +++ b/queue-5.18/intel_idle-add-alderlake-support.patch @@ -0,0 +1,213 @@ +From c13505c36b2642150596f12cc6b74f630ad22b45 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Apr 2022 17:39:51 +0800 +Subject: intel_idle: Add AlderLake support + +From: Zhang Rui + +[ Upstream commit d1cf8bbfed1edc5108220342ab39e4544d55fbc3 ] + +Similar to SPR, the C1 and C1E states on ADL are mutually exclusive. +Only one of them can be enabled at a time. + +But contrast to SPR, which usually has a strong latency requirement +as a Xeon processor, C1E is preferred on ADL for better energy +efficiency. + +Add custom C-state tables for ADL with both C1 and C1E, and + + 1. Enable the "C1E promotion" bit in MSR_IA32_POWER_CTL and mark C1 + with the CPUIDLE_FLAG_UNUSABLE flag, so C1 is not available by + default. + + 2. Add support for the "preferred_cstates" module parameter, so that + users can choose to use C1 instead of C1E by booting with + "intel_idle.preferred_cstates=2". + +Separate custom C-state tables are introduced for the ADL mobile and +desktop processors, because of the exit latency differences between +these two variants, especially with respect to PC10. + +Signed-off-by: Zhang Rui +[ rjw: Changelog edits, code rearrangement ] +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/idle/intel_idle.c | 133 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 133 insertions(+) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 47b68c6071be..907700d1e78e 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -811,6 +811,106 @@ static struct cpuidle_state icx_cstates[] __initdata = { + .enter = NULL } + }; + ++/* ++ * On AlderLake C1 has to be disabled if C1E is enabled, and vice versa. ++ * C1E is enabled only if "C1E promotion" bit is set in MSR_IA32_POWER_CTL. ++ * But in this case there is effectively no C1, because C1 requests are ++ * promoted to C1E. If the "C1E promotion" bit is cleared, then both C1 ++ * and C1E requests end up with C1, so there is effectively no C1E. ++ * ++ * By default we enable C1E and disable C1 by marking it with ++ * 'CPUIDLE_FLAG_UNUSABLE'. ++ */ ++static struct cpuidle_state adl_cstates[] __initdata = { ++ { ++ .name = "C1", ++ .desc = "MWAIT 0x00", ++ .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE, ++ .exit_latency = 1, ++ .target_residency = 1, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .name = "C1E", ++ .desc = "MWAIT 0x01", ++ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, ++ .exit_latency = 2, ++ .target_residency = 4, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .name = "C6", ++ .desc = "MWAIT 0x20", ++ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .exit_latency = 220, ++ .target_residency = 600, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .name = "C8", ++ .desc = "MWAIT 0x40", ++ .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .exit_latency = 280, ++ .target_residency = 800, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .name = "C10", ++ .desc = "MWAIT 0x60", ++ .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .exit_latency = 680, ++ .target_residency = 2000, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .enter = NULL } ++}; ++ ++static struct cpuidle_state adl_l_cstates[] __initdata = { ++ { ++ .name = "C1", ++ .desc = "MWAIT 0x00", ++ .flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_UNUSABLE, ++ .exit_latency = 1, ++ .target_residency = 1, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .name = "C1E", ++ .desc = "MWAIT 0x01", ++ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, ++ .exit_latency = 2, ++ .target_residency = 4, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .name = "C6", ++ .desc = "MWAIT 0x20", ++ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .exit_latency = 170, ++ .target_residency = 500, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .name = "C8", ++ .desc = "MWAIT 0x40", ++ .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .exit_latency = 200, ++ .target_residency = 600, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .name = "C10", ++ .desc = "MWAIT 0x60", ++ .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, ++ .exit_latency = 230, ++ .target_residency = 700, ++ .enter = &intel_idle, ++ .enter_s2idle = intel_idle_s2idle, }, ++ { ++ .enter = NULL } ++}; ++ + /* + * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice + * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in +@@ -1194,6 +1294,14 @@ static const struct idle_cpu idle_cpu_icx __initconst = { + .use_acpi = true, + }; + ++static const struct idle_cpu idle_cpu_adl __initconst = { ++ .state_table = adl_cstates, ++}; ++ ++static const struct idle_cpu idle_cpu_adl_l __initconst = { ++ .state_table = adl_l_cstates, ++}; ++ + static const struct idle_cpu idle_cpu_spr __initconst = { + .state_table = spr_cstates, + .disable_promotion_to_c1e = true, +@@ -1262,6 +1370,8 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), ++ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &idle_cpu_adl), ++ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), +@@ -1620,6 +1730,25 @@ static void __init skx_idle_state_table_update(void) + } + } + ++/** ++ * adl_idle_state_table_update - Adjust AlderLake idle states table. ++ */ ++static void __init adl_idle_state_table_update(void) ++{ ++ /* Check if user prefers C1 over C1E. */ ++ if (preferred_states_mask & BIT(1) && !(preferred_states_mask & BIT(2))) { ++ cpuidle_state_table[0].flags &= ~CPUIDLE_FLAG_UNUSABLE; ++ cpuidle_state_table[1].flags |= CPUIDLE_FLAG_UNUSABLE; ++ ++ /* Disable C1E by clearing the "C1E promotion" bit. */ ++ c1e_promotion = C1E_PROMOTION_DISABLE; ++ return; ++ } ++ ++ /* Make sure C1E is enabled by default */ ++ c1e_promotion = C1E_PROMOTION_ENABLE; ++} ++ + /** + * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table. + */ +@@ -1689,6 +1818,10 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) + case INTEL_FAM6_SAPPHIRERAPIDS_X: + spr_idle_state_table_update(); + break; ++ case INTEL_FAM6_ALDERLAKE: ++ case INTEL_FAM6_ALDERLAKE_L: ++ adl_idle_state_table_update(); ++ break; + } + + for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { +-- +2.35.1 + diff --git a/queue-5.18/intel_idle-make-spr-c1-and-c1e-be-independent.patch b/queue-5.18/intel_idle-make-spr-c1-and-c1e-be-independent.patch new file mode 100644 index 00000000000..e7d78bfedf1 --- /dev/null +++ b/queue-5.18/intel_idle-make-spr-c1-and-c1e-be-independent.patch @@ -0,0 +1,90 @@ +From 72d2cfae1dd2c795060d056fcddb70dff941d0da Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 16 Jul 2022 09:26:55 +0300 +Subject: intel_idle: make SPR C1 and C1E be independent + +From: Artem Bityutskiy + +[ Upstream commit 1548fac47a114b42063def551eb152a536ed9697 ] + +This patch partially reverts the changes made by the following commit: + +da0e58c038e6 intel_idle: add 'preferred_cstates' module argument + +As that commit describes, on early Sapphire Rapids Xeon platforms the C1 and +C1E states were mutually exclusive, so that users could only have either C1 and +C6, or C1E and C6. + +However, Intel firmware engineers managed to remove this limitation and make C1 +and C1E to be completely independent, just like on previous Xeon platforms. + +Therefore, this patch: + * Removes commentary describing the old, and now non-existing SPR C1E + limitation. + * Marks SPR C1E as available by default. + * Removes the 'preferred_cstates' parameter handling for SPR. Both C1 and + C1E will be available regardless of 'preferred_cstates' value. + +We expect that all SPR systems are shipping with new firmware, which includes +the C1/C1E improvement. + +Cc: v5.18+ # v5.18+ +Signed-off-by: Artem Bityutskiy +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Sasha Levin +--- + drivers/idle/intel_idle.c | 24 +----------------------- + 1 file changed, 1 insertion(+), 23 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 907700d1e78e..9515a3146dc9 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -911,16 +911,6 @@ static struct cpuidle_state adl_l_cstates[] __initdata = { + .enter = NULL } + }; + +-/* +- * On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice +- * versa. On SPR C1E is enabled only if "C1E promotion" bit is set in +- * MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1 +- * requests are promoted to C1E. If the "C1E promotion" bit is cleared, then +- * both C1 and C1E requests end up with C1, so there is effectively no C1E. +- * +- * By default we enable C1 and disable C1E by marking it with +- * 'CPUIDLE_FLAG_UNUSABLE'. +- */ + static struct cpuidle_state spr_cstates[] __initdata = { + { + .name = "C1", +@@ -933,8 +923,7 @@ static struct cpuidle_state spr_cstates[] __initdata = { + { + .name = "C1E", + .desc = "MWAIT 0x01", +- .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE | +- CPUIDLE_FLAG_UNUSABLE, ++ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 2, + .target_residency = 4, + .enter = &intel_idle, +@@ -1756,17 +1745,6 @@ static void __init spr_idle_state_table_update(void) + { + unsigned long long msr; + +- /* Check if user prefers C1E over C1. */ +- if ((preferred_states_mask & BIT(2)) && +- !(preferred_states_mask & BIT(1))) { +- /* Disable C1 and enable C1E. */ +- spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE; +- spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE; +- +- /* Enable C1E using the "C1E promotion" bit. */ +- c1e_promotion = C1E_PROMOTION_ENABLE; +- } +- + /* + * By default, the C6 state assumes the worst-case scenario of package + * C6. However, if PC6 is disabled, we update the numbers to match +-- +2.35.1 + diff --git a/queue-5.18/intel_th-pci-add-meteor-lake-p-support.patch b/queue-5.18/intel_th-pci-add-meteor-lake-p-support.patch new file mode 100644 index 00000000000..d5d460c667c --- /dev/null +++ b/queue-5.18/intel_th-pci-add-meteor-lake-p-support.patch @@ -0,0 +1,40 @@ +From ca9b34b3b4760f6429dbc8d09097e704d75a7bed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Jul 2022 11:26:35 +0300 +Subject: intel_th: pci: Add Meteor Lake-P support + +From: Alexander Shishkin + +[ Upstream commit 802a9a0b1d91274ef10d9fe429b4cc1e8c200aef ] + +Add support for the Trace Hub in Meteor Lake-P. + +Reviewed-by: Andy Shevchenko +Cc: stable +Signed-off-by: Alexander Shishkin +Link: https://lore.kernel.org/r/20220705082637.59979-5-alexander.shishkin@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + drivers/hwtracing/intel_th/pci.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c +index fcd0aca75007..41a31c7f505f 100644 +--- a/drivers/hwtracing/intel_th/pci.c ++++ b/drivers/hwtracing/intel_th/pci.c +@@ -284,6 +284,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x54a6), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, ++ { ++ /* Meteor Lake-P */ ++ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7e24), ++ .driver_data = (kernel_ulong_t)&intel_th_2x, ++ }, + { + /* Alder Lake CPU */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x466f), +-- +2.35.1 + diff --git a/queue-5.18/intel_th-pci-add-raptor-lake-s-cpu-support.patch b/queue-5.18/intel_th-pci-add-raptor-lake-s-cpu-support.patch new file mode 100644 index 00000000000..f12a531cdab --- /dev/null +++ b/queue-5.18/intel_th-pci-add-raptor-lake-s-cpu-support.patch @@ -0,0 +1,40 @@ +From 72f0e594ee06a5ab7c5ba3b37bb0774444d0b18e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Jul 2022 11:26:37 +0300 +Subject: intel_th: pci: Add Raptor Lake-S CPU support + +From: Alexander Shishkin + +[ Upstream commit ff46a601afc5a66a81c3945b83d0a2caeb88e8bc ] + +Add support for the Trace Hub in Raptor Lake-S CPU. + +Reviewed-by: Andy Shevchenko +Cc: stable +Signed-off-by: Alexander Shishkin +Link: https://lore.kernel.org/r/20220705082637.59979-7-alexander.shishkin@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + drivers/hwtracing/intel_th/pci.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c +index 5b6da26f1b63..147d338c191e 100644 +--- a/drivers/hwtracing/intel_th/pci.c ++++ b/drivers/hwtracing/intel_th/pci.c +@@ -294,6 +294,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7a26), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, ++ { ++ /* Raptor Lake-S CPU */ ++ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa76f), ++ .driver_data = (kernel_ulong_t)&intel_th_2x, ++ }, + { + /* Alder Lake CPU */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x466f), +-- +2.35.1 + diff --git a/queue-5.18/intel_th-pci-add-raptor-lake-s-pch-support.patch b/queue-5.18/intel_th-pci-add-raptor-lake-s-pch-support.patch new file mode 100644 index 00000000000..ef73b4077dc --- /dev/null +++ b/queue-5.18/intel_th-pci-add-raptor-lake-s-pch-support.patch @@ -0,0 +1,40 @@ +From 2ea65adef5be4557edb506fdaa6b3ea53411d77f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Jul 2022 11:26:36 +0300 +Subject: intel_th: pci: Add Raptor Lake-S PCH support + +From: Alexander Shishkin + +[ Upstream commit 23e2de5826e2fc4dd43e08bab3a2ea1a5338b063 ] + +Add support for the Trace Hub in Raptor Lake-S PCH. + +Reviewed-by: Andy Shevchenko +Cc: stable +Signed-off-by: Alexander Shishkin +Link: https://lore.kernel.org/r/20220705082637.59979-6-alexander.shishkin@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + drivers/hwtracing/intel_th/pci.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c +index 41a31c7f505f..5b6da26f1b63 100644 +--- a/drivers/hwtracing/intel_th/pci.c ++++ b/drivers/hwtracing/intel_th/pci.c +@@ -289,6 +289,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7e24), + .driver_data = (kernel_ulong_t)&intel_th_2x, + }, ++ { ++ /* Raptor Lake-S */ ++ PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x7a26), ++ .driver_data = (kernel_ulong_t)&intel_th_2x, ++ }, + { + /* Alder Lake CPU */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x466f), +-- +2.35.1 + diff --git a/queue-5.18/iommu-vt-d-avoid-invalid-memory-access-via-node_onli.patch b/queue-5.18/iommu-vt-d-avoid-invalid-memory-access-via-node_onli.patch new file mode 100644 index 00000000000..5ce8b7e55c6 --- /dev/null +++ b/queue-5.18/iommu-vt-d-avoid-invalid-memory-access-via-node_onli.patch @@ -0,0 +1,66 @@ +From 48a418e1d496eadf562380a9947ee715f429e6ca Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 17:38:36 +0200 +Subject: iommu/vt-d: avoid invalid memory access via node_online(NUMA_NO_NODE) + +From: Alexander Lobakin + +[ Upstream commit b0b0b77ea611e3088e9523e60860f4f41b62b235 ] + +KASAN reports: + +[ 4.668325][ T0] BUG: KASAN: wild-memory-access in dmar_parse_one_rhsa (arch/x86/include/asm/bitops.h:214 arch/x86/include/asm/bitops.h:226 include/asm-generic/bitops/instrumented-non-atomic.h:142 include/linux/nodemask.h:415 drivers/iommu/intel/dmar.c:497) +[ 4.676149][ T0] Read of size 8 at addr 1fffffff85115558 by task swapper/0/0 +[ 4.683454][ T0] +[ 4.685638][ T0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.19.0-rc3-00004-g0e862838f290 #1 +[ 4.694331][ T0] Hardware name: Supermicro SYS-5018D-FN4T/X10SDV-8C-TLN4F, BIOS 1.1 03/02/2016 +[ 4.703196][ T0] Call Trace: +[ 4.706334][ T0] +[ 4.709133][ T0] ? dmar_parse_one_rhsa (arch/x86/include/asm/bitops.h:214 arch/x86/include/asm/bitops.h:226 include/asm-generic/bitops/instrumented-non-atomic.h:142 include/linux/nodemask.h:415 drivers/iommu/intel/dmar.c:497) + +after converting the type of the first argument (@nr, bit number) +of arch_test_bit() from `long` to `unsigned long`[0]. + +Under certain conditions (for example, when ACPI NUMA is disabled +via command line), pxm_to_node() can return %NUMA_NO_NODE (-1). +It is valid 'magic' number of NUMA node, but not valid bit number +to use in bitops. +node_online() eventually descends to test_bit() without checking +for the input, assuming it's on caller side (which might be good +for perf-critical tasks). There, -1 becomes %ULONG_MAX which leads +to an insane array index when calculating bit position in memory. + +For now, add an explicit check for @node being not %NUMA_NO_NODE +before calling test_bit(). The actual logics didn't change here +at all. + +[0] https://github.com/norov/linux/commit/0e862838f290147ea9c16db852d8d494b552d38d + +Fixes: ee34b32d8c29 ("dmar: support for parsing Remapping Hardware Static Affinity structure") +Cc: stable@vger.kernel.org # 2.6.33+ +Reported-by: kernel test robot +Signed-off-by: Alexander Lobakin +Reviewed-by: Andy Shevchenko +Reviewed-by: Lu Baolu +Signed-off-by: Yury Norov +Signed-off-by: Sasha Levin +--- + drivers/iommu/intel/dmar.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c +index 497c5bd95caf..2a10c9b54064 100644 +--- a/drivers/iommu/intel/dmar.c ++++ b/drivers/iommu/intel/dmar.c +@@ -495,7 +495,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) + if (drhd->reg_base_addr == rhsa->base_address) { + int node = pxm_to_node(rhsa->proximity_domain); + +- if (!node_online(node)) ++ if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; + drhd->iommu->node = node; + return 0; +-- +2.35.1 + diff --git a/queue-5.18/kexec-clean-up-arch_kexec_kernel_verify_sig.patch b/queue-5.18/kexec-clean-up-arch_kexec_kernel_verify_sig.patch new file mode 100644 index 00000000000..ba802ce2439 --- /dev/null +++ b/queue-5.18/kexec-clean-up-arch_kexec_kernel_verify_sig.patch @@ -0,0 +1,107 @@ +From 0d01ede0c3b718afbf2b4bdb7b91840f25cc0d0e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Jul 2022 21:40:24 +0800 +Subject: kexec: clean up arch_kexec_kernel_verify_sig + +From: Coiby Xu + +[ Upstream commit 689a71493bd2f31c024f8c0395f85a1fd4b2138e ] + +Before commit 105e10e2cf1c ("kexec_file: drop weak attribute from +functions"), there was already no arch-specific implementation +of arch_kexec_kernel_verify_sig. With weak attribute dropped by that +commit, arch_kexec_kernel_verify_sig is completely useless. So clean it +up. + +Note later patches are dependent on this patch so it should be backported +to the stable tree as well. + +Cc: stable@vger.kernel.org +Suggested-by: Eric W. Biederman +Reviewed-by: Michal Suchanek +Acked-by: Baoquan He +Signed-off-by: Coiby Xu +[zohar@linux.ibm.com: reworded patch description "Note"] +Link: https://lore.kernel.org/linux-integrity/20220714134027.394370-1-coxu@redhat.com/ +Signed-off-by: Mimi Zohar +Signed-off-by: Sasha Levin +--- + include/linux/kexec.h | 5 ----- + kernel/kexec_file.c | 33 +++++++++++++-------------------- + 2 files changed, 13 insertions(+), 25 deletions(-) + +diff --git a/include/linux/kexec.h b/include/linux/kexec.h +index 87c1795297b0..f3e7680befcc 100644 +--- a/include/linux/kexec.h ++++ b/include/linux/kexec.h +@@ -212,11 +212,6 @@ static inline void *arch_kexec_kernel_image_load(struct kimage *image) + } + #endif + +-#ifdef CONFIG_KEXEC_SIG +-int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, +- unsigned long buf_len); +-#endif +- + extern int kexec_add_buffer(struct kexec_buf *kbuf); + int kexec_locate_mem_hole(struct kexec_buf *kbuf); + +diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c +index 925953dfef05..ad005cd184a4 100644 +--- a/kernel/kexec_file.c ++++ b/kernel/kexec_file.c +@@ -81,24 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image) + return image->fops->cleanup(image->image_loader_data); + } + +-#ifdef CONFIG_KEXEC_SIG +-static int kexec_image_verify_sig_default(struct kimage *image, void *buf, +- unsigned long buf_len) +-{ +- if (!image->fops || !image->fops->verify_sig) { +- pr_debug("kernel loader does not support signature verification.\n"); +- return -EKEYREJECTED; +- } +- +- return image->fops->verify_sig(buf, buf_len); +-} +- +-int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) +-{ +- return kexec_image_verify_sig_default(image, buf, buf_len); +-} +-#endif +- + /* + * Free up memory used by kernel, initrd, and command line. This is temporary + * memory allocation which is not needed any more after these buffers have +@@ -141,13 +123,24 @@ void kimage_file_post_load_cleanup(struct kimage *image) + } + + #ifdef CONFIG_KEXEC_SIG ++static int kexec_image_verify_sig(struct kimage *image, void *buf, ++ unsigned long buf_len) ++{ ++ if (!image->fops || !image->fops->verify_sig) { ++ pr_debug("kernel loader does not support signature verification.\n"); ++ return -EKEYREJECTED; ++ } ++ ++ return image->fops->verify_sig(buf, buf_len); ++} ++ + static int + kimage_validate_signature(struct kimage *image) + { + int ret; + +- ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, +- image->kernel_buf_len); ++ ret = kexec_image_verify_sig(image, image->kernel_buf, ++ image->kernel_buf_len); + if (ret) { + + if (sig_enforce) { +-- +2.35.1 + diff --git a/queue-5.18/kexec-keys-s390-make-use-of-built-in-and-secondary-k.patch b/queue-5.18/kexec-keys-s390-make-use-of-built-in-and-secondary-k.patch new file mode 100644 index 00000000000..0648467785d --- /dev/null +++ b/queue-5.18/kexec-keys-s390-make-use-of-built-in-and-secondary-k.patch @@ -0,0 +1,72 @@ +From 2f10017ddc47f38b293b183149fca92892bcfec0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Jul 2022 21:40:27 +0800 +Subject: kexec, KEYS, s390: Make use of built-in and secondary keyring for + signature verification + +From: Michal Suchanek + +[ Upstream commit 0828c4a39be57768b8788e8cbd0d84683ea757e5 ] + +commit e23a8020ce4e ("s390/kexec_file: Signature verification prototype") +adds support for KEXEC_SIG verification with keys from platform keyring +but the built-in keys and secondary keyring are not used. + +Add support for the built-in keys and secondary keyring as x86 does. + +Fixes: e23a8020ce4e ("s390/kexec_file: Signature verification prototype") +Cc: stable@vger.kernel.org +Cc: Philipp Rudo +Cc: kexec@lists.infradead.org +Cc: keyrings@vger.kernel.org +Cc: linux-security-module@vger.kernel.org +Signed-off-by: Michal Suchanek +Reviewed-by: "Lee, Chun-Yi" +Acked-by: Baoquan He +Signed-off-by: Coiby Xu +Acked-by: Heiko Carstens +Signed-off-by: Mimi Zohar +Signed-off-by: Sasha Levin +--- + arch/s390/kernel/machine_kexec_file.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c +index 8f43575a4dd3..fc6d5f58debe 100644 +--- a/arch/s390/kernel/machine_kexec_file.c ++++ b/arch/s390/kernel/machine_kexec_file.c +@@ -31,6 +31,7 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) + const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; + struct module_signature *ms; + unsigned long sig_len; ++ int ret; + + /* Skip signature verification when not secure IPLed. */ + if (!ipl_secure_flag) +@@ -65,11 +66,18 @@ int s390_verify_sig(const char *kernel, unsigned long kernel_len) + return -EBADMSG; + } + +- return verify_pkcs7_signature(kernel, kernel_len, +- kernel + kernel_len, sig_len, +- VERIFY_USE_PLATFORM_KEYRING, +- VERIFYING_MODULE_SIGNATURE, +- NULL, NULL); ++ ret = verify_pkcs7_signature(kernel, kernel_len, ++ kernel + kernel_len, sig_len, ++ VERIFY_USE_SECONDARY_KEYRING, ++ VERIFYING_MODULE_SIGNATURE, ++ NULL, NULL); ++ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) ++ ret = verify_pkcs7_signature(kernel, kernel_len, ++ kernel + kernel_len, sig_len, ++ VERIFY_USE_PLATFORM_KEYRING, ++ VERIFYING_MODULE_SIGNATURE, ++ NULL, NULL); ++ return ret; + } + #endif /* CONFIG_KEXEC_SIG */ + +-- +2.35.1 + diff --git a/queue-5.18/kexec_file-drop-weak-attribute-from-functions.patch b/queue-5.18/kexec_file-drop-weak-attribute-from-functions.patch new file mode 100644 index 00000000000..5c047249657 --- /dev/null +++ b/queue-5.18/kexec_file-drop-weak-attribute-from-functions.patch @@ -0,0 +1,261 @@ +From a0103a12b495a7c1698fb02cfa8077d5018aedce Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Jul 2022 13:04:04 +0530 +Subject: kexec_file: drop weak attribute from functions + +From: Naveen N. Rao + +[ Upstream commit 65d9a9a60fd71be964effb2e94747a6acb6e7015 ] + +As requested +(http://lkml.kernel.org/r/87ee0q7b92.fsf@email.froward.int.ebiederm.org), +this series converts weak functions in kexec to use the #ifdef approach. + +Quoting the 3e35142ef99fe ("kexec_file: drop weak attribute from +arch_kexec_apply_relocations[_add]") changelog: + +: Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols") +: [1], binutils (v2.36+) started dropping section symbols that it thought +: were unused. This isn't an issue in general, but with kexec_file.c, gcc +: is placing kexec_arch_apply_relocations[_add] into a separate +: .text.unlikely section and the section symbol ".text.unlikely" is being +: dropped. Due to this, recordmcount is unable to find a non-weak symbol in +: .text.unlikely to generate a relocation record against. + +This patch (of 2); + +Drop __weak attribute from functions in kexec_file.c: +- arch_kexec_kernel_image_probe() +- arch_kimage_file_post_load_cleanup() +- arch_kexec_kernel_image_load() +- arch_kexec_locate_mem_hole() +- arch_kexec_kernel_verify_sig() + +arch_kexec_kernel_image_load() calls into kexec_image_load_default(), so +drop the static attribute for the latter. + +arch_kexec_kernel_verify_sig() is not overridden by any architecture, so +drop the __weak attribute. + +Link: https://lkml.kernel.org/r/cover.1656659357.git.naveen.n.rao@linux.vnet.ibm.com +Link: https://lkml.kernel.org/r/2cd7ca1fe4d6bb6ca38e3283c717878388ed6788.1656659357.git.naveen.n.rao@linux.vnet.ibm.com +Signed-off-by: Naveen N. Rao +Suggested-by: Eric Biederman +Signed-off-by: Andrew Morton +Signed-off-by: Mimi Zohar +Signed-off-by: Sasha Levin +--- + arch/arm64/include/asm/kexec.h | 4 ++- + arch/powerpc/include/asm/kexec.h | 9 +++++++ + arch/s390/include/asm/kexec.h | 3 +++ + arch/x86/include/asm/kexec.h | 6 +++++ + include/linux/kexec.h | 44 +++++++++++++++++++++++++++----- + kernel/kexec_file.c | 35 ++----------------------- + 6 files changed, 61 insertions(+), 40 deletions(-) + +diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h +index 9839bfc163d7..78d272b26ebd 100644 +--- a/arch/arm64/include/asm/kexec.h ++++ b/arch/arm64/include/asm/kexec.h +@@ -115,7 +115,9 @@ extern const struct kexec_file_ops kexec_image_ops; + + struct kimage; + +-extern int arch_kimage_file_post_load_cleanup(struct kimage *image); ++int arch_kimage_file_post_load_cleanup(struct kimage *image); ++#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup ++ + extern int load_other_segments(struct kimage *image, + unsigned long kernel_load_addr, unsigned long kernel_size, + char *initrd, unsigned long initrd_len, +diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h +index 2aefe14e1442..1e5e9b6ec78d 100644 +--- a/arch/powerpc/include/asm/kexec.h ++++ b/arch/powerpc/include/asm/kexec.h +@@ -120,6 +120,15 @@ int setup_purgatory(struct kimage *image, const void *slave_code, + #ifdef CONFIG_PPC64 + struct kexec_buf; + ++int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len); ++#define arch_kexec_kernel_image_probe arch_kexec_kernel_image_probe ++ ++int arch_kimage_file_post_load_cleanup(struct kimage *image); ++#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup ++ ++int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf); ++#define arch_kexec_locate_mem_hole arch_kexec_locate_mem_hole ++ + int load_crashdump_segments_ppc64(struct kimage *image, + struct kexec_buf *kbuf); + int setup_purgatory_ppc64(struct kimage *image, const void *slave_code, +diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h +index 63098df81c9f..d13bd221cd37 100644 +--- a/arch/s390/include/asm/kexec.h ++++ b/arch/s390/include/asm/kexec.h +@@ -92,5 +92,8 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); + #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add ++ ++int arch_kimage_file_post_load_cleanup(struct kimage *image); ++#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup + #endif + #endif /*_S390_KEXEC_H */ +diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h +index 6ad8d946cd3e..5ec359c1b50c 100644 +--- a/arch/x86/include/asm/kexec.h ++++ b/arch/x86/include/asm/kexec.h +@@ -193,6 +193,12 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); + #define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add ++ ++void *arch_kexec_kernel_image_load(struct kimage *image); ++#define arch_kexec_kernel_image_load arch_kexec_kernel_image_load ++ ++int arch_kimage_file_post_load_cleanup(struct kimage *image); ++#define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup + #endif + #endif + +diff --git a/include/linux/kexec.h b/include/linux/kexec.h +index 8d573baaab29..87c1795297b0 100644 +--- a/include/linux/kexec.h ++++ b/include/linux/kexec.h +@@ -188,21 +188,53 @@ int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, + bool get_value); + void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); ++void *kexec_image_load_default(struct kimage *image); ++ ++#ifndef arch_kexec_kernel_image_probe ++static inline int ++arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) ++{ ++ return kexec_image_probe_default(image, buf, buf_len); ++} ++#endif ++ ++#ifndef arch_kimage_file_post_load_cleanup ++static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) ++{ ++ return kexec_image_post_load_cleanup_default(image); ++} ++#endif ++ ++#ifndef arch_kexec_kernel_image_load ++static inline void *arch_kexec_kernel_image_load(struct kimage *image) ++{ ++ return kexec_image_load_default(image); ++} ++#endif + +-/* Architectures may override the below functions */ +-int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, +- unsigned long buf_len); +-void *arch_kexec_kernel_image_load(struct kimage *image); +-int arch_kimage_file_post_load_cleanup(struct kimage *image); + #ifdef CONFIG_KEXEC_SIG + int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, + unsigned long buf_len); + #endif +-int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf); + + extern int kexec_add_buffer(struct kexec_buf *kbuf); + int kexec_locate_mem_hole(struct kexec_buf *kbuf); + ++#ifndef arch_kexec_locate_mem_hole ++/** ++ * arch_kexec_locate_mem_hole - Find free memory to place the segments. ++ * @kbuf: Parameters for the memory search. ++ * ++ * On success, kbuf->mem will have the start address of the memory region found. ++ * ++ * Return: 0 on success, negative errno on error. ++ */ ++static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) ++{ ++ return kexec_locate_mem_hole(kbuf); ++} ++#endif ++ + /* Alignment required for elf header segment */ + #define ELF_CORE_HEADER_ALIGN 4096 + +diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c +index bb0fb63f563c..925953dfef05 100644 +--- a/kernel/kexec_file.c ++++ b/kernel/kexec_file.c +@@ -62,14 +62,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf, + return ret; + } + +-/* Architectures can provide this probe function */ +-int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, +- unsigned long buf_len) +-{ +- return kexec_image_probe_default(image, buf, buf_len); +-} +- +-static void *kexec_image_load_default(struct kimage *image) ++void *kexec_image_load_default(struct kimage *image) + { + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); +@@ -80,11 +73,6 @@ static void *kexec_image_load_default(struct kimage *image) + image->cmdline_buf_len); + } + +-void * __weak arch_kexec_kernel_image_load(struct kimage *image) +-{ +- return kexec_image_load_default(image); +-} +- + int kexec_image_post_load_cleanup_default(struct kimage *image) + { + if (!image->fops || !image->fops->cleanup) +@@ -93,11 +81,6 @@ int kexec_image_post_load_cleanup_default(struct kimage *image) + return image->fops->cleanup(image->image_loader_data); + } + +-int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +-{ +- return kexec_image_post_load_cleanup_default(image); +-} +- + #ifdef CONFIG_KEXEC_SIG + static int kexec_image_verify_sig_default(struct kimage *image, void *buf, + unsigned long buf_len) +@@ -110,8 +93,7 @@ static int kexec_image_verify_sig_default(struct kimage *image, void *buf, + return image->fops->verify_sig(buf, buf_len); + } + +-int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, +- unsigned long buf_len) ++int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) + { + return kexec_image_verify_sig_default(image, buf, buf_len); + } +@@ -621,19 +603,6 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) + return ret == 1 ? 0 : -EADDRNOTAVAIL; + } + +-/** +- * arch_kexec_locate_mem_hole - Find free memory to place the segments. +- * @kbuf: Parameters for the memory search. +- * +- * On success, kbuf->mem will have the start address of the memory region found. +- * +- * Return: 0 on success, negative errno on error. +- */ +-int __weak arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) +-{ +- return kexec_locate_mem_hole(kbuf); +-} +- + /** + * kexec_add_buffer - place a buffer in a kexec segment + * @kbuf: Buffer contents and memory parameters. +-- +2.35.1 + diff --git a/queue-5.18/keys-asymmetric-enforce-sm2-signature-use-pkey-algo.patch b/queue-5.18/keys-asymmetric-enforce-sm2-signature-use-pkey-algo.patch new file mode 100644 index 00000000000..fc0adb2e9ee --- /dev/null +++ b/queue-5.18/keys-asymmetric-enforce-sm2-signature-use-pkey-algo.patch @@ -0,0 +1,60 @@ +From 7537225de80672a829ff983a94b62557441b5bf3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Jun 2022 11:37:20 +0800 +Subject: KEYS: asymmetric: enforce SM2 signature use pkey algo + +From: Tianjia Zhang + +[ Upstream commit 0815291a8fd66cdcf7db1445d4d99b0d16065829 ] + +The signature verification of SM2 needs to add the Za value and +recalculate sig->digest, which requires the detection of the pkey_algo +in public_key_verify_signature(). As Eric Biggers said, the pkey_algo +field in sig is attacker-controlled and should be use pkey->pkey_algo +instead of sig->pkey_algo, and secondly, if sig->pkey_algo is NULL, it +will also cause signature verification failure. + +The software_key_determine_akcipher() already forces the algorithms +are matched, so the SM3 algorithm is enforced in the SM2 signature, +although this has been checked, we still avoid using any algorithm +information in the signature as input. + +Fixes: 215525639631 ("X.509: support OSCCA SM2-with-SM3 certificate verification") +Reported-by: Eric Biggers +Cc: stable@vger.kernel.org # v5.10+ +Signed-off-by: Tianjia Zhang +Reviewed-by: Jarkko Sakkinen +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Sasha Levin +--- + crypto/asymmetric_keys/public_key.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c +index 7c9e6be35c30..2f8352e88860 100644 +--- a/crypto/asymmetric_keys/public_key.c ++++ b/crypto/asymmetric_keys/public_key.c +@@ -304,6 +304,10 @@ static int cert_sig_digest_update(const struct public_key_signature *sig, + + BUG_ON(!sig->data); + ++ /* SM2 signatures always use the SM3 hash algorithm */ ++ if (!sig->hash_algo || strcmp(sig->hash_algo, "sm3") != 0) ++ return -EINVAL; ++ + ret = sm2_compute_z_digest(tfm_pkey, SM2_DEFAULT_USERID, + SM2_DEFAULT_USERID_LEN, dgst); + if (ret) +@@ -414,8 +418,7 @@ int public_key_verify_signature(const struct public_key *pkey, + if (ret) + goto error_free_key; + +- if (sig->pkey_algo && strcmp(sig->pkey_algo, "sm2") == 0 && +- sig->data_size) { ++ if (strcmp(pkey->pkey_algo, "sm2") == 0 && sig->data_size) { + ret = cert_sig_digest_update(sig, tfm); + if (ret) + goto error_free_key; +-- +2.35.1 + diff --git a/queue-5.18/ksmbd-add-smbd-max-io-size-parameter.patch b/queue-5.18/ksmbd-add-smbd-max-io-size-parameter.patch new file mode 100644 index 00000000000..03479588aeb --- /dev/null +++ b/queue-5.18/ksmbd-add-smbd-max-io-size-parameter.patch @@ -0,0 +1,115 @@ +From e6b41f4cc7280b1aff9e2ef689f1e8a83a605357 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 16 May 2022 16:22:43 +0900 +Subject: ksmbd: add smbd max io size parameter + +From: Namjae Jeon + +[ Upstream commit 65bb45b97b578c8eed1ffa80caec84708df49729 ] + +Add 'smbd max io size' parameter to adjust smbd-direct max read/write +size. + +Signed-off-by: Namjae Jeon +Reviewed-by: Hyunchul Lee +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/ksmbd/ksmbd_netlink.h | 3 ++- + fs/ksmbd/transport_ipc.c | 3 +++ + fs/ksmbd/transport_rdma.c | 8 +++++++- + fs/ksmbd/transport_rdma.h | 6 ++++++ + 4 files changed, 18 insertions(+), 2 deletions(-) + +diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h +index ebe6ca08467a..52aa0adeb951 100644 +--- a/fs/ksmbd/ksmbd_netlink.h ++++ b/fs/ksmbd/ksmbd_netlink.h +@@ -104,7 +104,8 @@ struct ksmbd_startup_request { + */ + __u32 sub_auth[3]; /* Subauth value for Security ID */ + __u32 smb2_max_credits; /* MAX credits */ +- __u32 reserved[128]; /* Reserved room */ ++ __u32 smbd_max_io_size; /* smbd read write size */ ++ __u32 reserved[127]; /* Reserved room */ + __u32 ifc_list_sz; /* interfaces list size */ + __s8 ____payload[]; + }; +diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c +index 3ad6881e0f7e..7cb0eeb07c80 100644 +--- a/fs/ksmbd/transport_ipc.c ++++ b/fs/ksmbd/transport_ipc.c +@@ -26,6 +26,7 @@ + #include "mgmt/ksmbd_ida.h" + #include "connection.h" + #include "transport_tcp.h" ++#include "transport_rdma.h" + + #define IPC_WAIT_TIMEOUT (2 * HZ) + +@@ -303,6 +304,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) + init_smb2_max_trans_size(req->smb2_max_trans); + if (req->smb2_max_credits) + init_smb2_max_credits(req->smb2_max_credits); ++ if (req->smbd_max_io_size) ++ init_smbd_max_io_size(req->smbd_max_io_size); + + ret = ksmbd_set_netbios_name(req->netbios_name); + ret |= ksmbd_set_server_string(req->server_string); +diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c +index b44a5e584bac..afc66b9765e7 100644 +--- a/fs/ksmbd/transport_rdma.c ++++ b/fs/ksmbd/transport_rdma.c +@@ -80,7 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; + /* The maximum single-message size which can be received */ + static int smb_direct_max_receive_size = 8192; + +-static int smb_direct_max_read_write_size = 8 * 1024 * 1024; ++static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; + + static LIST_HEAD(smb_direct_device_list); + static DEFINE_RWLOCK(smb_direct_device_lock); +@@ -214,6 +214,12 @@ struct smb_direct_rdma_rw_msg { + struct scatterlist sg_list[]; + }; + ++void init_smbd_max_io_size(unsigned int sz) ++{ ++ sz = clamp_val(sz, SMBD_MIN_IOSIZE, SMBD_MAX_IOSIZE); ++ smb_direct_max_read_write_size = sz; ++} ++ + static inline int get_buf_page_count(void *buf, int size) + { + return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - +diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h +index 5567d93a6f96..e7b4e6790fab 100644 +--- a/fs/ksmbd/transport_rdma.h ++++ b/fs/ksmbd/transport_rdma.h +@@ -7,6 +7,10 @@ + #ifndef __KSMBD_TRANSPORT_RDMA_H__ + #define __KSMBD_TRANSPORT_RDMA_H__ + ++#define SMBD_DEFAULT_IOSIZE (8 * 1024 * 1024) ++#define SMBD_MIN_IOSIZE (512 * 1024) ++#define SMBD_MAX_IOSIZE (16 * 1024 * 1024) ++ + /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ + struct smb_direct_negotiate_req { + __le16 min_version; +@@ -52,10 +56,12 @@ struct smb_direct_data_transfer { + int ksmbd_rdma_init(void); + void ksmbd_rdma_destroy(void); + bool ksmbd_rdma_capable_netdev(struct net_device *netdev); ++void init_smbd_max_io_size(unsigned int sz); + #else + static inline int ksmbd_rdma_init(void) { return 0; } + static inline int ksmbd_rdma_destroy(void) { return 0; } + static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } ++static inline void init_smbd_max_io_size(unsigned int sz) { } + #endif + + #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ +-- +2.35.1 + diff --git a/queue-5.18/ksmbd-fix-wrong-smbd-max-read-write-size-check.patch b/queue-5.18/ksmbd-fix-wrong-smbd-max-read-write-size-check.patch new file mode 100644 index 00000000000..b6297e1e159 --- /dev/null +++ b/queue-5.18/ksmbd-fix-wrong-smbd-max-read-write-size-check.patch @@ -0,0 +1,172 @@ +From 1149a190b27509bf700591539f0cd164592d4ce0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 16 May 2022 16:23:28 +0900 +Subject: ksmbd: fix wrong smbd max read/write size check + +From: Namjae Jeon + +[ Upstream commit 7a84399e1ce3f5f2fbec3e7dd93459ba25badc2f ] + +smb-direct max read/write size can be different with smb2 max read/write +size. So smb2_read() can return error by wrong max read/write size check. +This patch use smb_direct_max_read_write_size for this check in +smb-direct read/write(). + +Signed-off-by: Namjae Jeon +Reviewed-by: Hyunchul Lee +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/ksmbd/smb2pdu.c | 39 +++++++++++++++++++++++++-------------- + fs/ksmbd/transport_rdma.c | 5 +++++ + fs/ksmbd/transport_rdma.h | 2 ++ + 3 files changed, 32 insertions(+), 14 deletions(-) + +diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c +index 8f86b8d6765f..6c8dd718b5db 100644 +--- a/fs/ksmbd/smb2pdu.c ++++ b/fs/ksmbd/smb2pdu.c +@@ -6194,6 +6194,8 @@ int smb2_read(struct ksmbd_work *work) + size_t length, mincount; + ssize_t nbytes = 0, remain_bytes = 0; + int err = 0; ++ bool is_rdma_channel = false; ++ unsigned int max_read_size = conn->vals->max_read_size; + + WORK_BUFFERS(work, req, rsp); + +@@ -6205,6 +6207,11 @@ int smb2_read(struct ksmbd_work *work) + + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || + req->Channel == SMB2_CHANNEL_RDMA_V1) { ++ is_rdma_channel = true; ++ max_read_size = get_smbd_max_read_write_size(); ++ } ++ ++ if (is_rdma_channel == true) { + unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset); + + if (ch_offset < offsetof(struct smb2_read_req, Buffer)) { +@@ -6236,9 +6243,9 @@ int smb2_read(struct ksmbd_work *work) + length = le32_to_cpu(req->Length); + mincount = le32_to_cpu(req->MinimumCount); + +- if (length > conn->vals->max_read_size) { ++ if (length > max_read_size) { + ksmbd_debug(SMB, "limiting read size to max size(%u)\n", +- conn->vals->max_read_size); ++ max_read_size); + err = -EINVAL; + goto out; + } +@@ -6270,8 +6277,7 @@ int smb2_read(struct ksmbd_work *work) + ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n", + nbytes, offset, mincount); + +- if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || +- req->Channel == SMB2_CHANNEL_RDMA_V1) { ++ if (is_rdma_channel == true) { + /* write data to the client using rdma channel */ + remain_bytes = smb2_read_rdma_channel(work, req, + work->aux_payload_buf, +@@ -6432,8 +6438,9 @@ int smb2_write(struct ksmbd_work *work) + size_t length; + ssize_t nbytes; + char *data_buf; +- bool writethrough = false; ++ bool writethrough = false, is_rdma_channel = false; + int err = 0; ++ unsigned int max_write_size = work->conn->vals->max_write_size; + + WORK_BUFFERS(work, req, rsp); + +@@ -6442,8 +6449,17 @@ int smb2_write(struct ksmbd_work *work) + return smb2_write_pipe(work); + } + ++ offset = le64_to_cpu(req->Offset); ++ length = le32_to_cpu(req->Length); ++ + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || + req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { ++ is_rdma_channel = true; ++ max_write_size = get_smbd_max_read_write_size(); ++ length = le32_to_cpu(req->RemainingBytes); ++ } ++ ++ if (is_rdma_channel == true) { + unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset); + + if (req->Length != 0 || req->DataOffset != 0 || +@@ -6478,12 +6494,9 @@ int smb2_write(struct ksmbd_work *work) + goto out; + } + +- offset = le64_to_cpu(req->Offset); +- length = le32_to_cpu(req->Length); +- +- if (length > work->conn->vals->max_write_size) { ++ if (length > max_write_size) { + ksmbd_debug(SMB, "limiting write size to max size(%u)\n", +- work->conn->vals->max_write_size); ++ max_write_size); + err = -EINVAL; + goto out; + } +@@ -6491,8 +6504,7 @@ int smb2_write(struct ksmbd_work *work) + if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) + writethrough = true; + +- if (req->Channel != SMB2_CHANNEL_RDMA_V1 && +- req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) { ++ if (is_rdma_channel == false) { + if ((u64)le16_to_cpu(req->DataOffset) + length > + get_rfc1002_len(work->request_buf)) { + pr_err("invalid write data offset %u, smb_len %u\n", +@@ -6518,8 +6530,7 @@ int smb2_write(struct ksmbd_work *work) + /* read data from the client using rdma channel, and + * write the data. + */ +- nbytes = smb2_write_rdma_channel(work, req, fp, offset, +- le32_to_cpu(req->RemainingBytes), ++ nbytes = smb2_write_rdma_channel(work, req, fp, offset, length, + writethrough); + if (nbytes < 0) { + err = (int)nbytes; +diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c +index afc66b9765e7..c6af8d89b7f7 100644 +--- a/fs/ksmbd/transport_rdma.c ++++ b/fs/ksmbd/transport_rdma.c +@@ -220,6 +220,11 @@ void init_smbd_max_io_size(unsigned int sz) + smb_direct_max_read_write_size = sz; + } + ++unsigned int get_smbd_max_read_write_size(void) ++{ ++ return smb_direct_max_read_write_size; ++} ++ + static inline int get_buf_page_count(void *buf, int size) + { + return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - +diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h +index e7b4e6790fab..77aee4e5c9dc 100644 +--- a/fs/ksmbd/transport_rdma.h ++++ b/fs/ksmbd/transport_rdma.h +@@ -57,11 +57,13 @@ int ksmbd_rdma_init(void); + void ksmbd_rdma_destroy(void); + bool ksmbd_rdma_capable_netdev(struct net_device *netdev); + void init_smbd_max_io_size(unsigned int sz); ++unsigned int get_smbd_max_read_write_size(void); + #else + static inline int ksmbd_rdma_init(void) { return 0; } + static inline int ksmbd_rdma_destroy(void) { return 0; } + static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } + static inline void init_smbd_max_io_size(unsigned int sz) { } ++static inline unsigned int get_smbd_max_read_write_size(void) { return 0; } + #endif + + #endif /* __KSMBD_TRANSPORT_RDMA_H__ */ +-- +2.35.1 + diff --git a/queue-5.18/ksmbd-prevent-out-of-bound-read-for-smb2_write.patch b/queue-5.18/ksmbd-prevent-out-of-bound-read-for-smb2_write.patch new file mode 100644 index 00000000000..e872b663c31 --- /dev/null +++ b/queue-5.18/ksmbd-prevent-out-of-bound-read-for-smb2_write.patch @@ -0,0 +1,128 @@ +From 79288fb9f5ec9fb25c6f827a60c7233dece972c4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 Jul 2022 23:41:51 +0900 +Subject: ksmbd: prevent out of bound read for SMB2_WRITE + +From: Hyunchul Lee + +[ Upstream commit ac60778b87e45576d7bfdbd6f53df902654e6f09 ] + +OOB read memory can be written to a file, +if DataOffset is 0 and Length is too large +in SMB2_WRITE request of compound request. + +To prevent this, when checking the length of +the data area of SMB2_WRITE in smb2_get_data_area_len(), +let the minimum of DataOffset be the size of +SMB2 header + the size of SMB2_WRITE header. + +This bug can lead an oops looking something like: + +[ 798.008715] BUG: KASAN: slab-out-of-bounds in copy_page_from_iter_atomic+0xd3d/0x14b0 +[ 798.008724] Read of size 252 at addr ffff88800f863e90 by task kworker/0:2/2859 +... +[ 798.008754] Call Trace: +[ 798.008756] +[ 798.008759] dump_stack_lvl+0x49/0x5f +[ 798.008764] print_report.cold+0x5e/0x5cf +[ 798.008768] ? __filemap_get_folio+0x285/0x6d0 +[ 798.008774] ? copy_page_from_iter_atomic+0xd3d/0x14b0 +[ 798.008777] kasan_report+0xaa/0x120 +[ 798.008781] ? copy_page_from_iter_atomic+0xd3d/0x14b0 +[ 798.008784] kasan_check_range+0x100/0x1e0 +[ 798.008788] memcpy+0x24/0x60 +[ 798.008792] copy_page_from_iter_atomic+0xd3d/0x14b0 +[ 798.008795] ? pagecache_get_page+0x53/0x160 +[ 798.008799] ? iov_iter_get_pages_alloc+0x1590/0x1590 +[ 798.008803] ? ext4_write_begin+0xfc0/0xfc0 +[ 798.008807] ? current_time+0x72/0x210 +[ 798.008811] generic_perform_write+0x2c8/0x530 +[ 798.008816] ? filemap_fdatawrite_wbc+0x180/0x180 +[ 798.008820] ? down_write+0xb4/0x120 +[ 798.008824] ? down_write_killable+0x130/0x130 +[ 798.008829] ext4_buffered_write_iter+0x137/0x2c0 +[ 798.008833] ext4_file_write_iter+0x40b/0x1490 +[ 798.008837] ? __fsnotify_parent+0x275/0xb20 +[ 798.008842] ? __fsnotify_update_child_dentry_flags+0x2c0/0x2c0 +[ 798.008846] ? ext4_buffered_write_iter+0x2c0/0x2c0 +[ 798.008851] __kernel_write+0x3a1/0xa70 +[ 798.008855] ? __x64_sys_preadv2+0x160/0x160 +[ 798.008860] ? security_file_permission+0x4a/0xa0 +[ 798.008865] kernel_write+0xbb/0x360 +[ 798.008869] ksmbd_vfs_write+0x27e/0xb90 [ksmbd] +[ 798.008881] ? ksmbd_vfs_read+0x830/0x830 [ksmbd] +[ 798.008892] ? _raw_read_unlock+0x2a/0x50 +[ 798.008896] smb2_write+0xb45/0x14e0 [ksmbd] +[ 798.008909] ? __kasan_check_write+0x14/0x20 +[ 798.008912] ? _raw_spin_lock_bh+0xd0/0xe0 +[ 798.008916] ? smb2_read+0x15e0/0x15e0 [ksmbd] +[ 798.008927] ? memcpy+0x4e/0x60 +[ 798.008931] ? _raw_spin_unlock+0x19/0x30 +[ 798.008934] ? ksmbd_smb2_check_message+0x16af/0x2350 [ksmbd] +[ 798.008946] ? _raw_spin_lock_bh+0xe0/0xe0 +[ 798.008950] handle_ksmbd_work+0x30e/0x1020 [ksmbd] +[ 798.008962] process_one_work+0x778/0x11c0 +[ 798.008966] ? _raw_spin_lock_irq+0x8e/0xe0 +[ 798.008970] worker_thread+0x544/0x1180 +[ 798.008973] ? __cpuidle_text_end+0x4/0x4 +[ 798.008977] kthread+0x282/0x320 +[ 798.008982] ? process_one_work+0x11c0/0x11c0 +[ 798.008985] ? kthread_complete_and_exit+0x30/0x30 +[ 798.008989] ret_from_fork+0x1f/0x30 +[ 798.008995] + +Fixes: e2f34481b24d ("cifsd: add server-side procedures for SMB3") +Cc: stable@vger.kernel.org +Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-17817 +Signed-off-by: Hyunchul Lee +Acked-by: Namjae Jeon +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/ksmbd/smb2misc.c | 7 +++++-- + fs/ksmbd/smb2pdu.c | 8 +++----- + 2 files changed, 8 insertions(+), 7 deletions(-) + +diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c +index 03bcd7ce0c75..6e25ace36568 100644 +--- a/fs/ksmbd/smb2misc.c ++++ b/fs/ksmbd/smb2misc.c +@@ -131,8 +131,11 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, + *len = le16_to_cpu(((struct smb2_read_req *)hdr)->ReadChannelInfoLength); + break; + case SMB2_WRITE: +- if (((struct smb2_write_req *)hdr)->DataOffset) { +- *off = le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset); ++ if (((struct smb2_write_req *)hdr)->DataOffset || ++ ((struct smb2_write_req *)hdr)->Length) { ++ *off = max_t(unsigned int, ++ le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset), ++ offsetof(struct smb2_write_req, Buffer)); + *len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length); + break; + } +diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c +index 6c8dd718b5db..85a9ed7156ea 100644 +--- a/fs/ksmbd/smb2pdu.c ++++ b/fs/ksmbd/smb2pdu.c +@@ -6505,14 +6505,12 @@ int smb2_write(struct ksmbd_work *work) + writethrough = true; + + if (is_rdma_channel == false) { +- if ((u64)le16_to_cpu(req->DataOffset) + length > +- get_rfc1002_len(work->request_buf)) { +- pr_err("invalid write data offset %u, smb_len %u\n", +- le16_to_cpu(req->DataOffset), +- get_rfc1002_len(work->request_buf)); ++ if (le16_to_cpu(req->DataOffset) < ++ offsetof(struct smb2_write_req, Buffer)) { + err = -EINVAL; + goto out; + } ++ + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); + +-- +2.35.1 + diff --git a/queue-5.18/ksmbd-smbd-change-prototypes-of-rdma-read-write-rela.patch b/queue-5.18/ksmbd-smbd-change-prototypes-of-rdma-read-write-rela.patch new file mode 100644 index 00000000000..587133506b3 --- /dev/null +++ b/queue-5.18/ksmbd-smbd-change-prototypes-of-rdma-read-write-rela.patch @@ -0,0 +1,258 @@ +From a2389c2ae23605f7b53ab49541ee17285fdf563e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Apr 2022 08:30:25 +0900 +Subject: ksmbd: smbd: change prototypes of RDMA read/write related functions + +From: Hyunchul Lee + +[ Upstream commit 1807abcf8778bcbbf584fe54da9ccbe9029c49bb ] + +Change the prototypes of RDMA read/write +operations to accept a pointer and length +of buffer descriptors. + +Signed-off-by: Hyunchul Lee +Acked-by: Namjae Jeon +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/ksmbd/connection.c | 20 ++++++++++---------- + fs/ksmbd/connection.h | 27 ++++++++++++++++----------- + fs/ksmbd/smb2pdu.c | 23 ++++++++--------------- + fs/ksmbd/transport_rdma.c | 30 +++++++++++++++++------------- + 4 files changed, 51 insertions(+), 49 deletions(-) + +diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c +index bc6050b67256..e8f476c5f189 100644 +--- a/fs/ksmbd/connection.c ++++ b/fs/ksmbd/connection.c +@@ -205,31 +205,31 @@ int ksmbd_conn_write(struct ksmbd_work *work) + return 0; + } + +-int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, +- unsigned int buflen, u32 remote_key, u64 remote_offset, +- u32 remote_len) ++int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, ++ void *buf, unsigned int buflen, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len) + { + int ret = -EINVAL; + + if (conn->transport->ops->rdma_read) + ret = conn->transport->ops->rdma_read(conn->transport, + buf, buflen, +- remote_key, remote_offset, +- remote_len); ++ desc, desc_len); + return ret; + } + +-int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, +- unsigned int buflen, u32 remote_key, +- u64 remote_offset, u32 remote_len) ++int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, ++ void *buf, unsigned int buflen, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len) + { + int ret = -EINVAL; + + if (conn->transport->ops->rdma_write) + ret = conn->transport->ops->rdma_write(conn->transport, + buf, buflen, +- remote_key, remote_offset, +- remote_len); ++ desc, desc_len); + return ret; + } + +diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h +index 7a59aacb5daa..98c1cbe45ec9 100644 +--- a/fs/ksmbd/connection.h ++++ b/fs/ksmbd/connection.h +@@ -122,11 +122,14 @@ struct ksmbd_transport_ops { + int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, + int size, bool need_invalidate_rkey, + unsigned int remote_key); +- int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, +- u32 remote_key, u64 remote_offset, u32 remote_len); +- int (*rdma_write)(struct ksmbd_transport *t, void *buf, +- unsigned int len, u32 remote_key, u64 remote_offset, +- u32 remote_len); ++ int (*rdma_read)(struct ksmbd_transport *t, ++ void *buf, unsigned int len, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len); ++ int (*rdma_write)(struct ksmbd_transport *t, ++ void *buf, unsigned int len, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len); + }; + + struct ksmbd_transport { +@@ -148,12 +151,14 @@ struct ksmbd_conn *ksmbd_conn_alloc(void); + void ksmbd_conn_free(struct ksmbd_conn *conn); + bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); + int ksmbd_conn_write(struct ksmbd_work *work); +-int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, +- unsigned int buflen, u32 remote_key, u64 remote_offset, +- u32 remote_len); +-int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, +- unsigned int buflen, u32 remote_key, u64 remote_offset, +- u32 remote_len); ++int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, ++ void *buf, unsigned int buflen, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len); ++int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, ++ void *buf, unsigned int buflen, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len); + void ksmbd_conn_enqueue_request(struct ksmbd_work *work); + int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); + void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); +diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c +index 5df87fe18905..8f86b8d6765f 100644 +--- a/fs/ksmbd/smb2pdu.c ++++ b/fs/ksmbd/smb2pdu.c +@@ -6132,7 +6132,6 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) + static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, + struct smb2_buffer_desc_v1 *desc, + __le32 Channel, +- __le16 ChannelInfoOffset, + __le16 ChannelInfoLength) + { + unsigned int i, ch_count; +@@ -6158,7 +6157,8 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, + + work->need_invalidate_rkey = + (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); +- work->remote_key = le32_to_cpu(desc->token); ++ if (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) ++ work->remote_key = le32_to_cpu(desc->token); + return 0; + } + +@@ -6166,14 +6166,12 @@ static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, + struct smb2_read_req *req, void *data_buf, + size_t length) + { +- struct smb2_buffer_desc_v1 *desc = +- (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; + int err; + + err = ksmbd_conn_rdma_write(work->conn, data_buf, length, +- le32_to_cpu(desc->token), +- le64_to_cpu(desc->offset), +- le32_to_cpu(desc->length)); ++ (struct smb2_buffer_desc_v1 *) ++ ((char *)req + le16_to_cpu(req->ReadChannelInfoOffset)), ++ le16_to_cpu(req->ReadChannelInfoLength)); + if (err) + return err; + +@@ -6217,7 +6215,6 @@ int smb2_read(struct ksmbd_work *work) + (struct smb2_buffer_desc_v1 *) + ((char *)req + ch_offset), + req->Channel, +- req->ReadChannelInfoOffset, + req->ReadChannelInfoLength); + if (err) + goto out; +@@ -6395,21 +6392,18 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, + struct ksmbd_file *fp, + loff_t offset, size_t length, bool sync) + { +- struct smb2_buffer_desc_v1 *desc; + char *data_buf; + int ret; + ssize_t nbytes; + +- desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; +- + data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); + if (!data_buf) + return -ENOMEM; + + ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, +- le32_to_cpu(desc->token), +- le64_to_cpu(desc->offset), +- le32_to_cpu(desc->length)); ++ (struct smb2_buffer_desc_v1 *) ++ ((char *)req + le16_to_cpu(req->WriteChannelInfoOffset)), ++ le16_to_cpu(req->WriteChannelInfoLength)); + if (ret < 0) { + kvfree(data_buf); + return ret; +@@ -6461,7 +6455,6 @@ int smb2_write(struct ksmbd_work *work) + (struct smb2_buffer_desc_v1 *) + ((char *)req + ch_offset), + req->Channel, +- req->WriteChannelInfoOffset, + req->WriteChannelInfoLength); + if (err) + goto out; +diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c +index 3f5d13571694..479d279ee146 100644 +--- a/fs/ksmbd/transport_rdma.c ++++ b/fs/ksmbd/transport_rdma.c +@@ -1352,14 +1352,18 @@ static void write_done(struct ib_cq *cq, struct ib_wc *wc) + read_write_done(cq, wc, DMA_TO_DEVICE); + } + +-static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, +- int buf_len, u32 remote_key, u64 remote_offset, +- u32 remote_len, bool is_read) ++static int smb_direct_rdma_xmit(struct smb_direct_transport *t, ++ void *buf, int buf_len, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len, ++ bool is_read) + { + struct smb_direct_rdma_rw_msg *msg; + int ret; + DECLARE_COMPLETION_ONSTACK(completion); + struct ib_send_wr *first_wr = NULL; ++ u32 remote_key = le32_to_cpu(desc[0].token); ++ u64 remote_offset = le64_to_cpu(desc[0].offset); + + ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); + if (ret < 0) +@@ -1424,22 +1428,22 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, + return ret; + } + +-static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, +- unsigned int buflen, u32 remote_key, +- u64 remote_offset, u32 remote_len) ++static int smb_direct_rdma_write(struct ksmbd_transport *t, ++ void *buf, unsigned int buflen, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len) + { + return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, +- remote_key, remote_offset, +- remote_len, false); ++ desc, desc_len, false); + } + +-static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, +- unsigned int buflen, u32 remote_key, +- u64 remote_offset, u32 remote_len) ++static int smb_direct_rdma_read(struct ksmbd_transport *t, ++ void *buf, unsigned int buflen, ++ struct smb2_buffer_desc_v1 *desc, ++ unsigned int desc_len) + { + return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, +- remote_key, remote_offset, +- remote_len, true); ++ desc, desc_len, true); + } + + static void smb_direct_disconnect(struct ksmbd_transport *t) +-- +2.35.1 + diff --git a/queue-5.18/ksmbd-smbd-introduce-read-write-credits-for-rdma-rea.patch b/queue-5.18/ksmbd-smbd-introduce-read-write-credits-for-rdma-rea.patch new file mode 100644 index 00000000000..4c4ecc10f86 --- /dev/null +++ b/queue-5.18/ksmbd-smbd-introduce-read-write-credits-for-rdma-rea.patch @@ -0,0 +1,294 @@ +From 874e8676953ec14919db995ac2610534d933d174 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Apr 2022 08:30:26 +0900 +Subject: ksmbd: smbd: introduce read/write credits for RDMA read/write + +From: Hyunchul Lee + +[ Upstream commit ddbdc861e37c168cf2fb8a7b7477f5d18b4daf76 ] + +SMB2_READ/SMB2_WRITE request has to be granted the number +of rw credits, the pages the request wants to transfer +/ the maximum pages which can be registered with one +MR to read and write a file. +And allocate enough RDMA resources for the maximum +number of rw credits allowed by ksmbd. + +Signed-off-by: Hyunchul Lee +Acked-by: Namjae Jeon +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/ksmbd/transport_rdma.c | 120 ++++++++++++++++++++++---------------- + 1 file changed, 71 insertions(+), 49 deletions(-) + +diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c +index 479d279ee146..b44a5e584bac 100644 +--- a/fs/ksmbd/transport_rdma.c ++++ b/fs/ksmbd/transport_rdma.c +@@ -80,9 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; + /* The maximum single-message size which can be received */ + static int smb_direct_max_receive_size = 8192; + +-static int smb_direct_max_read_write_size = 524224; +- +-static int smb_direct_max_outstanding_rw_ops = 8; ++static int smb_direct_max_read_write_size = 8 * 1024 * 1024; + + static LIST_HEAD(smb_direct_device_list); + static DEFINE_RWLOCK(smb_direct_device_lock); +@@ -147,10 +145,12 @@ struct smb_direct_transport { + atomic_t send_credits; + spinlock_t lock_new_recv_credits; + int new_recv_credits; +- atomic_t rw_avail_ops; ++ int max_rw_credits; ++ int pages_per_rw_credit; ++ atomic_t rw_credits; + + wait_queue_head_t wait_send_credits; +- wait_queue_head_t wait_rw_avail_ops; ++ wait_queue_head_t wait_rw_credits; + + mempool_t *sendmsg_mempool; + struct kmem_cache *sendmsg_cache; +@@ -377,7 +377,7 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) + t->reassembly_queue_length = 0; + init_waitqueue_head(&t->wait_reassembly_queue); + init_waitqueue_head(&t->wait_send_credits); +- init_waitqueue_head(&t->wait_rw_avail_ops); ++ init_waitqueue_head(&t->wait_rw_credits); + + spin_lock_init(&t->receive_credit_lock); + spin_lock_init(&t->recvmsg_queue_lock); +@@ -984,18 +984,19 @@ static int smb_direct_flush_send_list(struct smb_direct_transport *t, + } + + static int wait_for_credits(struct smb_direct_transport *t, +- wait_queue_head_t *waitq, atomic_t *credits) ++ wait_queue_head_t *waitq, atomic_t *total_credits, ++ int needed) + { + int ret; + + do { +- if (atomic_dec_return(credits) >= 0) ++ if (atomic_sub_return(needed, total_credits) >= 0) + return 0; + +- atomic_inc(credits); ++ atomic_add(needed, total_credits); + ret = wait_event_interruptible(*waitq, +- atomic_read(credits) > 0 || +- t->status != SMB_DIRECT_CS_CONNECTED); ++ atomic_read(total_credits) >= needed || ++ t->status != SMB_DIRECT_CS_CONNECTED); + + if (t->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; +@@ -1016,7 +1017,19 @@ static int wait_for_send_credits(struct smb_direct_transport *t, + return ret; + } + +- return wait_for_credits(t, &t->wait_send_credits, &t->send_credits); ++ return wait_for_credits(t, &t->wait_send_credits, &t->send_credits, 1); ++} ++ ++static int wait_for_rw_credits(struct smb_direct_transport *t, int credits) ++{ ++ return wait_for_credits(t, &t->wait_rw_credits, &t->rw_credits, credits); ++} ++ ++static int calc_rw_credits(struct smb_direct_transport *t, ++ char *buf, unsigned int len) ++{ ++ return DIV_ROUND_UP(get_buf_page_count(buf, len), ++ t->pages_per_rw_credit); + } + + static int smb_direct_create_header(struct smb_direct_transport *t, +@@ -1332,8 +1345,8 @@ static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, + smb_direct_disconnect_rdma_connection(t); + } + +- if (atomic_inc_return(&t->rw_avail_ops) > 0) +- wake_up(&t->wait_rw_avail_ops); ++ if (atomic_inc_return(&t->rw_credits) > 0) ++ wake_up(&t->wait_rw_credits); + + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sg_list, msg->sgt.nents, dir); +@@ -1364,8 +1377,10 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + struct ib_send_wr *first_wr = NULL; + u32 remote_key = le32_to_cpu(desc[0].token); + u64 remote_offset = le64_to_cpu(desc[0].offset); ++ int credits_needed; + +- ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); ++ credits_needed = calc_rw_credits(t, buf, buf_len); ++ ret = wait_for_rw_credits(t, credits_needed); + if (ret < 0) + return ret; + +@@ -1373,7 +1388,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); + if (!msg) { +- atomic_inc(&t->rw_avail_ops); ++ atomic_add(credits_needed, &t->rw_credits); + return -ENOMEM; + } + +@@ -1382,7 +1397,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + get_buf_page_count(buf, buf_len), + msg->sg_list, SG_CHUNK_SIZE); + if (ret) { +- atomic_inc(&t->rw_avail_ops); ++ atomic_add(credits_needed, &t->rw_credits); + kfree(msg); + return -ENOMEM; + } +@@ -1418,7 +1433,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, + return 0; + + err: +- atomic_inc(&t->rw_avail_ops); ++ atomic_add(credits_needed, &t->rw_credits); + if (first_wr) + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sg_list, msg->sgt.nents, +@@ -1643,11 +1658,19 @@ static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) + return ret; + } + ++static unsigned int smb_direct_get_max_fr_pages(struct smb_direct_transport *t) ++{ ++ return min_t(unsigned int, ++ t->cm_id->device->attrs.max_fast_reg_page_list_len, ++ 256); ++} ++ + static int smb_direct_init_params(struct smb_direct_transport *t, + struct ib_qp_cap *cap) + { + struct ib_device *device = t->cm_id->device; +- int max_send_sges, max_pages, max_rw_wrs, max_send_wrs; ++ int max_send_sges, max_rw_wrs, max_send_wrs; ++ unsigned int max_sge_per_wr, wrs_per_credit; + + /* need 2 more sge. because a SMB_DIRECT header will be mapped, + * and maybe a send buffer could be not page aligned. +@@ -1659,25 +1682,31 @@ static int smb_direct_init_params(struct smb_direct_transport *t, + return -EINVAL; + } + +- /* +- * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA +- * read/writes. HCA guarantees at least max_send_sge of sges for +- * a RDMA read/write work request, and if memory registration is used, +- * we need reg_mr, local_inv wrs for each read/write. ++ /* Calculate the number of work requests for RDMA R/W. ++ * The maximum number of pages which can be registered ++ * with one Memory region can be transferred with one ++ * R/W credit. And at least 4 work requests for each credit ++ * are needed for MR registration, RDMA R/W, local & remote ++ * MR invalidation. + */ + t->max_rdma_rw_size = smb_direct_max_read_write_size; +- max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; +- max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES); +- max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num, +- max_pages) * 2; +- max_rw_wrs *= smb_direct_max_outstanding_rw_ops; ++ t->pages_per_rw_credit = smb_direct_get_max_fr_pages(t); ++ t->max_rw_credits = DIV_ROUND_UP(t->max_rdma_rw_size, ++ (t->pages_per_rw_credit - 1) * ++ PAGE_SIZE); ++ ++ max_sge_per_wr = min_t(unsigned int, device->attrs.max_send_sge, ++ device->attrs.max_sge_rd); ++ wrs_per_credit = max_t(unsigned int, 4, ++ DIV_ROUND_UP(t->pages_per_rw_credit, ++ max_sge_per_wr) + 1); ++ max_rw_wrs = t->max_rw_credits * wrs_per_credit; + + max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; + if (max_send_wrs > device->attrs.max_cqe || + max_send_wrs > device->attrs.max_qp_wr) { +- pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n", +- smb_direct_send_credit_target, +- smb_direct_max_outstanding_rw_ops); ++ pr_err("consider lowering send_credit_target = %d\n", ++ smb_direct_send_credit_target); + pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", + device->attrs.max_cqe, device->attrs.max_qp_wr); + return -EINVAL; +@@ -1712,7 +1741,7 @@ static int smb_direct_init_params(struct smb_direct_transport *t, + + t->send_credit_target = smb_direct_send_credit_target; + atomic_set(&t->send_credits, 0); +- atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops); ++ atomic_set(&t->rw_credits, t->max_rw_credits); + + t->max_send_size = smb_direct_max_send_size; + t->max_recv_size = smb_direct_max_receive_size; +@@ -1720,12 +1749,10 @@ static int smb_direct_init_params(struct smb_direct_transport *t, + + cap->max_send_wr = max_send_wrs; + cap->max_recv_wr = t->recv_credit_max; +- cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; ++ cap->max_send_sge = max_sge_per_wr; + cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; + cap->max_inline_data = 0; +- cap->max_rdma_ctxs = +- rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * +- smb_direct_max_outstanding_rw_ops; ++ cap->max_rdma_ctxs = t->max_rw_credits; + return 0; + } + +@@ -1818,7 +1845,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, + } + + t->send_cq = ib_alloc_cq(t->cm_id->device, t, +- t->send_credit_target, 0, IB_POLL_WORKQUEUE); ++ smb_direct_send_credit_target + cap->max_rdma_ctxs, ++ 0, IB_POLL_WORKQUEUE); + if (IS_ERR(t->send_cq)) { + pr_err("Can't create RDMA send CQ\n"); + ret = PTR_ERR(t->send_cq); +@@ -1827,8 +1855,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, + } + + t->recv_cq = ib_alloc_cq(t->cm_id->device, t, +- cap->max_send_wr + cap->max_rdma_ctxs, +- 0, IB_POLL_WORKQUEUE); ++ t->recv_credit_max, 0, IB_POLL_WORKQUEUE); + if (IS_ERR(t->recv_cq)) { + pr_err("Can't create RDMA recv CQ\n"); + ret = PTR_ERR(t->recv_cq); +@@ -1857,17 +1884,12 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, + + pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; + if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { +- int pages_per_mr, mr_count; +- +- pages_per_mr = min_t(int, pages_per_rw, +- t->cm_id->device->attrs.max_fast_reg_page_list_len); +- mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * +- atomic_read(&t->rw_avail_ops); +- ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, +- IB_MR_TYPE_MEM_REG, pages_per_mr, 0); ++ ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, ++ t->max_rw_credits, IB_MR_TYPE_MEM_REG, ++ t->pages_per_rw_credit, 0); + if (ret) { + pr_err("failed to init mr pool count %d pages %d\n", +- mr_count, pages_per_mr); ++ t->max_rw_credits, t->pages_per_rw_credit); + goto err; + } + } +-- +2.35.1 + diff --git a/queue-5.18/ksmbd-validate-length-in-smb2_write.patch b/queue-5.18/ksmbd-validate-length-in-smb2_write.patch new file mode 100644 index 00000000000..d8404ccb413 --- /dev/null +++ b/queue-5.18/ksmbd-validate-length-in-smb2_write.patch @@ -0,0 +1,101 @@ +From e58f6941d18cf59076c318ceaa24694c385f008b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 May 2022 15:40:10 +0200 +Subject: ksmbd: validate length in smb2_write() + +From: Marios Makassikis + +[ Upstream commit 158a66b245739e15858de42c0ba60fcf3de9b8e6 ] + +The SMB2 Write packet contains data that is to be written +to a file or to a pipe. Depending on the client, there may +be padding between the header and the data field. +Currently, the length is validated only in the case padding +is present. + +Since the DataOffset field always points to the beginning +of the data, there is no need to have a special case for +padding. By removing this, the length is validated in both +cases. + +Signed-off-by: Marios Makassikis +Acked-by: Namjae Jeon +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/ksmbd/smb2pdu.c | 49 ++++++++++++++++++---------------------------- + 1 file changed, 19 insertions(+), 30 deletions(-) + +diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c +index 0a76aa7fe5f9..5df87fe18905 100644 +--- a/fs/ksmbd/smb2pdu.c ++++ b/fs/ksmbd/smb2pdu.c +@@ -6344,23 +6344,18 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) + length = le32_to_cpu(req->Length); + id = req->VolatileFileId; + +- if (le16_to_cpu(req->DataOffset) == +- offsetof(struct smb2_write_req, Buffer)) { +- data_buf = (char *)&req->Buffer[0]; +- } else { +- if ((u64)le16_to_cpu(req->DataOffset) + length > +- get_rfc1002_len(work->request_buf)) { +- pr_err("invalid write data offset %u, smb_len %u\n", +- le16_to_cpu(req->DataOffset), +- get_rfc1002_len(work->request_buf)); +- err = -EINVAL; +- goto out; +- } +- +- data_buf = (char *)(((char *)&req->hdr.ProtocolId) + +- le16_to_cpu(req->DataOffset)); ++ if ((u64)le16_to_cpu(req->DataOffset) + length > ++ get_rfc1002_len(work->request_buf)) { ++ pr_err("invalid write data offset %u, smb_len %u\n", ++ le16_to_cpu(req->DataOffset), ++ get_rfc1002_len(work->request_buf)); ++ err = -EINVAL; ++ goto out; + } + ++ data_buf = (char *)(((char *)&req->hdr.ProtocolId) + ++ le16_to_cpu(req->DataOffset)); ++ + rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length); + if (rpc_resp) { + if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) { +@@ -6505,22 +6500,16 @@ int smb2_write(struct ksmbd_work *work) + + if (req->Channel != SMB2_CHANNEL_RDMA_V1 && + req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) { +- if (le16_to_cpu(req->DataOffset) == +- offsetof(struct smb2_write_req, Buffer)) { +- data_buf = (char *)&req->Buffer[0]; +- } else { +- if ((u64)le16_to_cpu(req->DataOffset) + length > +- get_rfc1002_len(work->request_buf)) { +- pr_err("invalid write data offset %u, smb_len %u\n", +- le16_to_cpu(req->DataOffset), +- get_rfc1002_len(work->request_buf)); +- err = -EINVAL; +- goto out; +- } +- +- data_buf = (char *)(((char *)&req->hdr.ProtocolId) + +- le16_to_cpu(req->DataOffset)); ++ if ((u64)le16_to_cpu(req->DataOffset) + length > ++ get_rfc1002_len(work->request_buf)) { ++ pr_err("invalid write data offset %u, smb_len %u\n", ++ le16_to_cpu(req->DataOffset), ++ get_rfc1002_len(work->request_buf)); ++ err = -EINVAL; ++ goto out; + } ++ data_buf = (char *)(((char *)&req->hdr.ProtocolId) + ++ le16_to_cpu(req->DataOffset)); + + ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); + if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) +-- +2.35.1 + diff --git a/queue-5.18/kvm-nvmx-attempt-to-load-perf_global_ctrl-on-nvmx-xf.patch b/queue-5.18/kvm-nvmx-attempt-to-load-perf_global_ctrl-on-nvmx-xf.patch new file mode 100644 index 00000000000..e437d5974d1 --- /dev/null +++ b/queue-5.18/kvm-nvmx-attempt-to-load-perf_global_ctrl-on-nvmx-xf.patch @@ -0,0 +1,78 @@ +From 0c595ec21faf719ae7503cd05c3534eb55ebe586 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Jul 2022 22:44:08 +0000 +Subject: KVM: nVMX: Attempt to load PERF_GLOBAL_CTRL on nVMX xfer iff it + exists + +From: Sean Christopherson + +[ Upstream commit 4496a6f9b45e8cd83343ad86a3984d614e22cf54 ] + +Attempt to load PERF_GLOBAL_CTRL during nested VM-Enter/VM-Exit if and +only if the MSR exists (according to the guest vCPU model). KVM has very +misguided handling of VM_{ENTRY,EXIT}_LOAD_IA32_PERF_GLOBAL_CTRL and +attempts to force the nVMX MSR settings to match the vPMU model, i.e. to +hide/expose the control based on whether or not the MSR exists from the +guest's perspective. + +KVM's modifications fail to handle the scenario where the vPMU is hidden +from the guest _after_ being exposed to the guest, e.g. by userspace +doing multiple KVM_SET_CPUID2 calls, which is allowed if done before any +KVM_RUN. nested_vmx_pmu_refresh() is called if and only if there's a +recognized vPMU, i.e. KVM will leave the bits in the allow state and then +ultimately reject the MSR load and WARN. + +KVM should not force the VMX MSRs in the first place. KVM taking control +of the MSRs was a misguided attempt at mimicking what commit 5f76f6f5ff96 +("KVM: nVMX: Do not expose MPX VMX controls when guest MPX disabled", +2018-10-01) did for MPX. However, the MPX commit was a workaround for +another KVM bug and not something that should be imitated (and it should +never been done in the first place). + +In other words, KVM's ABI _should_ be that userspace has full control +over the MSRs, at which point triggering the WARN that loading the MSR +must not fail is trivial. + +The intent of the WARN is still valid; KVM has consistency checks to +ensure that vmcs12->{guest,host}_ia32_perf_global_ctrl is valid. The +problem is that '0' must be considered a valid value at all times, and so +the simple/obvious solution is to just not actually load the MSR when it +does not exist. It is userspace's responsibility to provide a sane vCPU +model, i.e. KVM is well within its ABI and Intel's VMX architecture to +skip the loads if the MSR does not exist. + +Fixes: 03a8871add95 ("KVM: nVMX: Expose load IA32_PERF_GLOBAL_CTRL VM-{Entry,Exit} control") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20220722224409.1336532-5-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/nested.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c +index aa287302f991..5c62e552082a 100644 +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -2621,6 +2621,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && ++ intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl))) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; +@@ -4346,7 +4347,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, + vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); + vcpu->arch.pat = vmcs12->host_ia32_pat; + } +- if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) ++ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && ++ intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) + WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl)); + +-- +2.35.1 + diff --git a/queue-5.18/kvm-set_msr_mce-permit-guests-to-ignore-single-bit-e.patch b/queue-5.18/kvm-set_msr_mce-permit-guests-to-ignore-single-bit-e.patch new file mode 100644 index 00000000000..055458acee1 --- /dev/null +++ b/queue-5.18/kvm-set_msr_mce-permit-guests-to-ignore-single-bit-e.patch @@ -0,0 +1,71 @@ +From afc6d9998f6e8d08603017905a5383988aa91bd0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 21 May 2022 08:15:11 +0000 +Subject: KVM: set_msr_mce: Permit guests to ignore single-bit ECC errors + +From: Lev Kujawski + +[ Upstream commit 0471a7bd1bca2a47a5f378f2222c5cf39ce94152 ] + +Certain guest operating systems (e.g., UNIXWARE) clear bit 0 of +MC1_CTL to ignore single-bit ECC data errors. Single-bit ECC data +errors are always correctable and thus are safe to ignore because they +are informational in nature rather than signaling a loss of data +integrity. + +Prior to this patch, these guests would crash upon writing MC1_CTL, +with resultant error messages like the following: + +error: kvm run failed Operation not permitted +EAX=fffffffe EBX=fffffffe ECX=00000404 EDX=ffffffff +ESI=ffffffff EDI=00000001 EBP=fffdaba4 ESP=fffdab20 +EIP=c01333a5 EFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0 +ES =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] +CS =0100 00000000 ffffffff 00c09b00 DPL=0 CS32 [-RA] +SS =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] +DS =0108 00000000 ffffffff 00c09300 DPL=0 DS [-WA] +FS =0000 00000000 ffffffff 00c00000 +GS =0000 00000000 ffffffff 00c00000 +LDT=0118 c1026390 00000047 00008200 DPL=0 LDT +TR =0110 ffff5af0 00000067 00008b00 DPL=0 TSS32-busy +GDT= ffff5020 000002cf +IDT= ffff52f0 000007ff +CR0=8001003b CR2=00000000 CR3=0100a000 CR4=00000230 +DR0=00000000 DR1=00000000 DR2=00000000 DR3=00000000 +DR6=ffff0ff0 DR7=00000400 +EFER=0000000000000000 +Code=08 89 01 89 51 04 c3 8b 4c 24 08 8b 01 8b 51 04 8b 4c 24 04 <0f> +30 c3 f7 05 a4 6d ff ff 10 00 00 00 74 03 0f 31 c3 33 c0 33 d2 c3 8d +74 26 00 0f 31 c3 + +Signed-off-by: Lev Kujawski +Message-Id: <20220521081511.187388-1-lkujaw@member.fsf.org> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/x86.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 767a61e29f51..2316c978b598 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3226,10 +3226,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + /* only 0 or all 1s can be written to IA32_MCi_CTL + * some Linux kernels though clear bit 10 in bank 4 to + * workaround a BIOS/GART TBL issue on AMD K8s, ignore +- * this to avoid an uncatched #GP in the guest ++ * this to avoid an uncatched #GP in the guest. ++ * ++ * UNIXWARE clears bit 0 of MC1_CTL to ignore ++ * correctable, single-bit ECC data errors. + */ + if ((offset & 0x3) == 0 && +- data != 0 && (data | (1 << 10)) != ~(u64)0) ++ data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return -1; + + /* MCi_STATUS */ +-- +2.35.1 + diff --git a/queue-5.18/kvm-vmx-add-helper-to-check-if-the-guest-pmu-has-per.patch b/queue-5.18/kvm-vmx-add-helper-to-check-if-the-guest-pmu-has-per.patch new file mode 100644 index 00000000000..3ba173f97c5 --- /dev/null +++ b/queue-5.18/kvm-vmx-add-helper-to-check-if-the-guest-pmu-has-per.patch @@ -0,0 +1,73 @@ +From d297e223ab3f71968097240e39d44fb5cc478e26 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Jul 2022 22:44:07 +0000 +Subject: KVM: VMX: Add helper to check if the guest PMU has PERF_GLOBAL_CTRL + +From: Sean Christopherson + +[ Upstream commit b663f0b5f3d665c261256d1f76e98f077c6e56af ] + +Add a helper to check of the guest PMU has PERF_GLOBAL_CTRL, which is +unintuitive _and_ diverges from Intel's architecturally defined behavior. +Even worse, KVM currently implements the check using two different (but +equivalent) checks, _and_ there has been at least one attempt to add a +_third_ flavor. + +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20220722224409.1336532-4-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- + arch/x86/kvm/vmx/vmx.h | 12 ++++++++++++ + 2 files changed, 14 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c +index 2cbd5f183ab5..8bd154f8c966 100644 +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -98,7 +98,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) + { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + +- if (pmu->version < 2) ++ if (!intel_pmu_has_perf_global_ctrl(pmu)) + return true; + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +@@ -215,7 +215,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: +- ret = pmu->version > 1; ++ return intel_pmu_has_perf_global_ctrl(pmu); + break; + default: + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || +diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h +index 1e7f9453894b..93aa1f3ea01e 100644 +--- a/arch/x86/kvm/vmx/vmx.h ++++ b/arch/x86/kvm/vmx/vmx.h +@@ -92,6 +92,18 @@ union vmx_exit_reason { + u32 full; + }; + ++static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) ++{ ++ /* ++ * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is ++ * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is ++ * greater than zero. However, KVM only exposes and emulates the MSR ++ * to/for the guest if the guest PMU supports at least "Architectural ++ * Performance Monitoring Version 2". ++ */ ++ return pmu->version > 1; ++} ++ + #define vcpu_to_lbr_desc(vcpu) (&to_vmx(vcpu)->lbr_desc) + #define vcpu_to_lbr_records(vcpu) (&to_vmx(vcpu)->lbr_desc.records) + +-- +2.35.1 + diff --git a/queue-5.18/kvm-vmx-mark-all-perf_global_-ovf-_ctrl-bits-reserve.patch b/queue-5.18/kvm-vmx-mark-all-perf_global_-ovf-_ctrl-bits-reserve.patch new file mode 100644 index 00000000000..54ff0c6a06b --- /dev/null +++ b/queue-5.18/kvm-vmx-mark-all-perf_global_-ovf-_ctrl-bits-reserve.patch @@ -0,0 +1,43 @@ +From cbb6518aa4bc3a15da904a5d81ff02604a6fcbe8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Jul 2022 22:44:06 +0000 +Subject: KVM: VMX: Mark all PERF_GLOBAL_(OVF)_CTRL bits reserved if there's no + vPMU + +From: Sean Christopherson + +[ Upstream commit 93255bf92939d948bc86d81c6bb70bb0fecc5db1 ] + +Mark all MSR_CORE_PERF_GLOBAL_CTRL and MSR_CORE_PERF_GLOBAL_OVF_CTRL bits +as reserved if there is no guest vPMU. The nVMX VM-Entry consistency +checks do not check for a valid vPMU prior to consuming the masks via +kvm_valid_perf_global_ctrl(), i.e. may incorrectly allow a non-zero mask +to be loaded via VM-Enter or VM-Exit (well, attempted to be loaded, the +actual MSR load will be rejected by intel_is_valid_msr()). + +Fixes: f5132b01386b ("KVM: Expose a version 2 architectural PMU to a guests") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Message-Id: <20220722224409.1336532-3-seanjc@google.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/pmu_intel.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c +index 040d598622e3..cd2d0454f8b0 100644 +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -488,6 +488,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) + pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; ++ pmu->global_ctrl_mask = ~0ull; ++ pmu->global_ovf_ctrl_mask = ~0ull; + pmu->fixed_ctr_ctrl_mask = ~0ull; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); +-- +2.35.1 + diff --git a/queue-5.18/kvm-x86-pmu-ignore-pmu-global_ctrl-check-if-vpmu-doe.patch b/queue-5.18/kvm-x86-pmu-ignore-pmu-global_ctrl-check-if-vpmu-doe.patch new file mode 100644 index 00000000000..802275c6153 --- /dev/null +++ b/queue-5.18/kvm-x86-pmu-ignore-pmu-global_ctrl-check-if-vpmu-doe.patch @@ -0,0 +1,40 @@ +From 6174683901bf4385f235ccf8923ece845be35a32 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 9 May 2022 18:22:02 +0800 +Subject: KVM: x86/pmu: Ignore pmu->global_ctrl check if vPMU doesn't support + global_ctrl + +From: Like Xu + +[ Upstream commit 98defd2e17803263f49548fea930cfc974d505aa ] + +MSR_CORE_PERF_GLOBAL_CTRL is introduced as part of Architecture PMU V2, +as indicated by Intel SDM 19.2.2 and the intel_is_valid_msr() function. + +So in the absence of global_ctrl support, all PMCs are enabled as AMD does. + +Signed-off-by: Like Xu +Message-Id: <20220509102204.62389-1-likexu@tencent.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/vmx/pmu_intel.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c +index cd2d0454f8b0..2cbd5f183ab5 100644 +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -98,6 +98,9 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) + { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + ++ if (pmu->version < 2) ++ return true; ++ + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); + } + +-- +2.35.1 + diff --git a/queue-5.18/kvm-x86-pmu-introduce-the-ctrl_mask-value-for-fixed-.patch b/queue-5.18/kvm-x86-pmu-introduce-the-ctrl_mask-value-for-fixed-.patch new file mode 100644 index 00000000000..4b5bde5b582 --- /dev/null +++ b/queue-5.18/kvm-x86-pmu-introduce-the-ctrl_mask-value-for-fixed-.patch @@ -0,0 +1,79 @@ +From 1687eb7807a86765c388c89e08f961c30ccc30a4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Apr 2022 18:19:34 +0800 +Subject: KVM: x86/pmu: Introduce the ctrl_mask value for fixed counter + +From: Like Xu + +[ Upstream commit 2c985527dd8d283e786ad7a67e532ef7f6f00fac ] + +The mask value of fixed counter control register should be dynamic +adjusted with the number of fixed counters. This patch introduces a +variable that includes the reserved bits of fixed counter control +registers. This is a generic code refactoring. + +Co-developed-by: Luwei Kang +Signed-off-by: Luwei Kang +Signed-off-by: Like Xu +Acked-by: Peter Zijlstra (Intel) +Message-Id: <20220411101946.20262-6-likexu@tencent.com> +Signed-off-by: Paolo Bonzini +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/kvm_host.h | 1 + + arch/x86/kvm/vmx/pmu_intel.c | 6 +++++- + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h +index 57550a427789..35c7a1fce8ea 100644 +--- a/arch/x86/include/asm/kvm_host.h ++++ b/arch/x86/include/asm/kvm_host.h +@@ -508,6 +508,7 @@ struct kvm_pmu { + unsigned nr_arch_fixed_counters; + unsigned available_event_types; + u64 fixed_ctr_ctrl; ++ u64 fixed_ctr_ctrl_mask; + u64 global_ctrl; + u64 global_status; + u64 counter_bitmask[2]; +diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c +index b82b6709d7a8..040d598622e3 100644 +--- a/arch/x86/kvm/vmx/pmu_intel.c ++++ b/arch/x86/kvm/vmx/pmu_intel.c +@@ -395,7 +395,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + case MSR_CORE_PERF_FIXED_CTR_CTRL: + if (pmu->fixed_ctr_ctrl == data) + return 0; +- if (!(data & 0xfffffffffffff444ull)) { ++ if (!(data & pmu->fixed_ctr_ctrl_mask)) { + reprogram_fixed_counters(pmu, data); + return 0; + } +@@ -479,6 +479,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) + struct kvm_cpuid_entry2 *entry; + union cpuid10_eax eax; + union cpuid10_edx edx; ++ int i; + + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; +@@ -487,6 +488,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) + pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + pmu->raw_event_mask = X86_RAW_EVENT_MASK; ++ pmu->fixed_ctr_ctrl_mask = ~0ull; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry || !vcpu->kvm->arch.enable_pmu) +@@ -522,6 +524,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) + setup_fixed_pmc_eventsel(pmu); + } + ++ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) ++ pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4)); + pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); + pmu->global_ctrl_mask = ~pmu->global_ctrl; +-- +2.35.1 + diff --git a/queue-5.18/kvm-x86-signal-gp-not-eperm-on-bad-wrmsr-mci_ctl-sta.patch b/queue-5.18/kvm-x86-signal-gp-not-eperm-on-bad-wrmsr-mci_ctl-sta.patch new file mode 100644 index 00000000000..b900ac56fff --- /dev/null +++ b/queue-5.18/kvm-x86-signal-gp-not-eperm-on-bad-wrmsr-mci_ctl-sta.patch @@ -0,0 +1,50 @@ +From e9cd8ca56097b6239965d4f48f6472c096fa12cf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 May 2022 22:27:14 +0000 +Subject: KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS) + +From: Sean Christopherson + +[ Upstream commit 2368048bf5c2ec4b604ac3431564071e89a0bc71 ] + +Return '1', not '-1', when handling an illegal WRMSR to a MCi_CTL or +MCi_STATUS MSR. The behavior of "all zeros' or "all ones" for CTL MSRs +is architectural, as is the "only zeros" behavior for STATUS MSRs. I.e. +the intent is to inject a #GP, not exit to userspace due to an unhandled +emulation case. Returning '-1' gets interpreted as -EPERM up the stack +and effecitvely kills the guest. + +Fixes: 890ca9aefa78 ("KVM: Add MCE support") +Fixes: 9ffd986c6e4e ("KVM: X86: #GP when guest attempts to write MCi_STATUS register w/o 0") +Cc: stable@vger.kernel.org +Signed-off-by: Sean Christopherson +Reviewed-by: Jim Mattson +Link: https://lore.kernel.org/r/20220512222716.4112548-2-seanjc@google.com +Signed-off-by: Sasha Levin +--- + arch/x86/kvm/x86.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 2316c978b598..0d6cea0d33a9 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -3233,13 +3233,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) + */ + if ((offset & 0x3) == 0 && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) +- return -1; ++ return 1; + + /* MCi_STATUS */ + if (!msr_info->host_initiated && + (offset & 0x3) == 1 && data != 0) { + if (!can_set_mci_status(vcpu)) +- return -1; ++ return 1; + } + + vcpu->arch.mce_banks[offset] = data; +-- +2.35.1 + diff --git a/queue-5.18/locking-csd_lock-change-csdlock_debug-from-early_par.patch b/queue-5.18/locking-csd_lock-change-csdlock_debug-from-early_par.patch new file mode 100644 index 00000000000..feb17387701 --- /dev/null +++ b/queue-5.18/locking-csd_lock-change-csdlock_debug-from-early_par.patch @@ -0,0 +1,56 @@ +From 9219faa1ac767c0882f58d034a20086d6c06042e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 10 May 2022 17:46:39 +0800 +Subject: locking/csd_lock: Change csdlock_debug from early_param to __setup + +From: Chen Zhongjin + +[ Upstream commit 9c9b26b0df270d4f9246e483a44686fca951a29c ] + +The csdlock_debug kernel-boot parameter is parsed by the +early_param() function csdlock_debug(). If set, csdlock_debug() +invokes static_branch_enable() to enable csd_lock_wait feature, which +triggers a panic on arm64 for kernels built with CONFIG_SPARSEMEM=y and +CONFIG_SPARSEMEM_VMEMMAP=n. + +With CONFIG_SPARSEMEM_VMEMMAP=n, __nr_to_section is called in +static_key_enable() and returns NULL, resulting in a NULL dereference +because mem_section is initialized only later in sparse_init(). + +This is also a problem for powerpc because early_param() functions +are invoked earlier than jump_label_init(), also resulting in +static_key_enable() failures. These failures cause the warning "static +key 'xxx' used before call to jump_label_init()". + +Thus, early_param is too early for csd_lock_wait to run +static_branch_enable(), so changes it to __setup to fix these. + +Fixes: 8d0968cc6b8f ("locking/csd_lock: Add boot parameter for controlling CSD lock debugging") +Cc: stable@vger.kernel.org +Reported-by: Chen jingwen +Signed-off-by: Chen Zhongjin +Signed-off-by: Paul E. McKenney +Signed-off-by: Sasha Levin +--- + kernel/smp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/smp.c b/kernel/smp.c +index 65a630f62363..381eb15cd28f 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -174,9 +174,9 @@ static int __init csdlock_debug(char *str) + if (val) + static_branch_enable(&csdlock_debug_enabled); + +- return 0; ++ return 1; + } +-early_param("csdlock_debug", csdlock_debug); ++__setup("csdlock_debug=", csdlock_debug); + + static DEFINE_PER_CPU(call_single_data_t *, cur_csd); + static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); +-- +2.35.1 + diff --git a/queue-5.18/mm-damon-reclaim-fix-potential-memory-leak-in-damon_.patch b/queue-5.18/mm-damon-reclaim-fix-potential-memory-leak-in-damon_.patch new file mode 100644 index 00000000000..b60df35db11 --- /dev/null +++ b/queue-5.18/mm-damon-reclaim-fix-potential-memory-leak-in-damon_.patch @@ -0,0 +1,46 @@ +From da88b2e18326479f78988cfb3e313dbb2286e447 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Jul 2022 14:37:46 +0800 +Subject: mm/damon/reclaim: fix potential memory leak in damon_reclaim_init() + +From: Jianglei Nie + +[ Upstream commit 188043c7f4f2bd662f2a55957d684fffa543e600 ] + +damon_reclaim_init() allocates a memory chunk for ctx with +damon_new_ctx(). When damon_select_ops() fails, ctx is not released, +which will lead to a memory leak. + +We should release the ctx with damon_destroy_ctx() when damon_select_ops() +fails to fix the memory leak. + +Link: https://lkml.kernel.org/r/20220714063746.2343549-1-niejianglei2021@163.com +Fixes: 4d69c3457821 ("mm/damon/reclaim: use damon_select_ops() instead of damon_{v,p}a_set_operations()") +Signed-off-by: Jianglei Nie +Reviewed-by: SeongJae Park +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/damon/reclaim.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c +index e34c4d0c4d93..11982685508e 100644 +--- a/mm/damon/reclaim.c ++++ b/mm/damon/reclaim.c +@@ -384,8 +384,10 @@ static int __init damon_reclaim_init(void) + if (!ctx) + return -ENOMEM; + +- if (damon_select_ops(ctx, DAMON_OPS_PADDR)) ++ if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { ++ damon_destroy_ctx(ctx); + return -EINVAL; ++ } + + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; + +-- +2.35.1 + diff --git a/queue-5.18/net-9p-initialize-the-iounit-field-during-fid-creati.patch b/queue-5.18/net-9p-initialize-the-iounit-field-during-fid-creati.patch new file mode 100644 index 00000000000..d96916f3c25 --- /dev/null +++ b/queue-5.18/net-9p-initialize-the-iounit-field-during-fid-creati.patch @@ -0,0 +1,68 @@ +From 54577663faf8efe35cef8b782a6a2be7dbe01e35 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 10 Jul 2022 09:14:02 -0500 +Subject: net/9p: Initialize the iounit field during fid creation + +From: Tyler Hicks + +[ Upstream commit aa7aeee169480e98cf41d83c01290a37e569be6d ] + +Ensure that the fid's iounit field is set to zero when a new fid is +created. Certain 9P operations, such as OPEN and CREATE, allow the +server to reply with an iounit size which the client code assigns to the +p9_fid struct shortly after the fid is created by p9_fid_create(). On +the other hand, an XATTRWALK operation doesn't allow for the server to +specify an iounit value. The iounit field of the newly allocated p9_fid +struct remained uninitialized in that case. Depending on allocation +patterns, the iounit value could have been something reasonable that was +carried over from previously freed fids or, in the worst case, could +have been arbitrary values from non-fid related usages of the memory +location. + +The bug was detected in the Windows Subsystem for Linux 2 (WSL2) kernel +after the uninitialized iounit field resulted in the typical sequence of +two getxattr(2) syscalls, one to get the size of an xattr and another +after allocating a sufficiently sized buffer to fit the xattr value, to +hit an unexpected ERANGE error in the second call to getxattr(2). An +uninitialized iounit field would sometimes force rsize to be smaller +than the xattr value size in p9_client_read_once() and the 9P server in +WSL refused to chunk up the READ on the attr_fid and, instead, returned +ERANGE to the client. The virtfs server in QEMU seems happy to chunk up +the READ and this problem goes undetected there. + +Link: https://lkml.kernel.org/r/20220710141402.803295-1-tyhicks@linux.microsoft.com +Fixes: ebf46264a004 ("fs/9p: Add support user. xattr") +Cc: stable@vger.kernel.org +Signed-off-by: Tyler Hicks +Reviewed-by: Christian Schoenebeck +Signed-off-by: Dominique Martinet +Signed-off-by: Sasha Levin +--- + net/9p/client.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +diff --git a/net/9p/client.c b/net/9p/client.c +index a36a40137caa..87cde948f628 100644 +--- a/net/9p/client.c ++++ b/net/9p/client.c +@@ -886,16 +886,13 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) + struct p9_fid *fid; + + p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); +- fid = kmalloc(sizeof(*fid), GFP_KERNEL); ++ fid = kzalloc(sizeof(*fid), GFP_KERNEL); + if (!fid) + return NULL; + +- memset(&fid->qid, 0, sizeof(fid->qid)); + fid->mode = -1; + fid->uid = current_fsuid(); + fid->clnt = clnt; +- fid->rdir = NULL; +- fid->fid = 0; + refcount_set(&fid->count, 1); + + idr_preload(GFP_KERNEL); +-- +2.35.1 + diff --git a/queue-5.18/pci-aer-iterate-over-error-counters-instead-of-error.patch b/queue-5.18/pci-aer-iterate-over-error-counters-instead-of-error.patch new file mode 100644 index 00000000000..d27157ed3c0 --- /dev/null +++ b/queue-5.18/pci-aer-iterate-over-error-counters-instead-of-error.patch @@ -0,0 +1,61 @@ +From dbe842dddf2be13dbdb8de5eb0b9ee702f315d95 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 9 May 2022 18:14:41 +0000 +Subject: PCI/AER: Iterate over error counters instead of error strings + +From: Mohamed Khalfella + +[ Upstream commit 5e6ae050955b566484f3cc6a66e3925eae87a0ed ] + +Previously we iterated over AER stat *names*, e.g., +aer_correctable_error_string[32], but the actual stat *counters* may not be +that large, e.g., pdev->aer_stats->dev_cor_errs[16], which means that we +printed junk in the sysfs stats files. + +Iterate over the stat counter arrays instead of the names to avoid this +junk. + +Also, added a build time check to make sure all +counters have entries in strings array. + +Fixes: 0678e3109a3c ("PCI/AER: Simplify __aer_print_error()") +Link: https://lore.kernel.org/r/20220509181441.31884-1-mkhalfella@purestorage.com +Reported-by: Meeta Saggi +Signed-off-by: Mohamed Khalfella +Signed-off-by: Bjorn Helgaas +Reviewed-by: Meeta Saggi +Reviewed-by: Eric Badger +Cc: stable@vger.kernel.org +Signed-off-by: Sasha Levin +--- + drivers/pci/pcie/aer.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c +index 7952e5efd6cf..a1e38ca93cd9 100644 +--- a/drivers/pci/pcie/aer.c ++++ b/drivers/pci/pcie/aer.c +@@ -538,7 +538,7 @@ static const char *aer_agent_string[] = { + u64 *stats = pdev->aer_stats->stats_array; \ + size_t len = 0; \ + \ +- for (i = 0; i < ARRAY_SIZE(strings_array); i++) { \ ++ for (i = 0; i < ARRAY_SIZE(pdev->aer_stats->stats_array); i++) {\ + if (strings_array[i]) \ + len += sysfs_emit_at(buf, len, "%s %llu\n", \ + strings_array[i], \ +@@ -1347,6 +1347,11 @@ static int aer_probe(struct pcie_device *dev) + struct device *device = &dev->device; + struct pci_dev *port = dev->port; + ++ BUILD_BUG_ON(ARRAY_SIZE(aer_correctable_error_string) < ++ AER_MAX_TYPEOF_COR_ERRS); ++ BUILD_BUG_ON(ARRAY_SIZE(aer_uncorrectable_error_string) < ++ AER_MAX_TYPEOF_UNCOR_ERRS); ++ + /* Limit to Root Ports or Root Complex Event Collectors */ + if ((pci_pcie_type(port) != PCI_EXP_TYPE_RC_EC) && + (pci_pcie_type(port) != PCI_EXP_TYPE_ROOT_PORT)) +-- +2.35.1 + diff --git a/queue-5.18/pci-qcom-power-on-phy-before-ipq8074-dbi-register-ac.patch b/queue-5.18/pci-qcom-power-on-phy-before-ipq8074-dbi-register-ac.patch new file mode 100644 index 00000000000..be88f137ea1 --- /dev/null +++ b/queue-5.18/pci-qcom-power-on-phy-before-ipq8074-dbi-register-ac.patch @@ -0,0 +1,111 @@ +From 458d79349bdb8364d0035a65afb591e5eff068d8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 23 Jun 2022 17:50:03 +0200 +Subject: PCI: qcom: Power on PHY before IPQ8074 DBI register accesses + +From: Robert Marko + +[ Upstream commit a0e43bb9973b06ce5c666f0901e104e2037c1b34 ] + +Currently the Gen2 port in IPQ8074 will cause the system to hang as it +accesses DBI registers in qcom_pcie_init_2_3_3(), and those are only +accesible after phy_power_on(). + +Move the DBI read/writes to a new qcom_pcie_post_init_2_3_3(), which is +executed after phy_power_on(). + +Link: https://lore.kernel.org/r/20220623155004.688090-1-robimarko@gmail.com +Fixes: a0fd361db8e5 ("PCI: dwc: Move "dbi", "dbi2", and "addr_space" resource setup into common code") +Signed-off-by: Robert Marko +Signed-off-by: Bjorn Helgaas +Reviewed-by: Dmitry Baryshkov +Cc: stable@vger.kernel.org # v5.11+ +Signed-off-by: Sasha Levin +--- + drivers/pci/controller/dwc/pcie-qcom.c | 48 +++++++++++++++----------- + 1 file changed, 28 insertions(+), 20 deletions(-) + +diff --git a/drivers/pci/controller/dwc/pcie-qcom.c b/drivers/pci/controller/dwc/pcie-qcom.c +index ab04818f6ed9..340542aab8a5 100644 +--- a/drivers/pci/controller/dwc/pcie-qcom.c ++++ b/drivers/pci/controller/dwc/pcie-qcom.c +@@ -1036,9 +1036,7 @@ static int qcom_pcie_init_2_3_3(struct qcom_pcie *pcie) + struct qcom_pcie_resources_2_3_3 *res = &pcie->res.v2_3_3; + struct dw_pcie *pci = pcie->pci; + struct device *dev = pci->dev; +- u16 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); + int i, ret; +- u32 val; + + for (i = 0; i < ARRAY_SIZE(res->rst); i++) { + ret = reset_control_assert(res->rst[i]); +@@ -1095,6 +1093,33 @@ static int qcom_pcie_init_2_3_3(struct qcom_pcie *pcie) + goto err_clk_aux; + } + ++ return 0; ++ ++err_clk_aux: ++ clk_disable_unprepare(res->ahb_clk); ++err_clk_ahb: ++ clk_disable_unprepare(res->axi_s_clk); ++err_clk_axi_s: ++ clk_disable_unprepare(res->axi_m_clk); ++err_clk_axi_m: ++ clk_disable_unprepare(res->iface); ++err_clk_iface: ++ /* ++ * Not checking for failure, will anyway return ++ * the original failure in 'ret'. ++ */ ++ for (i = 0; i < ARRAY_SIZE(res->rst); i++) ++ reset_control_assert(res->rst[i]); ++ ++ return ret; ++} ++ ++static int qcom_pcie_post_init_2_3_3(struct qcom_pcie *pcie) ++{ ++ struct dw_pcie *pci = pcie->pci; ++ u16 offset = dw_pcie_find_capability(pci, PCI_CAP_ID_EXP); ++ u32 val; ++ + writel(SLV_ADDR_SPACE_SZ, + pcie->parf + PCIE20_v3_PARF_SLV_ADDR_SPACE_SIZE); + +@@ -1122,24 +1147,6 @@ static int qcom_pcie_init_2_3_3(struct qcom_pcie *pcie) + PCI_EXP_DEVCTL2); + + return 0; +- +-err_clk_aux: +- clk_disable_unprepare(res->ahb_clk); +-err_clk_ahb: +- clk_disable_unprepare(res->axi_s_clk); +-err_clk_axi_s: +- clk_disable_unprepare(res->axi_m_clk); +-err_clk_axi_m: +- clk_disable_unprepare(res->iface); +-err_clk_iface: +- /* +- * Not checking for failure, will anyway return +- * the original failure in 'ret'. +- */ +- for (i = 0; i < ARRAY_SIZE(res->rst); i++) +- reset_control_assert(res->rst[i]); +- +- return ret; + } + + static int qcom_pcie_get_resources_2_7_0(struct qcom_pcie *pcie) +@@ -1465,6 +1472,7 @@ static const struct qcom_pcie_ops ops_2_4_0 = { + static const struct qcom_pcie_ops ops_2_3_3 = { + .get_resources = qcom_pcie_get_resources_2_3_3, + .init = qcom_pcie_init_2_3_3, ++ .post_init = qcom_pcie_post_init_2_3_3, + .deinit = qcom_pcie_deinit_2_3_3, + .ltssm_enable = qcom_pcie_2_3_2_ltssm_enable, + }; +-- +2.35.1 + diff --git a/queue-5.18/powerpc-powernv-kvm-use-darn-for-h_random-on-power9.patch b/queue-5.18/powerpc-powernv-kvm-use-darn-for-h_random-on-power9.patch new file mode 100644 index 00000000000..2cea5bd7bd8 --- /dev/null +++ b/queue-5.18/powerpc-powernv-kvm-use-darn-for-h_random-on-power9.patch @@ -0,0 +1,145 @@ +From 6df1064850462c2a7cac009c6e72ad3fcd7d0fa6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 Jul 2022 00:32:18 +1000 +Subject: powerpc/powernv/kvm: Use darn for H_RANDOM on Power9 + +From: Jason A. Donenfeld + +[ Upstream commit 7ef3d06f1bc4a5e62273726f3dc2bd258ae1c71f ] + +The existing logic in KVM to support guests calling H_RANDOM only works +on Power8, because it looks for an RNG in the device tree, but on Power9 +we just use darn. + +In addition the existing code needs to work in real mode, so we have the +special cased powernv_get_random_real_mode() to deal with that. + +Instead just have KVM call ppc_md.get_random_seed(), and do the real +mode check inside of there, that way we use whatever RNG is available, +including darn on Power9. + +Fixes: e928e9cb3601 ("KVM: PPC: Book3S HV: Add fast real-mode H_RANDOM implementation.") +Cc: stable@vger.kernel.org # v4.1+ +Signed-off-by: Jason A. Donenfeld +Tested-by: Sachin Sant +[mpe: Rebase on previous commit, update change log appropriately] +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20220727143219.2684192-2-mpe@ellerman.id.au +Signed-off-by: Sasha Levin +--- + arch/powerpc/include/asm/archrandom.h | 5 ---- + arch/powerpc/kvm/book3s_hv_builtin.c | 7 +++--- + arch/powerpc/platforms/powernv/rng.c | 36 ++++++--------------------- + 3 files changed, 12 insertions(+), 36 deletions(-) + +diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h +index 9a53e29680f4..258174304904 100644 +--- a/arch/powerpc/include/asm/archrandom.h ++++ b/arch/powerpc/include/asm/archrandom.h +@@ -38,12 +38,7 @@ static inline bool __must_check arch_get_random_seed_int(unsigned int *v) + #endif /* CONFIG_ARCH_RANDOM */ + + #ifdef CONFIG_PPC_POWERNV +-int powernv_hwrng_present(void); + int powernv_get_random_long(unsigned long *v); +-int powernv_get_random_real_mode(unsigned long *v); +-#else +-static inline int powernv_hwrng_present(void) { return 0; } +-static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; } + #endif + + #endif /* _ASM_POWERPC_ARCHRANDOM_H */ +diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c +index 7e52d0beee77..5e4251b76e75 100644 +--- a/arch/powerpc/kvm/book3s_hv_builtin.c ++++ b/arch/powerpc/kvm/book3s_hv_builtin.c +@@ -19,7 +19,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + #include +@@ -176,13 +176,14 @@ EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); + + int kvmppc_hwrng_present(void) + { +- return powernv_hwrng_present(); ++ return ppc_md.get_random_seed != NULL; + } + EXPORT_SYMBOL_GPL(kvmppc_hwrng_present); + + long kvmppc_rm_h_random(struct kvm_vcpu *vcpu) + { +- if (powernv_get_random_real_mode(&vcpu->arch.regs.gpr[4])) ++ if (ppc_md.get_random_seed && ++ ppc_md.get_random_seed(&vcpu->arch.regs.gpr[4])) + return H_SUCCESS; + + return H_HARDWARE; +diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c +index 2287c9cd0cd5..d19305292e1e 100644 +--- a/arch/powerpc/platforms/powernv/rng.c ++++ b/arch/powerpc/platforms/powernv/rng.c +@@ -29,15 +29,6 @@ struct powernv_rng { + + static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); + +-int powernv_hwrng_present(void) +-{ +- struct powernv_rng *rng; +- +- rng = get_cpu_var(powernv_rng); +- put_cpu_var(rng); +- return rng != NULL; +-} +- + static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) + { + unsigned long parity; +@@ -58,19 +49,6 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) + return val; + } + +-int powernv_get_random_real_mode(unsigned long *v) +-{ +- struct powernv_rng *rng; +- +- rng = raw_cpu_read(powernv_rng); +- if (!rng) +- return 0; +- +- *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); +- +- return 1; +-} +- + static int powernv_get_random_darn(unsigned long *v) + { + unsigned long val; +@@ -107,12 +85,14 @@ int powernv_get_random_long(unsigned long *v) + { + struct powernv_rng *rng; + +- rng = get_cpu_var(powernv_rng); +- +- *v = rng_whiten(rng, in_be64(rng->regs)); +- +- put_cpu_var(rng); +- ++ if (mfmsr() & MSR_DR) { ++ rng = get_cpu_var(powernv_rng); ++ *v = rng_whiten(rng, in_be64(rng->regs)); ++ put_cpu_var(rng); ++ } else { ++ rng = raw_cpu_read(powernv_rng); ++ *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); ++ } + return 1; + } + EXPORT_SYMBOL_GPL(powernv_get_random_long); +-- +2.35.1 + diff --git a/queue-5.18/s390-unwind-fix-fgraph-return-address-recovery.patch b/queue-5.18/s390-unwind-fix-fgraph-return-address-recovery.patch new file mode 100644 index 00000000000..02c34a00f7b --- /dev/null +++ b/queue-5.18/s390-unwind-fix-fgraph-return-address-recovery.patch @@ -0,0 +1,46 @@ +From acaca81b30149dc942304fa4d9460deb6ab485ba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 26 Jul 2022 18:57:59 +0200 +Subject: s390/unwind: fix fgraph return address recovery + +From: Sumanth Korikkar + +[ Upstream commit ded466e1806686794b403ebf031133bbaca76bb2 ] + +When HAVE_FUNCTION_GRAPH_RET_ADDR_PTR is defined, the return +address to the fgraph caller is recovered by tagging it along with the +stack pointer of ftrace stack. This makes the stack unwinding more +reliable. + +When the fgraph return address is modified to return_to_handler, +ftrace_graph_ret_addr tries to restore it to the original +value using tagged stack pointer. + +Fix this by passing tagged sp to ftrace_graph_ret_addr. + +Fixes: d81675b60d09 ("s390/unwind: recover kretprobe modified return address in stacktrace") +Cc: # 5.18 +Reviewed-by: Vasily Gorbik +Signed-off-by: Sumanth Korikkar +Signed-off-by: Alexander Gordeev +Signed-off-by: Sasha Levin +--- + arch/s390/include/asm/unwind.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h +index 0bf06f1682d8..02462e7100c1 100644 +--- a/arch/s390/include/asm/unwind.h ++++ b/arch/s390/include/asm/unwind.h +@@ -47,7 +47,7 @@ struct unwind_state { + static inline unsigned long unwind_recover_ret_addr(struct unwind_state *state, + unsigned long ip) + { +- ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, NULL); ++ ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, ip, (void *)state->sp); + if (is_kretprobe_trampoline(ip)) + ip = kretprobe_find_ret_addr(state->task, (void *)state->sp, &state->kr_cur); + return ip; +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-edif-fix-dropped-ike-message.patch b/queue-5.18/scsi-qla2xxx-edif-fix-dropped-ike-message.patch new file mode 100644 index 00000000000..d421910c909 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-edif-fix-dropped-ike-message.patch @@ -0,0 +1,126 @@ +From 983ed3d61efd61bb2643bd8b27af58c45e0dfcce Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 22:20:40 -0700 +Subject: scsi: qla2xxx: edif: Fix dropped IKE message + +From: Quinn Tran + +[ Upstream commit c019cd656e717349ff22d0c41d6fbfc773f48c52 ] + +This patch fixes IKE message being dropped due to error in processing Purex +IOCB and Continuation IOCBs. + +Link: https://lore.kernel.org/r/20220713052045.10683-6-njavali@marvell.com +Fixes: fac2807946c1 ("scsi: qla2xxx: edif: Add extraction of auth_els from the wire") +Cc: stable@vger.kernel.org +Reviewed-by: Himanshu Madhani +Signed-off-by: Quinn Tran +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_isr.c | 54 +++++++++++++++------------------- + 1 file changed, 24 insertions(+), 30 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c +index c509bb23af40..125b208cf118 100644 +--- a/drivers/scsi/qla2xxx/qla_isr.c ++++ b/drivers/scsi/qla2xxx/qla_isr.c +@@ -3711,12 +3711,11 @@ void qla24xx_nvme_ls4_iocb(struct scsi_qla_host *vha, + * Return: 0 all iocbs has arrived, xx- all iocbs have not arrived. + */ + static int qla_chk_cont_iocb_avail(struct scsi_qla_host *vha, +- struct rsp_que *rsp, response_t *pkt) ++ struct rsp_que *rsp, response_t *pkt, u32 rsp_q_in) + { +- int start_pkt_ring_index, end_pkt_ring_index, n_ring_index; +- response_t *end_pkt; ++ int start_pkt_ring_index; ++ u32 iocb_cnt = 0; + int rc = 0; +- u32 rsp_q_in; + + if (pkt->entry_count == 1) + return rc; +@@ -3727,34 +3726,18 @@ static int qla_chk_cont_iocb_avail(struct scsi_qla_host *vha, + else + start_pkt_ring_index = rsp->ring_index - 1; + +- if ((start_pkt_ring_index + pkt->entry_count) >= rsp->length) +- end_pkt_ring_index = start_pkt_ring_index + pkt->entry_count - +- rsp->length - 1; ++ if (rsp_q_in < start_pkt_ring_index) ++ /* q in ptr is wrapped */ ++ iocb_cnt = rsp->length - start_pkt_ring_index + rsp_q_in; + else +- end_pkt_ring_index = start_pkt_ring_index + pkt->entry_count - 1; ++ iocb_cnt = rsp_q_in - start_pkt_ring_index; + +- end_pkt = rsp->ring + end_pkt_ring_index; +- +- /* next pkt = end_pkt + 1 */ +- n_ring_index = end_pkt_ring_index + 1; +- if (n_ring_index >= rsp->length) +- n_ring_index = 0; +- +- rsp_q_in = rsp->qpair->use_shadow_reg ? *rsp->in_ptr : +- rd_reg_dword(rsp->rsp_q_in); +- +- /* rsp_q_in is either wrapped or pointing beyond endpkt */ +- if ((rsp_q_in < start_pkt_ring_index && rsp_q_in < n_ring_index) || +- rsp_q_in >= n_ring_index) +- /* all IOCBs arrived. */ +- rc = 0; +- else ++ if (iocb_cnt < pkt->entry_count) + rc = -EIO; + +- ql_dbg(ql_dbg_init + ql_dbg_verbose, vha, 0x5091, +- "%s - ring %p pkt %p end pkt %p entry count %#x rsp_q_in %d rc %d\n", +- __func__, rsp->ring, pkt, end_pkt, pkt->entry_count, +- rsp_q_in, rc); ++ ql_dbg(ql_dbg_init, vha, 0x5091, ++ "%s - ring %p pkt %p entry count %d iocb_cnt %d rsp_q_in %d rc %d\n", ++ __func__, rsp->ring, pkt, pkt->entry_count, iocb_cnt, rsp_q_in, rc); + + return rc; + } +@@ -3771,7 +3754,7 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, + struct qla_hw_data *ha = vha->hw; + struct purex_entry_24xx *purex_entry; + struct purex_item *pure_item; +- u16 rsp_in = 0; ++ u16 rsp_in = 0, cur_ring_index; + int follow_inptr, is_shadow_hba; + + if (!ha->flags.fw_started) +@@ -3802,6 +3785,7 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, + (!follow_inptr && + rsp->ring_ptr->signature != RESPONSE_PROCESSED)) { + pkt = (struct sts_entry_24xx *)rsp->ring_ptr; ++ cur_ring_index = rsp->ring_index; + + rsp->ring_index++; + if (rsp->ring_index == rsp->length) { +@@ -3922,7 +3906,17 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, + break; + + case ELS_AUTH_ELS: +- if (qla_chk_cont_iocb_avail(vha, rsp, (response_t *)pkt)) { ++ if (qla_chk_cont_iocb_avail(vha, rsp, (response_t *)pkt, rsp_in)) { ++ /* ++ * ring_ptr and ring_index were ++ * pre-incremented above. Reset them ++ * back to current. Wait for next ++ * interrupt with all IOCBs to arrive ++ * and re-process. ++ */ ++ rsp->ring_ptr = (response_t *)pkt; ++ rsp->ring_index = cur_ring_index; ++ + ql_dbg(ql_dbg_init, vha, 0x5091, + "Defer processing ELS opcode %#x...\n", + purex_entry->els_frame_payload[3]); +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-crash-due-to-stale-srb-access-aroun.patch b/queue-5.18/scsi-qla2xxx-fix-crash-due-to-stale-srb-access-aroun.patch new file mode 100644 index 00000000000..3238e59d025 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-crash-due-to-stale-srb-access-aroun.patch @@ -0,0 +1,125 @@ +From ff24e11b6f46bae7af4d61b2021903bdc747c5ec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 22:35:02 -0700 +Subject: scsi: qla2xxx: Fix crash due to stale SRB access around I/O timeouts + +From: Arun Easi + +[ Upstream commit c39587bc0abaf16593f7abcdf8aeec3c038c7d52 ] + +Ensure SRB is returned during I/O timeout error escalation. If that is not +possible fail the escalation path. + +Following crash stack was seen: + +BUG: unable to handle kernel paging request at 0000002f56aa90f8 +IP: qla_chk_edif_rx_sa_delete_pending+0x14/0x30 [qla2xxx] +Call Trace: + ? qla2x00_status_entry+0x19f/0x1c50 [qla2xxx] + ? qla2x00_start_sp+0x116/0x1170 [qla2xxx] + ? dma_pool_alloc+0x1d6/0x210 + ? mempool_alloc+0x54/0x130 + ? qla24xx_process_response_queue+0x548/0x12b0 [qla2xxx] + ? qla_do_work+0x2d/0x40 [qla2xxx] + ? process_one_work+0x14c/0x390 + +Link: https://lore.kernel.org/r/20220616053508.27186-6-njavali@marvell.com +Fixes: d74595278f4a ("scsi: qla2xxx: Add multiple queue pair functionality.") +Cc: stable@vger.kernel.org +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_os.c | 43 +++++++++++++++++++++++++---------- + 1 file changed, 31 insertions(+), 12 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c +index daa9a3c3f7b5..f9ad0847782d 100644 +--- a/drivers/scsi/qla2xxx/qla_os.c ++++ b/drivers/scsi/qla2xxx/qla_os.c +@@ -1342,21 +1342,20 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd) + /* + * Returns: QLA_SUCCESS or QLA_FUNCTION_FAILED. + */ +-int +-qla2x00_eh_wait_for_pending_commands(scsi_qla_host_t *vha, unsigned int t, +- uint64_t l, enum nexus_wait_type type) ++static int ++__qla2x00_eh_wait_for_pending_commands(struct qla_qpair *qpair, unsigned int t, ++ uint64_t l, enum nexus_wait_type type) + { + int cnt, match, status; + unsigned long flags; +- struct qla_hw_data *ha = vha->hw; +- struct req_que *req; ++ scsi_qla_host_t *vha = qpair->vha; ++ struct req_que *req = qpair->req; + srb_t *sp; + struct scsi_cmnd *cmd; + + status = QLA_SUCCESS; + +- spin_lock_irqsave(&ha->hardware_lock, flags); +- req = vha->req; ++ spin_lock_irqsave(qpair->qp_lock_ptr, flags); + for (cnt = 1; status == QLA_SUCCESS && + cnt < req->num_outstanding_cmds; cnt++) { + sp = req->outstanding_cmds[cnt]; +@@ -1383,12 +1382,32 @@ qla2x00_eh_wait_for_pending_commands(scsi_qla_host_t *vha, unsigned int t, + if (!match) + continue; + +- spin_unlock_irqrestore(&ha->hardware_lock, flags); ++ spin_unlock_irqrestore(qpair->qp_lock_ptr, flags); + status = qla2x00_eh_wait_on_command(cmd); +- spin_lock_irqsave(&ha->hardware_lock, flags); ++ spin_lock_irqsave(qpair->qp_lock_ptr, flags); + } +- spin_unlock_irqrestore(&ha->hardware_lock, flags); ++ spin_unlock_irqrestore(qpair->qp_lock_ptr, flags); ++ ++ return status; ++} ++ ++int ++qla2x00_eh_wait_for_pending_commands(scsi_qla_host_t *vha, unsigned int t, ++ uint64_t l, enum nexus_wait_type type) ++{ ++ struct qla_qpair *qpair; ++ struct qla_hw_data *ha = vha->hw; ++ int i, status = QLA_SUCCESS; + ++ status = __qla2x00_eh_wait_for_pending_commands(ha->base_qpair, t, l, ++ type); ++ for (i = 0; status == QLA_SUCCESS && i < ha->max_qpairs; i++) { ++ qpair = ha->queue_pair_map[i]; ++ if (!qpair) ++ continue; ++ status = __qla2x00_eh_wait_for_pending_commands(qpair, t, l, ++ type); ++ } + return status; + } + +@@ -1425,7 +1444,7 @@ qla2xxx_eh_device_reset(struct scsi_cmnd *cmd) + return err; + + if (fcport->deleted) +- return SUCCESS; ++ return FAILED; + + ql_log(ql_log_info, vha, 0x8009, + "DEVICE RESET ISSUED nexus=%ld:%d:%llu cmd=%p.\n", vha->host_no, +@@ -1493,7 +1512,7 @@ qla2xxx_eh_target_reset(struct scsi_cmnd *cmd) + return err; + + if (fcport->deleted) +- return SUCCESS; ++ return FAILED; + + ql_log(ql_log_info, vha, 0x8009, + "TARGET RESET ISSUED nexus=%ld:%d cmd=%p.\n", vha->host_no, +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-discovery-issues-in-fc-al-topology.patch-4818 b/queue-5.18/scsi-qla2xxx-fix-discovery-issues-in-fc-al-topology.patch-4818 new file mode 100644 index 00000000000..323af66e2d8 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-discovery-issues-in-fc-al-topology.patch-4818 @@ -0,0 +1,116 @@ +From ada2019561e89a831747aab73d489754049451a5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 22:20:42 -0700 +Subject: scsi: qla2xxx: Fix discovery issues in FC-AL topology + +From: Arun Easi + +[ Upstream commit 47ccb113cead905bdc236571bf8ac6fed90321b3 ] + +A direct attach tape device, when gets swapped with another, was not +discovered. Fix this by looking at loop map and reinitialize link if there +are devices present. + +Link: https://lore.kernel.org/linux-scsi/baef87c3-5dad-3b47-44c1-6914bfc90108@cybernetics.com/ +Link: https://lore.kernel.org/r/20220713052045.10683-8-njavali@marvell.com +Cc: stable@vger.kernel.org +Reported-by: Tony Battersby +Tested-by: Tony Battersby +Reviewed-by: Himanshu Madhani +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_gbl.h | 3 ++- + drivers/scsi/qla2xxx/qla_init.c | 29 +++++++++++++++++++++++++++++ + drivers/scsi/qla2xxx/qla_mbx.c | 5 ++++- + 3 files changed, 35 insertions(+), 2 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h +index 20ae0ef7d078..331b33200f50 100644 +--- a/drivers/scsi/qla2xxx/qla_gbl.h ++++ b/drivers/scsi/qla2xxx/qla_gbl.h +@@ -436,7 +436,8 @@ extern int + qla2x00_get_resource_cnts(scsi_qla_host_t *); + + extern int +-qla2x00_get_fcal_position_map(scsi_qla_host_t *ha, char *pos_map); ++qla2x00_get_fcal_position_map(scsi_qla_host_t *ha, char *pos_map, ++ u8 *num_entries); + + extern int + qla2x00_get_link_status(scsi_qla_host_t *, uint16_t, struct link_statistics *, +diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c +index 01c7eda51d5a..51503a316b10 100644 +--- a/drivers/scsi/qla2xxx/qla_init.c ++++ b/drivers/scsi/qla2xxx/qla_init.c +@@ -5516,6 +5516,22 @@ static int qla2x00_configure_n2n_loop(scsi_qla_host_t *vha) + return QLA_FUNCTION_FAILED; + } + ++static void ++qla_reinitialize_link(scsi_qla_host_t *vha) ++{ ++ int rval; ++ ++ atomic_set(&vha->loop_state, LOOP_DOWN); ++ atomic_set(&vha->loop_down_timer, LOOP_DOWN_TIME); ++ rval = qla2x00_full_login_lip(vha); ++ if (rval == QLA_SUCCESS) { ++ ql_dbg(ql_dbg_disc, vha, 0xd050, "Link reinitialized\n"); ++ } else { ++ ql_dbg(ql_dbg_disc, vha, 0xd051, ++ "Link reinitialization failed (%d)\n", rval); ++ } ++} ++ + /* + * qla2x00_configure_local_loop + * Updates Fibre Channel Device Database with local loop devices. +@@ -5567,6 +5583,19 @@ qla2x00_configure_local_loop(scsi_qla_host_t *vha) + spin_unlock_irqrestore(&vha->work_lock, flags); + + if (vha->scan.scan_retry < MAX_SCAN_RETRIES) { ++ u8 loop_map_entries = 0; ++ int rc; ++ ++ rc = qla2x00_get_fcal_position_map(vha, NULL, ++ &loop_map_entries); ++ if (rc == QLA_SUCCESS && loop_map_entries > 1) { ++ /* ++ * There are devices that are still not logged ++ * in. Reinitialize to give them a chance. ++ */ ++ qla_reinitialize_link(vha); ++ return QLA_FUNCTION_FAILED; ++ } + set_bit(LOCAL_LOOP_UPDATE, &vha->dpc_flags); + set_bit(LOOP_RESYNC_NEEDED, &vha->dpc_flags); + } +diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c +index bcade1deb798..86d8c455c07a 100644 +--- a/drivers/scsi/qla2xxx/qla_mbx.c ++++ b/drivers/scsi/qla2xxx/qla_mbx.c +@@ -3068,7 +3068,8 @@ qla2x00_get_resource_cnts(scsi_qla_host_t *vha) + * Kernel context. + */ + int +-qla2x00_get_fcal_position_map(scsi_qla_host_t *vha, char *pos_map) ++qla2x00_get_fcal_position_map(scsi_qla_host_t *vha, char *pos_map, ++ u8 *num_entries) + { + int rval; + mbx_cmd_t mc; +@@ -3108,6 +3109,8 @@ qla2x00_get_fcal_position_map(scsi_qla_host_t *vha, char *pos_map) + + if (pos_map) + memcpy(pos_map, pmap, FCAL_MAP_SIZE); ++ if (num_entries) ++ *num_entries = pmap[0]; + } + dma_pool_free(ha->s_dma_pool, pmap, pmap_dma); + +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-erroneous-mailbox-timeout-after-pci.patch b/queue-5.18/scsi-qla2xxx-fix-erroneous-mailbox-timeout-after-pci.patch new file mode 100644 index 00000000000..21005b32a42 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-erroneous-mailbox-timeout-after-pci.patch @@ -0,0 +1,67 @@ +From ed1aa089d6371962ed13e3fe349ef0d02f660393 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 22:35:07 -0700 +Subject: scsi: qla2xxx: Fix erroneous mailbox timeout after PCI error + injection + +From: Quinn Tran + +[ Upstream commit f260694e6463b63ae550aad25ddefe94cb1904da ] + +Clear wait for mailbox interrupt flag to prevent stale mailbox: + +Feb 22 05:22:56 ltcden4-lp7 kernel: qla2xxx [0135:90:00.1]-500a:4: LOOP UP detected (16 Gbps). +Feb 22 05:22:59 ltcden4-lp7 kernel: qla2xxx [0135:90:00.1]-d04c:4: MBX Command timeout for cmd 69, ... + +To fix the issue, driver needs to clear the MBX_INTR_WAIT flag on purging +the mailbox. When the stale mailbox completion does arrive, it will be +dropped. + +Link: https://lore.kernel.org/r/20220616053508.27186-11-njavali@marvell.com +Fixes: b6faaaf796d7 ("scsi: qla2xxx: Serialize mailbox request") +Cc: Naresh Bannoth +Cc: Kyle Mahlkuch +Cc: stable@vger.kernel.org +Reported-by: Naresh Bannoth +Tested-by: Naresh Bannoth +Signed-off-by: Quinn Tran +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_mbx.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_mbx.c b/drivers/scsi/qla2xxx/qla_mbx.c +index 15d8866046b3..bcade1deb798 100644 +--- a/drivers/scsi/qla2xxx/qla_mbx.c ++++ b/drivers/scsi/qla2xxx/qla_mbx.c +@@ -276,6 +276,12 @@ qla2x00_mailbox_command(scsi_qla_host_t *vha, mbx_cmd_t *mcp) + atomic_inc(&ha->num_pend_mbx_stage3); + if (!wait_for_completion_timeout(&ha->mbx_intr_comp, + mcp->tov * HZ)) { ++ ql_dbg(ql_dbg_mbx, vha, 0x117a, ++ "cmd=%x Timeout.\n", command); ++ spin_lock_irqsave(&ha->hardware_lock, flags); ++ clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags); ++ spin_unlock_irqrestore(&ha->hardware_lock, flags); ++ + if (chip_reset != ha->chip_reset) { + eeh_delay = ha->flags.eeh_busy ? 1 : 0; + +@@ -288,12 +294,6 @@ qla2x00_mailbox_command(scsi_qla_host_t *vha, mbx_cmd_t *mcp) + rval = QLA_ABORTED; + goto premature_exit; + } +- ql_dbg(ql_dbg_mbx, vha, 0x117a, +- "cmd=%x Timeout.\n", command); +- spin_lock_irqsave(&ha->hardware_lock, flags); +- clear_bit(MBX_INTR_WAIT, &ha->mbx_cmd_flags); +- spin_unlock_irqrestore(&ha->hardware_lock, flags); +- + } else if (ha->flags.purge_mbox || + chip_reset != ha->chip_reset) { + eeh_delay = ha->flags.eeh_busy ? 1 : 0; +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-excessive-i-o-error-messages-by-def.patch b/queue-5.18/scsi-qla2xxx-fix-excessive-i-o-error-messages-by-def.patch new file mode 100644 index 00000000000..3d45b9e051e --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-excessive-i-o-error-messages-by-def.patch @@ -0,0 +1,48 @@ +From 1537088b80ff6a934403b341ee3ffb445867f3d3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 22:34:58 -0700 +Subject: scsi: qla2xxx: Fix excessive I/O error messages by default + +From: Arun Easi + +[ Upstream commit bff4873c709085e09d0ffae0c25b8e65256e3205 ] + +Disable printing I/O error messages by default. The messages will be +printed only when logging was enabled. + +Link: https://lore.kernel.org/r/20220616053508.27186-2-njavali@marvell.com +Fixes: 8e2d81c6b5be ("scsi: qla2xxx: Fix excessive messages during device logout") +Cc: stable@vger.kernel.org +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_isr.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c +index ad55eace66aa..5e99f559230f 100644 +--- a/drivers/scsi/qla2xxx/qla_isr.c ++++ b/drivers/scsi/qla2xxx/qla_isr.c +@@ -2637,7 +2637,7 @@ static void qla24xx_nvme_iocb_entry(scsi_qla_host_t *vha, struct req_que *req, + } + + if (unlikely(logit)) +- ql_log(ql_dbg_io, fcport->vha, 0x5060, ++ ql_dbg(ql_dbg_io, fcport->vha, 0x5060, + "NVME-%s ERR Handling - hdl=%x status(%x) tr_len:%x resid=%x ox_id=%x\n", + sp->name, sp->handle, comp_status, + fd->transferred_length, le32_to_cpu(sts->residual_len), +@@ -3495,7 +3495,7 @@ qla2x00_status_entry(scsi_qla_host_t *vha, struct rsp_que *rsp, void *pkt) + + out: + if (logit) +- ql_log(ql_dbg_io, fcport->vha, 0x3022, ++ ql_dbg(ql_dbg_io, fcport->vha, 0x3022, + "FCP command status: 0x%x-0x%x (0x%x) nexus=%ld:%d:%llu portid=%02x%02x%02x oxid=0x%x cdb=%10phN len=0x%x rsp_info=0x%x resid=0x%x fw_resid=0x%x sp=%p cp=%p.\n", + comp_status, scsi_status, res, vha->host_no, + cp->device->id, cp->device->lun, fcport->d_id.b.domain, +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-imbalance-vha-vref_count.patch-27970 b/queue-5.18/scsi-qla2xxx-fix-imbalance-vha-vref_count.patch-27970 new file mode 100644 index 00000000000..0b1d6e1dcf8 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-imbalance-vha-vref_count.patch-27970 @@ -0,0 +1,61 @@ +From 10606f5e0ba0d8d8210cc69be72aa4035377a56d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 22:20:41 -0700 +Subject: scsi: qla2xxx: Fix imbalance vha->vref_count + +From: Quinn Tran + +[ Upstream commit 63fa7f2644b4b48e1913af33092c044bf48e9321 ] + +vref_count took an extra decrement in the task management path. Add an +extra ref count to compensate the imbalance. + +Link: https://lore.kernel.org/r/20220713052045.10683-7-njavali@marvell.com +Cc: stable@vger.kernel.org +Reviewed-by: Himanshu Madhani +Signed-off-by: Quinn Tran +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_init.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c +index 3df66934fb1e..01c7eda51d5a 100644 +--- a/drivers/scsi/qla2xxx/qla_init.c ++++ b/drivers/scsi/qla2xxx/qla_init.c +@@ -168,6 +168,7 @@ int qla24xx_async_abort_cmd(srb_t *cmd_sp, bool wait) + struct srb_iocb *abt_iocb; + srb_t *sp; + int rval = QLA_FUNCTION_FAILED; ++ uint8_t bail; + + /* ref: INIT for ABTS command */ + sp = qla2xxx_get_qpair_sp(cmd_sp->vha, cmd_sp->qpair, cmd_sp->fcport, +@@ -175,6 +176,7 @@ int qla24xx_async_abort_cmd(srb_t *cmd_sp, bool wait) + if (!sp) + return QLA_MEMORY_ALLOC_FAILED; + ++ QLA_VHA_MARK_BUSY(vha, bail); + abt_iocb = &sp->u.iocb_cmd; + sp->type = SRB_ABT_CMD; + sp->name = "abort"; +@@ -2018,12 +2020,14 @@ qla2x00_async_tm_cmd(fc_port_t *fcport, uint32_t flags, uint32_t lun, + struct srb_iocb *tm_iocb; + srb_t *sp; + int rval = QLA_FUNCTION_FAILED; ++ uint8_t bail; + + /* ref: INIT */ + sp = qla2x00_get_sp(vha, fcport, GFP_KERNEL); + if (!sp) + goto done; + ++ QLA_VHA_MARK_BUSY(vha, bail); + sp->type = SRB_TM_CMD; + sp->name = "tmf"; + qla2x00_init_async_sp(sp, qla2x00_get_async_timeout(vha), +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-losing-fcp-2-targets-during-port-pe.patch b/queue-5.18/scsi-qla2xxx-fix-losing-fcp-2-targets-during-port-pe.patch new file mode 100644 index 00000000000..1bb75978768 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-losing-fcp-2-targets-during-port-pe.patch @@ -0,0 +1,41 @@ +From eea51e30df139a84ea79448c0a19833387a8bb12 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 22:35:03 -0700 +Subject: scsi: qla2xxx: Fix losing FCP-2 targets during port perturbation + tests + +From: Arun Easi + +[ Upstream commit 58d1c124cd79ea686b512043c5bd515590b2ed95 ] + +When a mix of FCP-2 (tape) and non-FCP-2 targets are present, FCP-2 target +state was incorrectly transitioned when both of the targets were gone. Fix +this by ignoring state transition for FCP-2 targets. + +Link: https://lore.kernel.org/r/20220616053508.27186-7-njavali@marvell.com +Fixes: 44c57f205876 ("scsi: qla2xxx: Changes to support FCP2 Target") +Cc: stable@vger.kernel.org +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_gs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/scsi/qla2xxx/qla_gs.c b/drivers/scsi/qla2xxx/qla_gs.c +index c914b5df9c12..7ca734337000 100644 +--- a/drivers/scsi/qla2xxx/qla_gs.c ++++ b/drivers/scsi/qla2xxx/qla_gs.c +@@ -3629,7 +3629,7 @@ void qla24xx_async_gnnft_done(scsi_qla_host_t *vha, srb_t *sp) + do_delete) { + if (fcport->loop_id != FC_NO_LOOP_ID) { + if (fcport->flags & FCF_FCP2_DEVICE) +- fcport->logout_on_delete = 0; ++ continue; + + ql_log(ql_log_warn, vha, 0x20f0, + "%s %d %8phC post del sess\n", +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-losing-fcp-2-targets-on-long-port-d.patch b/queue-5.18/scsi-qla2xxx-fix-losing-fcp-2-targets-on-long-port-d.patch new file mode 100644 index 00000000000..86c4dce1d60 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-losing-fcp-2-targets-on-long-port-d.patch @@ -0,0 +1,72 @@ +From cd632680dceb7e22c58ec69f997ffaaf7c4c0dac Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 22:35:06 -0700 +Subject: scsi: qla2xxx: Fix losing FCP-2 targets on long port disable with + I/Os + +From: Arun Easi + +[ Upstream commit 2416ccd3815ba1613e10a6da0a24ef21acfe5633 ] + +FCP-2 devices were not coming back online once they were lost, login +retries exhausted, and then came back up. Fix this by accepting RSCN when +the device is not online. + +Link: https://lore.kernel.org/r/20220616053508.27186-10-njavali@marvell.com +Fixes: 44c57f205876 ("scsi: qla2xxx: Changes to support FCP2 Target") +Cc: stable@vger.kernel.org +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_init.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c +index a0d26e2e0ce1..3df66934fb1e 100644 +--- a/drivers/scsi/qla2xxx/qla_init.c ++++ b/drivers/scsi/qla2xxx/qla_init.c +@@ -1832,7 +1832,8 @@ void qla2x00_handle_rscn(scsi_qla_host_t *vha, struct event_arg *ea) + case RSCN_PORT_ADDR: + fcport = qla2x00_find_fcport_by_nportid(vha, &ea->id, 1); + if (fcport) { +- if (fcport->flags & FCF_FCP2_DEVICE) { ++ if (fcport->flags & FCF_FCP2_DEVICE && ++ atomic_read(&fcport->state) == FCS_ONLINE) { + ql_dbg(ql_dbg_disc, vha, 0x2115, + "Delaying session delete for FCP2 portid=%06x %8phC ", + fcport->d_id.b24, fcport->port_name); +@@ -1864,7 +1865,8 @@ void qla2x00_handle_rscn(scsi_qla_host_t *vha, struct event_arg *ea) + break; + case RSCN_AREA_ADDR: + list_for_each_entry(fcport, &vha->vp_fcports, list) { +- if (fcport->flags & FCF_FCP2_DEVICE) ++ if (fcport->flags & FCF_FCP2_DEVICE && ++ atomic_read(&fcport->state) == FCS_ONLINE) + continue; + + if ((ea->id.b24 & 0xffff00) == (fcport->d_id.b24 & 0xffff00)) { +@@ -1875,7 +1877,8 @@ void qla2x00_handle_rscn(scsi_qla_host_t *vha, struct event_arg *ea) + break; + case RSCN_DOM_ADDR: + list_for_each_entry(fcport, &vha->vp_fcports, list) { +- if (fcport->flags & FCF_FCP2_DEVICE) ++ if (fcport->flags & FCF_FCP2_DEVICE && ++ atomic_read(&fcport->state) == FCS_ONLINE) + continue; + + if ((ea->id.b24 & 0xff0000) == (fcport->d_id.b24 & 0xff0000)) { +@@ -1887,7 +1890,8 @@ void qla2x00_handle_rscn(scsi_qla_host_t *vha, struct event_arg *ea) + case RSCN_FAB_ADDR: + default: + list_for_each_entry(fcport, &vha->vp_fcports, list) { +- if (fcport->flags & FCF_FCP2_DEVICE) ++ if (fcport->flags & FCF_FCP2_DEVICE && ++ atomic_read(&fcport->state) == FCS_ONLINE) + continue; + + fcport->scan_needed = 1; +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-losing-target-when-it-reappears-dur.patch b/queue-5.18/scsi-qla2xxx-fix-losing-target-when-it-reappears-dur.patch new file mode 100644 index 00000000000..0fe225a3dfd --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-losing-target-when-it-reappears-dur.patch @@ -0,0 +1,84 @@ +From a9228406b04cbe27522f81b77535befe4c5e7924 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 22:35:04 -0700 +Subject: scsi: qla2xxx: Fix losing target when it reappears during delete + +From: Arun Easi + +[ Upstream commit 118b0c863c8f5629cc5271fc24d72d926e0715d9 ] + +FC target disappeared during port perturbation tests due to a race that +tramples target state. Fix the issue by adding state checks before +proceeding. + +Link: https://lore.kernel.org/r/20220616053508.27186-8-njavali@marvell.com +Fixes: 44c57f205876 ("scsi: qla2xxx: Changes to support FCP2 Target") +Cc: stable@vger.kernel.org +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_attr.c | 24 +++++++++++++++++------- + 1 file changed, 17 insertions(+), 7 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c +index 3b3e4234f37a..412ad888bdc1 100644 +--- a/drivers/scsi/qla2xxx/qla_attr.c ++++ b/drivers/scsi/qla2xxx/qla_attr.c +@@ -2716,17 +2716,24 @@ qla2x00_dev_loss_tmo_callbk(struct fc_rport *rport) + if (!fcport) + return; + +- /* Now that the rport has been deleted, set the fcport state to +- FCS_DEVICE_DEAD */ +- qla2x00_set_fcport_state(fcport, FCS_DEVICE_DEAD); ++ ++ /* ++ * Now that the rport has been deleted, set the fcport state to ++ * FCS_DEVICE_DEAD, if the fcport is still lost. ++ */ ++ if (fcport->scan_state != QLA_FCPORT_FOUND) ++ qla2x00_set_fcport_state(fcport, FCS_DEVICE_DEAD); + + /* + * Transport has effectively 'deleted' the rport, clear + * all local references. + */ + spin_lock_irqsave(host->host_lock, flags); +- fcport->rport = fcport->drport = NULL; +- *((fc_port_t **)rport->dd_data) = NULL; ++ /* Confirm port has not reappeared before clearing pointers. */ ++ if (rport->port_state != FC_PORTSTATE_ONLINE) { ++ fcport->rport = fcport->drport = NULL; ++ *((fc_port_t **)rport->dd_data) = NULL; ++ } + spin_unlock_irqrestore(host->host_lock, flags); + + if (test_bit(ABORT_ISP_ACTIVE, &fcport->vha->dpc_flags)) +@@ -2759,9 +2766,12 @@ qla2x00_terminate_rport_io(struct fc_rport *rport) + /* + * At this point all fcport's software-states are cleared. Perform any + * final cleanup of firmware resources (PCBs and XCBs). ++ * ++ * Attempt to cleanup only lost devices. + */ + if (fcport->loop_id != FC_NO_LOOP_ID) { +- if (IS_FWI2_CAPABLE(fcport->vha->hw)) { ++ if (IS_FWI2_CAPABLE(fcport->vha->hw) && ++ fcport->scan_state != QLA_FCPORT_FOUND) { + if (fcport->loop_id != FC_NO_LOOP_ID) + fcport->logout_on_delete = 1; + +@@ -2771,7 +2781,7 @@ qla2x00_terminate_rport_io(struct fc_rport *rport) + __LINE__); + qlt_schedule_sess_for_deletion(fcport); + } +- } else { ++ } else if (!IS_FWI2_CAPABLE(fcport->vha->hw)) { + qla2x00_port_logout(fcport->vha, fcport); + } + } +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-fix-response-queue-handler-reading-stal.patch b/queue-5.18/scsi-qla2xxx-fix-response-queue-handler-reading-stal.patch new file mode 100644 index 00000000000..08cf6a62ebc --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-fix-response-queue-handler-reading-stal.patch @@ -0,0 +1,128 @@ +From ac1c86d5f6f0826e7897d9395a1c6abba08a0f8d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 22:20:39 -0700 +Subject: scsi: qla2xxx: Fix response queue handler reading stale packets + +From: Arun Easi + +[ Upstream commit b1f707146923335849fb70237eec27d4d1ae7d62 ] + +On some platforms, the current logic of relying on finding new packet +solely based on signature pattern can lead to driver reading stale +packets. Though this is a bug in those platforms, reduce such exposures by +limiting reading packets until the IN pointer. + +Two module parameters are introduced: + + ql2xrspq_follow_inptr: + + When set, on newer adapters that has queue pointer shadowing, look for + response packets only until response queue in pointer. + + When reset, response packets are read based on a signature pattern + logic (old way). + + ql2xrspq_follow_inptr_legacy: + + Like ql2xrspq_follow_inptr, but for those adapters where there is no + queue pointer shadowing. + +Link: https://lore.kernel.org/r/20220713052045.10683-5-njavali@marvell.com +Cc: stable@vger.kernel.org +Reviewed-by: Himanshu Madhani +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_gbl.h | 2 ++ + drivers/scsi/qla2xxx/qla_isr.c | 24 +++++++++++++++++++++++- + drivers/scsi/qla2xxx/qla_os.c | 10 ++++++++++ + 3 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/drivers/scsi/qla2xxx/qla_gbl.h b/drivers/scsi/qla2xxx/qla_gbl.h +index 84b44454c231..20ae0ef7d078 100644 +--- a/drivers/scsi/qla2xxx/qla_gbl.h ++++ b/drivers/scsi/qla2xxx/qla_gbl.h +@@ -193,6 +193,8 @@ extern int ql2xsecenable; + extern int ql2xenforce_iocb_limit; + extern int ql2xabts_wait_nvme; + extern u32 ql2xnvme_queues; ++extern int ql2xrspq_follow_inptr; ++extern int ql2xrspq_follow_inptr_legacy; + + extern int qla2x00_loop_reset(scsi_qla_host_t *); + extern void qla2x00_abort_all_cmds(scsi_qla_host_t *, int); +diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c +index de348628aa53..c509bb23af40 100644 +--- a/drivers/scsi/qla2xxx/qla_isr.c ++++ b/drivers/scsi/qla2xxx/qla_isr.c +@@ -3771,6 +3771,8 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, + struct qla_hw_data *ha = vha->hw; + struct purex_entry_24xx *purex_entry; + struct purex_item *pure_item; ++ u16 rsp_in = 0; ++ int follow_inptr, is_shadow_hba; + + if (!ha->flags.fw_started) + return; +@@ -3780,7 +3782,25 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, + qla_cpu_update(rsp->qpair, smp_processor_id()); + } + +- while (rsp->ring_ptr->signature != RESPONSE_PROCESSED) { ++#define __update_rsp_in(_update, _is_shadow_hba, _rsp, _rsp_in) \ ++ do { \ ++ if (_update) { \ ++ _rsp_in = _is_shadow_hba ? *(_rsp)->in_ptr : \ ++ rd_reg_dword_relaxed((_rsp)->rsp_q_in); \ ++ } \ ++ } while (0) ++ ++ is_shadow_hba = IS_SHADOW_REG_CAPABLE(ha); ++ follow_inptr = is_shadow_hba ? ql2xrspq_follow_inptr : ++ ql2xrspq_follow_inptr_legacy; ++ ++ __update_rsp_in(follow_inptr, is_shadow_hba, rsp, rsp_in); ++ ++ while ((likely(follow_inptr && ++ rsp->ring_index != rsp_in && ++ rsp->ring_ptr->signature != RESPONSE_PROCESSED)) || ++ (!follow_inptr && ++ rsp->ring_ptr->signature != RESPONSE_PROCESSED)) { + pkt = (struct sts_entry_24xx *)rsp->ring_ptr; + + rsp->ring_index++; +@@ -3893,6 +3913,8 @@ void qla24xx_process_response_queue(struct scsi_qla_host *vha, + } + pure_item = qla27xx_copy_fpin_pkt(vha, + (void **)&pkt, &rsp); ++ __update_rsp_in(follow_inptr, is_shadow_hba, ++ rsp, rsp_in); + if (!pure_item) + break; + qla24xx_queue_purex_item(vha, pure_item, +diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c +index f9ad0847782d..3bbfce3ccf2e 100644 +--- a/drivers/scsi/qla2xxx/qla_os.c ++++ b/drivers/scsi/qla2xxx/qla_os.c +@@ -338,6 +338,16 @@ module_param(ql2xdelay_before_pci_error_handling, uint, 0644); + MODULE_PARM_DESC(ql2xdelay_before_pci_error_handling, + "Number of seconds delayed before qla begin PCI error self-handling (default: 5).\n"); + ++int ql2xrspq_follow_inptr = 1; ++module_param(ql2xrspq_follow_inptr, int, 0644); ++MODULE_PARM_DESC(ql2xrspq_follow_inptr, ++ "Follow RSP IN pointer for RSP updates for HBAs 27xx and newer (default: 1)."); ++ ++int ql2xrspq_follow_inptr_legacy = 1; ++module_param(ql2xrspq_follow_inptr_legacy, int, 0644); ++MODULE_PARM_DESC(ql2xrspq_follow_inptr_legacy, ++ "Follow RSP IN pointer for RSP updates for HBAs older than 27XX. (default: 1)."); ++ + static void qla2x00_clear_drv_active(struct qla_hw_data *); + static void qla2x00_free_device(scsi_qla_host_t *); + static int qla2xxx_map_queues(struct Scsi_Host *shost); +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-turn-off-multi-queue-for-8g-adapters.patch-18430 b/queue-5.18/scsi-qla2xxx-turn-off-multi-queue-for-8g-adapters.patch-18430 new file mode 100644 index 00000000000..710fef0aff4 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-turn-off-multi-queue-for-8g-adapters.patch-18430 @@ -0,0 +1,68 @@ +From 6467d2dbfbfe7cc5abe98d997e428cbfb58354d9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 22:35:01 -0700 +Subject: scsi: qla2xxx: Turn off multi-queue for 8G adapters + +From: Quinn Tran + +[ Upstream commit 5304673bdb1635e27555bd636fd5d6956f1cd552 ] + +For 8G adapters, multi-queue was enabled accidentally. Make sure +multi-queue is not enabled. + +Link: https://lore.kernel.org/r/20220616053508.27186-5-njavali@marvell.com +Cc: stable@vger.kernel.org +Signed-off-by: Quinn Tran +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_def.h | 4 ++-- + drivers/scsi/qla2xxx/qla_isr.c | 16 ++++++---------- + 2 files changed, 8 insertions(+), 12 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h +index 4cbaea4b993e..01cdd5f8723c 100644 +--- a/drivers/scsi/qla2xxx/qla_def.h ++++ b/drivers/scsi/qla2xxx/qla_def.h +@@ -4268,8 +4268,8 @@ struct qla_hw_data { + #define IS_OEM_001(ha) ((ha)->device_type & DT_OEM_001) + #define HAS_EXTENDED_IDS(ha) ((ha)->device_type & DT_EXTENDED_IDS) + #define IS_CT6_SUPPORTED(ha) ((ha)->device_type & DT_CT6_SUPPORTED) +-#define IS_MQUE_CAPABLE(ha) ((ha)->mqenable || IS_QLA83XX(ha) || \ +- IS_QLA27XX(ha) || IS_QLA28XX(ha)) ++#define IS_MQUE_CAPABLE(ha) (IS_QLA83XX(ha) || IS_QLA27XX(ha) || \ ++ IS_QLA28XX(ha)) + #define IS_BIDI_CAPABLE(ha) \ + (IS_QLA25XX(ha) || IS_QLA2031(ha) || IS_QLA27XX(ha) || IS_QLA28XX(ha)) + /* Bit 21 of fw_attributes decides the MCTP capabilities */ +diff --git a/drivers/scsi/qla2xxx/qla_isr.c b/drivers/scsi/qla2xxx/qla_isr.c +index 5e99f559230f..de348628aa53 100644 +--- a/drivers/scsi/qla2xxx/qla_isr.c ++++ b/drivers/scsi/qla2xxx/qla_isr.c +@@ -4419,16 +4419,12 @@ qla24xx_enable_msix(struct qla_hw_data *ha, struct rsp_que *rsp) + } + + /* Enable MSI-X vector for response queue update for queue 0 */ +- if (IS_QLA83XX(ha) || IS_QLA27XX(ha) || IS_QLA28XX(ha)) { +- if (ha->msixbase && ha->mqiobase && +- (ha->max_rsp_queues > 1 || ha->max_req_queues > 1 || +- ql2xmqsupport)) +- ha->mqenable = 1; +- } else +- if (ha->mqiobase && +- (ha->max_rsp_queues > 1 || ha->max_req_queues > 1 || +- ql2xmqsupport)) +- ha->mqenable = 1; ++ if (IS_MQUE_CAPABLE(ha) && ++ (ha->msixbase && ha->mqiobase && ha->max_qpairs)) ++ ha->mqenable = 1; ++ else ++ ha->mqenable = 0; ++ + ql_dbg(ql_dbg_multiq, vha, 0xc005, + "mqiobase=%p, max_rsp_queues=%d, max_req_queues=%d.\n", + ha->mqiobase, ha->max_rsp_queues, ha->max_req_queues); +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-update-manufacturer-details.patch b/queue-5.18/scsi-qla2xxx-update-manufacturer-details.patch new file mode 100644 index 00000000000..74b239db3d5 --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-update-manufacturer-details.patch @@ -0,0 +1,52 @@ +From fc0719b2de5e78eb7fd2b53cd1e1f8a06e81653f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Jul 2022 22:20:44 -0700 +Subject: scsi: qla2xxx: Update manufacturer details + +From: Bikash Hazarika + +[ Upstream commit 1ccad27716ecad1fd58c35e579bedb81fa5e1ad5 ] + +Update manufacturer details to indicate Marvell Semiconductors. + +Link: https://lore.kernel.org/r/20220713052045.10683-10-njavali@marvell.com +Cc: stable@vger.kernel.org +Reviewed-by: Himanshu Madhani +Signed-off-by: Bikash Hazarika +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_def.h | 2 +- + drivers/scsi/qla2xxx/qla_gs.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h +index 01cdd5f8723c..17b8a4e86ffb 100644 +--- a/drivers/scsi/qla2xxx/qla_def.h ++++ b/drivers/scsi/qla2xxx/qla_def.h +@@ -78,7 +78,7 @@ typedef union { + #include "qla_nvme.h" + #define QLA2XXX_DRIVER_NAME "qla2xxx" + #define QLA2XXX_APIDEV "ql2xapidev" +-#define QLA2XXX_MANUFACTURER "QLogic Corporation" ++#define QLA2XXX_MANUFACTURER "Marvell Semiconductor, Inc." + + /* + * We have MAILBOX_REGISTER_COUNT sized arrays in a few places, +diff --git a/drivers/scsi/qla2xxx/qla_gs.c b/drivers/scsi/qla2xxx/qla_gs.c +index 7ca734337000..64ab070b8716 100644 +--- a/drivers/scsi/qla2xxx/qla_gs.c ++++ b/drivers/scsi/qla2xxx/qla_gs.c +@@ -1616,7 +1616,7 @@ qla2x00_hba_attributes(scsi_qla_host_t *vha, void *entries, + eiter->type = cpu_to_be16(FDMI_HBA_MANUFACTURER); + alen = scnprintf( + eiter->a.manufacturer, sizeof(eiter->a.manufacturer), +- "%s", "QLogic Corporation"); ++ "%s", QLA2XXX_MANUFACTURER); + alen += FDMI_ATTR_ALIGNMENT(alen); + alen += FDMI_ATTR_TYPELEN(eiter); + eiter->len = cpu_to_be16(alen); +-- +2.35.1 + diff --git a/queue-5.18/scsi-qla2xxx-wind-down-adapter-after-pcie-error.patch-27996 b/queue-5.18/scsi-qla2xxx-wind-down-adapter-after-pcie-error.patch-27996 new file mode 100644 index 00000000000..978938a28cb --- /dev/null +++ b/queue-5.18/scsi-qla2xxx-wind-down-adapter-after-pcie-error.patch-27996 @@ -0,0 +1,210 @@ +From 7d35e2215d13472f85fd7000bc0f76847bc4d08e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Jun 2022 22:35:00 -0700 +Subject: scsi: qla2xxx: Wind down adapter after PCIe error + +From: Quinn Tran + +[ Upstream commit d3117c83ba316b3200d9f2fe900f2b9a5525a25c ] + +Put adapter into a wind down state if OS does not make any attempt to +recover the adapter after PCIe error. + +Link: https://lore.kernel.org/r/20220616053508.27186-4-njavali@marvell.com +Cc: stable@vger.kernel.org +Signed-off-by: Quinn Tran +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/qla2xxx/qla_bsg.c | 10 ++++++- + drivers/scsi/qla2xxx/qla_def.h | 4 +++ + drivers/scsi/qla2xxx/qla_init.c | 20 ++++++++++++++ + drivers/scsi/qla2xxx/qla_os.c | 48 +++++++++++++++++++++++++++++++++ + 4 files changed, 81 insertions(+), 1 deletion(-) + +diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c +index c2f00f076f79..726af9e40572 100644 +--- a/drivers/scsi/qla2xxx/qla_bsg.c ++++ b/drivers/scsi/qla2xxx/qla_bsg.c +@@ -2975,6 +2975,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job) + + ql_log(ql_log_info, vha, 0x708b, "%s CMD timeout. bsg ptr %p.\n", + __func__, bsg_job); ++ ++ if (qla2x00_isp_reg_stat(ha)) { ++ ql_log(ql_log_info, vha, 0x9007, ++ "PCI/Register disconnect.\n"); ++ qla_pci_set_eeh_busy(vha); ++ } ++ + /* find the bsg job from the active list of commands */ + spin_lock_irqsave(&ha->hardware_lock, flags); + for (que = 0; que < ha->max_req_queues; que++) { +@@ -2992,7 +2999,8 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job) + sp->u.bsg_job == bsg_job) { + req->outstanding_cmds[cnt] = NULL; + spin_unlock_irqrestore(&ha->hardware_lock, flags); +- if (ha->isp_ops->abort_command(sp)) { ++ ++ if (!ha->flags.eeh_busy && ha->isp_ops->abort_command(sp)) { + ql_log(ql_log_warn, vha, 0x7089, + "mbx abort_command failed.\n"); + bsg_reply->result = -EIO; +diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h +index 4062d46f33a6..4cbaea4b993e 100644 +--- a/drivers/scsi/qla2xxx/qla_def.h ++++ b/drivers/scsi/qla2xxx/qla_def.h +@@ -4048,6 +4048,9 @@ struct qla_hw_data { + uint32_t n2n_fw_acc_sec:1; + uint32_t plogi_template_valid:1; + uint32_t port_isolated:1; ++ uint32_t eeh_flush:2; ++#define EEH_FLUSH_RDY 1 ++#define EEH_FLUSH_DONE 2 + } flags; + + uint16_t max_exchg; +@@ -4082,6 +4085,7 @@ struct qla_hw_data { + uint32_t rsp_que_len; + uint32_t req_que_off; + uint32_t rsp_que_off; ++ unsigned long eeh_jif; + + /* Multi queue data structs */ + device_reg_t *mqiobase; +diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c +index 7bd10b4ed9ed..a0d26e2e0ce1 100644 +--- a/drivers/scsi/qla2xxx/qla_init.c ++++ b/drivers/scsi/qla2xxx/qla_init.c +@@ -47,6 +47,7 @@ qla2x00_sp_timeout(struct timer_list *t) + { + srb_t *sp = from_timer(sp, t, u.iocb_cmd.timer); + struct srb_iocb *iocb; ++ scsi_qla_host_t *vha = sp->vha; + + WARN_ON(irqs_disabled()); + iocb = &sp->u.iocb_cmd; +@@ -54,6 +55,12 @@ qla2x00_sp_timeout(struct timer_list *t) + + /* ref: TMR */ + kref_put(&sp->cmd_kref, qla2x00_sp_release); ++ ++ if (vha && qla2x00_isp_reg_stat(vha->hw)) { ++ ql_log(ql_log_info, vha, 0x9008, ++ "PCI/Register disconnect.\n"); ++ qla_pci_set_eeh_busy(vha); ++ } + } + + void qla2x00_sp_free(srb_t *sp) +@@ -9669,6 +9676,12 @@ int qla2xxx_disable_port(struct Scsi_Host *host) + + vha->hw->flags.port_isolated = 1; + ++ if (qla2x00_isp_reg_stat(vha->hw)) { ++ ql_log(ql_log_info, vha, 0x9006, ++ "PCI/Register disconnect, exiting.\n"); ++ qla_pci_set_eeh_busy(vha); ++ return FAILED; ++ } + if (qla2x00_chip_is_down(vha)) + return 0; + +@@ -9684,6 +9697,13 @@ int qla2xxx_enable_port(struct Scsi_Host *host) + { + scsi_qla_host_t *vha = shost_priv(host); + ++ if (qla2x00_isp_reg_stat(vha->hw)) { ++ ql_log(ql_log_info, vha, 0x9001, ++ "PCI/Register disconnect, exiting.\n"); ++ qla_pci_set_eeh_busy(vha); ++ return FAILED; ++ } ++ + vha->hw->flags.port_isolated = 0; + /* Set the flag to 1, so that isp_abort can proceed */ + vha->flags.online = 1; +diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c +index 3c68dad00d04..daa9a3c3f7b5 100644 +--- a/drivers/scsi/qla2xxx/qla_os.c ++++ b/drivers/scsi/qla2xxx/qla_os.c +@@ -333,6 +333,11 @@ MODULE_PARM_DESC(ql2xabts_wait_nvme, + "To wait for ABTS response on I/O timeouts for NVMe. (default: 1)"); + + ++u32 ql2xdelay_before_pci_error_handling = 5; ++module_param(ql2xdelay_before_pci_error_handling, uint, 0644); ++MODULE_PARM_DESC(ql2xdelay_before_pci_error_handling, ++ "Number of seconds delayed before qla begin PCI error self-handling (default: 5).\n"); ++ + static void qla2x00_clear_drv_active(struct qla_hw_data *); + static void qla2x00_free_device(scsi_qla_host_t *); + static int qla2xxx_map_queues(struct Scsi_Host *shost); +@@ -7239,6 +7244,44 @@ static void qla_heart_beat(struct scsi_qla_host *vha, u16 dpc_started) + } + } + ++static void qla_wind_down_chip(scsi_qla_host_t *vha) ++{ ++ struct qla_hw_data *ha = vha->hw; ++ ++ if (!ha->flags.eeh_busy) ++ return; ++ if (ha->pci_error_state) ++ /* system is trying to recover */ ++ return; ++ ++ /* ++ * Current system is not handling PCIE error. At this point, this is ++ * best effort to wind down the adapter. ++ */ ++ if (time_after_eq(jiffies, ha->eeh_jif + ql2xdelay_before_pci_error_handling * HZ) && ++ !ha->flags.eeh_flush) { ++ ql_log(ql_log_info, vha, 0x9009, ++ "PCI Error detected, attempting to reset hardware.\n"); ++ ++ ha->isp_ops->reset_chip(vha); ++ ha->isp_ops->disable_intrs(ha); ++ ++ ha->flags.eeh_flush = EEH_FLUSH_RDY; ++ ha->eeh_jif = jiffies; ++ ++ } else if (ha->flags.eeh_flush == EEH_FLUSH_RDY && ++ time_after_eq(jiffies, ha->eeh_jif + 5 * HZ)) { ++ pci_clear_master(ha->pdev); ++ ++ /* flush all command */ ++ qla2x00_abort_isp_cleanup(vha); ++ ha->flags.eeh_flush = EEH_FLUSH_DONE; ++ ++ ql_log(ql_log_info, vha, 0x900a, ++ "PCI Error handling complete, all IOs aborted.\n"); ++ } ++} ++ + /************************************************************************** + * qla2x00_timer + * +@@ -7262,6 +7305,8 @@ qla2x00_timer(struct timer_list *t) + fc_port_t *fcport = NULL; + + if (ha->flags.eeh_busy) { ++ qla_wind_down_chip(vha); ++ + ql_dbg(ql_dbg_timer, vha, 0x6000, + "EEH = %d, restarting timer.\n", + ha->flags.eeh_busy); +@@ -7842,6 +7887,9 @@ void qla_pci_set_eeh_busy(struct scsi_qla_host *vha) + + spin_lock_irqsave(&base_vha->work_lock, flags); + if (!ha->flags.eeh_busy) { ++ ha->eeh_jif = jiffies; ++ ha->flags.eeh_flush = 0; ++ + ha->flags.eeh_busy = 1; + do_cleanup = true; + } +-- +2.35.1 + diff --git a/queue-5.18/serial-8250-add-proper-clock-handling-for-oxsemi-pci.patch b/queue-5.18/serial-8250-add-proper-clock-handling-for-oxsemi-pci.patch new file mode 100644 index 00000000000..aa0068a3b0b --- /dev/null +++ b/queue-5.18/serial-8250-add-proper-clock-handling-for-oxsemi-pci.patch @@ -0,0 +1,764 @@ +From 6690821f686011456d2aa183db7777be5132b6b0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 18 Apr 2022 16:27:33 +0100 +Subject: serial: 8250: Add proper clock handling for OxSemi PCIe devices + +From: Maciej W. Rozycki + +[ Upstream commit 366f6c955d4d1a5125ffcd6875ead26a3c7a2a1c ] + +Oxford Semiconductor PCIe (Tornado) 950 serial port devices are driven +by a fixed 62.5MHz clock input derived from the 100MHz PCI Express clock. + +We currently drive the device using its default oversampling rate of 16 +and the clock prescaler disabled, consequently yielding the baud base of +3906250. This base is inadequate for some of the high-speed baud rates +such as 460800bps, for which the closest rate possible can be obtained +by dividing the baud base by 8, yielding the baud rate of 488281.25bps, +which is off by 5.9638%. This is enough for data communication to break +with the remote end talking actual 460800bps, where missed stop bits +have been observed. + +We can do better however, by taking advantage of a reduced oversampling +rate, which can be set to any integer value from 4 to 16 inclusive by +programming the TCR register, and by using the clock prescaler, which +can be set to any value from 1 to 63.875 in increments of 0.125 in the +CPR/CPR2 register pair. The prescaler has to be explicitly enabled +though by setting bit 7 in the MCR or otherwise it is bypassed (in the +enhanced mode that we enable) as if the value of 1 was used. + +Make use of these features then as follows: + +- Set the baud base to 15625000, reflecting the minimum oversampling + rate of 4 with the clock prescaler and divisor both set to 1. + +- Override the `set_mctrl' and set the MCR shadow there so as to have + MCR[7] always set and have the 8250 core propagate these settings. + +- Override the `get_divisor' handler and determine a good combination of + parameters by using a lookup table with predetermined value pairs of + the oversampling rate and the clock prescaler and finding a pair that + divides the input clock such that the quotient, when rounded to the + nearest integer, deviates the least from the exact result. Calculate + the clock divisor accordingly. + + Scale the resulting oversampling rate (only by powers of two) if + possible so as to maximise it, reducing the divisor accordingly, and + avoid a divisor overflow for very low baud rates by scaling the + oversampling rate and/or the prescaler even if that causes some + accuracy loss. + + Also handle the historic spd_cust feature so as to allow one to set + all the three parameters manually to arbitrary values, by keeping the + low 16 bits for the divisor and then putting TCR in bits 19:16 and + CPR/CPR2 in bits 28:20, sanitising the bit pattern supplied such as + to clamp CPR/CPR2 values between 0.000 and 0.875 inclusive to 33.875. + This preserves compatibility with any existing setups, that is where + requesting a custom divisor that only has any bits set among the low + 16 the oversampling rate of 16 and the clock prescaler of 33.875 will + be used as with the original 8250. + + Finally abuse the `frac' argument to store the determined bit patterns + for the TCR, CPR and CPR2 registers. + +- Override the `set_divisor' handler so as to set the TCR, CPR and CPR2 + registers from the `frac' value supplied. Set the divisor as usual. + +With the baud base set to 15625000 and the unsigned 16-bit UART_DIV_MAX +limitation imposed by `serial8250_get_baud_rate' standard baud rates +below 300bps become unavailable in the regular way, e.g. the rate of +200bps requires the baud base to be divided by 78125 and that is beyond +the unsigned 16-bit range. The historic spd_cust feature can still be +used to obtain such rates if so required. + +See Documentation/tty/device_drivers/oxsemi-tornado.rst for more details. + +Signed-off-by: Maciej W. Rozycki +Link: https://lore.kernel.org/r/alpine.DEB.2.21.2204181519450.9383@angie.orcam.me.uk +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + .../tty/device_drivers/oxsemi-tornado.rst | 129 +++++++ + drivers/tty/serial/8250/8250_pci.c | 339 ++++++++++++++---- + 2 files changed, 400 insertions(+), 68 deletions(-) + create mode 100644 Documentation/tty/device_drivers/oxsemi-tornado.rst + +diff --git a/Documentation/tty/device_drivers/oxsemi-tornado.rst b/Documentation/tty/device_drivers/oxsemi-tornado.rst +new file mode 100644 +index 000000000000..0180d8bb0881 +--- /dev/null ++++ b/Documentation/tty/device_drivers/oxsemi-tornado.rst +@@ -0,0 +1,129 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++==================================================================== ++Notes on Oxford Semiconductor PCIe (Tornado) 950 serial port devices ++==================================================================== ++ ++Oxford Semiconductor PCIe (Tornado) 950 serial port devices are driven ++by a fixed 62.5MHz clock input derived from the 100MHz PCI Express clock. ++ ++The baud rate produced by the baud generator is obtained from this input ++frequency by dividing it by the clock prescaler, which can be set to any ++value from 1 to 63.875 in increments of 0.125, and then the usual 16-bit ++divisor is used as with the original 8250, to divide the frequency by a ++value from 1 to 65535. Finally a programmable oversampling rate is used ++that can take any value from 4 to 16 to divide the frequency further and ++determine the actual baud rate used. Baud rates from 15625000bps down ++to 0.933bps can be obtained this way. ++ ++By default the oversampling rate is set to 16 and the clock prescaler is ++set to 33.875, meaning that the frequency to be used as the reference ++for the usual 16-bit divisor is 115313.653, which is close enough to the ++frequency of 115200 used by the original 8250 for the same values to be ++used for the divisor to obtain the requested baud rates by software that ++is unaware of the extra clock controls available. ++ ++The oversampling rate is programmed with the TCR register and the clock ++prescaler is programmed with the CPR/CPR2 register pair[1][2][3][4]. ++To switch away from the default value of 33.875 for the prescaler the ++the enhanced mode has to be explicitly enabled though, by setting bit 4 ++of the EFR. In that mode setting bit 7 in the MCR enables the prescaler ++or otherwise it is bypassed as if the value of 1 was used. Additionally ++writing any value to CPR clears CPR2 for compatibility with old software ++written for older conventional PCI Oxford Semiconductor devices that do ++not have the extra prescaler's 9th bit in CPR2, so the CPR/CPR2 register ++pair has to be programmed in the right order. ++ ++By using these parameters rates from 15625000bps down to 1bps can be ++obtained, with either exact or highly-accurate actual bit rates for ++standard and many non-standard rates. ++ ++Here are the figures for the standard and some non-standard baud rates ++(including those quoted in Oxford Semiconductor documentation), giving ++the requested rate (r), the actual rate yielded (a) and its deviation ++from the requested rate (d), and the values of the oversampling rate ++(tcr), the clock prescaler (cpr) and the divisor (div) produced by the ++new `get_divisor' handler: ++ ++r: 15625000, a: 15625000.00, d: 0.0000%, tcr: 4, cpr: 1.000, div: 1 ++r: 12500000, a: 12500000.00, d: 0.0000%, tcr: 5, cpr: 1.000, div: 1 ++r: 10416666, a: 10416666.67, d: 0.0000%, tcr: 6, cpr: 1.000, div: 1 ++r: 8928571, a: 8928571.43, d: 0.0000%, tcr: 7, cpr: 1.000, div: 1 ++r: 7812500, a: 7812500.00, d: 0.0000%, tcr: 8, cpr: 1.000, div: 1 ++r: 4000000, a: 4000000.00, d: 0.0000%, tcr: 5, cpr: 3.125, div: 1 ++r: 3686400, a: 3676470.59, d: -0.2694%, tcr: 8, cpr: 2.125, div: 1 ++r: 3500000, a: 3496503.50, d: -0.0999%, tcr: 13, cpr: 1.375, div: 1 ++r: 3000000, a: 2976190.48, d: -0.7937%, tcr: 14, cpr: 1.500, div: 1 ++r: 2500000, a: 2500000.00, d: 0.0000%, tcr: 10, cpr: 2.500, div: 1 ++r: 2000000, a: 2000000.00, d: 0.0000%, tcr: 10, cpr: 3.125, div: 1 ++r: 1843200, a: 1838235.29, d: -0.2694%, tcr: 16, cpr: 2.125, div: 1 ++r: 1500000, a: 1492537.31, d: -0.4975%, tcr: 5, cpr: 8.375, div: 1 ++r: 1152000, a: 1152073.73, d: 0.0064%, tcr: 14, cpr: 3.875, div: 1 ++r: 921600, a: 919117.65, d: -0.2694%, tcr: 16, cpr: 2.125, div: 2 ++r: 576000, a: 576036.87, d: 0.0064%, tcr: 14, cpr: 3.875, div: 2 ++r: 460800, a: 460829.49, d: 0.0064%, tcr: 7, cpr: 3.875, div: 5 ++r: 230400, a: 230414.75, d: 0.0064%, tcr: 14, cpr: 3.875, div: 5 ++r: 115200, a: 115207.37, d: 0.0064%, tcr: 14, cpr: 1.250, div: 31 ++r: 57600, a: 57603.69, d: 0.0064%, tcr: 8, cpr: 3.875, div: 35 ++r: 38400, a: 38402.46, d: 0.0064%, tcr: 14, cpr: 3.875, div: 30 ++r: 19200, a: 19201.23, d: 0.0064%, tcr: 8, cpr: 3.875, div: 105 ++r: 9600, a: 9600.06, d: 0.0006%, tcr: 9, cpr: 1.125, div: 643 ++r: 4800, a: 4799.98, d: -0.0004%, tcr: 7, cpr: 2.875, div: 647 ++r: 2400, a: 2400.02, d: 0.0008%, tcr: 9, cpr: 2.250, div: 1286 ++r: 1200, a: 1200.00, d: 0.0000%, tcr: 14, cpr: 2.875, div: 1294 ++r: 300, a: 300.00, d: 0.0000%, tcr: 11, cpr: 2.625, div: 7215 ++r: 200, a: 200.00, d: 0.0000%, tcr: 16, cpr: 1.250, div: 15625 ++r: 150, a: 150.00, d: 0.0000%, tcr: 13, cpr: 2.250, div: 14245 ++r: 134, a: 134.00, d: 0.0000%, tcr: 11, cpr: 2.625, div: 16153 ++r: 110, a: 110.00, d: 0.0000%, tcr: 12, cpr: 1.000, div: 47348 ++r: 75, a: 75.00, d: 0.0000%, tcr: 4, cpr: 5.875, div: 35461 ++r: 50, a: 50.00, d: 0.0000%, tcr: 16, cpr: 1.250, div: 62500 ++r: 25, a: 25.00, d: 0.0000%, tcr: 16, cpr: 2.500, div: 62500 ++r: 4, a: 4.00, d: 0.0000%, tcr: 16, cpr: 20.000, div: 48828 ++r: 2, a: 2.00, d: 0.0000%, tcr: 16, cpr: 40.000, div: 48828 ++r: 1, a: 1.00, d: 0.0000%, tcr: 16, cpr: 63.875, div: 61154 ++ ++With the baud base set to 15625000 and the unsigned 16-bit UART_DIV_MAX ++limitation imposed by `serial8250_get_baud_rate' standard baud rates ++below 300bps become unavailable in the regular way, e.g. the rate of ++200bps requires the baud base to be divided by 78125 and that is beyond ++the unsigned 16-bit range. The historic spd_cust feature can still be ++used by encoding the values for, the prescaler, the oversampling rate ++and the clock divisor (DLM/DLL) as follows to obtain such rates if so ++required: ++ ++ 31 29 28 20 19 16 15 0 +++-----+-----------------+-------+-------------------------------+ ++|0 0 0| CPR2:CPR | TCR | DLM:DLL | +++-----+-----------------+-------+-------------------------------+ ++ ++Use a value such encoded for the `custom_divisor' field along with the ++ASYNC_SPD_CUST flag set in the `flags' field in `struct serial_struct' ++passed with the TIOCSSERIAL ioctl(2), such as with the setserial(8) ++utility and its `divisor' and `spd_cust' parameters, and the select ++the baud rate of 38400bps. Note that the value of 0 in TCR sets the ++oversampling rate to 16 and prescaler values below 1 in CPR2/CPR are ++clamped by the driver to 1. ++ ++For example the value of 0x1f4004e2 will set CPR2/CPR, TCR and DLM/DLL ++respectively to 0x1f4, 0x0 and 0x04e2, choosing the prescaler value, ++the oversampling rate and the clock divisor of 62.500, 16 and 1250 ++respectively. These parameters will set the baud rate for the serial ++port to 62500000 / 62.500 / 1250 / 16 = 50bps. ++ ++References: ++ ++[1] "OXPCIe200 PCI Express Multi-Port Bridge", Oxford Semiconductor, ++ Inc., DS-0045, 10 Nov 2008, Section "950 Mode", pp. 64-65 ++ ++[2] "OXPCIe952 PCI Express Bridge to Dual Serial & Parallel Port", ++ Oxford Semiconductor, Inc., DS-0046, Mar 06 08, Section "950 Mode", ++ p. 20 ++ ++[3] "OXPCIe954 PCI Express Bridge to Quad Serial Port", Oxford ++ Semiconductor, Inc., DS-0047, Feb 08, Section "950 Mode", p. 20 ++ ++[4] "OXPCIe958 PCI Express Bridge to Octal Serial Port", Oxford ++ Semiconductor, Inc., DS-0048, Feb 08, Section "950 Mode", p. 20 ++ ++Maciej W. Rozycki +diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c +index 4b0e84e01e55..818ed6cd3132 100644 +--- a/drivers/tty/serial/8250/8250_pci.c ++++ b/drivers/tty/serial/8250/8250_pci.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1044,6 +1045,208 @@ static int pci_oxsemi_tornado_init(struct pci_dev *dev) + return number_uarts; + } + ++/* Tornado-specific constants for the TCR and CPR registers; see below. */ ++#define OXSEMI_TORNADO_TCR_MASK 0xf ++#define OXSEMI_TORNADO_CPR_MASK 0x1ff ++#define OXSEMI_TORNADO_CPR_MIN 0x008 ++#define OXSEMI_TORNADO_CPR_DEF 0x10f ++ ++/* ++ * Determine the oversampling rate, the clock prescaler, and the clock ++ * divisor for the requested baud rate. The clock rate is 62.5 MHz, ++ * which is four times the baud base, and the prescaler increments in ++ * steps of 1/8. Therefore to make calculations on integers we need ++ * to use a scaled clock rate, which is the baud base multiplied by 32 ++ * (or our assumed UART clock rate multiplied by 2). ++ * ++ * The allowed oversampling rates are from 4 up to 16 inclusive (values ++ * from 0 to 3 inclusive map to 16). Likewise the clock prescaler allows ++ * values between 1.000 and 63.875 inclusive (operation for values from ++ * 0.000 to 0.875 has not been specified). The clock divisor is the usual ++ * unsigned 16-bit integer. ++ * ++ * For the most accurate baud rate we use a table of predetermined ++ * oversampling rates and clock prescalers that records all possible ++ * products of the two parameters in the range from 4 up to 255 inclusive, ++ * and additionally 335 for the 1500000bps rate, with the prescaler scaled ++ * by 8. The table is sorted by the decreasing value of the oversampling ++ * rate and ties are resolved by sorting by the decreasing value of the ++ * product. This way preference is given to higher oversampling rates. ++ * ++ * We iterate over the table and choose the product of an oversampling ++ * rate and a clock prescaler that gives the lowest integer division ++ * result deviation, or if an exact integer divider is found we stop ++ * looking for it right away. We do some fixup if the resulting clock ++ * divisor required would be out of its unsigned 16-bit integer range. ++ * ++ * Finally we abuse the supposed fractional part returned to encode the ++ * 4-bit value of the oversampling rate and the 9-bit value of the clock ++ * prescaler which will end up in the TCR and CPR/CPR2 registers. ++ */ ++static unsigned int pci_oxsemi_tornado_get_divisor(struct uart_port *port, ++ unsigned int baud, ++ unsigned int *frac) ++{ ++ static u8 p[][2] = { ++ { 16, 14, }, { 16, 13, }, { 16, 12, }, { 16, 11, }, ++ { 16, 10, }, { 16, 9, }, { 16, 8, }, { 15, 17, }, ++ { 15, 16, }, { 15, 15, }, { 15, 14, }, { 15, 13, }, ++ { 15, 12, }, { 15, 11, }, { 15, 10, }, { 15, 9, }, ++ { 15, 8, }, { 14, 18, }, { 14, 17, }, { 14, 14, }, ++ { 14, 13, }, { 14, 12, }, { 14, 11, }, { 14, 10, }, ++ { 14, 9, }, { 14, 8, }, { 13, 19, }, { 13, 18, }, ++ { 13, 17, }, { 13, 13, }, { 13, 12, }, { 13, 11, }, ++ { 13, 10, }, { 13, 9, }, { 13, 8, }, { 12, 19, }, ++ { 12, 18, }, { 12, 17, }, { 12, 11, }, { 12, 9, }, ++ { 12, 8, }, { 11, 23, }, { 11, 22, }, { 11, 21, }, ++ { 11, 20, }, { 11, 19, }, { 11, 18, }, { 11, 17, }, ++ { 11, 11, }, { 11, 10, }, { 11, 9, }, { 11, 8, }, ++ { 10, 25, }, { 10, 23, }, { 10, 20, }, { 10, 19, }, ++ { 10, 17, }, { 10, 10, }, { 10, 9, }, { 10, 8, }, ++ { 9, 27, }, { 9, 23, }, { 9, 21, }, { 9, 19, }, ++ { 9, 18, }, { 9, 17, }, { 9, 9, }, { 9, 8, }, ++ { 8, 31, }, { 8, 29, }, { 8, 23, }, { 8, 19, }, ++ { 8, 17, }, { 8, 8, }, { 7, 35, }, { 7, 31, }, ++ { 7, 29, }, { 7, 25, }, { 7, 23, }, { 7, 21, }, ++ { 7, 19, }, { 7, 17, }, { 7, 15, }, { 7, 14, }, ++ { 7, 13, }, { 7, 12, }, { 7, 11, }, { 7, 10, }, ++ { 7, 9, }, { 7, 8, }, { 6, 41, }, { 6, 37, }, ++ { 6, 31, }, { 6, 29, }, { 6, 23, }, { 6, 19, }, ++ { 6, 17, }, { 6, 13, }, { 6, 11, }, { 6, 10, }, ++ { 6, 9, }, { 6, 8, }, { 5, 67, }, { 5, 47, }, ++ { 5, 43, }, { 5, 41, }, { 5, 37, }, { 5, 31, }, ++ { 5, 29, }, { 5, 25, }, { 5, 23, }, { 5, 19, }, ++ { 5, 17, }, { 5, 15, }, { 5, 13, }, { 5, 11, }, ++ { 5, 10, }, { 5, 9, }, { 5, 8, }, { 4, 61, }, ++ { 4, 59, }, { 4, 53, }, { 4, 47, }, { 4, 43, }, ++ { 4, 41, }, { 4, 37, }, { 4, 31, }, { 4, 29, }, ++ { 4, 23, }, { 4, 19, }, { 4, 17, }, { 4, 13, }, ++ { 4, 9, }, { 4, 8, }, ++ }; ++ /* Scale the quotient for comparison to get the fractional part. */ ++ const unsigned int quot_scale = 65536; ++ unsigned int sclk = port->uartclk * 2; ++ unsigned int sdiv = DIV_ROUND_CLOSEST(sclk, baud); ++ unsigned int best_squot; ++ unsigned int squot; ++ unsigned int quot; ++ u16 cpr; ++ u8 tcr; ++ int i; ++ ++ /* Old custom speed handling. */ ++ if (baud == 38400 && (port->flags & UPF_SPD_MASK) == UPF_SPD_CUST) { ++ unsigned int cust_div = port->custom_divisor; ++ ++ quot = cust_div & UART_DIV_MAX; ++ tcr = (cust_div >> 16) & OXSEMI_TORNADO_TCR_MASK; ++ cpr = (cust_div >> 20) & OXSEMI_TORNADO_CPR_MASK; ++ if (cpr < OXSEMI_TORNADO_CPR_MIN) ++ cpr = OXSEMI_TORNADO_CPR_DEF; ++ } else { ++ best_squot = quot_scale; ++ for (i = 0; i < ARRAY_SIZE(p); i++) { ++ unsigned int spre; ++ unsigned int srem; ++ u8 cp; ++ u8 tc; ++ ++ tc = p[i][0]; ++ cp = p[i][1]; ++ spre = tc * cp; ++ ++ srem = sdiv % spre; ++ if (srem > spre / 2) ++ srem = spre - srem; ++ squot = DIV_ROUND_CLOSEST(srem * quot_scale, spre); ++ ++ if (srem == 0) { ++ tcr = tc; ++ cpr = cp; ++ quot = sdiv / spre; ++ break; ++ } else if (squot < best_squot) { ++ best_squot = squot; ++ tcr = tc; ++ cpr = cp; ++ quot = DIV_ROUND_CLOSEST(sdiv, spre); ++ } ++ } ++ while (tcr <= (OXSEMI_TORNADO_TCR_MASK + 1) >> 1 && ++ quot % 2 == 0) { ++ quot >>= 1; ++ tcr <<= 1; ++ } ++ while (quot > UART_DIV_MAX) { ++ if (tcr <= (OXSEMI_TORNADO_TCR_MASK + 1) >> 1) { ++ quot >>= 1; ++ tcr <<= 1; ++ } else if (cpr <= OXSEMI_TORNADO_CPR_MASK >> 1) { ++ quot >>= 1; ++ cpr <<= 1; ++ } else { ++ quot = quot * cpr / OXSEMI_TORNADO_CPR_MASK; ++ cpr = OXSEMI_TORNADO_CPR_MASK; ++ } ++ } ++ } ++ ++ *frac = (cpr << 8) | (tcr & OXSEMI_TORNADO_TCR_MASK); ++ return quot; ++} ++ ++/* ++ * Set the oversampling rate in the transmitter clock cycle register (TCR), ++ * the clock prescaler in the clock prescaler register (CPR and CPR2), and ++ * the clock divisor in the divisor latch (DLL and DLM). Note that for ++ * backwards compatibility any write to CPR clears CPR2 and therefore CPR ++ * has to be written first, followed by CPR2, which occupies the location ++ * of CKS used with earlier UART designs. ++ */ ++static void pci_oxsemi_tornado_set_divisor(struct uart_port *port, ++ unsigned int baud, ++ unsigned int quot, ++ unsigned int quot_frac) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ u8 cpr2 = quot_frac >> 16; ++ u8 cpr = quot_frac >> 8; ++ u8 tcr = quot_frac; ++ ++ serial_icr_write(up, UART_TCR, tcr); ++ serial_icr_write(up, UART_CPR, cpr); ++ serial_icr_write(up, UART_CKS, cpr2); ++ serial8250_do_set_divisor(port, baud, quot, 0); ++} ++ ++/* ++ * For Tornado devices we force MCR[7] set for the Divide-by-M N/8 baud rate ++ * generator prescaler (CPR and CPR2). Otherwise no prescaler would be used. ++ */ ++static void pci_oxsemi_tornado_set_mctrl(struct uart_port *port, ++ unsigned int mctrl) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ ++ up->mcr |= UART_MCR_CLKSEL; ++ serial8250_do_set_mctrl(port, mctrl); ++} ++ ++static int pci_oxsemi_tornado_setup(struct serial_private *priv, ++ const struct pciserial_board *board, ++ struct uart_8250_port *up, int idx) ++{ ++ struct pci_dev *dev = priv->dev; ++ ++ if (pci_oxsemi_tornado_p(dev)) { ++ up->port.get_divisor = pci_oxsemi_tornado_get_divisor; ++ up->port.set_divisor = pci_oxsemi_tornado_set_divisor; ++ up->port.set_mctrl = pci_oxsemi_tornado_set_mctrl; ++ } ++ ++ return pci_default_setup(priv, board, up, idx); ++} ++ + static int pci_asix_setup(struct serial_private *priv, + const struct pciserial_board *board, + struct uart_8250_port *port, int idx) +@@ -2245,7 +2448,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = { + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .init = pci_oxsemi_tornado_init, +- .setup = pci_default_setup, ++ .setup = pci_oxsemi_tornado_setup, + }, + { + .vendor = PCI_VENDOR_ID_MAINPINE, +@@ -2253,7 +2456,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = { + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .init = pci_oxsemi_tornado_init, +- .setup = pci_default_setup, ++ .setup = pci_oxsemi_tornado_setup, + }, + { + .vendor = PCI_VENDOR_ID_DIGI, +@@ -2261,7 +2464,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = { + .subvendor = PCI_SUBVENDOR_ID_IBM, + .subdevice = PCI_ANY_ID, + .init = pci_oxsemi_tornado_init, +- .setup = pci_default_setup, ++ .setup = pci_oxsemi_tornado_setup, + }, + { + .vendor = PCI_VENDOR_ID_INTEL, +@@ -2578,7 +2781,7 @@ enum pci_board_num_t { + pbn_b0_2_1843200, + pbn_b0_4_1843200, + +- pbn_b0_1_3906250, ++ pbn_b0_1_15625000, + + pbn_b0_bt_1_115200, + pbn_b0_bt_2_115200, +@@ -2657,10 +2860,10 @@ enum pci_board_num_t { + pbn_panacom4, + pbn_plx_romulus, + pbn_oxsemi, +- pbn_oxsemi_1_3906250, +- pbn_oxsemi_2_3906250, +- pbn_oxsemi_4_3906250, +- pbn_oxsemi_8_3906250, ++ pbn_oxsemi_1_15625000, ++ pbn_oxsemi_2_15625000, ++ pbn_oxsemi_4_15625000, ++ pbn_oxsemi_8_15625000, + pbn_intel_i960, + pbn_sgi_ioc3, + pbn_computone_4, +@@ -2803,10 +3006,10 @@ static struct pciserial_board pci_boards[] = { + .uart_offset = 8, + }, + +- [pbn_b0_1_3906250] = { ++ [pbn_b0_1_15625000] = { + .flags = FL_BASE0, + .num_ports = 1, +- .base_baud = 3906250, ++ .base_baud = 15625000, + .uart_offset = 8, + }, + +@@ -3187,31 +3390,31 @@ static struct pciserial_board pci_boards[] = { + .base_baud = 115200, + .uart_offset = 8, + }, +- [pbn_oxsemi_1_3906250] = { ++ [pbn_oxsemi_1_15625000] = { + .flags = FL_BASE0, + .num_ports = 1, +- .base_baud = 3906250, ++ .base_baud = 15625000, + .uart_offset = 0x200, + .first_offset = 0x1000, + }, +- [pbn_oxsemi_2_3906250] = { ++ [pbn_oxsemi_2_15625000] = { + .flags = FL_BASE0, + .num_ports = 2, +- .base_baud = 3906250, ++ .base_baud = 15625000, + .uart_offset = 0x200, + .first_offset = 0x1000, + }, +- [pbn_oxsemi_4_3906250] = { ++ [pbn_oxsemi_4_15625000] = { + .flags = FL_BASE0, + .num_ports = 4, +- .base_baud = 3906250, ++ .base_baud = 15625000, + .uart_offset = 0x200, + .first_offset = 0x1000, + }, +- [pbn_oxsemi_8_3906250] = { ++ [pbn_oxsemi_8_15625000] = { + .flags = FL_BASE0, + .num_ports = 8, +- .base_baud = 3906250, ++ .base_baud = 15625000, + .uart_offset = 0x200, + .first_offset = 0x1000, + }, +@@ -4192,165 +4395,165 @@ static const struct pci_device_id serial_pci_tbl[] = { + */ + { PCI_VENDOR_ID_OXSEMI, 0xc101, /* OXPCIe952 1 Legacy UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_b0_1_3906250 }, ++ pbn_b0_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc105, /* OXPCIe952 1 Legacy UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_b0_1_3906250 }, ++ pbn_b0_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc11b, /* OXPCIe952 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc11f, /* OXPCIe952 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc120, /* OXPCIe952 1 Legacy UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_b0_1_3906250 }, ++ pbn_b0_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc124, /* OXPCIe952 1 Legacy UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_b0_1_3906250 }, ++ pbn_b0_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc138, /* OXPCIe952 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc13d, /* OXPCIe952 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc140, /* OXPCIe952 1 Legacy UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_b0_1_3906250 }, ++ pbn_b0_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc141, /* OXPCIe952 1 Legacy UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_b0_1_3906250 }, ++ pbn_b0_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc144, /* OXPCIe952 1 Legacy UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_b0_1_3906250 }, ++ pbn_b0_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc145, /* OXPCIe952 1 Legacy UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_b0_1_3906250 }, ++ pbn_b0_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc158, /* OXPCIe952 2 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_2_3906250 }, ++ pbn_oxsemi_2_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc15d, /* OXPCIe952 2 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_2_3906250 }, ++ pbn_oxsemi_2_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc208, /* OXPCIe954 4 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_4_3906250 }, ++ pbn_oxsemi_4_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc20d, /* OXPCIe954 4 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_4_3906250 }, ++ pbn_oxsemi_4_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc308, /* OXPCIe958 8 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_8_3906250 }, ++ pbn_oxsemi_8_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc30d, /* OXPCIe958 8 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_8_3906250 }, ++ pbn_oxsemi_8_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc40b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc40f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc41b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc41f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc42b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc42f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc43b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc43f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc44b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc44f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc45b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc45f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc46b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc46f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc47b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc47f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc48b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc48f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc49b, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc49f, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc4ab, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc4af, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc4bb, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc4bf, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc4cb, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_OXSEMI, 0xc4cf, /* OXPCIe200 1 Native UART */ + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + /* + * Mainpine Inc. IQ Express "Rev3" utilizing OxSemi Tornado + */ + { PCI_VENDOR_ID_MAINPINE, 0x4000, /* IQ Express 1 Port V.34 Super-G3 Fax */ + PCI_VENDOR_ID_MAINPINE, 0x4001, 0, 0, +- pbn_oxsemi_1_3906250 }, ++ pbn_oxsemi_1_15625000 }, + { PCI_VENDOR_ID_MAINPINE, 0x4000, /* IQ Express 2 Port V.34 Super-G3 Fax */ + PCI_VENDOR_ID_MAINPINE, 0x4002, 0, 0, +- pbn_oxsemi_2_3906250 }, ++ pbn_oxsemi_2_15625000 }, + { PCI_VENDOR_ID_MAINPINE, 0x4000, /* IQ Express 4 Port V.34 Super-G3 Fax */ + PCI_VENDOR_ID_MAINPINE, 0x4004, 0, 0, +- pbn_oxsemi_4_3906250 }, ++ pbn_oxsemi_4_15625000 }, + { PCI_VENDOR_ID_MAINPINE, 0x4000, /* IQ Express 8 Port V.34 Super-G3 Fax */ + PCI_VENDOR_ID_MAINPINE, 0x4008, 0, 0, +- pbn_oxsemi_8_3906250 }, ++ pbn_oxsemi_8_15625000 }, + + /* + * Digi/IBM PCIe 2-port Async EIA-232 Adapter utilizing OxSemi Tornado + */ + { PCI_VENDOR_ID_DIGI, PCIE_DEVICE_ID_NEO_2_OX_IBM, + PCI_SUBVENDOR_ID_IBM, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_2_3906250 }, ++ pbn_oxsemi_2_15625000 }, + /* + * EndRun Technologies. PCI express device range. + * EndRun PTP/1588 has 2 Native UARTs utilizing OxSemi 952. + */ + { PCI_VENDOR_ID_ENDRUN, PCI_DEVICE_ID_ENDRUN_1588, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_oxsemi_2_3906250 }, ++ pbn_oxsemi_2_15625000 }, + + /* + * SBS Technologies, Inc. P-Octal and PMC-OCTPRO cards, +-- +2.35.1 + diff --git a/queue-5.18/serial-8250-fold-endrun-device-support-into-oxsemi-t.patch b/queue-5.18/serial-8250-fold-endrun-device-support-into-oxsemi-t.patch new file mode 100644 index 00000000000..8a0500504d7 --- /dev/null +++ b/queue-5.18/serial-8250-fold-endrun-device-support-into-oxsemi-t.patch @@ -0,0 +1,177 @@ +From cfb171de62f71cfb83a28429ef85c60e8be57a08 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 18 Apr 2022 16:27:22 +0100 +Subject: serial: 8250: Fold EndRun device support into OxSemi Tornado code + +From: Maciej W. Rozycki + +[ Upstream commit 1f32c65bad24b9787d3e52843de375430e3df822 ] + +The EndRun PTP/1588 dual serial port device is based on the Oxford +Semiconductor OXPCIe952 UART device with the PCI vendor:device ID set +for EndRun Technologies and uses the same sequence to determine the +number of ports available. Despite that we have duplicate code +specific to the EndRun device. + +Remove redundant code then and factor out OxSemi Tornado device +detection. + +Signed-off-by: Maciej W. Rozycki +Reviewed-by: Andy Shevchenko +Link: https://lore.kernel.org/r/alpine.DEB.2.21.2204181516220.9383@angie.orcam.me.uk +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + drivers/tty/serial/8250/8250_pci.c | 76 ++++++++++-------------------- + 1 file changed, 25 insertions(+), 51 deletions(-) + +diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c +index a293e9f107d0..4b0e84e01e55 100644 +--- a/drivers/tty/serial/8250/8250_pci.c ++++ b/drivers/tty/serial/8250/8250_pci.c +@@ -994,41 +994,29 @@ static void pci_ite887x_exit(struct pci_dev *dev) + } + + /* +- * EndRun Technologies. +- * Determine the number of ports available on the device. ++ * Oxford Semiconductor Inc. ++ * Check if an OxSemi device is part of the Tornado range of devices. + */ + #define PCI_VENDOR_ID_ENDRUN 0x7401 + #define PCI_DEVICE_ID_ENDRUN_1588 0xe100 + +-static int pci_endrun_init(struct pci_dev *dev) ++static bool pci_oxsemi_tornado_p(struct pci_dev *dev) + { +- u8 __iomem *p; +- unsigned long deviceID; +- unsigned int number_uarts = 0; ++ /* OxSemi Tornado devices are all 0xCxxx */ ++ if (dev->vendor == PCI_VENDOR_ID_OXSEMI && ++ (dev->device & 0xf000) != 0xc000) ++ return false; + +- /* EndRun device is all 0xexxx */ ++ /* EndRun devices are all 0xExxx */ + if (dev->vendor == PCI_VENDOR_ID_ENDRUN && +- (dev->device & 0xf000) != 0xe000) +- return 0; +- +- p = pci_iomap(dev, 0, 5); +- if (p == NULL) +- return -ENOMEM; ++ (dev->device & 0xf000) != 0xe000) ++ return false; + +- deviceID = ioread32(p); +- /* EndRun device */ +- if (deviceID == 0x07000200) { +- number_uarts = ioread8(p + 4); +- pci_dbg(dev, "%d ports detected on EndRun PCI Express device\n", number_uarts); +- } +- pci_iounmap(dev, p); +- return number_uarts; ++ return true; + } + + /* +- * Oxford Semiconductor Inc. +- * Check that device is part of the Tornado range of devices, then determine +- * the number of ports available on the device. ++ * Determine the number of ports available on a Tornado device. + */ + static int pci_oxsemi_tornado_init(struct pci_dev *dev) + { +@@ -1036,9 +1024,7 @@ static int pci_oxsemi_tornado_init(struct pci_dev *dev) + unsigned long deviceID; + unsigned int number_uarts = 0; + +- /* OxSemi Tornado devices are all 0xCxxx */ +- if (dev->vendor == PCI_VENDOR_ID_OXSEMI && +- (dev->device & 0xF000) != 0xC000) ++ if (!pci_oxsemi_tornado_p(dev)) + return 0; + + p = pci_iomap(dev, 0, 5); +@@ -1049,7 +1035,10 @@ static int pci_oxsemi_tornado_init(struct pci_dev *dev) + /* Tornado device */ + if (deviceID == 0x07000200) { + number_uarts = ioread8(p + 4); +- pci_dbg(dev, "%d ports detected on Oxford PCI Express device\n", number_uarts); ++ pci_dbg(dev, "%d ports detected on %s PCI Express device\n", ++ number_uarts, ++ dev->vendor == PCI_VENDOR_ID_ENDRUN ? ++ "EndRun" : "Oxford"); + } + pci_iounmap(dev, p); + return number_uarts; +@@ -2244,7 +2233,7 @@ static struct pci_serial_quirk pci_serial_quirks[] = { + .device = PCI_ANY_ID, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, +- .init = pci_endrun_init, ++ .init = pci_oxsemi_tornado_init, + .setup = pci_default_setup, + }, + /* +@@ -2667,7 +2656,6 @@ enum pci_board_num_t { + pbn_panacom2, + pbn_panacom4, + pbn_plx_romulus, +- pbn_endrun_2_3906250, + pbn_oxsemi, + pbn_oxsemi_1_3906250, + pbn_oxsemi_2_3906250, +@@ -3189,20 +3177,6 @@ static struct pciserial_board pci_boards[] = { + .first_offset = 0x03, + }, + +- /* +- * EndRun Technologies +- * Uses the size of PCI Base region 0 to +- * signal now many ports are available +- * 2 port 952 Uart support +- */ +- [pbn_endrun_2_3906250] = { +- .flags = FL_BASE0, +- .num_ports = 2, +- .base_baud = 3906250, +- .uart_offset = 0x200, +- .first_offset = 0x1000, +- }, +- + /* + * This board uses the size of PCI Base region 0 to + * signal now many ports are available +@@ -4109,13 +4083,6 @@ static const struct pci_device_id serial_pci_tbl[] = { + { PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_ROMULUS, + 0x10b5, 0x106a, 0, 0, + pbn_plx_romulus }, +- /* +- * EndRun Technologies. PCI express device range. +- * EndRun PTP/1588 has 2 Native UARTs. +- */ +- { PCI_VENDOR_ID_ENDRUN, PCI_DEVICE_ID_ENDRUN_1588, +- PCI_ANY_ID, PCI_ANY_ID, 0, 0, +- pbn_endrun_2_3906250 }, + /* + * Quatech cards. These actually have configurable clocks but for + * now we just use the default. +@@ -4377,6 +4344,13 @@ static const struct pci_device_id serial_pci_tbl[] = { + { PCI_VENDOR_ID_DIGI, PCIE_DEVICE_ID_NEO_2_OX_IBM, + PCI_SUBVENDOR_ID_IBM, PCI_ANY_ID, 0, 0, + pbn_oxsemi_2_3906250 }, ++ /* ++ * EndRun Technologies. PCI express device range. ++ * EndRun PTP/1588 has 2 Native UARTs utilizing OxSemi 952. ++ */ ++ { PCI_VENDOR_ID_ENDRUN, PCI_DEVICE_ID_ENDRUN_1588, ++ PCI_ANY_ID, PCI_ANY_ID, 0, 0, ++ pbn_oxsemi_2_3906250 }, + + /* + * SBS Technologies, Inc. P-Octal and PMC-OCTPRO cards, +-- +2.35.1 + diff --git a/queue-5.18/series b/queue-5.18/series index a89abb32b57..1a109963018 100644 --- a/queue-5.18/series +++ b/queue-5.18/series @@ -987,3 +987,112 @@ input-gscps2-check-return-value-of-ioremap-in-gscps2_probe.patch __follow_mount_rcu-verify-that-mount_lock-remains-unchanged.patch spmi-trace-fix-stack-out-of-bound-access-in-spmi-tracing-functions.patch drivers-base-fix-userspace-break-from-using-bin_attributes-for-cpumap-and-cpulist.patch +drm-mediatek-keep-dsi-as-lp00-before-dcs-cmds-transf.patch +crypto-blake2s-remove-shash-module.patch +drm-dp-mst-read-the-extended-dpcd-capabilities-durin.patch +scsi-qla2xxx-fix-excessive-i-o-error-messages-by-def.patch +scsi-qla2xxx-wind-down-adapter-after-pcie-error.patch-27996 +scsi-qla2xxx-turn-off-multi-queue-for-8g-adapters.patch-18430 +scsi-qla2xxx-fix-crash-due-to-stale-srb-access-aroun.patch +scsi-qla2xxx-fix-losing-fcp-2-targets-during-port-pe.patch +scsi-qla2xxx-fix-losing-target-when-it-reappears-dur.patch +scsi-qla2xxx-fix-losing-fcp-2-targets-on-long-port-d.patch +scsi-qla2xxx-fix-erroneous-mailbox-timeout-after-pci.patch +drm-vc4-drv-adopt-the-dma-configuration-from-the-hvs.patch +usbnet-smsc95xx-don-t-clear-read-only-phy-interrupt.patch +usbnet-smsc95xx-avoid-link-settings-race-on-interrup.patch +usbnet-smsc95xx-forward-phy-interrupts-to-phy-driver.patch +usbnet-smsc95xx-fix-deadlock-on-runtime-resume.patch +firmware-arm_scpi-ensure-scpi_info-is-not-assigned-i.patch +__follow_mount_rcu-verify-that-mount_lock-remains-un.patch +intel_th-pci-add-meteor-lake-p-support.patch +intel_th-pci-add-raptor-lake-s-pch-support.patch +intel_th-pci-add-raptor-lake-s-cpu-support.patch +kvm-set_msr_mce-permit-guests-to-ignore-single-bit-e.patch +kvm-x86-signal-gp-not-eperm-on-bad-wrmsr-mci_ctl-sta.patch +iommu-vt-d-avoid-invalid-memory-access-via-node_onli.patch +pci-aer-iterate-over-error-counters-instead-of-error.patch +pci-qcom-power-on-phy-before-ipq8074-dbi-register-ac.patch +serial-8250-fold-endrun-device-support-into-oxsemi-t.patch +serial-8250-add-proper-clock-handling-for-oxsemi-pci.patch +tty-8250-add-support-for-brainboxes-px-cards.patch +dm-writecache-set-a-default-max_writeback_jobs.patch +x86-olpc-fix-logical-not-is-only-applied-to-the-left.patch +drivers-base-fix-userspace-break-from-using-bin_attr.patch +kexec_file-drop-weak-attribute-from-functions.patch +kexec-clean-up-arch_kexec_kernel_verify_sig.patch +kexec-keys-s390-make-use-of-built-in-and-secondary-k.patch +tracing-events-add-__vstring-and-__assign_vstr-helpe.patch +dm-thin-fix-use-after-free-crash-in-dm_sm_register_t.patch +net-9p-initialize-the-iounit-field-during-fid-creati.patch +timekeeping-contribute-wall-clock-to-rng-on-time-cha.patch +scsi-qla2xxx-fix-response-queue-handler-reading-stal.patch +scsi-qla2xxx-edif-fix-dropped-ike-message.patch +scsi-qla2xxx-fix-imbalance-vha-vref_count.patch-27970 +scsi-qla2xxx-fix-discovery-issues-in-fc-al-topology.patch-4818 +scsi-qla2xxx-update-manufacturer-details.patch +locking-csd_lock-change-csdlock_debug-from-early_par.patch +block-serialize-all-debugfs-operations-using-q-debug.patch +block-don-t-allow-the-same-type-rq_qos-add-more-than.patch +spmi-trace-fix-stack-out-of-bound-access-in-spmi-tra.patch +btrfs-tree-log-make-the-return-value-for-log-syncing.patch +btrfs-ensure-pages-are-unlocked-on-cow_file_range-fa.patch +btrfs-fix-error-handling-of-fallback-uncompress-writ.patch +btrfs-reset-block-group-chunk-force-if-we-have-to-wa.patch +btrfs-properly-flag-filesystem-with-btrfs_feature_in.patch +block-add-a-bdev_max_zone_append_sectors-helper.patch +block-add-bdev_max_segments-helper.patch +btrfs-zoned-revive-max_zone_append_bytes.patch +btrfs-replace-btrfs_max_extent_size-with-fs_info-max.patch +btrfs-let-can_allocate_chunk-return-error.patch +btrfs-zoned-finish-least-available-block-group-on-da.patch +btrfs-zoned-disable-metadata-overcommit-for-zoned.patch +btrfs-make-the-bg_reclaim_threshold-per-space-info.patch +btrfs-zoned-introduce-btrfs_zoned_bg_is_full.patch +btrfs-store-chunk-size-in-space-info-struct.patch +btrfs-zoned-introduce-space_info-active_total_bytes.patch +btrfs-zoned-activate-metadata-block-group-on-flush_s.patch +btrfs-zoned-activate-necessary-block-group.patch +btrfs-zoned-write-out-partially-allocated-region.patch +btrfs-zoned-wait-until-zone-is-finished-when-allocat.patch +intel_idle-add-alderlake-support.patch +intel_idle-make-spr-c1-and-c1e-be-independent.patch +acpi-cppc-do-not-prevent-cppc-from-working-in-the-fu.patch +powerpc-powernv-kvm-use-darn-for-h_random-on-power9.patch +s390-unwind-fix-fgraph-return-address-recovery.patch +kvm-x86-pmu-introduce-the-ctrl_mask-value-for-fixed-.patch +kvm-vmx-mark-all-perf_global_-ovf-_ctrl-bits-reserve.patch +kvm-x86-pmu-ignore-pmu-global_ctrl-check-if-vpmu-doe.patch +kvm-vmx-add-helper-to-check-if-the-guest-pmu-has-per.patch +kvm-nvmx-attempt-to-load-perf_global_ctrl-on-nvmx-xf.patch +dm-raid-fix-address-sanitizer-warning-in-raid_status.patch +dm-raid-fix-address-sanitizer-warning-in-raid_resume.patch +mm-damon-reclaim-fix-potential-memory-leak-in-damon_.patch +hugetlb_cgroup-fix-wrong-hugetlb-cgroup-numa-stat.patch +batman-adv-tracing-use-the-new-__vstring-helper.patch +ftrace-x86-add-back-ftrace_expected-assignment.patch-2936 +tracing-use-a-struct-alignof-to-determine-trace-even.patch +ksmbd-validate-length-in-smb2_write.patch +ksmbd-smbd-change-prototypes-of-rdma-read-write-rela.patch +ksmbd-smbd-introduce-read-write-credits-for-rdma-rea.patch +ksmbd-add-smbd-max-io-size-parameter.patch +ksmbd-fix-wrong-smbd-max-read-write-size-check.patch +ksmbd-prevent-out-of-bound-read-for-smb2_write.patch +input-gscps2-check-return-value-of-ioremap-in-gscps2.patch +x86-kprobes-update-kcb-status-flag-after-singlestepp.patch +ext4-update-s_overhead_clusters-in-the-superblock-du.patch +ext4-fix-extent-status-tree-race-in-writeback-error-.patch +ext4-add-ext4_inode_has_xattr_space-macro-in-xattr.h.patch +ext4-fix-use-after-free-in-ext4_xattr_set_entry.patch +ext4-correct-max_inline_xattr_value_size-computing.patch +ext4-correct-the-misjudgment-in-ext4_iget_extra_inod.patch +ext4-fix-warning-in-ext4_iomap_begin-as-race-between.patch +ext4-check-if-directory-block-is-within-i_size.patch +ext4-make-sure-ext4_append-always-allocates-new-bloc.patch +ext4-remove-ea-inode-entry-from-mbcache-on-inode-evi.patch +ext4-use-kmemdup-to-replace-kmalloc-memcpy.patch +ext4-unindent-codeblock-in-ext4_xattr_block_set.patch +ext4-fix-race-when-reusing-xattr-blocks.patch +keys-asymmetric-enforce-sm2-signature-use-pkey-algo.patch +tpm-eventlog-fix-section-mismatch-for-debug_section_.patch +tpm-add-check-for-failure-mode-for-tpm2-modules.patch diff --git a/queue-5.18/spmi-trace-fix-stack-out-of-bound-access-in-spmi-tra.patch b/queue-5.18/spmi-trace-fix-stack-out-of-bound-access-in-spmi-tra.patch new file mode 100644 index 00000000000..5d00c0e7338 --- /dev/null +++ b/queue-5.18/spmi-trace-fix-stack-out-of-bound-access-in-spmi-tra.patch @@ -0,0 +1,115 @@ +From 950c726de64542319fc82b60b84e385645aff774 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 27 Jun 2022 16:55:12 -0700 +Subject: spmi: trace: fix stack-out-of-bound access in SPMI tracing functions + +From: David Collins + +[ Upstream commit 2af28b241eea816e6f7668d1954f15894b45d7e3 ] + +trace_spmi_write_begin() and trace_spmi_read_end() both call +memcpy() with a length of "len + 1". This leads to one extra +byte being read beyond the end of the specified buffer. Fix +this out-of-bound memory access by using a length of "len" +instead. + +Here is a KASAN log showing the issue: + +BUG: KASAN: stack-out-of-bounds in trace_event_raw_event_spmi_read_end+0x1d0/0x234 +Read of size 2 at addr ffffffc0265b7540 by task thermal@2.0-ser/1314 +... +Call trace: + dump_backtrace+0x0/0x3e8 + show_stack+0x2c/0x3c + dump_stack_lvl+0xdc/0x11c + print_address_description+0x74/0x384 + kasan_report+0x188/0x268 + kasan_check_range+0x270/0x2b0 + memcpy+0x90/0xe8 + trace_event_raw_event_spmi_read_end+0x1d0/0x234 + spmi_read_cmd+0x294/0x3ac + spmi_ext_register_readl+0x84/0x9c + regmap_spmi_ext_read+0x144/0x1b0 [regmap_spmi] + _regmap_raw_read+0x40c/0x754 + regmap_raw_read+0x3a0/0x514 + regmap_bulk_read+0x418/0x494 + adc5_gen3_poll_wait_hs+0xe8/0x1e0 [qcom_spmi_adc5_gen3] + ... + __arm64_sys_read+0x4c/0x60 + invoke_syscall+0x80/0x218 + el0_svc_common+0xec/0x1c8 + ... + +addr ffffffc0265b7540 is located in stack of task thermal@2.0-ser/1314 at offset 32 in frame: + adc5_gen3_poll_wait_hs+0x0/0x1e0 [qcom_spmi_adc5_gen3] + +this frame has 1 object: + [32, 33) 'status' + +Memory state around the buggy address: + ffffffc0265b7400: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 + ffffffc0265b7480: 04 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 +>ffffffc0265b7500: 00 00 00 00 f1 f1 f1 f1 01 f3 f3 f3 00 00 00 00 + ^ + ffffffc0265b7580: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + ffffffc0265b7600: f1 f1 f1 f1 01 f2 07 f2 f2 f2 01 f3 00 00 00 00 +================================================================== + +Fixes: a9fce374815d ("spmi: add command tracepoints for SPMI") +Cc: stable@vger.kernel.org +Reviewed-by: Stephen Boyd +Acked-by: Steven Rostedt (Google) +Signed-off-by: David Collins +Link: https://lore.kernel.org/r/20220627235512.2272783-1-quic_collinsd@quicinc.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + include/trace/events/spmi.h | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/include/trace/events/spmi.h b/include/trace/events/spmi.h +index 8b60efe18ba6..a6819fd85cdf 100644 +--- a/include/trace/events/spmi.h ++++ b/include/trace/events/spmi.h +@@ -21,15 +21,15 @@ TRACE_EVENT(spmi_write_begin, + __field ( u8, sid ) + __field ( u16, addr ) + __field ( u8, len ) +- __dynamic_array ( u8, buf, len + 1 ) ++ __dynamic_array ( u8, buf, len ) + ), + + TP_fast_assign( + __entry->opcode = opcode; + __entry->sid = sid; + __entry->addr = addr; +- __entry->len = len + 1; +- memcpy(__get_dynamic_array(buf), buf, len + 1); ++ __entry->len = len; ++ memcpy(__get_dynamic_array(buf), buf, len); + ), + + TP_printk("opc=%d sid=%02d addr=0x%04x len=%d buf=0x[%*phD]", +@@ -92,7 +92,7 @@ TRACE_EVENT(spmi_read_end, + __field ( u16, addr ) + __field ( int, ret ) + __field ( u8, len ) +- __dynamic_array ( u8, buf, len + 1 ) ++ __dynamic_array ( u8, buf, len ) + ), + + TP_fast_assign( +@@ -100,8 +100,8 @@ TRACE_EVENT(spmi_read_end, + __entry->sid = sid; + __entry->addr = addr; + __entry->ret = ret; +- __entry->len = len + 1; +- memcpy(__get_dynamic_array(buf), buf, len + 1); ++ __entry->len = len; ++ memcpy(__get_dynamic_array(buf), buf, len); + ), + + TP_printk("opc=%d sid=%02d addr=0x%04x ret=%d len=%02d buf=0x[%*phD]", +-- +2.35.1 + diff --git a/queue-5.18/timekeeping-contribute-wall-clock-to-rng-on-time-cha.patch b/queue-5.18/timekeeping-contribute-wall-clock-to-rng-on-time-cha.patch new file mode 100644 index 00000000000..53a6e473373 --- /dev/null +++ b/queue-5.18/timekeeping-contribute-wall-clock-to-rng-on-time-cha.patch @@ -0,0 +1,74 @@ +From 9b10f29c1c139d6a3b1b563093243b9621df2322 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 17 Jul 2022 23:53:34 +0200 +Subject: timekeeping: contribute wall clock to rng on time change + +From: Jason A. Donenfeld + +[ Upstream commit b8ac29b40183a6038919768b5d189c9bd91ce9b4 ] + +The rng's random_init() function contributes the real time to the rng at +boot time, so that events can at least start in relation to something +particular in the real world. But this clock might not yet be set that +point in boot, so nothing is contributed. In addition, the relation +between minor clock changes from, say, NTP, and the cycle counter is +potentially useful entropic data. + +This commit addresses this by mixing in a time stamp on calls to +settimeofday and adjtimex. No entropy is credited in doing so, so it +doesn't make initialization faster, but it is still useful input to +have. + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Cc: stable@vger.kernel.org +Reviewed-by: Thomas Gleixner +Reviewed-by: Eric Biggers +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Sasha Levin +--- + kernel/time/timekeeping.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c +index 871c912860ed..d6a0ff68df41 100644 +--- a/kernel/time/timekeeping.c ++++ b/kernel/time/timekeeping.c +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + + #include "tick-internal.h" + #include "ntp_internal.h" +@@ -1326,8 +1327,10 @@ int do_settimeofday64(const struct timespec64 *ts) + /* Signal hrtimers about time change */ + clock_was_set(CLOCK_SET_WALL); + +- if (!ret) ++ if (!ret) { + audit_tk_injoffset(ts_delta); ++ add_device_randomness(ts, sizeof(*ts)); ++ } + + return ret; + } +@@ -2413,6 +2416,7 @@ int do_adjtimex(struct __kernel_timex *txc) + ret = timekeeping_validate_timex(txc); + if (ret) + return ret; ++ add_device_randomness(txc, sizeof(*txc)); + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec64 delta; +@@ -2430,6 +2434,7 @@ int do_adjtimex(struct __kernel_timex *txc) + audit_ntp_init(&ad); + + ktime_get_real_ts64(&ts); ++ add_device_randomness(&ts, sizeof(ts)); + + raw_spin_lock_irqsave(&timekeeper_lock, flags); + write_seqcount_begin(&tk_core.seq); +-- +2.35.1 + diff --git a/queue-5.18/tpm-add-check-for-failure-mode-for-tpm2-modules.patch b/queue-5.18/tpm-add-check-for-failure-mode-for-tpm2-modules.patch new file mode 100644 index 00000000000..80c0227ca7f --- /dev/null +++ b/queue-5.18/tpm-add-check-for-failure-mode-for-tpm2-modules.patch @@ -0,0 +1,54 @@ +From 9c1985649dac8ce39d9305a8087aa7e7a44b6a11 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Aug 2022 15:57:03 +0200 +Subject: tpm: Add check for Failure mode for TPM2 modules +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Mårten Lindahl + +[ Upstream commit 863ed94c589fcd1984f4e3080f069d30508044bb ] + +In commit 0aa698787aa2 ("tpm: Add Upgrade/Reduced mode support for +TPM2 modules") it was said that: + +"If the TPM is in Failure mode, it will successfully respond to both +tpm2_do_selftest() and tpm2_startup() calls. Although, will fail to +answer to tpm2_get_cc_attrs_tbl(). Use this fact to conclude that TPM +is in Failure mode." + +But a check was never added in the commit when calling +tpm2_get_cc_attrs_tbl() to conclude that the TPM is in Failure mode. +This commit corrects this by adding a check. + +Fixes: 0aa698787aa2 ("tpm: Add Upgrade/Reduced mode support for TPM2 modules") +Cc: stable@vger.kernel.org # v5.17+ +Signed-off-by: Mårten Lindahl +Reviewed-by: Jarkko Sakkinen +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Sasha Levin +--- + drivers/char/tpm/tpm2-cmd.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c +index 04a3e23a4afc..4419593d9531 100644 +--- a/drivers/char/tpm/tpm2-cmd.c ++++ b/drivers/char/tpm/tpm2-cmd.c +@@ -752,6 +752,12 @@ int tpm2_auto_startup(struct tpm_chip *chip) + } + + rc = tpm2_get_cc_attrs_tbl(chip); ++ if (rc == TPM2_RC_FAILURE || (rc < 0 && rc != -ENOMEM)) { ++ dev_info(&chip->dev, ++ "TPM in field failure mode, requires firmware upgrade\n"); ++ chip->flags |= TPM_CHIP_FLAG_FIRMWARE_UPGRADE; ++ rc = 0; ++ } + + out: + if (rc == TPM2_RC_UPGRADE) { +-- +2.35.1 + diff --git a/queue-5.18/tpm-eventlog-fix-section-mismatch-for-debug_section_.patch b/queue-5.18/tpm-eventlog-fix-section-mismatch-for-debug_section_.patch new file mode 100644 index 00000000000..a2f014e4e51 --- /dev/null +++ b/queue-5.18/tpm-eventlog-fix-section-mismatch-for-debug_section_.patch @@ -0,0 +1,47 @@ +From 965d4271f4ecbc74ebfd59eee61471d44d2e4694 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Jul 2022 09:17:38 +0800 +Subject: tpm: eventlog: Fix section mismatch for DEBUG_SECTION_MISMATCH + +From: Huacai Chen + +[ Upstream commit bed4593645366ad7362a3aa7bc0d100d8d8236a8 ] + +If DEBUG_SECTION_MISMATCH enabled, __calc_tpm2_event_size() will not be +inlined, this cause section mismatch like this: + +WARNING: modpost: vmlinux.o(.text.unlikely+0xe30c): Section mismatch in reference from the variable L0 to the function .init.text:early_ioremap() +The function L0() references +the function __init early_memremap(). +This is often because L0 lacks a __init +annotation or the annotation of early_ioremap is wrong. + +Fix it by using __always_inline instead of inline for the called-once +function __calc_tpm2_event_size(). + +Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations") +Cc: stable@vger.kernel.org # v5.3 +Reported-by: WANG Xuerui +Signed-off-by: Huacai Chen +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Sasha Levin +--- + include/linux/tpm_eventlog.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h +index 739ba9a03ec1..20c0ff54b7a0 100644 +--- a/include/linux/tpm_eventlog.h ++++ b/include/linux/tpm_eventlog.h +@@ -157,7 +157,7 @@ struct tcg_algorithm_info { + * Return: size of the event on success, 0 on failure + */ + +-static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, ++static __always_inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, + struct tcg_pcr_event *event_header, + bool do_mapping) + { +-- +2.35.1 + diff --git a/queue-5.18/tracing-events-add-__vstring-and-__assign_vstr-helpe.patch b/queue-5.18/tracing-events-add-__vstring-and-__assign_vstr-helpe.patch new file mode 100644 index 00000000000..071699069b9 --- /dev/null +++ b/queue-5.18/tracing-events-add-__vstring-and-__assign_vstr-helpe.patch @@ -0,0 +1,196 @@ +From 3a68f50bc6f54e4ef743e38ffb6064be1ef5e481 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Jul 2022 18:44:54 -0400 +Subject: tracing/events: Add __vstring() and __assign_vstr() helper macros + +From: Steven Rostedt (Google) + +[ Upstream commit 0563231f93c6d1f582b168a47753b345c1e20d81 ] + +There's several places that open code the following logic: + + TP_STRUCT__entry(__dynamic_array(char, msg, MSG_MAX)), + TP_fast_assign(vsnprintf(__get_str(msg), MSG_MAX, vaf->fmt, *vaf->va);) + +To load a string created by variable array va_list. + +The main issue with this approach is that "MSG_MAX" usage in the +__dynamic_array() portion. That actually just reserves the MSG_MAX in the +event, and even wastes space because there's dynamic meta data also saved +in the event to denote the offset and size of the dynamic array. It would +have been better to just use a static __array() field. + +Instead, create __vstring() and __assign_vstr() that work like __string +and __assign_str() but instead of taking a destination string to copy, +take a format string and a va_list pointer and fill in the values. + +It uses the helper: + + #define __trace_event_vstr_len(fmt, va) \ + ({ \ + va_list __ap; \ + int __ret; \ + \ + va_copy(__ap, *(va)); \ + __ret = vsnprintf(NULL, 0, fmt, __ap) + 1; \ + va_end(__ap); \ + \ + min(__ret, TRACE_EVENT_STR_MAX); \ + }) + +To figure out the length to store the string. It may be slightly slower as +it needs to run the vsnprintf() twice, but it now saves space on the ring +buffer. + +Link: https://lkml.kernel.org/r/20220705224749.053570613@goodmis.org + +Cc: Dennis Dalessandro +Cc: Ingo Molnar +Cc: Andrew Morton +Cc: Jason Gunthorpe +Cc: Leon Romanovsky +Cc: Kalle Valo +Cc: "David S. Miller" +Cc: Eric Dumazet +Cc: Jakub Kicinski +Cc: Paolo Abeni +Cc: Arend van Spriel +Cc: Franky Lin +Cc: Hante Meuleman +Cc: Gregory Greenman +Cc: Peter Chen +Cc: Greg Kroah-Hartman +Cc: Mathias Nyman +Cc: Chunfeng Yun +Cc: Bin Liu +Cc: Marek Lindner +Cc: Simon Wunderlich +Cc: Antonio Quartulli +Cc: Sven Eckelmann +Cc: Johannes Berg +Cc: Jim Cromie +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + include/linux/trace_events.h | 18 ++++++++++++++++++ + include/trace/stages/stage1_struct_define.h | 3 +++ + include/trace/stages/stage2_data_offsets.h | 3 +++ + include/trace/stages/stage4_event_fields.h | 3 +++ + include/trace/stages/stage5_get_offsets.h | 4 ++++ + include/trace/stages/stage6_event_callback.h | 7 +++++++ + 6 files changed, 38 insertions(+) + +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index e6e95a9f07a5..b18759a673c6 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -916,6 +916,24 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, + + #endif + ++#define TRACE_EVENT_STR_MAX 512 ++ ++/* ++ * gcc warns that you can not use a va_list in an inlined ++ * function. But lets me make it into a macro :-/ ++ */ ++#define __trace_event_vstr_len(fmt, va) \ ++({ \ ++ va_list __ap; \ ++ int __ret; \ ++ \ ++ va_copy(__ap, *(va)); \ ++ __ret = vsnprintf(NULL, 0, fmt, __ap) + 1; \ ++ va_end(__ap); \ ++ \ ++ min(__ret, TRACE_EVENT_STR_MAX); \ ++}) ++ + #endif /* _LINUX_TRACE_EVENT_H */ + + /* +diff --git a/include/trace/stages/stage1_struct_define.h b/include/trace/stages/stage1_struct_define.h +index a16783419687..1b7bab60434c 100644 +--- a/include/trace/stages/stage1_struct_define.h ++++ b/include/trace/stages/stage1_struct_define.h +@@ -26,6 +26,9 @@ + #undef __string_len + #define __string_len(item, src, len) __dynamic_array(char, item, -1) + ++#undef __vstring ++#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1) ++ + #undef __bitmask + #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1) + +diff --git a/include/trace/stages/stage2_data_offsets.h b/include/trace/stages/stage2_data_offsets.h +index 42fd1e8813ec..1b7a8f764fdd 100644 +--- a/include/trace/stages/stage2_data_offsets.h ++++ b/include/trace/stages/stage2_data_offsets.h +@@ -32,6 +32,9 @@ + #undef __string_len + #define __string_len(item, src, len) __dynamic_array(char, item, -1) + ++#undef __vstring ++#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1) ++ + #undef __bitmask + #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + +diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h +index e80cdc397a43..c3790ec7a453 100644 +--- a/include/trace/stages/stage4_event_fields.h ++++ b/include/trace/stages/stage4_event_fields.h +@@ -38,6 +38,9 @@ + #undef __string_len + #define __string_len(item, src, len) __dynamic_array(char, item, -1) + ++#undef __vstring ++#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1) ++ + #undef __bitmask + #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + +diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h +index 7ee5931300e6..fba4c24ed9e6 100644 +--- a/include/trace/stages/stage5_get_offsets.h ++++ b/include/trace/stages/stage5_get_offsets.h +@@ -39,6 +39,10 @@ + #undef __string_len + #define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1) + ++#undef __vstring ++#define __vstring(item, fmt, ap) __dynamic_array(char, item, \ ++ __trace_event_vstr_len(fmt, ap)) ++ + #undef __rel_dynamic_array + #define __rel_dynamic_array(type, item, len) \ + __item_length = (len) * sizeof(type); \ +diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h +index e1724f73594b..0f51f6b3ab70 100644 +--- a/include/trace/stages/stage6_event_callback.h ++++ b/include/trace/stages/stage6_event_callback.h +@@ -24,6 +24,9 @@ + #undef __string_len + #define __string_len(item, src, len) __dynamic_array(char, item, -1) + ++#undef __vstring ++#define __vstring(item, fmt, ap) __dynamic_array(char, item, -1) ++ + #undef __assign_str + #define __assign_str(dst, src) \ + strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)"); +@@ -35,6 +38,10 @@ + __get_str(dst)[len] = '\0'; \ + } while(0) + ++#undef __assign_vstr ++#define __assign_vstr(dst, fmt, va) \ ++ vsnprintf(__get_str(dst), TRACE_EVENT_STR_MAX, fmt, *(va)) ++ + #undef __bitmask + #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + +-- +2.35.1 + diff --git a/queue-5.18/tracing-use-a-struct-alignof-to-determine-trace-even.patch b/queue-5.18/tracing-use-a-struct-alignof-to-determine-trace-even.patch new file mode 100644 index 00000000000..1ea5bee51cb --- /dev/null +++ b/queue-5.18/tracing-use-a-struct-alignof-to-determine-trace-even.patch @@ -0,0 +1,80 @@ +From 1551cb64c9964c8e7535b88210b5993eb066ab5c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 31 Jul 2022 01:59:28 -0400 +Subject: tracing: Use a struct alignof to determine trace event field + alignment + +From: Steven Rostedt (Google) + +[ Upstream commit 4c3d2f9388d36eb28640a220a6f908328442d873 ] + +alignof() gives an alignment of types as they would be as standalone +variables. But alignment in structures might be different, and when +building the fields of events, the alignment must be the actual +alignment otherwise the field offsets may not match what they actually +are. + +This caused trace-cmd to crash, as libtraceevent did not check if the +field offset was bigger than the event. The write_msr and read_msr +events on 32 bit had their fields incorrect, because it had a u64 field +between two ints. alignof(u64) would give 8, but the u64 field was at a +4 byte alignment. + +Define a macro as: + + ALIGN_STRUCTFIELD(type) ((int)(offsetof(struct {char a; type b;}, b))) + +which gives the actual alignment of types in a structure. + +Link: https://lkml.kernel.org/r/20220731015928.7ab3a154@rorschach.local.home + +Cc: Ingo Molnar +Cc: Andrew Morton +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: Masami Hiramatsu +Cc: stable@vger.kernel.org +Fixes: 04ae87a52074e ("ftrace: Rework event_create_dir()") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + include/trace/stages/stage4_event_fields.h | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/include/trace/stages/stage4_event_fields.h b/include/trace/stages/stage4_event_fields.h +index c3790ec7a453..80d34f396555 100644 +--- a/include/trace/stages/stage4_event_fields.h ++++ b/include/trace/stages/stage4_event_fields.h +@@ -2,16 +2,18 @@ + + /* Stage 4 definitions for creating trace events */ + ++#define ALIGN_STRUCTFIELD(type) ((int)(offsetof(struct {char a; type b;}, b))) ++ + #undef __field_ext + #define __field_ext(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ +- .size = sizeof(_type), .align = __alignof__(_type), \ ++ .size = sizeof(_type), .align = ALIGN_STRUCTFIELD(_type), \ + .is_signed = is_signed_type(_type), .filter_type = _filter_type }, + + #undef __field_struct_ext + #define __field_struct_ext(_type, _item, _filter_type) { \ + .type = #_type, .name = #_item, \ +- .size = sizeof(_type), .align = __alignof__(_type), \ ++ .size = sizeof(_type), .align = ALIGN_STRUCTFIELD(_type), \ + 0, .filter_type = _filter_type }, + + #undef __field +@@ -23,7 +25,7 @@ + #undef __array + #define __array(_type, _item, _len) { \ + .type = #_type"["__stringify(_len)"]", .name = #_item, \ +- .size = sizeof(_type[_len]), .align = __alignof__(_type), \ ++ .size = sizeof(_type[_len]), .align = ALIGN_STRUCTFIELD(_type), \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }, + + #undef __dynamic_array +-- +2.35.1 + diff --git a/queue-5.18/tty-8250-add-support-for-brainboxes-px-cards.patch b/queue-5.18/tty-8250-add-support-for-brainboxes-px-cards.patch new file mode 100644 index 00000000000..b7778fa7692 --- /dev/null +++ b/queue-5.18/tty-8250-add-support-for-brainboxes-px-cards.patch @@ -0,0 +1,147 @@ +From 16376f07f879be2fc73315678d2b0862b01399b0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 11 Jul 2022 16:35:10 +0100 +Subject: tty: 8250: Add support for Brainboxes PX cards. + +From: Cameron Williams + +[ Upstream commit ef5a03a26c87a760bc3d86b5af7b773e82f8b1b7 ] + +Add support for some of the Brainboxes PCIe (PX) range of +serial cards, including the PX-101, PX-235/PX-246, +PX-203/PX-257, PX-260/PX-701, PX-310, PX-313, +PX-320/PX-324/PX-376/PX-387, PX-335/PX-346, PX-368, PX-420, +PX-803 and PX-846. + +Signed-off-by: Cameron Williams +Cc: stable +Link: https://lore.kernel.org/r/AM5PR0202MB2564669252BDC59BF55A6E87C4879@AM5PR0202MB2564.eurprd02.prod.outlook.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + drivers/tty/serial/8250/8250_pci.c | 109 +++++++++++++++++++++++++++++ + 1 file changed, 109 insertions(+) + +diff --git a/drivers/tty/serial/8250/8250_pci.c b/drivers/tty/serial/8250/8250_pci.c +index 818ed6cd3132..aeac20f7cbb2 100644 +--- a/drivers/tty/serial/8250/8250_pci.c ++++ b/drivers/tty/serial/8250/8250_pci.c +@@ -5063,6 +5063,115 @@ static const struct pci_device_id serial_pci_tbl[] = { + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + pbn_b2_4_115200 }, ++ /* ++ * Brainboxes PX-101 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x4005, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_b0_2_115200 }, ++ { PCI_VENDOR_ID_INTASHIELD, 0x4019, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_2_15625000 }, ++ /* ++ * Brainboxes PX-235/246 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x4004, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_b0_1_115200 }, ++ { PCI_VENDOR_ID_INTASHIELD, 0x4016, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_1_15625000 }, ++ /* ++ * Brainboxes PX-203/PX-257 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x4006, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_b0_2_115200 }, ++ { PCI_VENDOR_ID_INTASHIELD, 0x4015, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_4_15625000 }, ++ /* ++ * Brainboxes PX-260/PX-701 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x400A, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_4_15625000 }, ++ /* ++ * Brainboxes PX-310 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x400E, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_2_15625000 }, ++ /* ++ * Brainboxes PX-313 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x400C, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_2_15625000 }, ++ /* ++ * Brainboxes PX-320/324/PX-376/PX-387 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x400B, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_1_15625000 }, ++ /* ++ * Brainboxes PX-335/346 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x400F, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_4_15625000 }, ++ /* ++ * Brainboxes PX-368 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x4010, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_4_15625000 }, ++ /* ++ * Brainboxes PX-420 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x4000, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_b0_4_115200 }, ++ { PCI_VENDOR_ID_INTASHIELD, 0x4011, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_4_15625000 }, ++ /* ++ * Brainboxes PX-803 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x4009, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_b0_1_115200 }, ++ { PCI_VENDOR_ID_INTASHIELD, 0x401E, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_1_15625000 }, ++ /* ++ * Brainboxes PX-846 ++ */ ++ { PCI_VENDOR_ID_INTASHIELD, 0x4008, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_b0_1_115200 }, ++ { PCI_VENDOR_ID_INTASHIELD, 0x4017, ++ PCI_ANY_ID, PCI_ANY_ID, ++ 0, 0, ++ pbn_oxsemi_1_15625000 }, ++ + /* + * Perle PCI-RAS cards + */ +-- +2.35.1 + diff --git a/queue-5.18/usbnet-smsc95xx-avoid-link-settings-race-on-interrup.patch b/queue-5.18/usbnet-smsc95xx-avoid-link-settings-race-on-interrup.patch new file mode 100644 index 00000000000..affb52bd375 --- /dev/null +++ b/queue-5.18/usbnet-smsc95xx-avoid-link-settings-race-on-interrup.patch @@ -0,0 +1,122 @@ +From a7b73d5774f2b3d6656d48dbee4b2f995629a2c2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 May 2022 10:42:04 +0200 +Subject: usbnet: smsc95xx: Avoid link settings race on interrupt reception + +From: Lukas Wunner + +[ Upstream commit 8960f878e39fadc03d74292a6731f1e914cf2019 ] + +When a PHY interrupt is signaled, the SMSC LAN95xx driver updates the +MAC full duplex mode and PHY flow control registers based on cached data +in struct phy_device: + + smsc95xx_status() # raises EVENT_LINK_RESET + usbnet_deferred_kevent() + smsc95xx_link_reset() # uses cached data in phydev + +Simultaneously, phylib polls link status once per second and updates +that cached data: + + phy_state_machine() + phy_check_link_status() + phy_read_status() + lan87xx_read_status() + genphy_read_status() # updates cached data in phydev + +If smsc95xx_link_reset() wins the race against genphy_read_status(), +the registers may be updated based on stale data. + +E.g. if the link was previously down, phydev->duplex is set to +DUPLEX_UNKNOWN and that's what smsc95xx_link_reset() will use, even +though genphy_read_status() may update it to DUPLEX_FULL afterwards. + +PHY interrupts are currently only enabled on suspend to trigger wakeup, +so the impact of the race is limited, but we're about to enable them +perpetually. + +Avoid the race by delaying execution of smsc95xx_link_reset() until +phy_state_machine() has done its job and calls back via +smsc95xx_handle_link_change(). + +Signaling EVENT_LINK_RESET on wakeup is not necessary because phylib +picks up link status changes through polling. So drop the declaration +of a ->link_reset() callback. + +Note that the semicolon on a line by itself added in smsc95xx_status() +is a placeholder for a function call which will be added in a subsequent +commit. That function call will actually handle the INT_ENP_PHY_INT_ +interrupt. + +Tested-by: Oleksij Rempel # LAN9514/9512/9500 +Tested-by: Ferry Toth # LAN9514 +Signed-off-by: Lukas Wunner +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/usb/smsc95xx.c | 16 +++++++++------- + 1 file changed, 9 insertions(+), 7 deletions(-) + +diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c +index 2cb44d65bbc3..f5a208948d22 100644 +--- a/drivers/net/usb/smsc95xx.c ++++ b/drivers/net/usb/smsc95xx.c +@@ -566,7 +566,7 @@ static int smsc95xx_phy_update_flowcontrol(struct usbnet *dev) + return smsc95xx_write_reg(dev, AFC_CFG, afc_cfg); + } + +-static int smsc95xx_link_reset(struct usbnet *dev) ++static void smsc95xx_mac_update_fullduplex(struct usbnet *dev) + { + struct smsc95xx_priv *pdata = dev->driver_priv; + unsigned long flags; +@@ -583,14 +583,16 @@ static int smsc95xx_link_reset(struct usbnet *dev) + spin_unlock_irqrestore(&pdata->mac_cr_lock, flags); + + ret = smsc95xx_write_reg(dev, MAC_CR, pdata->mac_cr); +- if (ret < 0) +- return ret; ++ if (ret < 0) { ++ if (ret != -ENODEV) ++ netdev_warn(dev->net, ++ "Error updating MAC full duplex mode\n"); ++ return; ++ } + + ret = smsc95xx_phy_update_flowcontrol(dev); + if (ret < 0) + netdev_warn(dev->net, "Error updating PHY flow control\n"); +- +- return ret; + } + + static void smsc95xx_status(struct usbnet *dev, struct urb *urb) +@@ -607,7 +609,7 @@ static void smsc95xx_status(struct usbnet *dev, struct urb *urb) + netif_dbg(dev, link, dev->net, "intdata: 0x%08X\n", intdata); + + if (intdata & INT_ENP_PHY_INT_) +- usbnet_defer_kevent(dev, EVENT_LINK_RESET); ++ ; + else + netdev_warn(dev->net, "unexpected interrupt, intdata=0x%08X\n", + intdata); +@@ -1088,6 +1090,7 @@ static void smsc95xx_handle_link_change(struct net_device *net) + struct usbnet *dev = netdev_priv(net); + + phy_print_status(net->phydev); ++ smsc95xx_mac_update_fullduplex(dev); + usbnet_defer_kevent(dev, EVENT_LINK_CHANGE); + } + +@@ -1993,7 +1996,6 @@ static const struct driver_info smsc95xx_info = { + .description = "smsc95xx USB 2.0 Ethernet", + .bind = smsc95xx_bind, + .unbind = smsc95xx_unbind, +- .link_reset = smsc95xx_link_reset, + .reset = smsc95xx_reset, + .check_connect = smsc95xx_start_phy, + .stop = smsc95xx_stop, +-- +2.35.1 + diff --git a/queue-5.18/usbnet-smsc95xx-don-t-clear-read-only-phy-interrupt.patch b/queue-5.18/usbnet-smsc95xx-don-t-clear-read-only-phy-interrupt.patch new file mode 100644 index 00000000000..9ae8efb5392 --- /dev/null +++ b/queue-5.18/usbnet-smsc95xx-don-t-clear-read-only-phy-interrupt.patch @@ -0,0 +1,51 @@ +From 4a8d053acd3acd06d45b12cd8902178fdaf18acc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 May 2022 10:42:02 +0200 +Subject: usbnet: smsc95xx: Don't clear read-only PHY interrupt + +From: Lukas Wunner + +[ Upstream commit 3108871f19221372b251f7da1ac38736928b5b3a ] + +Upon receiving data from the Interrupt Endpoint, the SMSC LAN95xx driver +attempts to clear the signaled interrupts by writing "all ones" to the +Interrupt Status Register. + +However the driver only ever enables a single type of interrupt, namely +the PHY Interrupt. And according to page 119 of the LAN950x datasheet, +its bit in the Interrupt Status Register is read-only. There's no other +way to clear it than in a separate PHY register: + +https://www.microchip.com/content/dam/mchp/documents/UNG/ProductDocuments/DataSheets/LAN950x-Data-Sheet-DS00001875D.pdf + +Consequently, writing "all ones" to the Interrupt Status Register is +pointless and can be dropped. + +Tested-by: Oleksij Rempel # LAN9514/9512/9500 +Tested-by: Ferry Toth # LAN9514 +Signed-off-by: Lukas Wunner +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/usb/smsc95xx.c | 4 ---- + 1 file changed, 4 deletions(-) + +diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c +index edf0492ad489..2cb44d65bbc3 100644 +--- a/drivers/net/usb/smsc95xx.c ++++ b/drivers/net/usb/smsc95xx.c +@@ -572,10 +572,6 @@ static int smsc95xx_link_reset(struct usbnet *dev) + unsigned long flags; + int ret; + +- ret = smsc95xx_write_reg(dev, INT_STS, INT_STS_CLEAR_ALL_); +- if (ret < 0) +- return ret; +- + spin_lock_irqsave(&pdata->mac_cr_lock, flags); + if (pdata->phydev->duplex != DUPLEX_FULL) { + pdata->mac_cr &= ~MAC_CR_FDPX_; +-- +2.35.1 + diff --git a/queue-5.18/usbnet-smsc95xx-fix-deadlock-on-runtime-resume.patch b/queue-5.18/usbnet-smsc95xx-fix-deadlock-on-runtime-resume.patch new file mode 100644 index 00000000000..53522e14306 --- /dev/null +++ b/queue-5.18/usbnet-smsc95xx-fix-deadlock-on-runtime-resume.patch @@ -0,0 +1,193 @@ +From df274cfed28127dbf7d1d06f05318fa92faec8b6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Jul 2022 22:47:51 +0200 +Subject: usbnet: smsc95xx: Fix deadlock on runtime resume + +From: Lukas Wunner + +[ Upstream commit 7b960c967f2aa01ab8f45c5a0bd78e754cffdeee ] + +Commit 05b35e7eb9a1 ("smsc95xx: add phylib support") amended +smsc95xx_resume() to call phy_init_hw(). That function waits for the +device to runtime resume even though it is placed in the runtime resume +path, causing a deadlock. + +The problem is that phy_init_hw() calls down to smsc95xx_mdiobus_read(), +which never uses the _nopm variant of usbnet_read_cmd(). + +Commit b4df480f68ae ("usbnet: smsc95xx: add reset_resume function with +reset operation") causes a similar deadlock on resume if the device was +already runtime suspended when entering system sleep: + +That's because the commit introduced smsc95xx_reset_resume(), which +calls down to smsc95xx_reset(), which neglects to use _nopm accessors. + +Fix by auto-detecting whether a device access is performed by the +suspend/resume task_struct and use the _nopm variant if so. This works +because the PM core guarantees that suspend/resume callbacks are run in +task context. + +Stacktrace for posterity: + + INFO: task kworker/2:1:49 blocked for more than 122 seconds. + Workqueue: usb_hub_wq hub_event + schedule + rpm_resume + __pm_runtime_resume + usb_autopm_get_interface + usbnet_read_cmd + __smsc95xx_read_reg + __smsc95xx_phy_wait_not_busy + __smsc95xx_mdio_read + smsc95xx_mdiobus_read + __mdiobus_read + mdiobus_read + smsc_phy_reset + phy_init_hw + smsc95xx_resume + usb_resume_interface + usb_resume_both + usb_runtime_resume + __rpm_callback + rpm_callback + rpm_resume + __pm_runtime_resume + usb_autoresume_device + hub_event + process_one_work + +Fixes: b4df480f68ae ("usbnet: smsc95xx: add reset_resume function with reset operation") +Signed-off-by: Lukas Wunner +Cc: stable@vger.kernel.org # v3.16+ +Cc: Andre Edich +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/usb/smsc95xx.c | 26 ++++++++++++++++++++------ + 1 file changed, 20 insertions(+), 6 deletions(-) + +diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c +index 358b170cc8fb..515363d74078 100644 +--- a/drivers/net/usb/smsc95xx.c ++++ b/drivers/net/usb/smsc95xx.c +@@ -71,6 +71,7 @@ struct smsc95xx_priv { + struct fwnode_handle *irqfwnode; + struct mii_bus *mdiobus; + struct phy_device *phydev; ++ struct task_struct *pm_task; + }; + + static bool turbo_mode = true; +@@ -80,13 +81,14 @@ MODULE_PARM_DESC(turbo_mode, "Enable multiple frames per Rx transaction"); + static int __must_check __smsc95xx_read_reg(struct usbnet *dev, u32 index, + u32 *data, int in_pm) + { ++ struct smsc95xx_priv *pdata = dev->driver_priv; + u32 buf; + int ret; + int (*fn)(struct usbnet *, u8, u8, u16, u16, void *, u16); + + BUG_ON(!dev); + +- if (!in_pm) ++ if (current != pdata->pm_task) + fn = usbnet_read_cmd; + else + fn = usbnet_read_cmd_nopm; +@@ -110,13 +112,14 @@ static int __must_check __smsc95xx_read_reg(struct usbnet *dev, u32 index, + static int __must_check __smsc95xx_write_reg(struct usbnet *dev, u32 index, + u32 data, int in_pm) + { ++ struct smsc95xx_priv *pdata = dev->driver_priv; + u32 buf; + int ret; + int (*fn)(struct usbnet *, u8, u8, u16, u16, const void *, u16); + + BUG_ON(!dev); + +- if (!in_pm) ++ if (current != pdata->pm_task) + fn = usbnet_write_cmd; + else + fn = usbnet_write_cmd_nopm; +@@ -1508,9 +1511,12 @@ static int smsc95xx_suspend(struct usb_interface *intf, pm_message_t message) + u32 val, link_up; + int ret; + ++ pdata->pm_task = current; ++ + ret = usbnet_suspend(intf, message); + if (ret < 0) { + netdev_warn(dev->net, "usbnet_suspend error\n"); ++ pdata->pm_task = NULL; + return ret; + } + +@@ -1750,6 +1756,7 @@ static int smsc95xx_suspend(struct usb_interface *intf, pm_message_t message) + if (ret && PMSG_IS_AUTO(message)) + usbnet_resume(intf); + ++ pdata->pm_task = NULL; + return ret; + } + +@@ -1770,29 +1777,31 @@ static int smsc95xx_resume(struct usb_interface *intf) + /* do this first to ensure it's cleared even in error case */ + pdata->suspend_flags = 0; + ++ pdata->pm_task = current; ++ + if (suspend_flags & SUSPEND_ALLMODES) { + /* clear wake-up sources */ + ret = smsc95xx_read_reg_nopm(dev, WUCSR, &val); + if (ret < 0) +- return ret; ++ goto done; + + val &= ~(WUCSR_WAKE_EN_ | WUCSR_MPEN_); + + ret = smsc95xx_write_reg_nopm(dev, WUCSR, val); + if (ret < 0) +- return ret; ++ goto done; + + /* clear wake-up status */ + ret = smsc95xx_read_reg_nopm(dev, PM_CTRL, &val); + if (ret < 0) +- return ret; ++ goto done; + + val &= ~PM_CTL_WOL_EN_; + val |= PM_CTL_WUPS_; + + ret = smsc95xx_write_reg_nopm(dev, PM_CTRL, val); + if (ret < 0) +- return ret; ++ goto done; + } + + phy_init_hw(pdata->phydev); +@@ -1801,15 +1810,20 @@ static int smsc95xx_resume(struct usb_interface *intf) + if (ret < 0) + netdev_warn(dev->net, "usbnet_resume error\n"); + ++done: ++ pdata->pm_task = NULL; + return ret; + } + + static int smsc95xx_reset_resume(struct usb_interface *intf) + { + struct usbnet *dev = usb_get_intfdata(intf); ++ struct smsc95xx_priv *pdata = dev->driver_priv; + int ret; + ++ pdata->pm_task = current; + ret = smsc95xx_reset(dev); ++ pdata->pm_task = NULL; + if (ret < 0) + return ret; + +-- +2.35.1 + diff --git a/queue-5.18/usbnet-smsc95xx-forward-phy-interrupts-to-phy-driver.patch b/queue-5.18/usbnet-smsc95xx-forward-phy-interrupts-to-phy-driver.patch new file mode 100644 index 00000000000..4fa83b082d3 --- /dev/null +++ b/queue-5.18/usbnet-smsc95xx-forward-phy-interrupts-to-phy-driver.patch @@ -0,0 +1,318 @@ +From 0a0b027602d0b6c0b8dcdfc10ac37e8ec0d648d4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 May 2022 10:42:05 +0200 +Subject: usbnet: smsc95xx: Forward PHY interrupts to PHY driver to avoid + polling + +From: Lukas Wunner + +[ Upstream commit 1ce8b37241ed291af56f7a49bbdbf20c08728e88 ] + +Link status of SMSC LAN95xx chips is polled once per second, even though +they're capable of signaling PHY interrupts through the MAC layer. + +Forward those interrupts to the PHY driver to avoid polling. Benefits +are reduced bus traffic, reduced CPU overhead and quicker interface +bringup. + +Polling was introduced in 2016 by commit d69d16949346 ("usbnet: +smsc95xx: fix link detection for disabled autonegotiation"). +Back then, the LAN95xx driver neglected to enable the ENERGYON interrupt, +hence couldn't detect link-up events when auto-negotiation was disabled. +The proper solution would have been to enable the ENERGYON interrupt +instead of polling. + +Since then, PHY handling was moved from the LAN95xx driver to the SMSC +PHY driver with commit 05b35e7eb9a1 ("smsc95xx: add phylib support"). +That PHY driver is capable of link detection with auto-negotiation +disabled because it enables the ENERGYON interrupt. + +Note that signaling interrupts through the MAC layer not only works with +the integrated PHY, but also with an external PHY, provided its +interrupt pin is attached to LAN95xx's nPHY_INT pin. + +In the unlikely event that the interrupt pin of an external PHY is +attached to a GPIO of the SoC (or not connected at all), the driver can +be amended to retrieve the irq from the PHY's of_node. + +To forward PHY interrupts to phylib, it is not sufficient to call +phy_mac_interrupt(). Instead, the PHY's interrupt handler needs to run +so that PHY interrupts are cleared. That's because according to page +119 of the LAN950x datasheet, "The source of this interrupt is a level. +The interrupt persists until it is cleared in the PHY." + +https://www.microchip.com/content/dam/mchp/documents/UNG/ProductDocuments/DataSheets/LAN950x-Data-Sheet-DS00001875D.pdf + +Therefore, create an IRQ domain with a single IRQ for the PHY. In the +future, the IRQ domain may be extended to support the 11 GPIOs on the +LAN95xx. + +Normally the PHY interrupt should be masked until the PHY driver has +cleared it. However masking requires a (sleeping) USB transaction and +interrupts are received in (non-sleepable) softirq context. I decided +not to mask the interrupt at all (by using the dummy_irq_chip's noop +->irq_mask() callback): The USB interrupt endpoint is polled in 1 msec +intervals and normally that's sufficient to wake the PHY driver's IRQ +thread and have it clear the interrupt. If it does take longer, worst +thing that can happen is the IRQ thread is woken again. No big deal. + +Because PHY interrupts are now perpetually enabled, there's no need to +selectively enable them on suspend. So remove all invocations of +smsc95xx_enable_phy_wakeup_interrupts(). + +In smsc95xx_resume(), move the call of phy_init_hw() before +usbnet_resume() (which restarts the status URB) to ensure that the PHY +is fully initialized when an interrupt is handled. + +Tested-by: Oleksij Rempel # LAN9514/9512/9500 +Tested-by: Ferry Toth # LAN9514 +Signed-off-by: Lukas Wunner +Reviewed-by: Andrew Lunn # from a PHY perspective +Cc: Andre Edich +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/usb/smsc95xx.c | 113 ++++++++++++++++++++----------------- + 1 file changed, 61 insertions(+), 52 deletions(-) + +diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c +index f5a208948d22..358b170cc8fb 100644 +--- a/drivers/net/usb/smsc95xx.c ++++ b/drivers/net/usb/smsc95xx.c +@@ -18,6 +18,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -53,6 +55,9 @@ + #define SUSPEND_ALLMODES (SUSPEND_SUSPEND0 | SUSPEND_SUSPEND1 | \ + SUSPEND_SUSPEND2 | SUSPEND_SUSPEND3) + ++#define SMSC95XX_NR_IRQS (1) /* raise to 12 for GPIOs */ ++#define PHY_HWIRQ (SMSC95XX_NR_IRQS - 1) ++ + struct smsc95xx_priv { + u32 mac_cr; + u32 hash_hi; +@@ -61,6 +66,9 @@ struct smsc95xx_priv { + spinlock_t mac_cr_lock; + u8 features; + u8 suspend_flags; ++ struct irq_chip irqchip; ++ struct irq_domain *irqdomain; ++ struct fwnode_handle *irqfwnode; + struct mii_bus *mdiobus; + struct phy_device *phydev; + }; +@@ -597,6 +605,8 @@ static void smsc95xx_mac_update_fullduplex(struct usbnet *dev) + + static void smsc95xx_status(struct usbnet *dev, struct urb *urb) + { ++ struct smsc95xx_priv *pdata = dev->driver_priv; ++ unsigned long flags; + u32 intdata; + + if (urb->actual_length != 4) { +@@ -608,11 +618,15 @@ static void smsc95xx_status(struct usbnet *dev, struct urb *urb) + intdata = get_unaligned_le32(urb->transfer_buffer); + netif_dbg(dev, link, dev->net, "intdata: 0x%08X\n", intdata); + ++ local_irq_save(flags); ++ + if (intdata & INT_ENP_PHY_INT_) +- ; ++ generic_handle_domain_irq(pdata->irqdomain, PHY_HWIRQ); + else + netdev_warn(dev->net, "unexpected interrupt, intdata=0x%08X\n", + intdata); ++ ++ local_irq_restore(flags); + } + + /* Enable or disable Tx & Rx checksum offload engines */ +@@ -1098,8 +1112,9 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf) + { + struct smsc95xx_priv *pdata; + bool is_internal_phy; ++ char usb_path[64]; ++ int ret, phy_irq; + u32 val; +- int ret; + + printk(KERN_INFO SMSC_CHIPNAME " v" SMSC_DRIVER_VERSION "\n"); + +@@ -1139,10 +1154,38 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf) + if (ret) + goto free_pdata; + ++ /* create irq domain for use by PHY driver and GPIO consumers */ ++ usb_make_path(dev->udev, usb_path, sizeof(usb_path)); ++ pdata->irqfwnode = irq_domain_alloc_named_fwnode(usb_path); ++ if (!pdata->irqfwnode) { ++ ret = -ENOMEM; ++ goto free_pdata; ++ } ++ ++ pdata->irqdomain = irq_domain_create_linear(pdata->irqfwnode, ++ SMSC95XX_NR_IRQS, ++ &irq_domain_simple_ops, ++ pdata); ++ if (!pdata->irqdomain) { ++ ret = -ENOMEM; ++ goto free_irqfwnode; ++ } ++ ++ phy_irq = irq_create_mapping(pdata->irqdomain, PHY_HWIRQ); ++ if (!phy_irq) { ++ ret = -ENOENT; ++ goto remove_irqdomain; ++ } ++ ++ pdata->irqchip = dummy_irq_chip; ++ pdata->irqchip.name = SMSC_CHIPNAME; ++ irq_set_chip_and_handler_name(phy_irq, &pdata->irqchip, ++ handle_simple_irq, "phy"); ++ + pdata->mdiobus = mdiobus_alloc(); + if (!pdata->mdiobus) { + ret = -ENOMEM; +- goto free_pdata; ++ goto dispose_irq; + } + + ret = smsc95xx_read_reg(dev, HW_CFG, &val); +@@ -1175,6 +1218,7 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf) + goto unregister_mdio; + } + ++ pdata->phydev->irq = phy_irq; + pdata->phydev->is_internal = is_internal_phy; + + /* detect device revision as different features may be available */ +@@ -1217,6 +1261,15 @@ static int smsc95xx_bind(struct usbnet *dev, struct usb_interface *intf) + free_mdio: + mdiobus_free(pdata->mdiobus); + ++dispose_irq: ++ irq_dispose_mapping(phy_irq); ++ ++remove_irqdomain: ++ irq_domain_remove(pdata->irqdomain); ++ ++free_irqfwnode: ++ irq_domain_free_fwnode(pdata->irqfwnode); ++ + free_pdata: + kfree(pdata); + return ret; +@@ -1229,6 +1282,9 @@ static void smsc95xx_unbind(struct usbnet *dev, struct usb_interface *intf) + phy_disconnect(dev->net->phydev); + mdiobus_unregister(pdata->mdiobus); + mdiobus_free(pdata->mdiobus); ++ irq_dispose_mapping(irq_find_mapping(pdata->irqdomain, PHY_HWIRQ)); ++ irq_domain_remove(pdata->irqdomain); ++ irq_domain_free_fwnode(pdata->irqfwnode); + netif_dbg(dev, ifdown, dev->net, "free pdata\n"); + kfree(pdata); + } +@@ -1253,29 +1309,6 @@ static u32 smsc_crc(const u8 *buffer, size_t len, int filter) + return crc << ((filter % 2) * 16); + } + +-static int smsc95xx_enable_phy_wakeup_interrupts(struct usbnet *dev, u16 mask) +-{ +- int ret; +- +- netdev_dbg(dev->net, "enabling PHY wakeup interrupts\n"); +- +- /* read to clear */ +- ret = smsc95xx_mdio_read_nopm(dev, PHY_INT_SRC); +- if (ret < 0) +- return ret; +- +- /* enable interrupt source */ +- ret = smsc95xx_mdio_read_nopm(dev, PHY_INT_MASK); +- if (ret < 0) +- return ret; +- +- ret |= mask; +- +- smsc95xx_mdio_write_nopm(dev, PHY_INT_MASK, ret); +- +- return 0; +-} +- + static int smsc95xx_link_ok_nopm(struct usbnet *dev) + { + int ret; +@@ -1442,7 +1475,6 @@ static int smsc95xx_enter_suspend3(struct usbnet *dev) + static int smsc95xx_autosuspend(struct usbnet *dev, u32 link_up) + { + struct smsc95xx_priv *pdata = dev->driver_priv; +- int ret; + + if (!netif_running(dev->net)) { + /* interface is ifconfig down so fully power down hw */ +@@ -1461,27 +1493,10 @@ static int smsc95xx_autosuspend(struct usbnet *dev, u32 link_up) + } + + netdev_dbg(dev->net, "autosuspend entering SUSPEND1\n"); +- +- /* enable PHY wakeup events for if cable is attached */ +- ret = smsc95xx_enable_phy_wakeup_interrupts(dev, +- PHY_INT_MASK_ANEG_COMP_); +- if (ret < 0) { +- netdev_warn(dev->net, "error enabling PHY wakeup ints\n"); +- return ret; +- } +- + netdev_info(dev->net, "entering SUSPEND1 mode\n"); + return smsc95xx_enter_suspend1(dev); + } + +- /* enable PHY wakeup events so we remote wakeup if cable is pulled */ +- ret = smsc95xx_enable_phy_wakeup_interrupts(dev, +- PHY_INT_MASK_LINK_DOWN_); +- if (ret < 0) { +- netdev_warn(dev->net, "error enabling PHY wakeup ints\n"); +- return ret; +- } +- + netdev_dbg(dev->net, "autosuspend entering SUSPEND3\n"); + return smsc95xx_enter_suspend3(dev); + } +@@ -1547,13 +1562,6 @@ static int smsc95xx_suspend(struct usb_interface *intf, pm_message_t message) + } + + if (pdata->wolopts & WAKE_PHY) { +- ret = smsc95xx_enable_phy_wakeup_interrupts(dev, +- (PHY_INT_MASK_ANEG_COMP_ | PHY_INT_MASK_LINK_DOWN_)); +- if (ret < 0) { +- netdev_warn(dev->net, "error enabling PHY wakeup ints\n"); +- goto done; +- } +- + /* if link is down then configure EDPD and enter SUSPEND1, + * otherwise enter SUSPEND0 below + */ +@@ -1787,11 +1795,12 @@ static int smsc95xx_resume(struct usb_interface *intf) + return ret; + } + ++ phy_init_hw(pdata->phydev); ++ + ret = usbnet_resume(intf); + if (ret < 0) + netdev_warn(dev->net, "usbnet_resume error\n"); + +- phy_init_hw(pdata->phydev); + return ret; + } + +-- +2.35.1 + diff --git a/queue-5.18/x86-kprobes-update-kcb-status-flag-after-singlestepp.patch b/queue-5.18/x86-kprobes-update-kcb-status-flag-after-singlestepp.patch new file mode 100644 index 00000000000..0a98447a8f6 --- /dev/null +++ b/queue-5.18/x86-kprobes-update-kcb-status-flag-after-singlestepp.patch @@ -0,0 +1,67 @@ +From 2eb8d6ab6b01411d529d883d9a23396b251b4c91 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Aug 2022 15:04:16 +0900 +Subject: x86/kprobes: Update kcb status flag after singlestepping +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Masami Hiramatsu (Google) + +[ Upstream commit dec8784c9088b131a1523f582c2194cfc8107dc0 ] + +Fix kprobes to update kcb (kprobes control block) status flag to +KPROBE_HIT_SSDONE even if the kp->post_handler is not set. + +This bug may cause a kernel panic if another INT3 user runs right +after kprobes because kprobe_int3_handler() misunderstands the +INT3 is kprobe's single stepping INT3. + +Fixes: 6256e668b7af ("x86/kprobes: Use int3 instead of debug trap for single-step") +Reported-by: Daniel Müller +Signed-off-by: Masami Hiramatsu (Google) +Signed-off-by: Ingo Molnar +Tested-by: Daniel Müller +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/all/20220727210136.jjgc3lpqeq42yr3m@muellerd-fedora-PC2BDTX9 +Link: https://lore.kernel.org/r/165942025658.342061.12452378391879093249.stgit@devnote2 +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/kprobes/core.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c +index 7c4ab8870da4..74167dc5f55e 100644 +--- a/arch/x86/kernel/kprobes/core.c ++++ b/arch/x86/kernel/kprobes/core.c +@@ -814,16 +814,20 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) + { +- if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { +- kcb->kprobe_status = KPROBE_HIT_SSDONE; +- cur->post_handler(cur, regs, 0); +- } +- + /* Restore back the original saved kprobes variables and continue. */ +- if (kcb->kprobe_status == KPROBE_REENTER) ++ if (kcb->kprobe_status == KPROBE_REENTER) { ++ /* This will restore both kcb and current_kprobe */ + restore_previous_kprobe(kcb); +- else ++ } else { ++ /* ++ * Always update the kcb status because ++ * reset_curent_kprobe() doesn't update kcb. ++ */ ++ kcb->kprobe_status = KPROBE_HIT_SSDONE; ++ if (cur->post_handler) ++ cur->post_handler(cur, regs, 0); + reset_current_kprobe(); ++ } + } + NOKPROBE_SYMBOL(kprobe_post_process); + +-- +2.35.1 + diff --git a/queue-5.18/x86-olpc-fix-logical-not-is-only-applied-to-the-left.patch b/queue-5.18/x86-olpc-fix-logical-not-is-only-applied-to-the-left.patch new file mode 100644 index 00000000000..1b881069133 --- /dev/null +++ b/queue-5.18/x86-olpc-fix-logical-not-is-only-applied-to-the-left.patch @@ -0,0 +1,54 @@ +From b63e4693e67826d5eb07e8254daed08277df7133 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Jul 2022 17:15:36 +0200 +Subject: x86/olpc: fix 'logical not is only applied to the left hand side' + +From: Alexander Lobakin + +[ Upstream commit 3a2ba42cbd0b669ce3837ba400905f93dd06c79f ] + +The bitops compile-time optimization series revealed one more +problem in olpc-xo1-sci.c:send_ebook_state(), resulted in GCC +warnings: + +arch/x86/platform/olpc/olpc-xo1-sci.c: In function 'send_ebook_state': +arch/x86/platform/olpc/olpc-xo1-sci.c:83:63: warning: logical not is only applied to the left hand side of comparison [-Wlogical-not-parentheses] + 83 | if (!!test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == state) + | ^~ +arch/x86/platform/olpc/olpc-xo1-sci.c:83:13: note: add parentheses around left hand side expression to silence this warning + +Despite this code working as intended, this redundant double +negation of boolean value, together with comparing to `char` +with no explicit conversion to bool, makes compilers think +the author made some unintentional logical mistakes here. +Make it the other way around and negate the char instead +to silence the warnings. + +Fixes: d2aa37411b8e ("x86/olpc/xo1/sci: Produce wakeup events for buttons and switches") +Cc: stable@vger.kernel.org # 3.5+ +Reported-by: Guenter Roeck +Reported-by: kernel test robot +Reviewed-and-tested-by: Guenter Roeck +Signed-off-by: Alexander Lobakin +Signed-off-by: Yury Norov +Signed-off-by: Sasha Levin +--- + arch/x86/platform/olpc/olpc-xo1-sci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c +index f03a6883dcc6..89f25af4b3c3 100644 +--- a/arch/x86/platform/olpc/olpc-xo1-sci.c ++++ b/arch/x86/platform/olpc/olpc-xo1-sci.c +@@ -80,7 +80,7 @@ static void send_ebook_state(void) + return; + } + +- if (!!test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == state) ++ if (test_bit(SW_TABLET_MODE, ebook_switch_idev->sw) == !!state) + return; /* Nothing new to report. */ + + input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state); +-- +2.35.1 +