From: Sasha Levin Date: Thu, 30 Mar 2023 11:12:55 +0000 (-0400) Subject: Fixes for 6.1 X-Git-Tag: v4.14.312~66 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=bd41d40ee98c9e0c20d3fcc4dde3741e7b61d7c9;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.1 Signed-off-by: Sasha Levin --- diff --git a/queue-6.1/arm-dts-aspeed-p10bmc-update-battery-node-name.patch b/queue-6.1/arm-dts-aspeed-p10bmc-update-battery-node-name.patch new file mode 100644 index 00000000000..6768a687b7c --- /dev/null +++ b/queue-6.1/arm-dts-aspeed-p10bmc-update-battery-node-name.patch @@ -0,0 +1,53 @@ +From 158a8bc8a5cfcc2c4a8feaea3ee77ddeda799d53 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 21 Feb 2023 11:03:52 +1030 +Subject: ARM: dts: aspeed: p10bmc: Update battery node name + +From: Eddie James + +[ Upstream commit a8cef541dd5ef9445130660008c029205c4c5aa5 ] + +The ADC sensor for the battery needs to be named "iio-hwmon" for +compatibility with user space applications. + +Signed-off-by: Eddie James +Link: https://lore.kernel.org/r/20230202152759.67069-1-eajames@linux.ibm.com +Fixes: bf1914e2cfed ("ARM: dts: aspeed: p10bmc: Fix ADC iio-hwmon battery node name") +Signed-off-by: Joel Stanley +Link: https://lore.kernel.org/r/20230221003352.1218797-1-joel@jms.id.au +Signed-off-by: Arnd Bergmann +Signed-off-by: Sasha Levin +--- + arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts | 2 +- + arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts +index fcc890e3ad735..f11feb98fde33 100644 +--- a/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts ++++ b/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts +@@ -244,7 +244,7 @@ + }; + }; + +- iio-hwmon-battery { ++ iio-hwmon { + compatible = "iio-hwmon"; + io-channels = <&adc1 7>; + }; +diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts +index 4879da4cdbd25..77a3a27b04e26 100644 +--- a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts ++++ b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts +@@ -220,7 +220,7 @@ + }; + }; + +- iio-hwmon-battery { ++ iio-hwmon { + compatible = "iio-hwmon"; + io-channels = <&adc1 7>; + }; +-- +2.39.2 + diff --git a/queue-6.1/arm64-efi-set-nx-compat-flag-in-pe-coff-header.patch b/queue-6.1/arm64-efi-set-nx-compat-flag-in-pe-coff-header.patch new file mode 100644 index 00000000000..7562e0d1a4d --- /dev/null +++ b/queue-6.1/arm64-efi-set-nx-compat-flag-in-pe-coff-header.patch @@ -0,0 +1,49 @@ +From 16b47f362260fd260e6be1320d19bf3440fc199a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Mar 2023 13:30:05 +0100 +Subject: arm64: efi: Set NX compat flag in PE/COFF header + +From: Ard Biesheuvel + +[ Upstream commit 3c66bb1918c262dd52fb4221a8d372619c5da70a ] + +The PE/COFF header has a NX compat flag which informs the firmware that +the application does not rely on memory regions being mapped with both +executable and writable permissions at the same time. + +This is typically used by the firmware to decide whether it can set the +NX attribute on all allocations it returns, but going forward, it may be +used to enforce a policy that only permits applications with the NX flag +set to be loaded to begin wiht in some configurations, e.g., when Secure +Boot is in effect. + +Even though the arm64 version of the EFI stub may relocate the kernel +before executing it, it always did so after disabling the MMU, and so we +were always in line with what the NX compat flag conveys, we just never +bothered to set it. + +So let's set the flag now. + +Cc: +Signed-off-by: Ard Biesheuvel +Signed-off-by: Sasha Levin +--- + arch/arm64/kernel/efi-header.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S +index 28d8a5dca5f12..d731b4655df8e 100644 +--- a/arch/arm64/kernel/efi-header.S ++++ b/arch/arm64/kernel/efi-header.S +@@ -66,7 +66,7 @@ + .long .Lefi_header_end - .L_head // SizeOfHeaders + .long 0 // CheckSum + .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem +- .short 0 // DllCharacteristics ++ .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics + .quad 0 // SizeOfStackReserve + .quad 0 // SizeOfStackCommit + .quad 0 // SizeOfHeapReserve +-- +2.39.2 + diff --git a/queue-6.1/blk-mq-fix-bad-unlock-balance-detected-on-q-srcu-in-.patch b/queue-6.1/blk-mq-fix-bad-unlock-balance-detected-on-q-srcu-in-.patch new file mode 100644 index 00000000000..4935d04b936 --- /dev/null +++ b/queue-6.1/blk-mq-fix-bad-unlock-balance-detected-on-q-srcu-in-.patch @@ -0,0 +1,52 @@ +From 55cb484ac38dc54e421beb925ae5f7429300b0a6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Mar 2023 09:09:13 +0800 +Subject: blk-mq: fix "bad unlock balance detected" on q->srcu in + __blk_mq_run_dispatch_ops + +From: Chris Leech + +[ Upstream commit 00e885efcfbb8712d3e1bfc1ae30639c15ca1d3b ] + +The 'q' parameter of the macro __blk_mq_run_dispatch_ops may not be one +local variable, such as, it is rq->q, then request queue pointed by +this variable could be changed to another queue in case of +BLK_MQ_F_TAG_QUEUE_SHARED after 'dispatch_ops' returns, then +'bad unlock balance' is triggered. + +Fixes the issue by adding one local variable for doing srcu lock/unlock. + +Fixes: 2a904d00855f ("blk-mq: remove hctx_lock and hctx_unlock") +Cc: Marco Patalano +Signed-off-by: Chris Leech +Signed-off-by: Ming Lei +Link: https://lore.kernel.org/r/20230310010913.1014789-1-ming.lei@redhat.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + block/blk-mq.h | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/block/blk-mq.h b/block/blk-mq.h +index ef59fee62780d..a7482d2cc82e7 100644 +--- a/block/blk-mq.h ++++ b/block/blk-mq.h +@@ -378,12 +378,13 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ + do { \ + if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ ++ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ + int srcu_idx; \ + \ + might_sleep_if(check_sleep); \ +- srcu_idx = srcu_read_lock((q)->tag_set->srcu); \ ++ srcu_idx = srcu_read_lock(__tag_set->srcu); \ + (dispatch_ops); \ +- srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \ ++ srcu_read_unlock(__tag_set->srcu, srcu_idx); \ + } else { \ + rcu_read_lock(); \ + (dispatch_ops); \ +-- +2.39.2 + diff --git a/queue-6.1/blk-mq-move-the-srcu_struct-used-for-quiescing-to-th.patch b/queue-6.1/blk-mq-move-the-srcu_struct-used-for-quiescing-to-th.patch new file mode 100644 index 00000000000..f4c828f3c95 --- /dev/null +++ b/queue-6.1/blk-mq-move-the-srcu_struct-used-for-quiescing-to-th.patch @@ -0,0 +1,358 @@ +From f9f9f7c0add7cd173019a5c920121058e5239c0e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 1 Nov 2022 16:00:47 +0100 +Subject: blk-mq: move the srcu_struct used for quiescing to the tagset + +From: Christoph Hellwig + +[ Upstream commit 80bd4a7aab4c9ce59bf5e35fdf52aa23d8a3c9f5 ] + +All I/O submissions have fairly similar latencies, and a tagset-wide +quiesce is a fairly common operation. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Keith Busch +Reviewed-by: Ming Lei +Reviewed-by: Chao Leng +Reviewed-by: Sagi Grimberg +Reviewed-by: Hannes Reinecke +Reviewed-by: Chaitanya Kulkarni +Link: https://lore.kernel.org/r/20221101150050.3510-12-hch@lst.de +[axboe: fix whitespace] +Signed-off-by: Jens Axboe +Stable-dep-of: 00e885efcfbb ("blk-mq: fix "bad unlock balance detected" on q->srcu in __blk_mq_run_dispatch_ops") +Signed-off-by: Sasha Levin +--- + block/blk-core.c | 27 +++++---------------------- + block/blk-mq.c | 33 +++++++++++++++++++++++++-------- + block/blk-mq.h | 14 +++++++------- + block/blk-sysfs.c | 9 ++------- + block/blk.h | 9 +-------- + block/genhd.c | 2 +- + include/linux/blk-mq.h | 4 ++++ + include/linux/blkdev.h | 9 --------- + 8 files changed, 45 insertions(+), 62 deletions(-) + +diff --git a/block/blk-core.c b/block/blk-core.c +index 24ee7785a5ad5..d5da62bb4bc06 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -65,7 +65,6 @@ DEFINE_IDA(blk_queue_ida); + * For queue allocation + */ + struct kmem_cache *blk_requestq_cachep; +-struct kmem_cache *blk_requestq_srcu_cachep; + + /* + * Controlling structure to kblockd +@@ -373,26 +372,20 @@ static void blk_timeout_work(struct work_struct *work) + { + } + +-struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) ++struct request_queue *blk_alloc_queue(int node_id) + { + struct request_queue *q; + +- q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu), +- GFP_KERNEL | __GFP_ZERO, node_id); ++ q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, ++ node_id); + if (!q) + return NULL; + +- if (alloc_srcu) { +- blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q); +- if (init_srcu_struct(q->srcu) != 0) +- goto fail_q; +- } +- + q->last_merge = NULL; + + q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); + if (q->id < 0) +- goto fail_srcu; ++ goto fail_q; + + q->stats = blk_alloc_queue_stats(); + if (!q->stats) +@@ -434,11 +427,8 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) + blk_free_queue_stats(q->stats); + fail_id: + ida_free(&blk_queue_ida, q->id); +-fail_srcu: +- if (alloc_srcu) +- cleanup_srcu_struct(q->srcu); + fail_q: +- kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q); ++ kmem_cache_free(blk_requestq_cachep, q); + return NULL; + } + +@@ -1190,9 +1180,6 @@ int __init blk_dev_init(void) + sizeof_field(struct request, cmd_flags)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * + sizeof_field(struct bio, bi_opf)); +- BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu), +- __alignof__(struct request_queue)) != +- sizeof(struct request_queue)); + + /* used for unplugging and affects IO latency/throughput - HIGHPRI */ + kblockd_workqueue = alloc_workqueue("kblockd", +@@ -1203,10 +1190,6 @@ int __init blk_dev_init(void) + blk_requestq_cachep = kmem_cache_create("request_queue", + sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + +- blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu", +- sizeof(struct request_queue) + +- sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL); +- + blk_debugfs_root = debugfs_create_dir("block", NULL); + + return 0; +diff --git a/block/blk-mq.c b/block/blk-mq.c +index aa67a52c5a069..f8c97d75b8d1a 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -261,8 +261,8 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); + */ + void blk_mq_wait_quiesce_done(struct request_queue *q) + { +- if (blk_queue_has_srcu(q)) +- synchronize_srcu(q->srcu); ++ if (q->tag_set->flags & BLK_MQ_F_BLOCKING) ++ synchronize_srcu(q->tag_set->srcu); + else + synchronize_rcu(); + } +@@ -4022,7 +4022,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + struct request_queue *q; + int ret; + +- q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING); ++ q = blk_alloc_queue(set->numa_node); + if (!q) + return ERR_PTR(-ENOMEM); + q->queuedata = queuedata; +@@ -4194,9 +4194,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q) + int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) + { +- WARN_ON_ONCE(blk_queue_has_srcu(q) != +- !!(set->flags & BLK_MQ_F_BLOCKING)); +- + /* mark the queue as mq asap */ + q->mq_ops = set->ops; + +@@ -4453,8 +4450,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) + if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) + set->nr_hw_queues = nr_cpu_ids; + +- if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) +- return -ENOMEM; ++ if (set->flags & BLK_MQ_F_BLOCKING) { ++ set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); ++ if (!set->srcu) ++ return -ENOMEM; ++ ret = init_srcu_struct(set->srcu); ++ if (ret) ++ goto out_free_srcu; ++ } ++ ++ ret = blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues); ++ if (ret) ++ goto out_cleanup_srcu; + + ret = -ENOMEM; + for (i = 0; i < set->nr_maps; i++) { +@@ -4484,6 +4491,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) + } + kfree(set->tags); + set->tags = NULL; ++out_cleanup_srcu: ++ if (set->flags & BLK_MQ_F_BLOCKING) ++ cleanup_srcu_struct(set->srcu); ++out_free_srcu: ++ if (set->flags & BLK_MQ_F_BLOCKING) ++ kfree(set->srcu); + return ret; + } + EXPORT_SYMBOL(blk_mq_alloc_tag_set); +@@ -4523,6 +4536,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) + + kfree(set->tags); + set->tags = NULL; ++ if (set->flags & BLK_MQ_F_BLOCKING) { ++ cleanup_srcu_struct(set->srcu); ++ kfree(set->srcu); ++ } + } + EXPORT_SYMBOL(blk_mq_free_tag_set); + +diff --git a/block/blk-mq.h b/block/blk-mq.h +index 0b2870839cdd6..ef59fee62780d 100644 +--- a/block/blk-mq.h ++++ b/block/blk-mq.h +@@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + /* run the code block in @dispatch_ops with rcu/srcu read lock held */ + #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ + do { \ +- if (!blk_queue_has_srcu(q)) { \ +- rcu_read_lock(); \ +- (dispatch_ops); \ +- rcu_read_unlock(); \ +- } else { \ ++ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ + int srcu_idx; \ + \ + might_sleep_if(check_sleep); \ +- srcu_idx = srcu_read_lock((q)->srcu); \ ++ srcu_idx = srcu_read_lock((q)->tag_set->srcu); \ + (dispatch_ops); \ +- srcu_read_unlock((q)->srcu, srcu_idx); \ ++ srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \ ++ } else { \ ++ rcu_read_lock(); \ ++ (dispatch_ops); \ ++ rcu_read_unlock(); \ + } \ + } while (0) + +diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c +index e71b3b43927c0..e7871665825a3 100644 +--- a/block/blk-sysfs.c ++++ b/block/blk-sysfs.c +@@ -739,10 +739,8 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, + + static void blk_free_queue_rcu(struct rcu_head *rcu_head) + { +- struct request_queue *q = container_of(rcu_head, struct request_queue, +- rcu_head); +- +- kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); ++ kmem_cache_free(blk_requestq_cachep, ++ container_of(rcu_head, struct request_queue, rcu_head)); + } + + /** +@@ -779,9 +777,6 @@ static void blk_release_queue(struct kobject *kobj) + if (queue_is_mq(q)) + blk_mq_release(q); + +- if (blk_queue_has_srcu(q)) +- cleanup_srcu_struct(q->srcu); +- + ida_free(&blk_queue_ida, q->id); + call_rcu(&q->rcu_head, blk_free_queue_rcu); + } +diff --git a/block/blk.h b/block/blk.h +index a186ea20f39d8..4849a2efa4c50 100644 +--- a/block/blk.h ++++ b/block/blk.h +@@ -27,7 +27,6 @@ struct blk_flush_queue { + }; + + extern struct kmem_cache *blk_requestq_cachep; +-extern struct kmem_cache *blk_requestq_srcu_cachep; + extern struct kobj_type blk_queue_ktype; + extern struct ida blk_queue_ida; + +@@ -428,13 +427,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + unsigned int max_sectors, bool *same_page); + +-static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu) +-{ +- if (srcu) +- return blk_requestq_srcu_cachep; +- return blk_requestq_cachep; +-} +-struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu); ++struct request_queue *blk_alloc_queue(int node_id); + + int disk_scan_partitions(struct gendisk *disk, fmode_t mode); + +diff --git a/block/genhd.c b/block/genhd.c +index 0b6928e948f31..4db1f905514c5 100644 +--- a/block/genhd.c ++++ b/block/genhd.c +@@ -1436,7 +1436,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) + struct request_queue *q; + struct gendisk *disk; + +- q = blk_alloc_queue(node, false); ++ q = blk_alloc_queue(node); + if (!q) + return NULL; + +diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h +index a9764cbf7f8d2..8e942e36f1c48 100644 +--- a/include/linux/blk-mq.h ++++ b/include/linux/blk-mq.h +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + + struct blk_mq_tags; + struct blk_flush_queue; +@@ -507,6 +508,8 @@ enum hctx_type { + * @tag_list_lock: Serializes tag_list accesses. + * @tag_list: List of the request queues that use this tag set. See also + * request_queue.tag_set_list. ++ * @srcu: Use as lock when type of the request queue is blocking ++ * (BLK_MQ_F_BLOCKING). + */ + struct blk_mq_tag_set { + struct blk_mq_queue_map map[HCTX_MAX_TYPES]; +@@ -527,6 +530,7 @@ struct blk_mq_tag_set { + + struct mutex tag_list_lock; + struct list_head tag_list; ++ struct srcu_struct *srcu; + }; + + /** +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 891f8cbcd0436..36c286d22fb23 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -22,7 +22,6 @@ + #include + #include + #include +-#include + #include + #include + +@@ -544,18 +543,11 @@ struct request_queue { + struct mutex debugfs_mutex; + + bool mq_sysfs_init_done; +- +- /** +- * @srcu: Sleepable RCU. Use as lock when type of the request queue +- * is blocking (BLK_MQ_F_BLOCKING). Must be the last member +- */ +- struct srcu_struct srcu[]; + }; + + /* Keep blk_queue_flag_name[] in sync with the definitions below */ + #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ + #define QUEUE_FLAG_DYING 1 /* queue being torn down */ +-#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */ + #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ + #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ + #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ +@@ -591,7 +583,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); + + #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) + #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) +-#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) + #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) + #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) + #define blk_queue_noxmerges(q) \ +-- +2.39.2 + diff --git a/queue-6.1/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch b/queue-6.1/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch new file mode 100644 index 00000000000..719b9598d51 --- /dev/null +++ b/queue-6.1/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch @@ -0,0 +1,75 @@ +From 3c2f1e320bc487469d4a469e50d0732ff734800e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 1 Mar 2023 16:14:42 -0500 +Subject: btrfs: rename BTRFS_FS_NO_OVERCOMMIT to BTRFS_FS_ACTIVE_ZONE_TRACKING + +From: Josef Bacik + +[ Upstream commit bf1f1fec2724a33b67ec12032402ea75f2a83622 ] + +This flag only gets set when we're doing active zone tracking, and we're +going to need to use this flag for things related to this behavior. +Rename the flag to represent what it actually means for the file system +so it can be used in other ways and still make sense. + +Reviewed-by: Naohiro Aota +Reviewed-by: Johannes Thumshirn +Reviewed-by: Anand Jain +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/ctree.h | 7 ++----- + fs/btrfs/space-info.c | 2 +- + fs/btrfs/zoned.c | 3 +-- + 3 files changed, 4 insertions(+), 8 deletions(-) + +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index a3febabacec04..3bcef0c4d6fc4 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -590,11 +590,8 @@ enum { + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + +- /* +- * Indicate metadata over-commit is disabled. This is set when active +- * zone tracking is needed. +- */ +- BTRFS_FS_NO_OVERCOMMIT, ++ /* This is set when active zone tracking is needed. */ ++ BTRFS_FS_ACTIVE_ZONE_TRACKING, + + #if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 65c010159fb5f..c7642c00a65d0 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -404,7 +404,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + return 0; + + used = btrfs_space_info_used(space_info, true); +- if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) && ++ if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && + (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + avail = 0; + else +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 1b72004136ef8..0d88cc46ac5db 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -538,8 +538,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) + } + atomic_set(&zone_info->active_zones_left, + max_active_zones - nactive); +- /* Overcommit does not work well with active zone tacking. */ +- set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags); ++ set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); + } + + /* Validate superblock log */ +-- +2.39.2 + diff --git a/queue-6.1/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch b/queue-6.1/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch new file mode 100644 index 00000000000..edaadfacb86 --- /dev/null +++ b/queue-6.1/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch @@ -0,0 +1,134 @@ +From f9b02dd808f9c00eff8e1f2ce669b17bde9e444f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Mar 2023 16:06:13 +0900 +Subject: btrfs: zoned: count fresh BG region as zone unusable + +From: Naohiro Aota + +[ Upstream commit fa2068d7e922b434eba5bfb0131e6d39febfdb48 ] + +The naming of space_info->active_total_bytes is misleading. It counts +not only active block groups but also full ones which are previously +active but now inactive. That confusion results in a bug not counting +the full BGs into active_total_bytes on mount time. + +For a background, there are three kinds of block groups in terms of +activation. + + 1. Block groups never activated + 2. Block groups currently active + 3. Block groups previously active and currently inactive (due to fully + written or zone finish) + +What we really wanted to exclude from "total_bytes" is the total size of +BGs #1. They seem empty and allocatable but since they are not activated, +we cannot rely on them to do the space reservation. + +And, since BGs #1 never get activated, they should have no "used", +"reserved" and "pinned" bytes. + +OTOH, BGs #3 can be counted in the "total", since they are already full +we cannot allocate from them anyway. For them, "total_bytes == used + +reserved + pinned + zone_unusable" should hold. + +Tracking #2 and #3 as "active_total_bytes" (current implementation) is +confusing. And, tracking #1 and subtract that properly from "total_bytes" +every time you need space reservation is cumbersome. + +Instead, we can count the whole region of a newly allocated block group as +zone_unusable. Then, once that block group is activated, release +[0 .. zone_capacity] from the zone_unusable counters. With this, we can +eliminate the confusing ->active_total_bytes and the code will be common +among regular and the zoned mode. Also, no additional counter is needed +with this approach. + +Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes") +CC: stable@vger.kernel.org # 6.1+ +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/free-space-cache.c | 8 +++++++- + fs/btrfs/zoned.c | 24 +++++++++++++++++++----- + 2 files changed, 26 insertions(+), 6 deletions(-) + +diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c +index f4023651dd68b..6a8f2bd350f4b 100644 +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -2684,8 +2684,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + + spin_lock(&ctl->tree_lock); ++ /* Count initial region as zone_unusable until it gets activated. */ + if (!used) + to_free = size; ++ else if (initial && ++ test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && ++ (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) ++ to_free = 0; + else if (initial) + to_free = block_group->zone_capacity; + else if (offset >= block_group->alloc_offset) +@@ -2713,7 +2718,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + reclaimable_unusable = block_group->zone_unusable - + (block_group->length - block_group->zone_capacity); + /* All the region is now unusable. Mark it as unused and reclaim */ +- if (block_group->zone_unusable == block_group->length) { ++ if (block_group->zone_unusable == block_group->length && ++ block_group->alloc_offset) { + btrfs_mark_bg_unused(block_group); + } else if (bg_reclaim_threshold && + reclaimable_unusable >= +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 0d88cc46ac5db..e97c5a1ac95d6 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1575,9 +1575,19 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) + return; + + WARN_ON(cache->bytes_super != 0); +- unusable = (cache->alloc_offset - cache->used) + +- (cache->length - cache->zone_capacity); +- free = cache->zone_capacity - cache->alloc_offset; ++ ++ /* Check for block groups never get activated */ ++ if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && ++ cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && ++ !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && ++ cache->alloc_offset == 0) { ++ unusable = cache->length; ++ free = 0; ++ } else { ++ unusable = (cache->alloc_offset - cache->used) + ++ (cache->length - cache->zone_capacity); ++ free = cache->zone_capacity - cache->alloc_offset; ++ } + + /* We only need ->free_space in ALLOC_SEQ block groups */ + cache->cached = BTRFS_CACHE_FINISHED; +@@ -1914,7 +1924,11 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) + + /* Successfully activated all the zones */ + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); +- space_info->active_total_bytes += block_group->length; ++ WARN_ON(block_group->alloc_offset != 0); ++ if (block_group->zone_unusable == block_group->length) { ++ block_group->zone_unusable = block_group->length - block_group->zone_capacity; ++ space_info->bytes_zone_unusable -= block_group->zone_capacity; ++ } + spin_unlock(&block_group->lock); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); +@@ -2277,7 +2291,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) + u64 avail; + + spin_lock(&block_group->lock); +- if (block_group->reserved || ++ if (block_group->reserved || block_group->alloc_offset == 0 || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + spin_unlock(&block_group->lock); + continue; +-- +2.39.2 + diff --git a/queue-6.1/cifs-avoid-race-conditions-with-parallel-reconnects.patch b/queue-6.1/cifs-avoid-race-conditions-with-parallel-reconnects.patch new file mode 100644 index 00000000000..af1a5e4bc55 --- /dev/null +++ b/queue-6.1/cifs-avoid-race-conditions-with-parallel-reconnects.patch @@ -0,0 +1,333 @@ +From 0ab7a952cc892cdfb993bbd2897b5cc1f1e98858 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Mar 2023 06:08:19 +0000 +Subject: cifs: avoid race conditions with parallel reconnects + +From: Shyam Prasad N + +[ Upstream commit bc962159e8e326af634a506508034a375bf2b858 ] + +When multiple processes/channels do reconnects in parallel +we used to return success immediately +negotiate/session-setup/tree-connect, causing race conditions +between processes that enter the function in parallel. +This caused several errors related to session not found to +show up during parallel reconnects. + +Signed-off-by: Shyam Prasad N +Reviewed-by: Paulo Alcantara (SUSE) +Cc: stable@vger.kernel.org +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/cifs/connect.c | 48 ++++++++++++++++++++++++++++++----------- + fs/cifs/smb2pdu.c | 44 +++++++++++++++++++++---------------- + fs/cifs/smb2transport.c | 17 ++++++++++++--- + 3 files changed, 76 insertions(+), 33 deletions(-) + +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c +index 43637c1283748..077c88c49dfdf 100644 +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -261,31 +261,42 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, + cifs_chan_update_iface(ses, server); + + spin_lock(&ses->chan_lock); +- if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) +- goto next_session; ++ if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) { ++ spin_unlock(&ses->chan_lock); ++ continue; ++ } + + if (mark_smb_session) + CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses); + else + cifs_chan_set_need_reconnect(ses, server); + ++ cifs_dbg(FYI, "%s: channel connect bitmap: 0x%lx\n", ++ __func__, ses->chans_need_reconnect); ++ + /* If all channels need reconnect, then tcon needs reconnect */ +- if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) +- goto next_session; ++ if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { ++ spin_unlock(&ses->chan_lock); ++ continue; ++ } ++ spin_unlock(&ses->chan_lock); + ++ spin_lock(&ses->ses_lock); + ses->ses_status = SES_NEED_RECON; ++ spin_unlock(&ses->ses_lock); + + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + tcon->need_reconnect = true; ++ spin_lock(&tcon->tc_lock); + tcon->status = TID_NEED_RECON; ++ spin_unlock(&tcon->tc_lock); + } + if (ses->tcon_ipc) { + ses->tcon_ipc->need_reconnect = true; ++ spin_lock(&ses->tcon_ipc->tc_lock); + ses->tcon_ipc->status = TID_NEED_RECON; ++ spin_unlock(&ses->tcon_ipc->tc_lock); + } +- +-next_session: +- spin_unlock(&ses->chan_lock); + } + spin_unlock(&cifs_tcp_ses_lock); + } +@@ -4050,11 +4061,19 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, + + /* only send once per connect */ + spin_lock(&server->srv_lock); +- if (!server->ops->need_neg(server) || ++ if (server->tcpStatus != CifsGood && ++ server->tcpStatus != CifsNew && + server->tcpStatus != CifsNeedNegotiate) { ++ spin_unlock(&server->srv_lock); ++ return -EHOSTDOWN; ++ } ++ ++ if (!server->ops->need_neg(server) && ++ server->tcpStatus == CifsGood) { + spin_unlock(&server->srv_lock); + return 0; + } ++ + server->tcpStatus = CifsInNegotiate; + spin_unlock(&server->srv_lock); + +@@ -4088,23 +4107,28 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + bool is_binding = false; + + spin_lock(&ses->ses_lock); ++ cifs_dbg(FYI, "%s: channel connect bitmap: 0x%lx\n", ++ __func__, ses->chans_need_reconnect); ++ + if (ses->ses_status != SES_GOOD && + ses->ses_status != SES_NEW && + ses->ses_status != SES_NEED_RECON) { + spin_unlock(&ses->ses_lock); +- return 0; ++ return -EHOSTDOWN; + } + + /* only send once per connect */ + spin_lock(&ses->chan_lock); +- if (CIFS_ALL_CHANS_GOOD(ses) || +- cifs_chan_in_reconnect(ses, server)) { ++ if (CIFS_ALL_CHANS_GOOD(ses)) { ++ if (ses->ses_status == SES_NEED_RECON) ++ ses->ses_status = SES_GOOD; + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); + return 0; + } +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); ++ + cifs_chan_set_in_reconnect(ses, server); ++ is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + + if (!is_binding) +diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c +index 83d04cd2f9df8..f0b1ae0835d71 100644 +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -199,6 +199,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + } + spin_unlock(&server->srv_lock); + ++again: + rc = cifs_wait_for_server_reconnect(server, tcon->retry); + if (rc) + return rc; +@@ -217,6 +218,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + + nls_codepage = load_nls_default(); + ++ mutex_lock(&ses->session_mutex); + /* + * Recheck after acquire mutex. If another thread is negotiating + * and the server never sends an answer the socket will be closed +@@ -225,6 +227,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&server->srv_lock); ++ mutex_unlock(&ses->session_mutex); ++ ++ if (tcon->retry) ++ goto again; ++ + rc = -EHOSTDOWN; + goto out; + } +@@ -234,19 +241,22 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ ++ spin_lock(&ses->ses_lock); + spin_lock(&ses->chan_lock); +- if (!cifs_chan_needs_reconnect(ses, server)) { ++ if (!cifs_chan_needs_reconnect(ses, server) && ++ ses->ses_status == SES_GOOD) { + spin_unlock(&ses->chan_lock); +- ++ spin_unlock(&ses->ses_lock); + /* this means that we only need to tree connect */ + if (tcon->need_reconnect) + goto skip_sess_setup; + ++ mutex_unlock(&ses->session_mutex); + goto out; + } + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + +- mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(0, ses, server); + if (!rc) { + rc = cifs_setup_session(0, ses, server, nls_codepage); +@@ -262,10 +272,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + mutex_unlock(&ses->session_mutex); + goto out; + } +- mutex_unlock(&ses->session_mutex); + + skip_sess_setup: +- mutex_lock(&ses->session_mutex); + if (!tcon->need_reconnect) { + mutex_unlock(&ses->session_mutex); + goto out; +@@ -280,7 +288,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); + if (rc) { + /* If sess reconnected but tcon didn't, something strange ... */ +- pr_warn_once("reconnect tcon failed rc = %d\n", rc); ++ cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc); + goto out; + } + +@@ -1252,9 +1260,9 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) + if (rc) + return rc; + +- spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); +- spin_unlock(&ses->chan_lock); ++ spin_lock(&ses->ses_lock); ++ is_binding = (ses->ses_status == SES_GOOD); ++ spin_unlock(&ses->ses_lock); + + if (is_binding) { + req->hdr.SessionId = cpu_to_le64(ses->Suid); +@@ -1412,9 +1420,9 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) + goto out_put_spnego_key; + } + +- spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); +- spin_unlock(&ses->chan_lock); ++ spin_lock(&ses->ses_lock); ++ is_binding = (ses->ses_status == SES_GOOD); ++ spin_unlock(&ses->ses_lock); + + /* keep session key if binding */ + if (!is_binding) { +@@ -1538,9 +1546,9 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + +- spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); +- spin_unlock(&ses->chan_lock); ++ spin_lock(&ses->ses_lock); ++ is_binding = (ses->ses_status == SES_GOOD); ++ spin_unlock(&ses->ses_lock); + + /* keep existing ses id and flags if binding */ + if (!is_binding) { +@@ -1606,9 +1614,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + +- spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); +- spin_unlock(&ses->chan_lock); ++ spin_lock(&ses->ses_lock); ++ is_binding = (ses->ses_status == SES_GOOD); ++ spin_unlock(&ses->ses_lock); + + /* keep existing ses id and flags if binding */ + if (!is_binding) { +diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c +index d827b7547ffad..790acf65a0926 100644 +--- a/fs/cifs/smb2transport.c ++++ b/fs/cifs/smb2transport.c +@@ -81,6 +81,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) + struct cifs_ses *ses = NULL; + int i; + int rc = 0; ++ bool is_binding = false; + + spin_lock(&cifs_tcp_ses_lock); + +@@ -97,9 +98,12 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) + goto out; + + found: ++ spin_lock(&ses->ses_lock); + spin_lock(&ses->chan_lock); +- if (cifs_chan_needs_reconnect(ses, server) && +- !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { ++ ++ is_binding = (cifs_chan_needs_reconnect(ses, server) && ++ ses->ses_status == SES_GOOD); ++ if (is_binding) { + /* + * If we are in the process of binding a new channel + * to an existing session, use the master connection +@@ -107,6 +111,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) + */ + memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + goto out; + } + +@@ -119,10 +124,12 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) + if (chan->server == server) { + memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + goto out; + } + } + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + + cifs_dbg(VFS, + "%s: Could not find channel signing key for session 0x%llx\n", +@@ -392,11 +399,15 @@ generate_smb3signingkey(struct cifs_ses *ses, + bool is_binding = false; + int chan_index = 0; + ++ spin_lock(&ses->ses_lock); + spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); ++ is_binding = (cifs_chan_needs_reconnect(ses, server) && ++ ses->ses_status == SES_GOOD); ++ + chan_index = cifs_ses_get_chan_index(ses, server); + /* TODO: introduce ref counting for channels when the can be freed */ + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + + /* + * All channels use the same encryption/decryption keys but +-- +2.39.2 + diff --git a/queue-6.1/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch b/queue-6.1/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch new file mode 100644 index 00000000000..1185e8560aa --- /dev/null +++ b/queue-6.1/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch @@ -0,0 +1,255 @@ +From 3de8e3b54835786fe1ddf4048df5ff2822ba7bd9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Feb 2023 19:01:55 -0300 +Subject: cifs: prevent data race in cifs_reconnect_tcon() + +From: Paulo Alcantara + +[ Upstream commit 1bcd548d935a33c6fc58331405eb1b82fd6150de ] + +Make sure to get an up-to-date TCP_Server_Info::nr_targets value prior +to waiting the server to be reconnected in cifs_reconnect_tcon(). It +is set in cifs_tcp_ses_needs_reconnect() and protected by +TCP_Server_Info::srv_lock. + +Create a new cifs_wait_for_server_reconnect() helper that can be used +by both SMB2+ and CIFS reconnect code. + +Signed-off-by: Paulo Alcantara (SUSE) +Signed-off-by: Steve French +Stable-dep-of: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects") +Signed-off-by: Sasha Levin +--- + fs/cifs/cifsproto.h | 1 + + fs/cifs/cifssmb.c | 43 ++---------------------- + fs/cifs/misc.c | 44 ++++++++++++++++++++++++ + fs/cifs/smb2pdu.c | 82 ++++++++++++--------------------------------- + 4 files changed, 69 insertions(+), 101 deletions(-) + +diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h +index bc4475f6c0827..98513f5af3f96 100644 +--- a/fs/cifs/cifsproto.h ++++ b/fs/cifs/cifsproto.h +@@ -691,5 +691,6 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) + + struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon); + void cifs_put_tcon_super(struct super_block *sb); ++int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry); + + #endif /* _CIFSPROTO_H */ +diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c +index 6c6a7fc47f3e3..4bc6ba87baf4c 100644 +--- a/fs/cifs/cifssmb.c ++++ b/fs/cifs/cifssmb.c +@@ -70,7 +70,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) + struct cifs_ses *ses; + struct TCP_Server_Info *server; + struct nls_table *nls_codepage; +- int retries; + + /* + * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for +@@ -98,45 +97,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) + } + spin_unlock(&tcon->tc_lock); + +- retries = server->nr_targets; +- +- /* +- * Give demultiplex thread up to 10 seconds to each target available for +- * reconnect -- should be greater than cifs socket timeout which is 7 +- * seconds. +- */ +- while (server->tcpStatus == CifsNeedReconnect) { +- rc = wait_event_interruptible_timeout(server->response_q, +- (server->tcpStatus != CifsNeedReconnect), +- 10 * HZ); +- if (rc < 0) { +- cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", +- __func__); +- return -ERESTARTSYS; +- } +- +- /* are we still trying to reconnect? */ +- spin_lock(&server->srv_lock); +- if (server->tcpStatus != CifsNeedReconnect) { +- spin_unlock(&server->srv_lock); +- break; +- } +- spin_unlock(&server->srv_lock); +- +- if (retries && --retries) +- continue; +- +- /* +- * on "soft" mounts we wait once. Hard mounts keep +- * retrying until process is killed or server comes +- * back on-line +- */ +- if (!tcon->retry) { +- cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); +- return -EHOSTDOWN; +- } +- retries = server->nr_targets; +- } ++ rc = cifs_wait_for_server_reconnect(server, tcon->retry); ++ if (rc) ++ return rc; + + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { +diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c +index 4e54736a06996..832856aef4b7a 100644 +--- a/fs/cifs/misc.c ++++ b/fs/cifs/misc.c +@@ -1382,3 +1382,47 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, + return 0; + } + #endif ++ ++int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry) ++{ ++ int timeout = 10; ++ int rc; ++ ++ spin_lock(&server->srv_lock); ++ if (server->tcpStatus != CifsNeedReconnect) { ++ spin_unlock(&server->srv_lock); ++ return 0; ++ } ++ timeout *= server->nr_targets; ++ spin_unlock(&server->srv_lock); ++ ++ /* ++ * Give demultiplex thread up to 10 seconds to each target available for ++ * reconnect -- should be greater than cifs socket timeout which is 7 ++ * seconds. ++ * ++ * On "soft" mounts we wait once. Hard mounts keep retrying until ++ * process is killed or server comes back on-line. ++ */ ++ do { ++ rc = wait_event_interruptible_timeout(server->response_q, ++ (server->tcpStatus != CifsNeedReconnect), ++ timeout * HZ); ++ if (rc < 0) { ++ cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", ++ __func__); ++ return -ERESTARTSYS; ++ } ++ ++ /* are we still trying to reconnect? */ ++ spin_lock(&server->srv_lock); ++ if (server->tcpStatus != CifsNeedReconnect) { ++ spin_unlock(&server->srv_lock); ++ return 0; ++ } ++ spin_unlock(&server->srv_lock); ++ } while (retry); ++ ++ cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); ++ return -EHOSTDOWN; ++} +diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c +index 6e6e44d8b4c79..83d04cd2f9df8 100644 +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -139,66 +139,6 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, + return; + } + +-static int wait_for_server_reconnect(struct TCP_Server_Info *server, +- __le16 smb2_command, bool retry) +-{ +- int timeout = 10; +- int rc; +- +- spin_lock(&server->srv_lock); +- if (server->tcpStatus != CifsNeedReconnect) { +- spin_unlock(&server->srv_lock); +- return 0; +- } +- timeout *= server->nr_targets; +- spin_unlock(&server->srv_lock); +- +- /* +- * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE +- * here since they are implicitly done when session drops. +- */ +- switch (smb2_command) { +- /* +- * BB Should we keep oplock break and add flush to exceptions? +- */ +- case SMB2_TREE_DISCONNECT: +- case SMB2_CANCEL: +- case SMB2_CLOSE: +- case SMB2_OPLOCK_BREAK: +- return -EAGAIN; +- } +- +- /* +- * Give demultiplex thread up to 10 seconds to each target available for +- * reconnect -- should be greater than cifs socket timeout which is 7 +- * seconds. +- * +- * On "soft" mounts we wait once. Hard mounts keep retrying until +- * process is killed or server comes back on-line. +- */ +- do { +- rc = wait_event_interruptible_timeout(server->response_q, +- (server->tcpStatus != CifsNeedReconnect), +- timeout * HZ); +- if (rc < 0) { +- cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", +- __func__); +- return -ERESTARTSYS; +- } +- +- /* are we still trying to reconnect? */ +- spin_lock(&server->srv_lock); +- if (server->tcpStatus != CifsNeedReconnect) { +- spin_unlock(&server->srv_lock); +- return 0; +- } +- spin_unlock(&server->srv_lock); +- } while (retry); +- +- cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); +- return -EHOSTDOWN; +-} +- + static int + smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + struct TCP_Server_Info *server) +@@ -239,7 +179,27 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + (!tcon->ses->server) || !server) + return -EIO; + +- rc = wait_for_server_reconnect(server, smb2_command, tcon->retry); ++ spin_lock(&server->srv_lock); ++ if (server->tcpStatus == CifsNeedReconnect) { ++ /* ++ * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE ++ * here since they are implicitly done when session drops. ++ */ ++ switch (smb2_command) { ++ /* ++ * BB Should we keep oplock break and add flush to exceptions? ++ */ ++ case SMB2_TREE_DISCONNECT: ++ case SMB2_CANCEL: ++ case SMB2_CLOSE: ++ case SMB2_OPLOCK_BREAK: ++ spin_unlock(&server->srv_lock); ++ return -EAGAIN; ++ } ++ } ++ spin_unlock(&server->srv_lock); ++ ++ rc = cifs_wait_for_server_reconnect(server, tcon->retry); + if (rc) + return rc; + +-- +2.39.2 + diff --git a/queue-6.1/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch b/queue-6.1/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch new file mode 100644 index 00000000000..71baa5fa1dd --- /dev/null +++ b/queue-6.1/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch @@ -0,0 +1,64 @@ +From 90cd3a627fdcdebe9437c5a51183285fcae5bc96 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Feb 2023 17:41:17 +0000 +Subject: cifs: update ip_addr for ses only for primary chan setup + +From: Shyam Prasad N + +[ Upstream commit e77978de4765229e09c8fabcf4f8419ff367317f ] + +We update ses->ip_addr whenever we do a session setup. +But this should happen only for primary channel in mchan +scenario. + +Signed-off-by: Shyam Prasad N +Reviewed-by: Paulo Alcantara (SUSE) +Signed-off-by: Steve French +Stable-dep-of: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects") +Signed-off-by: Sasha Levin +--- + fs/cifs/connect.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c +index 7aecb1646b6fc..43637c1283748 100644 +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -4082,16 +4082,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + struct nls_table *nls_info) + { + int rc = -ENOSYS; +- struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; +- struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; ++ struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; ++ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; ++ struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; + bool is_binding = false; + + spin_lock(&ses->ses_lock); +- if (server->dstaddr.ss_family == AF_INET6) +- scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); +- else +- scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); +- + if (ses->ses_status != SES_GOOD && + ses->ses_status != SES_NEW && + ses->ses_status != SES_NEED_RECON) { +@@ -4115,6 +4111,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + ses->ses_status = SES_IN_SETUP; + spin_unlock(&ses->ses_lock); + ++ /* update ses ip_addr only for primary chan */ ++ if (server == pserver) { ++ if (server->dstaddr.ss_family == AF_INET6) ++ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); ++ else ++ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); ++ } ++ + if (!is_binding) { + ses->capabilities = server->capabilities; + if (!linuxExtEnabled) +-- +2.39.2 + diff --git a/queue-6.1/drm-msm-disp-dpu-fix-sc7280_pp-base-offset.patch b/queue-6.1/drm-msm-disp-dpu-fix-sc7280_pp-base-offset.patch new file mode 100644 index 00000000000..85a865ea11f --- /dev/null +++ b/queue-6.1/drm-msm-disp-dpu-fix-sc7280_pp-base-offset.patch @@ -0,0 +1,46 @@ +From 0dd04eeb2a1b3d70349342acfcccb3b4ec6b899c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 27 Feb 2023 13:36:40 -0800 +Subject: drm/msm/disp/dpu: fix sc7280_pp base offset + +From: Kuogee Hsieh + +[ Upstream commit ce68153edb5b36ddf87a19ed5a85131498690bbf ] + +At sc7280, pingpong block is used to management the dither effects +to reduce distortion at panel. Currently pingpong-0 base offset is +wrongly set at 0x59000. This mistake will not cause system to crash. +However it will make dither not work. This patch correct sc7280 ping +pong-0 block base offset. + +Changes in v2: +-- add more details info n regrading of pingpong block at commit text + +Fixes: 591e34a091d1 ("drm/msm/disp/dpu1: add support for display for SC7280 target") +Signed-off-by: Kuogee Hsieh +Reviewed-by: Abhinav Kumar +Reviewed-by: Dmitry Baryshkov +Patchwork: https://patchwork.freedesktop.org/patch/524332/ +Link: https://lore.kernel.org/r/1677533800-3125-1-git-send-email-quic_khsieh@quicinc.com +Signed-off-by: Abhinav Kumar +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +index b1131860ada17..32a3c42ec45b1 100644 +--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c ++++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +@@ -1181,7 +1181,7 @@ static const struct dpu_pingpong_cfg sm8150_pp[] = { + }; + + static const struct dpu_pingpong_cfg sc7280_pp[] = { +- PP_BLK("pingpong_0", PINGPONG_0, 0x59000, 0, sc7280_pp_sblk, -1, -1), ++ PP_BLK("pingpong_0", PINGPONG_0, 0x69000, 0, sc7280_pp_sblk, -1, -1), + PP_BLK("pingpong_1", PINGPONG_1, 0x6a000, 0, sc7280_pp_sblk, -1, -1), + PP_BLK("pingpong_2", PINGPONG_2, 0x6b000, 0, sc7280_pp_sblk, -1, -1), + PP_BLK("pingpong_3", PINGPONG_3, 0x6c000, 0, sc7280_pp_sblk, -1, -1), +-- +2.39.2 + diff --git a/queue-6.1/drm-msm-dpu-correct-sm8250-and-sm8350-scaler.patch b/queue-6.1/drm-msm-dpu-correct-sm8250-and-sm8350-scaler.patch new file mode 100644 index 00000000000..dc9b3d00e64 --- /dev/null +++ b/queue-6.1/drm-msm-dpu-correct-sm8250-and-sm8350-scaler.patch @@ -0,0 +1,71 @@ +From 1be611e5125270a82315622a4066ec1ebb496e6c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 12 Feb 2023 01:12:18 +0200 +Subject: drm/msm/dpu: correct sm8250 and sm8350 scaler + +From: Dmitry Baryshkov + +[ Upstream commit 03c0c3cb22a4ff29afba1b43f0330289ea80433f ] + +QSEED4 is a newer variant of QSEED3LITE, which should be used on +sm8250 and sm8350. Fix the DPU caps structure and used feature masks. + +Fixes: d21fc5dfc3df ("drm/msm/dpu1: add support for qseed3lite used on sm8250") +Fixes: 0e91bcbb0016 ("drm/msm/dpu: Add SM8350 to hw catalog") +Signed-off-by: Dmitry Baryshkov +Reviewed-by: Abhinav Kumar +Patchwork: https://patchwork.freedesktop.org/patch/522229/ +Link: https://lore.kernel.org/r/20230211231259.1308718-10-dmitry.baryshkov@linaro.org +Signed-off-by: Abhinav Kumar +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 18 +++++++++--------- + 1 file changed, 9 insertions(+), 9 deletions(-) + +diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +index bbd884c8e0cb1..b1131860ada17 100644 +--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c ++++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +@@ -356,7 +356,7 @@ static const struct dpu_caps sc8180x_dpu_caps = { + static const struct dpu_caps sm8250_dpu_caps = { + .max_mixer_width = DEFAULT_DPU_OUTPUT_LINE_WIDTH, + .max_mixer_blendstages = 0xb, +- .qseed_type = DPU_SSPP_SCALER_QSEED3LITE, ++ .qseed_type = DPU_SSPP_SCALER_QSEED4, + .smart_dma_rev = DPU_SSPP_SMART_DMA_V2, /* TODO: v2.5 */ + .ubwc_version = DPU_HW_UBWC_VER_40, + .has_src_split = true, +@@ -855,22 +855,22 @@ static const struct dpu_sspp_cfg sc7180_sspp[] = { + }; + + static const struct dpu_sspp_sub_blks sm8250_vig_sblk_0 = +- _VIG_SBLK("0", 5, DPU_SSPP_SCALER_QSEED3LITE); ++ _VIG_SBLK("0", 5, DPU_SSPP_SCALER_QSEED4); + static const struct dpu_sspp_sub_blks sm8250_vig_sblk_1 = +- _VIG_SBLK("1", 6, DPU_SSPP_SCALER_QSEED3LITE); ++ _VIG_SBLK("1", 6, DPU_SSPP_SCALER_QSEED4); + static const struct dpu_sspp_sub_blks sm8250_vig_sblk_2 = +- _VIG_SBLK("2", 7, DPU_SSPP_SCALER_QSEED3LITE); ++ _VIG_SBLK("2", 7, DPU_SSPP_SCALER_QSEED4); + static const struct dpu_sspp_sub_blks sm8250_vig_sblk_3 = +- _VIG_SBLK("3", 8, DPU_SSPP_SCALER_QSEED3LITE); ++ _VIG_SBLK("3", 8, DPU_SSPP_SCALER_QSEED4); + + static const struct dpu_sspp_cfg sm8250_sspp[] = { +- SSPP_BLK("sspp_0", SSPP_VIG0, 0x4000, VIG_SM8250_MASK, ++ SSPP_BLK("sspp_0", SSPP_VIG0, 0x4000, VIG_SC7180_MASK, + sm8250_vig_sblk_0, 0, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG0), +- SSPP_BLK("sspp_1", SSPP_VIG1, 0x6000, VIG_SM8250_MASK, ++ SSPP_BLK("sspp_1", SSPP_VIG1, 0x6000, VIG_SC7180_MASK, + sm8250_vig_sblk_1, 4, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG1), +- SSPP_BLK("sspp_2", SSPP_VIG2, 0x8000, VIG_SM8250_MASK, ++ SSPP_BLK("sspp_2", SSPP_VIG2, 0x8000, VIG_SC7180_MASK, + sm8250_vig_sblk_2, 8, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG2), +- SSPP_BLK("sspp_3", SSPP_VIG3, 0xa000, VIG_SM8250_MASK, ++ SSPP_BLK("sspp_3", SSPP_VIG3, 0xa000, VIG_SC7180_MASK, + sm8250_vig_sblk_3, 12, SSPP_TYPE_VIG, DPU_CLK_CTRL_VIG3), + SSPP_BLK("sspp_8", SSPP_DMA0, 0x24000, DMA_SDM845_MASK, + sdm845_dma_sblk_0, 1, SSPP_TYPE_DMA, DPU_CLK_CTRL_DMA0), +-- +2.39.2 + diff --git a/queue-6.1/drm-msm-dpu-refactor-sc7280_pp-location.patch b/queue-6.1/drm-msm-dpu-refactor-sc7280_pp-location.patch new file mode 100644 index 00000000000..a68a464ec46 --- /dev/null +++ b/queue-6.1/drm-msm-dpu-refactor-sc7280_pp-location.patch @@ -0,0 +1,59 @@ +From 134315db11b46b7cc9edd050b7b29518262e7533 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 28 Oct 2022 14:08:05 +0200 +Subject: drm/msm/dpu: Refactor sc7280_pp location + +From: Robert Foss + +[ Upstream commit 1a5b5372e3b0a4cc65a0cbb724b1b0859f4ac63c ] + +The sc7280_pp declaration is not located by the other _pp +declarations, but rather hidden around the _merge_3d +declarations. Let's fix this to avoid confusion. + +Signed-off-by: Robert Foss +Reviewed-by: Dmitry Baryshkov +Patchwork: https://patchwork.freedesktop.org/patch/509153/ +Link: https://lore.kernel.org/r/20221028120812.339100-3-robert.foss@linaro.org +Signed-off-by: Dmitry Baryshkov +Stable-dep-of: 03c0c3cb22a4 ("drm/msm/dpu: correct sm8250 and sm8350 scaler") +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +index 41c93a18d5cb3..bbd884c8e0cb1 100644 +--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c ++++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c +@@ -1180,6 +1180,13 @@ static const struct dpu_pingpong_cfg sm8150_pp[] = { + -1), + }; + ++static const struct dpu_pingpong_cfg sc7280_pp[] = { ++ PP_BLK("pingpong_0", PINGPONG_0, 0x59000, 0, sc7280_pp_sblk, -1, -1), ++ PP_BLK("pingpong_1", PINGPONG_1, 0x6a000, 0, sc7280_pp_sblk, -1, -1), ++ PP_BLK("pingpong_2", PINGPONG_2, 0x6b000, 0, sc7280_pp_sblk, -1, -1), ++ PP_BLK("pingpong_3", PINGPONG_3, 0x6c000, 0, sc7280_pp_sblk, -1, -1), ++}; ++ + static struct dpu_pingpong_cfg qcm2290_pp[] = { + PP_BLK("pingpong_0", PINGPONG_0, 0x70000, 0, sdm845_pp_sblk, + DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8), +@@ -1203,13 +1210,6 @@ static const struct dpu_merge_3d_cfg sm8150_merge_3d[] = { + MERGE_3D_BLK("merge_3d_2", MERGE_3D_2, 0x83200), + }; + +-static const struct dpu_pingpong_cfg sc7280_pp[] = { +- PP_BLK("pingpong_0", PINGPONG_0, 0x59000, 0, sc7280_pp_sblk, -1, -1), +- PP_BLK("pingpong_1", PINGPONG_1, 0x6a000, 0, sc7280_pp_sblk, -1, -1), +- PP_BLK("pingpong_2", PINGPONG_2, 0x6b000, 0, sc7280_pp_sblk, -1, -1), +- PP_BLK("pingpong_3", PINGPONG_3, 0x6c000, 0, sc7280_pp_sblk, -1, -1), +-}; +- + /************************************************************* + * DSC sub blocks config + *************************************************************/ +-- +2.39.2 + diff --git a/queue-6.1/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch b/queue-6.1/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch new file mode 100644 index 00000000000..76b03c16535 --- /dev/null +++ b/queue-6.1/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch @@ -0,0 +1,73 @@ +From 7da9b6aea5d624d8b1133bfbd214e1647b3ebc74 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Mar 2023 16:31:32 -0700 +Subject: fsverity: don't drop pagecache at end of FS_IOC_ENABLE_VERITY + +From: Eric Biggers + +[ Upstream commit a075bacde257f755bea0e53400c9f1cdd1b8e8e6 ] + +The full pagecache drop at the end of FS_IOC_ENABLE_VERITY is causing +performance problems and is hindering adoption of fsverity. It was +intended to solve a race condition where unverified pages might be left +in the pagecache. But actually it doesn't solve it fully. + +Since the incomplete solution for this race condition has too much +performance impact for it to be worth it, let's remove it for now. + +Fixes: 3fda4c617e84 ("fs-verity: implement FS_IOC_ENABLE_VERITY ioctl") +Cc: stable@vger.kernel.org +Reviewed-by: Victor Hsieh +Link: https://lore.kernel.org/r/20230314235332.50270-1-ebiggers@kernel.org +Signed-off-by: Eric Biggers +Signed-off-by: Sasha Levin +--- + fs/verity/enable.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/fs/verity/enable.c b/fs/verity/enable.c +index df6b499bf6a14..400c264bf8930 100644 +--- a/fs/verity/enable.c ++++ b/fs/verity/enable.c +@@ -390,25 +390,27 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) + goto out_drop_write; + + err = enable_verity(filp, &arg); +- if (err) +- goto out_allow_write_access; + + /* +- * Some pages of the file may have been evicted from pagecache after +- * being used in the Merkle tree construction, then read into pagecache +- * again by another process reading from the file concurrently. Since +- * these pages didn't undergo verification against the file digest which +- * fs-verity now claims to be enforcing, we have to wipe the pagecache +- * to ensure that all future reads are verified. ++ * We no longer drop the inode's pagecache after enabling verity. This ++ * used to be done to try to avoid a race condition where pages could be ++ * evicted after being used in the Merkle tree construction, then ++ * re-instantiated by a concurrent read. Such pages are unverified, and ++ * the backing storage could have filled them with different content, so ++ * they shouldn't be used to fulfill reads once verity is enabled. ++ * ++ * But, dropping the pagecache has a big performance impact, and it ++ * doesn't fully solve the race condition anyway. So for those reasons, ++ * and also because this race condition isn't very important relatively ++ * speaking (especially for small-ish files, where the chance of a page ++ * being used, evicted, *and* re-instantiated all while enabling verity ++ * is quite small), we no longer drop the inode's pagecache. + */ +- filemap_write_and_wait(inode->i_mapping); +- invalidate_inode_pages2(inode->i_mapping); + + /* + * allow_write_access() is needed to pair with deny_write_access(). + * Regardless, the filesystem won't allow writing to verity files. + */ +-out_allow_write_access: + allow_write_access(filp); + out_drop_write: + mnt_drop_write_file(filp); +-- +2.39.2 + diff --git a/queue-6.1/kcsan-avoid-passing-g-for-test.patch b/queue-6.1/kcsan-avoid-passing-g-for-test.patch new file mode 100644 index 00000000000..73b7fb7fc6a --- /dev/null +++ b/queue-6.1/kcsan-avoid-passing-g-for-test.patch @@ -0,0 +1,50 @@ +From 6df0780c17f9b688e3a9a0921c2b2f62ca3d9820 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 16 Mar 2023 23:47:05 +0100 +Subject: kcsan: avoid passing -g for test + +From: Marco Elver + +[ Upstream commit 5eb39cde1e2487ba5ec1802dc5e58a77e700d99e ] + +Nathan reported that when building with GNU as and a version of clang that +defaults to DWARF5, the assembler will complain with: + + Error: non-constant .uleb128 is not supported + +This is because `-g` defaults to the compiler debug info default. If the +assembler does not support some of the directives used, the above errors +occur. To fix, remove the explicit passing of `-g`. + +All the test wants is that stack traces print valid function names, and +debug info is not required for that. (I currently cannot recall why I +added the explicit `-g`.) + +Link: https://lkml.kernel.org/r/20230316224705.709984-2-elver@google.com +Fixes: 1fe84fd4a402 ("kcsan: Add test suite") +Signed-off-by: Marco Elver +Reported-by: Nathan Chancellor +Cc: Alexander Potapenko +Cc: Dmitry Vyukov +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + kernel/kcsan/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile +index 8cf70f068d92d..a45f3dfc8d141 100644 +--- a/kernel/kcsan/Makefile ++++ b/kernel/kcsan/Makefile +@@ -16,6 +16,6 @@ obj-y := core.o debugfs.o report.o + KCSAN_INSTRUMENT_BARRIERS_selftest.o := y + obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o + +-CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer ++CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer + CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) + obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o +-- +2.39.2 + diff --git a/queue-6.1/kernel-kcsan-kcsan_test-build-without-structleak-plu.patch b/queue-6.1/kernel-kcsan-kcsan_test-build-without-structleak-plu.patch new file mode 100644 index 00000000000..6011d39aabd --- /dev/null +++ b/queue-6.1/kernel-kcsan-kcsan_test-build-without-structleak-plu.patch @@ -0,0 +1,44 @@ +From 0750f19e829df59a44aa66d8476fa2b5608f0352 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 28 Nov 2022 11:43:58 +0100 +Subject: kernel: kcsan: kcsan_test: build without structleak plugin + +From: Anders Roxell + +[ Upstream commit 6fcd4267a840d0536b8e5334ad5f31e4105fce85 ] + +Building kcsan_test with structleak plugin enabled makes the stack frame +size to grow. + +kernel/kcsan/kcsan_test.c:704:1: error: the frame size of 3296 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] + +Turn off the structleak plugin checks for kcsan_test. + +Link: https://lkml.kernel.org/r/20221128104358.2660634-1-anders.roxell@linaro.org +Signed-off-by: Anders Roxell +Suggested-by: Arnd Bergmann +Acked-by: Marco Elver +Cc: Arnd Bergmann +Cc: David Gow +Cc: Jason A. Donenfeld +Cc: Kees Cook +Signed-off-by: Andrew Morton +Stable-dep-of: 5eb39cde1e24 ("kcsan: avoid passing -g for test") +Signed-off-by: Sasha Levin +--- + kernel/kcsan/Makefile | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile +index 4f35d1bced6a2..8cf70f068d92d 100644 +--- a/kernel/kcsan/Makefile ++++ b/kernel/kcsan/Makefile +@@ -17,4 +17,5 @@ KCSAN_INSTRUMENT_BARRIERS_selftest.o := y + obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o + + CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer ++CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) + obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o +-- +2.39.2 + diff --git a/queue-6.1/net-ethernet-ti-am65-cpsw-cpts-fix-cpts-release-acti.patch b/queue-6.1/net-ethernet-ti-am65-cpsw-cpts-fix-cpts-release-acti.patch new file mode 100644 index 00000000000..5c1b68bc8b2 --- /dev/null +++ b/queue-6.1/net-ethernet-ti-am65-cpsw-cpts-fix-cpts-release-acti.patch @@ -0,0 +1,134 @@ +From 51156bb1ed96e686878d2a39f3ddf65548db677d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 20 Jan 2023 12:37:31 +0530 +Subject: net: ethernet: ti: am65-cpsw/cpts: Fix CPTS release action + +From: Siddharth Vadapalli + +[ Upstream commit 4ad8766cd3982744e53f107f378d2c65b76ff9a8 ] + +The am65_cpts_release() function is registered as a devm_action in the +am65_cpts_create() function in am65-cpts driver. When the am65-cpsw driver +invokes am65_cpts_create(), am65_cpts_release() is added in the set of devm +actions associated with the am65-cpsw driver's device. + +In the event of probe failure or probe deferral, the platform_drv_probe() +function invokes dev_pm_domain_detach() which powers off the CPSW and the +CPSW's CPTS hardware, both of which share the same power domain. Since the +am65_cpts_disable() function invoked by the am65_cpts_release() function +attempts to reset the CPTS hardware by writing to its registers, the CPTS +hardware is assumed to be powered on at this point. However, the hardware +is powered off before the devm actions are executed. + +Fix this by getting rid of the devm action for am65_cpts_release() and +invoking it directly on the cleanup and exit paths. + +Fixes: f6bd59526ca5 ("net: ethernet: ti: introduce am654 common platform time sync driver") +Signed-off-by: Siddharth Vadapalli +Reviewed-by: Leon Romanovsky +Reviewed-by: Tony Nguyen +Reviewed-by: Roger Quadros +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 ++ + drivers/net/ethernet/ti/am65-cpts.c | 15 +++++---------- + drivers/net/ethernet/ti/am65-cpts.h | 5 +++++ + 3 files changed, 12 insertions(+), 10 deletions(-) + +diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c +index 00911e9360525..8ff1c84a23ce7 100644 +--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c ++++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c +@@ -2817,6 +2817,7 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) + + err_free_phylink: + am65_cpsw_nuss_phylink_cleanup(common); ++ am65_cpts_release(common->cpts); + err_of_clear: + of_platform_device_destroy(common->mdio_dev, NULL); + err_pm_clear: +@@ -2845,6 +2846,7 @@ static int am65_cpsw_nuss_remove(struct platform_device *pdev) + */ + am65_cpsw_nuss_cleanup_ndev(common); + am65_cpsw_nuss_phylink_cleanup(common); ++ am65_cpts_release(common->cpts); + + of_platform_device_destroy(common->mdio_dev, NULL); + +diff --git a/drivers/net/ethernet/ti/am65-cpts.c b/drivers/net/ethernet/ti/am65-cpts.c +index e2f0fb286143b..9948ac14e68db 100644 +--- a/drivers/net/ethernet/ti/am65-cpts.c ++++ b/drivers/net/ethernet/ti/am65-cpts.c +@@ -918,14 +918,13 @@ static int am65_cpts_of_parse(struct am65_cpts *cpts, struct device_node *node) + return cpts_of_mux_clk_setup(cpts, node); + } + +-static void am65_cpts_release(void *data) ++void am65_cpts_release(struct am65_cpts *cpts) + { +- struct am65_cpts *cpts = data; +- + ptp_clock_unregister(cpts->ptp_clock); + am65_cpts_disable(cpts); + clk_disable_unprepare(cpts->refclk); + } ++EXPORT_SYMBOL_GPL(am65_cpts_release); + + struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, + struct device_node *node) +@@ -1003,18 +1002,12 @@ struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, + } + cpts->phc_index = ptp_clock_index(cpts->ptp_clock); + +- ret = devm_add_action_or_reset(dev, am65_cpts_release, cpts); +- if (ret) { +- dev_err(dev, "failed to add ptpclk reset action %d", ret); +- return ERR_PTR(ret); +- } +- + ret = devm_request_threaded_irq(dev, cpts->irq, NULL, + am65_cpts_interrupt, + IRQF_ONESHOT, dev_name(dev), cpts); + if (ret < 0) { + dev_err(cpts->dev, "error attaching irq %d\n", ret); +- return ERR_PTR(ret); ++ goto reset_ptpclk; + } + + dev_info(dev, "CPTS ver 0x%08x, freq:%u, add_val:%u\n", +@@ -1023,6 +1016,8 @@ struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, + + return cpts; + ++reset_ptpclk: ++ am65_cpts_release(cpts); + refclk_disable: + clk_disable_unprepare(cpts->refclk); + return ERR_PTR(ret); +diff --git a/drivers/net/ethernet/ti/am65-cpts.h b/drivers/net/ethernet/ti/am65-cpts.h +index cf9fbc28fd032..c0ae0117e5737 100644 +--- a/drivers/net/ethernet/ti/am65-cpts.h ++++ b/drivers/net/ethernet/ti/am65-cpts.h +@@ -18,6 +18,7 @@ struct am65_cpts_estf_cfg { + }; + + #if IS_ENABLED(CONFIG_TI_K3_AM65_CPTS) ++void am65_cpts_release(struct am65_cpts *cpts); + struct am65_cpts *am65_cpts_create(struct device *dev, void __iomem *regs, + struct device_node *node); + int am65_cpts_phc_index(struct am65_cpts *cpts); +@@ -29,6 +30,10 @@ int am65_cpts_estf_enable(struct am65_cpts *cpts, int idx, + struct am65_cpts_estf_cfg *cfg); + void am65_cpts_estf_disable(struct am65_cpts *cpts, int idx); + #else ++static inline void am65_cpts_release(struct am65_cpts *cpts) ++{ ++} ++ + static inline struct am65_cpts *am65_cpts_create(struct device *dev, + void __iomem *regs, + struct device_node *node) +-- +2.39.2 + diff --git a/queue-6.1/net-mscc-ocelot-fix-stats-region-batching.patch b/queue-6.1/net-mscc-ocelot-fix-stats-region-batching.patch new file mode 100644 index 00000000000..8f1b2c569b0 --- /dev/null +++ b/queue-6.1/net-mscc-ocelot-fix-stats-region-batching.patch @@ -0,0 +1,86 @@ +From 1bfdf14cb0563cc7e444a87f59d86d4977edc782 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 21 Mar 2023 03:03:23 +0200 +Subject: net: mscc: ocelot: fix stats region batching + +From: Vladimir Oltean + +[ Upstream commit 6acc72a43eac78a309160d0a7512bbc59bcdd757 ] + +The blamed commit changed struct ocelot_stat_layout :: "u32 offset" to +"u32 reg". + +However, "u32 reg" is not quite a register address, but an enum +ocelot_reg, which in itself encodes an enum ocelot_target target in the +upper bits, and an index into the ocelot->map[target][] array in the +lower bits. + +So, whereas the previous code comparison between stats_layout[i].offset +and last + 1 was correct (because those "offsets" at the time were +32-bit relative addresses), the new code, comparing layout[i].reg to +last + 4 is not correct, because the "reg" here is an enum/index, not an +actual register address. + +What we want to compare are indeed register addresses, but to do that, +we need to actually go through the same motions as +__ocelot_bulk_read_ix() itself. + +With this bug, all statistics counters are deemed by +ocelot_prepare_stats_regions() as constituting their own region. +(Truncated) log on VSC9959 (Felix) below (prints added by me): + +Before: + +region of 1 contiguous counters starting with SYS:STAT:CNT[0x000] +region of 1 contiguous counters starting with SYS:STAT:CNT[0x001] +region of 1 contiguous counters starting with SYS:STAT:CNT[0x002] +... +region of 1 contiguous counters starting with SYS:STAT:CNT[0x041] +region of 1 contiguous counters starting with SYS:STAT:CNT[0x042] +region of 1 contiguous counters starting with SYS:STAT:CNT[0x080] +region of 1 contiguous counters starting with SYS:STAT:CNT[0x081] +... +region of 1 contiguous counters starting with SYS:STAT:CNT[0x0ac] +region of 1 contiguous counters starting with SYS:STAT:CNT[0x100] +region of 1 contiguous counters starting with SYS:STAT:CNT[0x101] +... +region of 1 contiguous counters starting with SYS:STAT:CNT[0x111] + +After: + +region of 67 contiguous counters starting with SYS:STAT:CNT[0x000] +region of 45 contiguous counters starting with SYS:STAT:CNT[0x080] +region of 18 contiguous counters starting with SYS:STAT:CNT[0x100] + +Since commit d87b1c08f38a ("net: mscc: ocelot: use bulk reads for +stats") intended bulking as a performance improvement, and since now, +with trivial-sized regions, performance is even worse than without +bulking at all, this could easily qualify as a performance regression. + +Fixes: d4c367650704 ("net: mscc: ocelot: keep ocelot_stat_layout by reg address, not offset") +Signed-off-by: Vladimir Oltean +Acked-by: Colin Foster +Tested-by: Colin Foster +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mscc/ocelot_stats.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mscc/ocelot_stats.c b/drivers/net/ethernet/mscc/ocelot_stats.c +index dbd20b125ceaf..0066219bb0e89 100644 +--- a/drivers/net/ethernet/mscc/ocelot_stats.c ++++ b/drivers/net/ethernet/mscc/ocelot_stats.c +@@ -392,7 +392,8 @@ static int ocelot_prepare_stats_regions(struct ocelot *ocelot) + if (!ocelot->stats_layout[i].reg) + continue; + +- if (region && ocelot->stats_layout[i].reg == last + 4) { ++ if (region && ocelot->map[SYS][ocelot->stats_layout[i].reg & REG_MASK] == ++ ocelot->map[SYS][last & REG_MASK] + 4) { + region->count++; + } else { + region = devm_kzalloc(ocelot->dev, sizeof(*region), +-- +2.39.2 + diff --git a/queue-6.1/riscv-ftrace-fixup-panic-by-disabling-preemption.patch b/queue-6.1/riscv-ftrace-fixup-panic-by-disabling-preemption.patch new file mode 100644 index 00000000000..15e842a170f --- /dev/null +++ b/queue-6.1/riscv-ftrace-fixup-panic-by-disabling-preemption.patch @@ -0,0 +1,57 @@ +From 46b4b428b523d1e1276c23969bbdde4ad654e1e5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Jan 2023 04:05:57 -0500 +Subject: riscv: ftrace: Fixup panic by disabling preemption + +From: Andy Chiu + +[ Upstream commit 8547649981e6631328cd64f583667501ae385531 ] + +In RISCV, we must use an AUIPC + JALR pair to encode an immediate, +forming a jump that jumps to an address over 4K. This may cause errors +if we want to enable kernel preemption and remove dependency from +patching code with stop_machine(). For example, if a task was switched +out on auipc. And, if we changed the ftrace function before it was +switched back, then it would jump to an address that has updated 11:0 +bits mixing with previous XLEN:12 part. + +p: patched area performed by dynamic ftrace +ftrace_prologue: +p| REG_S ra, -SZREG(sp) +p| auipc ra, 0x? ------------> preempted + ... + change ftrace function + ... +p| jalr -?(ra) <------------- switched back +p| REG_L ra, -SZREG(sp) +func: + xxx + ret + +Fixes: afc76b8b8011 ("riscv: Using PATCHABLE_FUNCTION_ENTRY instead of MCOUNT") +Signed-off-by: Andy Chiu +Signed-off-by: Guo Ren +Link: https://lore.kernel.org/r/20230112090603.1295340-2-guoren@kernel.org +Cc: stable@vger.kernel.org +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + arch/riscv/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig +index ae11d5647f9d4..06b9b2f60b9fb 100644 +--- a/arch/riscv/Kconfig ++++ b/arch/riscv/Kconfig +@@ -278,7 +278,7 @@ config ARCH_RV64I + select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE + select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL + select HAVE_FUNCTION_GRAPH_TRACER +- select HAVE_FUNCTION_TRACER if !XIP_KERNEL ++ select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION + select SWIOTLB if MMU + + endchoice +-- +2.39.2 + diff --git a/queue-6.1/series b/queue-6.1/series new file mode 100644 index 00000000000..441c62da5b1 --- /dev/null +++ b/queue-6.1/series @@ -0,0 +1,29 @@ +thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch +cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch +cifs-prevent-data-race-in-cifs_reconnect_tcon.patch +cifs-avoid-race-conditions-with-parallel-reconnects.patch +zonefs-reorganize-code.patch +zonefs-simplify-io-error-handling.patch +zonefs-reduce-struct-zonefs_inode_info-size.patch +zonefs-separate-zone-information-from-inode-informat.patch +zonefs-fix-error-message-in-zonefs_file_dio_append.patch +fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch +kernel-kcsan-kcsan_test-build-without-structleak-plu.patch +kcsan-avoid-passing-g-for-test.patch +btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch +btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch +net-ethernet-ti-am65-cpsw-cpts-fix-cpts-release-acti.patch +riscv-ftrace-fixup-panic-by-disabling-preemption.patch +arm-dts-aspeed-p10bmc-update-battery-node-name.patch +drm-msm-dpu-refactor-sc7280_pp-location.patch +drm-msm-dpu-correct-sm8250-and-sm8350-scaler.patch +drm-msm-disp-dpu-fix-sc7280_pp-base-offset.patch +blk-mq-move-the-srcu_struct-used-for-quiescing-to-th.patch +blk-mq-fix-bad-unlock-balance-detected-on-q-srcu-in-.patch +tty-serial-fsl_lpuart-switch-to-new-dmaengine_termin.patch +tty-serial-fsl_lpuart-fix-race-on-rx-dma-shutdown.patch +tracing-add-.percent-suffix-option-to-histogram-valu.patch +tracing-add-.graph-suffix-option-to-histogram-value.patch +tracing-do-not-let-histogram-values-have-some-modifi.patch +net-mscc-ocelot-fix-stats-region-batching.patch +arm64-efi-set-nx-compat-flag-in-pe-coff-header.patch diff --git a/queue-6.1/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch b/queue-6.1/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch new file mode 100644 index 00000000000..1ff900e1455 --- /dev/null +++ b/queue-6.1/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch @@ -0,0 +1,138 @@ +From b0882211c51cbf4707c3bd1ea81d8feb116b01e1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 31 Jan 2023 13:04:52 +0200 +Subject: thunderbolt: Limit USB3 bandwidth of certain Intel USB4 host routers + +From: Gil Fine + +[ Upstream commit f0a57dd33b3eadf540912cd130db727ea824d174 ] + +Current Intel USB4 host routers have hardware limitation that the USB3 +bandwidth cannot go higher than 16376 Mb/s. Work this around by adding a +new quirk that limits the bandwidth for the affected host routers. + +Cc: stable@vger.kernel.org +Signed-off-by: Gil Fine +Signed-off-by: Mika Westerberg +Signed-off-by: Sasha Levin +--- + drivers/thunderbolt/quirks.c | 31 +++++++++++++++++++++++++++++++ + drivers/thunderbolt/tb.h | 3 +++ + drivers/thunderbolt/usb4.c | 17 +++++++++++++++-- + 3 files changed, 49 insertions(+), 2 deletions(-) + +diff --git a/drivers/thunderbolt/quirks.c b/drivers/thunderbolt/quirks.c +index ae28a03fa890b..1157b8869bcca 100644 +--- a/drivers/thunderbolt/quirks.c ++++ b/drivers/thunderbolt/quirks.c +@@ -26,6 +26,19 @@ static void quirk_clx_disable(struct tb_switch *sw) + tb_sw_dbg(sw, "disabling CL states\n"); + } + ++static void quirk_usb3_maximum_bandwidth(struct tb_switch *sw) ++{ ++ struct tb_port *port; ++ ++ tb_switch_for_each_port(sw, port) { ++ if (!tb_port_is_usb3_down(port)) ++ continue; ++ port->max_bw = 16376; ++ tb_port_dbg(port, "USB3 maximum bandwidth limited to %u Mb/s\n", ++ port->max_bw); ++ } ++} ++ + struct tb_quirk { + u16 hw_vendor_id; + u16 hw_device_id; +@@ -43,6 +56,24 @@ static const struct tb_quirk tb_quirks[] = { + * DP buffers. + */ + { 0x8087, 0x0b26, 0x0000, 0x0000, quirk_dp_credit_allocation }, ++ /* ++ * Limit the maximum USB3 bandwidth for the following Intel USB4 ++ * host routers due to a hardware issue. ++ */ ++ { 0x8087, PCI_DEVICE_ID_INTEL_ADL_NHI0, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_ADL_NHI1, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_RPL_NHI0, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_RPL_NHI1, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_MTL_M_NHI0, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_MTL_P_NHI0, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_MTL_P_NHI1, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, + /* + * CLx is not supported on AMD USB4 Yellow Carp and Pink Sardine platforms. + */ +diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h +index e11d973a8f9b6..f034723b1b40e 100644 +--- a/drivers/thunderbolt/tb.h ++++ b/drivers/thunderbolt/tb.h +@@ -252,6 +252,8 @@ struct tb_switch { + * @ctl_credits: Buffers reserved for control path + * @dma_credits: Number of credits allocated for DMA tunneling for all + * DMA paths through this port. ++ * @max_bw: Maximum possible bandwidth through this adapter if set to ++ * non-zero. + * + * In USB4 terminology this structure represents an adapter (protocol or + * lane adapter). +@@ -277,6 +279,7 @@ struct tb_port { + unsigned int total_credits; + unsigned int ctl_credits; + unsigned int dma_credits; ++ unsigned int max_bw; + }; + + /** +diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c +index cf8d4f769579e..3c821f5e44814 100644 +--- a/drivers/thunderbolt/usb4.c ++++ b/drivers/thunderbolt/usb4.c +@@ -1865,6 +1865,15 @@ int usb4_port_retimer_nvm_read(struct tb_port *port, u8 index, + usb4_port_retimer_nvm_read_block, &info); + } + ++static inline unsigned int ++usb4_usb3_port_max_bandwidth(const struct tb_port *port, unsigned int bw) ++{ ++ /* Take the possible bandwidth limitation into account */ ++ if (port->max_bw) ++ return min(bw, port->max_bw); ++ return bw; ++} ++ + /** + * usb4_usb3_port_max_link_rate() - Maximum support USB3 link rate + * @port: USB3 adapter port +@@ -1886,7 +1895,9 @@ int usb4_usb3_port_max_link_rate(struct tb_port *port) + return ret; + + lr = (val & ADP_USB3_CS_4_MSLR_MASK) >> ADP_USB3_CS_4_MSLR_SHIFT; +- return lr == ADP_USB3_CS_4_MSLR_20G ? 20000 : 10000; ++ ret = lr == ADP_USB3_CS_4_MSLR_20G ? 20000 : 10000; ++ ++ return usb4_usb3_port_max_bandwidth(port, ret); + } + + /** +@@ -1913,7 +1924,9 @@ int usb4_usb3_port_actual_link_rate(struct tb_port *port) + return 0; + + lr = val & ADP_USB3_CS_4_ALR_MASK; +- return lr == ADP_USB3_CS_4_ALR_20G ? 20000 : 10000; ++ ret = lr == ADP_USB3_CS_4_ALR_20G ? 20000 : 10000; ++ ++ return usb4_usb3_port_max_bandwidth(port, ret); + } + + static int usb4_usb3_port_cm_request(struct tb_port *port, bool request) +-- +2.39.2 + diff --git a/queue-6.1/tracing-add-.graph-suffix-option-to-histogram-value.patch b/queue-6.1/tracing-add-.graph-suffix-option-to-histogram-value.patch new file mode 100644 index 00000000000..8e2781ef242 --- /dev/null +++ b/queue-6.1/tracing-add-.graph-suffix-option-to-histogram-value.patch @@ -0,0 +1,240 @@ +From 30335d6f17cf3c839475858ac8a47b51ad446d01 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Oct 2022 00:31:55 +0900 +Subject: tracing: Add .graph suffix option to histogram value + +From: Masami Hiramatsu (Google) + +[ Upstream commit a2c54256dec7510477e2b4f4db187e638f7cac37 ] + +Add the .graph suffix which shows the bar graph of the histogram value. + +For example, the below example shows that the bar graph +of the histogram of the runtime for each tasks. + +------ + # cd /sys/kernel/debug/tracing/ + # echo hist:keys=pid:vals=runtime.graph:sort=pid > \ + events/sched/sched_stat_runtime/trigger + # sleep 10 + # cat events/sched/sched_stat_runtime/hist + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount,runtime.graph:sort=pid:size=2048 [active] + # + + { pid: 14 } hitcount: 2 runtime: + { pid: 16 } hitcount: 8 runtime: + { pid: 26 } hitcount: 1 runtime: + { pid: 57 } hitcount: 3 runtime: + { pid: 61 } hitcount: 20 runtime: ### + { pid: 66 } hitcount: 2 runtime: + { pid: 70 } hitcount: 3 runtime: + { pid: 72 } hitcount: 2 runtime: + { pid: 145 } hitcount: 14 runtime: #################### + { pid: 152 } hitcount: 5 runtime: ####### + { pid: 153 } hitcount: 2 runtime: #### + + Totals: + Hits: 62 + Entries: 11 + Dropped: 0 +------- + +Link: https://lore.kernel.org/linux-trace-kernel/166610813953.56030.10944148382315789485.stgit@devnote2 + +Signed-off-by: Masami Hiramatsu (Google) +Reviewed-by: Tom Zanussi +Tested-by: Tom Zanussi +Stable-dep-of: e0213434fe3e ("tracing: Do not let histogram values have some modifiers") +Signed-off-by: Sasha Levin +--- + kernel/trace/trace.c | 3 +- + kernel/trace/trace_events_hist.c | 77 +++++++++++++++++++++++++------- + 2 files changed, 63 insertions(+), 17 deletions(-) + +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index f714ed1f1c673..78d69b9488e45 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -5728,7 +5728,8 @@ static const char readme_msg[] = + "\t .log2 display log2 value rather than raw number\n" + "\t .buckets=size display values in groups of size rather than raw number\n" + "\t .usecs display a common_timestamp in microseconds\n" +- "\t .percent display a number of percentage value\n\n" ++ "\t .percent display a number of percentage value\n" ++ "\t .graph display a bar-graph of a value\n\n" + "\t The 'pause' parameter can be used to pause an existing hist\n" + "\t trigger or to start a hist trigger but not log any events\n" + "\t until told to do so. 'continue' can be used to start or\n" +diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c +index 1c207fbf5634f..8e0acf8009bde 100644 +--- a/kernel/trace/trace_events_hist.c ++++ b/kernel/trace/trace_events_hist.c +@@ -507,6 +507,7 @@ enum hist_field_flags { + HIST_FIELD_FL_BUCKET = 1 << 17, + HIST_FIELD_FL_CONST = 1 << 18, + HIST_FIELD_FL_PERCENT = 1 << 19, ++ HIST_FIELD_FL_GRAPH = 1 << 20, + }; + + struct var_defs { +@@ -1711,6 +1712,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) + flags_str = "usecs"; + else if (hist_field->flags & HIST_FIELD_FL_PERCENT) + flags_str = "percent"; ++ else if (hist_field->flags & HIST_FIELD_FL_GRAPH) ++ flags_str = "graph"; + + return flags_str; + } +@@ -2327,6 +2330,10 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, + if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) + goto error; + *flags |= HIST_FIELD_FL_PERCENT; ++ } else if (strncmp(modifier, "graph", 5) == 0) { ++ if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) ++ goto error; ++ *flags |= HIST_FIELD_FL_GRAPH; + } else { + error: + hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); +@@ -5322,20 +5329,52 @@ static inline unsigned int __get_percentage(u64 val, u64 total) + return val ? UINT_MAX : 0; + } + ++#define BAR_CHAR '#' ++ ++static inline const char *__fill_bar_str(char *buf, int size, u64 val, u64 max) ++{ ++ unsigned int len = __get_percentage(val, max); ++ int i; ++ ++ if (len == UINT_MAX) { ++ snprintf(buf, size, "[ERROR]"); ++ return buf; ++ } ++ ++ len = len * size / 10000; ++ for (i = 0; i < len && i < size; i++) ++ buf[i] = BAR_CHAR; ++ while (i < size) ++ buf[i++] = ' '; ++ buf[size] = '\0'; ++ ++ return buf; ++} ++ ++struct hist_val_stat { ++ u64 max; ++ u64 total; ++}; ++ + static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, + const char *field_name, unsigned long flags, +- u64 *totals, struct tracing_map_elt *elt) ++ struct hist_val_stat *stats, ++ struct tracing_map_elt *elt) + { + u64 val = tracing_map_read_sum(elt, idx); + unsigned int pc; ++ char bar[21]; + + if (flags & HIST_FIELD_FL_PERCENT) { +- pc = __get_percentage(val, totals[idx]); ++ pc = __get_percentage(val, stats[idx].total); + if (pc == UINT_MAX) + seq_printf(m, " %s (%%):[ERROR]", field_name); + else + seq_printf(m, " %s (%%): %3u.%02u", field_name, + pc / 100, pc % 100); ++ } else if (flags & HIST_FIELD_FL_GRAPH) { ++ seq_printf(m, " %s: %20s", field_name, ++ __fill_bar_str(bar, 20, val, stats[idx].max)); + } else if (flags & HIST_FIELD_FL_HEX) { + seq_printf(m, " %s: %10llx", field_name, val); + } else { +@@ -5345,7 +5384,7 @@ static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, + + static void hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, +- u64 *totals, ++ struct hist_val_stat *stats, + void *key, + struct tracing_map_elt *elt) + { +@@ -5356,7 +5395,7 @@ static void hist_trigger_entry_print(struct seq_file *m, + hist_trigger_print_key(m, hist_data, key, elt); + + /* At first, show the raw hitcount always */ +- hist_trigger_print_val(m, i, "hitcount", 0, totals, elt); ++ hist_trigger_print_val(m, i, "hitcount", 0, stats, elt); + + for (i = 1; i < hist_data->n_vals; i++) { + field_name = hist_field_name(hist_data->fields[i], 0); +@@ -5366,7 +5405,7 @@ static void hist_trigger_entry_print(struct seq_file *m, + continue; + + seq_puts(m, " "); +- hist_trigger_print_val(m, i, field_name, flags, totals, elt); ++ hist_trigger_print_val(m, i, field_name, flags, stats, elt); + } + + print_actions(m, hist_data, elt); +@@ -5380,7 +5419,8 @@ static int print_entries(struct seq_file *m, + struct tracing_map_sort_entry **sort_entries = NULL; + struct tracing_map *map = hist_data->map; + int i, j, n_entries; +- u64 *totals = NULL; ++ struct hist_val_stat *stats = NULL; ++ u64 val; + + n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, + hist_data->n_sort_keys, +@@ -5388,28 +5428,33 @@ static int print_entries(struct seq_file *m, + if (n_entries < 0) + return n_entries; + ++ /* Calculate the max and the total for each field if needed. */ + for (j = 0; j < hist_data->n_vals; j++) { +- if (!(hist_data->fields[j]->flags & HIST_FIELD_FL_PERCENT)) ++ if (!(hist_data->fields[j]->flags & ++ (HIST_FIELD_FL_PERCENT | HIST_FIELD_FL_GRAPH))) + continue; +- if (!totals) { +- totals = kcalloc(hist_data->n_vals, sizeof(u64), +- GFP_KERNEL); +- if (!totals) { ++ if (!stats) { ++ stats = kcalloc(hist_data->n_vals, sizeof(*stats), ++ GFP_KERNEL); ++ if (!stats) { + n_entries = -ENOMEM; + goto out; + } + } +- for (i = 0; i < n_entries; i++) +- totals[j] += tracing_map_read_sum( +- sort_entries[i]->elt, j); ++ for (i = 0; i < n_entries; i++) { ++ val = tracing_map_read_sum(sort_entries[i]->elt, j); ++ stats[j].total += val; ++ if (stats[j].max < val) ++ stats[j].max = val; ++ } + } + + for (i = 0; i < n_entries; i++) +- hist_trigger_entry_print(m, hist_data, totals, ++ hist_trigger_entry_print(m, hist_data, stats, + sort_entries[i]->key, + sort_entries[i]->elt); + +- kfree(totals); ++ kfree(stats); + out: + tracing_map_destroy_sort_entries(sort_entries, n_entries); + +-- +2.39.2 + diff --git a/queue-6.1/tracing-add-.percent-suffix-option-to-histogram-valu.patch b/queue-6.1/tracing-add-.percent-suffix-option-to-histogram-valu.patch new file mode 100644 index 00000000000..a8ec432fc22 --- /dev/null +++ b/queue-6.1/tracing-add-.percent-suffix-option-to-histogram-valu.patch @@ -0,0 +1,226 @@ +From aa5e63131c95836508399c95d7db46f5581e9aa4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Oct 2022 00:31:55 +0900 +Subject: tracing: Add .percent suffix option to histogram values + +From: Masami Hiramatsu (Google) + +[ Upstream commit abaa5258ce5e5887a9de049f50a85dc023391a1c ] + +Add .percent suffix option to show the histogram values in percentage. +This feature is useful when we need yo undersntand the overall trend +for the histograms of large values. +E.g. this shows the runtime percentage for each tasks. + +------ + # cd /sys/kernel/debug/tracing/ + # echo hist:keys=pid:vals=hitcount,runtime.percent:sort=pid > \ + events/sched/sched_stat_runtime/trigger + # sleep 10 + # cat events/sched/sched_stat_runtime/hist + # event histogram + # + # trigger info: hist:keys=pid:vals=hitcount,runtime.percent:sort=pid:size=2048 [active] + # + + { pid: 8 } hitcount: 7 runtime (%): 4.14 + { pid: 14 } hitcount: 5 runtime (%): 3.69 + { pid: 16 } hitcount: 11 runtime (%): 3.41 + { pid: 61 } hitcount: 41 runtime (%): 19.75 + { pid: 65 } hitcount: 4 runtime (%): 1.48 + { pid: 70 } hitcount: 6 runtime (%): 3.60 + { pid: 72 } hitcount: 2 runtime (%): 1.10 + { pid: 144 } hitcount: 10 runtime (%): 32.01 + { pid: 151 } hitcount: 8 runtime (%): 22.66 + { pid: 152 } hitcount: 2 runtime (%): 8.10 + + Totals: + Hits: 96 + Entries: 10 + Dropped: 0 +----- + +Link: https://lore.kernel.org/linux-trace-kernel/166610813077.56030.4238090506973562347.stgit@devnote2 + +Signed-off-by: Masami Hiramatsu (Google) +Reviewed-by: Tom Zanussi +Tested-by: Tom Zanussi +Stable-dep-of: e0213434fe3e ("tracing: Do not let histogram values have some modifiers") +Signed-off-by: Sasha Levin +--- + kernel/trace/trace.c | 3 +- + kernel/trace/trace_events_hist.c | 90 +++++++++++++++++++++++++++----- + 2 files changed, 78 insertions(+), 15 deletions(-) + +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 888980257340f..f714ed1f1c673 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -5727,7 +5727,8 @@ static const char readme_msg[] = + "\t .syscall display a syscall id as a syscall name\n" + "\t .log2 display log2 value rather than raw number\n" + "\t .buckets=size display values in groups of size rather than raw number\n" +- "\t .usecs display a common_timestamp in microseconds\n\n" ++ "\t .usecs display a common_timestamp in microseconds\n" ++ "\t .percent display a number of percentage value\n\n" + "\t The 'pause' parameter can be used to pause an existing hist\n" + "\t trigger or to start a hist trigger but not log any events\n" + "\t until told to do so. 'continue' can be used to start or\n" +diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c +index e3df03cdecbcb..1c207fbf5634f 100644 +--- a/kernel/trace/trace_events_hist.c ++++ b/kernel/trace/trace_events_hist.c +@@ -506,6 +506,7 @@ enum hist_field_flags { + HIST_FIELD_FL_ALIAS = 1 << 16, + HIST_FIELD_FL_BUCKET = 1 << 17, + HIST_FIELD_FL_CONST = 1 << 18, ++ HIST_FIELD_FL_PERCENT = 1 << 19, + }; + + struct var_defs { +@@ -1708,6 +1709,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field) + flags_str = "buckets"; + else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) + flags_str = "usecs"; ++ else if (hist_field->flags & HIST_FIELD_FL_PERCENT) ++ flags_str = "percent"; + + return flags_str; + } +@@ -2320,6 +2323,10 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, + if (ret || !(*buckets)) + goto error; + *flags |= HIST_FIELD_FL_BUCKET; ++ } else if (strncmp(modifier, "percent", 7) == 0) { ++ if (*flags & (HIST_FIELD_FL_VAR | HIST_FIELD_FL_KEY)) ++ goto error; ++ *flags |= HIST_FIELD_FL_PERCENT; + } else { + error: + hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); +@@ -5297,33 +5304,69 @@ static void hist_trigger_print_key(struct seq_file *m, + seq_puts(m, "}"); + } + ++/* Get the 100 times of the percentage of @val in @total */ ++static inline unsigned int __get_percentage(u64 val, u64 total) ++{ ++ if (!total) ++ goto div0; ++ ++ if (val < (U64_MAX / 10000)) ++ return (unsigned int)div64_ul(val * 10000, total); ++ ++ total = div64_u64(total, 10000); ++ if (!total) ++ goto div0; ++ ++ return (unsigned int)div64_ul(val, total); ++div0: ++ return val ? UINT_MAX : 0; ++} ++ ++static void hist_trigger_print_val(struct seq_file *m, unsigned int idx, ++ const char *field_name, unsigned long flags, ++ u64 *totals, struct tracing_map_elt *elt) ++{ ++ u64 val = tracing_map_read_sum(elt, idx); ++ unsigned int pc; ++ ++ if (flags & HIST_FIELD_FL_PERCENT) { ++ pc = __get_percentage(val, totals[idx]); ++ if (pc == UINT_MAX) ++ seq_printf(m, " %s (%%):[ERROR]", field_name); ++ else ++ seq_printf(m, " %s (%%): %3u.%02u", field_name, ++ pc / 100, pc % 100); ++ } else if (flags & HIST_FIELD_FL_HEX) { ++ seq_printf(m, " %s: %10llx", field_name, val); ++ } else { ++ seq_printf(m, " %s: %10llu", field_name, val); ++ } ++} ++ + static void hist_trigger_entry_print(struct seq_file *m, + struct hist_trigger_data *hist_data, ++ u64 *totals, + void *key, + struct tracing_map_elt *elt) + { + const char *field_name; +- unsigned int i; ++ unsigned int i = HITCOUNT_IDX; ++ unsigned long flags; + + hist_trigger_print_key(m, hist_data, key, elt); + +- seq_printf(m, " hitcount: %10llu", +- tracing_map_read_sum(elt, HITCOUNT_IDX)); ++ /* At first, show the raw hitcount always */ ++ hist_trigger_print_val(m, i, "hitcount", 0, totals, elt); + + for (i = 1; i < hist_data->n_vals; i++) { + field_name = hist_field_name(hist_data->fields[i], 0); ++ flags = hist_data->fields[i]->flags; + +- if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || +- hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) ++ if (flags & HIST_FIELD_FL_VAR || flags & HIST_FIELD_FL_EXPR) + continue; + +- if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { +- seq_printf(m, " %s: %10llx", field_name, +- tracing_map_read_sum(elt, i)); +- } else { +- seq_printf(m, " %s: %10llu", field_name, +- tracing_map_read_sum(elt, i)); +- } ++ seq_puts(m, " "); ++ hist_trigger_print_val(m, i, field_name, flags, totals, elt); + } + + print_actions(m, hist_data, elt); +@@ -5336,7 +5379,8 @@ static int print_entries(struct seq_file *m, + { + struct tracing_map_sort_entry **sort_entries = NULL; + struct tracing_map *map = hist_data->map; +- int i, n_entries; ++ int i, j, n_entries; ++ u64 *totals = NULL; + + n_entries = tracing_map_sort_entries(map, hist_data->sort_keys, + hist_data->n_sort_keys, +@@ -5344,11 +5388,29 @@ static int print_entries(struct seq_file *m, + if (n_entries < 0) + return n_entries; + ++ for (j = 0; j < hist_data->n_vals; j++) { ++ if (!(hist_data->fields[j]->flags & HIST_FIELD_FL_PERCENT)) ++ continue; ++ if (!totals) { ++ totals = kcalloc(hist_data->n_vals, sizeof(u64), ++ GFP_KERNEL); ++ if (!totals) { ++ n_entries = -ENOMEM; ++ goto out; ++ } ++ } ++ for (i = 0; i < n_entries; i++) ++ totals[j] += tracing_map_read_sum( ++ sort_entries[i]->elt, j); ++ } ++ + for (i = 0; i < n_entries; i++) +- hist_trigger_entry_print(m, hist_data, ++ hist_trigger_entry_print(m, hist_data, totals, + sort_entries[i]->key, + sort_entries[i]->elt); + ++ kfree(totals); ++out: + tracing_map_destroy_sort_entries(sort_entries, n_entries); + + return n_entries; +-- +2.39.2 + diff --git a/queue-6.1/tracing-do-not-let-histogram-values-have-some-modifi.patch b/queue-6.1/tracing-do-not-let-histogram-values-have-some-modifi.patch new file mode 100644 index 00000000000..35287e79174 --- /dev/null +++ b/queue-6.1/tracing-do-not-let-histogram-values-have-some-modifi.patch @@ -0,0 +1,109 @@ +From 5e1ccb9a8074e6d6d95bf68dcbebc263b326c574 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 1 Mar 2023 20:00:52 -0500 +Subject: tracing: Do not let histogram values have some modifiers + +From: Steven Rostedt (Google) + +[ Upstream commit e0213434fe3e4a0d118923dc98d31e7ff1cd9e45 ] + +Histogram values can not be strings, stacktraces, graphs, symbols, +syscalls, or grouped in buckets or log. Give an error if a value is set to +do so. + +Note, the histogram code was not prepared to handle these modifiers for +histograms and caused a bug. + +Mark Rutland reported: + + # echo 'p:copy_to_user __arch_copy_to_user n=$arg2' >> /sys/kernel/tracing/kprobe_events + # echo 'hist:keys=n:vals=hitcount.buckets=8:sort=hitcount' > /sys/kernel/tracing/events/kprobes/copy_to_user/trigger + # cat /sys/kernel/tracing/events/kprobes/copy_to_user/hist +[ 143.694628] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 +[ 143.695190] Mem abort info: +[ 143.695362] ESR = 0x0000000096000004 +[ 143.695604] EC = 0x25: DABT (current EL), IL = 32 bits +[ 143.695889] SET = 0, FnV = 0 +[ 143.696077] EA = 0, S1PTW = 0 +[ 143.696302] FSC = 0x04: level 0 translation fault +[ 143.702381] Data abort info: +[ 143.702614] ISV = 0, ISS = 0x00000004 +[ 143.702832] CM = 0, WnR = 0 +[ 143.703087] user pgtable: 4k pages, 48-bit VAs, pgdp=00000000448f9000 +[ 143.703407] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 +[ 143.704137] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP +[ 143.704714] Modules linked in: +[ 143.705273] CPU: 0 PID: 133 Comm: cat Not tainted 6.2.0-00003-g6fc512c10a7c #3 +[ 143.706138] Hardware name: linux,dummy-virt (DT) +[ 143.706723] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) +[ 143.707120] pc : hist_field_name.part.0+0x14/0x140 +[ 143.707504] lr : hist_field_name.part.0+0x104/0x140 +[ 143.707774] sp : ffff800008333a30 +[ 143.707952] x29: ffff800008333a30 x28: 0000000000000001 x27: 0000000000400cc0 +[ 143.708429] x26: ffffd7a653b20260 x25: 0000000000000000 x24: ffff10d303ee5800 +[ 143.708776] x23: ffffd7a6539b27b0 x22: ffff10d303fb8c00 x21: 0000000000000001 +[ 143.709127] x20: ffff10d303ec2000 x19: 0000000000000000 x18: 0000000000000000 +[ 143.709478] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 +[ 143.709824] x14: 0000000000000000 x13: 203a6f666e692072 x12: 6567676972742023 +[ 143.710179] x11: 0a230a6d6172676f x10: 000000000000002c x9 : ffffd7a6521e018c +[ 143.710584] x8 : 000000000000002c x7 : 7f7f7f7f7f7f7f7f x6 : 000000000000002c +[ 143.710915] x5 : ffff10d303b0103e x4 : ffffd7a653b20261 x3 : 000000000000003d +[ 143.711239] x2 : 0000000000020001 x1 : 0000000000000001 x0 : 0000000000000000 +[ 143.711746] Call trace: +[ 143.712115] hist_field_name.part.0+0x14/0x140 +[ 143.712642] hist_field_name.part.0+0x104/0x140 +[ 143.712925] hist_field_print+0x28/0x140 +[ 143.713125] event_hist_trigger_print+0x174/0x4d0 +[ 143.713348] hist_show+0xf8/0x980 +[ 143.713521] seq_read_iter+0x1bc/0x4b0 +[ 143.713711] seq_read+0x8c/0xc4 +[ 143.713876] vfs_read+0xc8/0x2a4 +[ 143.714043] ksys_read+0x70/0xfc +[ 143.714218] __arm64_sys_read+0x24/0x30 +[ 143.714400] invoke_syscall+0x50/0x120 +[ 143.714587] el0_svc_common.constprop.0+0x4c/0x100 +[ 143.714807] do_el0_svc+0x44/0xd0 +[ 143.714970] el0_svc+0x2c/0x84 +[ 143.715134] el0t_64_sync_handler+0xbc/0x140 +[ 143.715334] el0t_64_sync+0x190/0x194 +[ 143.715742] Code: a9bd7bfd 910003fd a90153f3 aa0003f3 (f9400000) +[ 143.716510] ---[ end trace 0000000000000000 ]--- +Segmentation fault + +Link: https://lkml.kernel.org/r/20230302020810.559462599@goodmis.org + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Andrew Morton +Fixes: c6afad49d127f ("tracing: Add hist trigger 'sym' and 'sym-offset' modifiers") +Reported-by: Mark Rutland +Tested-by: Mark Rutland +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + kernel/trace/trace_events_hist.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c +index 8e0acf8009bde..2b2120ed2460f 100644 +--- a/kernel/trace/trace_events_hist.c ++++ b/kernel/trace/trace_events_hist.c +@@ -4193,6 +4193,15 @@ static int __create_val_field(struct hist_trigger_data *hist_data, + goto out; + } + ++ /* Some types cannot be a value */ ++ if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT | ++ HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 | ++ HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET | ++ HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) { ++ hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str)); ++ ret = -EINVAL; ++ } ++ + hist_data->fields[val_idx] = hist_field; + + ++hist_data->n_vals; +-- +2.39.2 + diff --git a/queue-6.1/tty-serial-fsl_lpuart-fix-race-on-rx-dma-shutdown.patch b/queue-6.1/tty-serial-fsl_lpuart-fix-race-on-rx-dma-shutdown.patch new file mode 100644 index 00000000000..dd193890d77 --- /dev/null +++ b/queue-6.1/tty-serial-fsl_lpuart-fix-race-on-rx-dma-shutdown.patch @@ -0,0 +1,109 @@ +From 8c4c957d731435fb3b6ada7fbd0956769ebdbee3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Mar 2023 14:43:02 +0100 +Subject: tty: serial: fsl_lpuart: fix race on RX DMA shutdown + +From: Alexander Sverdlin + +[ Upstream commit 1be6f2b15f902c02e055ae0b419ca789200473c9 ] + +From time to time DMA completion can come in the middle of DMA shutdown: + +: : +lpuart32_shutdown() + lpuart_dma_shutdown() + del_timer_sync() + lpuart_dma_rx_complete() + lpuart_copy_rx_to_tty() + mod_timer() + lpuart_dma_rx_free() + +When the timer fires a bit later, sport->dma_rx_desc is NULL: + +Unable to handle kernel NULL pointer dereference at virtual address 0000000000000004 +pc : lpuart_copy_rx_to_tty+0xcc/0x5bc +lr : lpuart_timer_func+0x1c/0x2c +Call trace: + lpuart_copy_rx_to_tty + lpuart_timer_func + call_timer_fn + __run_timers.part.0 + run_timer_softirq + __do_softirq + __irq_exit_rcu + irq_exit + handle_domain_irq + gic_handle_irq + call_on_irq_stack + do_interrupt_handler + ... + +To fix this fold del_timer_sync() into lpuart_dma_rx_free() after +dmaengine_terminate_sync() to make sure timer will not be re-started in +lpuart_copy_rx_to_tty() <= lpuart_dma_rx_complete(). + +Fixes: 4a8588a1cf86 ("serial: fsl_lpuart: delete timer on shutdown") +Cc: stable +Signed-off-by: Alexander Sverdlin +Link: https://lore.kernel.org/r/20230309134302.74940-2-alexander.sverdlin@siemens.com +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Sasha Levin +--- + drivers/tty/serial/fsl_lpuart.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c +index 86e96696ab26d..cd98c04de0330 100644 +--- a/drivers/tty/serial/fsl_lpuart.c ++++ b/drivers/tty/serial/fsl_lpuart.c +@@ -1334,6 +1334,7 @@ static void lpuart_dma_rx_free(struct uart_port *port) + struct dma_chan *chan = sport->dma_rx_chan; + + dmaengine_terminate_sync(chan); ++ del_timer_sync(&sport->lpuart_timer); + dma_unmap_sg(chan->device->dev, &sport->rx_sgl, 1, DMA_FROM_DEVICE); + kfree(sport->rx_ring.buf); + sport->rx_ring.tail = 0; +@@ -1757,7 +1758,6 @@ static int lpuart32_startup(struct uart_port *port) + static void lpuart_dma_shutdown(struct lpuart_port *sport) + { + if (sport->lpuart_dma_rx_use) { +- del_timer_sync(&sport->lpuart_timer); + lpuart_dma_rx_free(&sport->port); + sport->lpuart_dma_rx_use = false; + } +@@ -1917,10 +1917,8 @@ lpuart_set_termios(struct uart_port *port, struct ktermios *termios, + * Since timer function acqures sport->port.lock, need to stop before + * acquring same lock because otherwise del_timer_sync() can deadlock. + */ +- if (old && sport->lpuart_dma_rx_use) { +- del_timer_sync(&sport->lpuart_timer); ++ if (old && sport->lpuart_dma_rx_use) + lpuart_dma_rx_free(&sport->port); +- } + + spin_lock_irqsave(&sport->port.lock, flags); + +@@ -2154,10 +2152,8 @@ lpuart32_set_termios(struct uart_port *port, struct ktermios *termios, + * Since timer function acqures sport->port.lock, need to stop before + * acquring same lock because otherwise del_timer_sync() can deadlock. + */ +- if (old && sport->lpuart_dma_rx_use) { +- del_timer_sync(&sport->lpuart_timer); ++ if (old && sport->lpuart_dma_rx_use) + lpuart_dma_rx_free(&sport->port); +- } + + spin_lock_irqsave(&sport->port.lock, flags); + +@@ -2850,7 +2846,6 @@ static int __maybe_unused lpuart_suspend(struct device *dev) + * Rx DMA path before suspend and start Rx DMA path on resume. + */ + if (irq_wake) { +- del_timer_sync(&sport->lpuart_timer); + lpuart_dma_rx_free(&sport->port); + } + +-- +2.39.2 + diff --git a/queue-6.1/tty-serial-fsl_lpuart-switch-to-new-dmaengine_termin.patch b/queue-6.1/tty-serial-fsl_lpuart-switch-to-new-dmaengine_termin.patch new file mode 100644 index 00000000000..cb7f37e0485 --- /dev/null +++ b/queue-6.1/tty-serial-fsl_lpuart-switch-to-new-dmaengine_termin.patch @@ -0,0 +1,64 @@ +From 9da1b65e8b605ca30bf0106f430c5bcab727147f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 23 Nov 2022 10:36:19 +0800 +Subject: tty: serial: fsl_lpuart: switch to new dmaengine_terminate_* API + +From: Sherry Sun + +[ Upstream commit 8682ab0eea89c300ebb120c02ead3999ca5560a8 ] + +Convert dmaengine_terminate_all() calls to synchronous and asynchronous +versions where appropriate. + +Signed-off-by: Sherry Sun +Link: https://lore.kernel.org/r/20221123023619.30173-1-sherry.sun@nxp.com +Signed-off-by: Greg Kroah-Hartman +Stable-dep-of: 1be6f2b15f90 ("tty: serial: fsl_lpuart: fix race on RX DMA shutdown") +Signed-off-by: Sasha Levin +--- + drivers/tty/serial/fsl_lpuart.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c +index c51883f34ac2b..86e96696ab26d 100644 +--- a/drivers/tty/serial/fsl_lpuart.c ++++ b/drivers/tty/serial/fsl_lpuart.c +@@ -582,7 +582,7 @@ static void lpuart_flush_buffer(struct uart_port *port) + sport->dma_tx_nents, DMA_TO_DEVICE); + sport->dma_tx_in_progress = false; + } +- dmaengine_terminate_all(chan); ++ dmaengine_terminate_async(chan); + } + + if (lpuart_is_32(sport)) { +@@ -1333,7 +1333,7 @@ static void lpuart_dma_rx_free(struct uart_port *port) + struct lpuart_port, port); + struct dma_chan *chan = sport->dma_rx_chan; + +- dmaengine_terminate_all(chan); ++ dmaengine_terminate_sync(chan); + dma_unmap_sg(chan->device->dev, &sport->rx_sgl, 1, DMA_FROM_DEVICE); + kfree(sport->rx_ring.buf); + sport->rx_ring.tail = 0; +@@ -1766,7 +1766,7 @@ static void lpuart_dma_shutdown(struct lpuart_port *sport) + if (wait_event_interruptible_timeout(sport->dma_wait, + !sport->dma_tx_in_progress, msecs_to_jiffies(300)) <= 0) { + sport->dma_tx_in_progress = false; +- dmaengine_terminate_all(sport->dma_tx_chan); ++ dmaengine_terminate_sync(sport->dma_tx_chan); + } + sport->lpuart_dma_tx_use = false; + } +@@ -2867,7 +2867,7 @@ static int __maybe_unused lpuart_suspend(struct device *dev) + + if (sport->lpuart_dma_tx_use) { + sport->dma_tx_in_progress = false; +- dmaengine_terminate_all(sport->dma_tx_chan); ++ dmaengine_terminate_sync(sport->dma_tx_chan); + } + + if (sport->port.suspended && !irq_wake) +-- +2.39.2 + diff --git a/queue-6.1/zonefs-fix-error-message-in-zonefs_file_dio_append.patch b/queue-6.1/zonefs-fix-error-message-in-zonefs_file_dio_append.patch new file mode 100644 index 00000000000..88f40cbd4c5 --- /dev/null +++ b/queue-6.1/zonefs-fix-error-message-in-zonefs_file_dio_append.patch @@ -0,0 +1,41 @@ +From afcf97061101c21b9516f39b0495f576a309f29d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Mar 2023 22:49:15 +0900 +Subject: zonefs: Fix error message in zonefs_file_dio_append() + +From: Damien Le Moal + +[ Upstream commit 88b170088ad2c3e27086fe35769aa49f8a512564 ] + +Since the expected write location in a sequential file is always at the +end of the file (append write), when an invalid write append location is +detected in zonefs_file_dio_append(), print the invalid written location +instead of the expected write location. + +Fixes: a608da3bd730 ("zonefs: Detect append writes at invalid locations") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Reviewed-by: Christoph Hellwig +Reviewed-by: Johannes Thumshirn +Reviewed-by: Himanshu Madhani +Signed-off-by: Sasha Levin +--- + fs/zonefs/file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c +index 738b0e28d74b5..c71cc0fcb3ec8 100644 +--- a/fs/zonefs/file.c ++++ b/fs/zonefs/file.c +@@ -426,7 +426,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + if (bio->bi_iter.bi_sector != wpsector) { + zonefs_warn(inode->i_sb, + "Corrupted write pointer %llu for zone at %llu\n", +- wpsector, z->z_sector); ++ bio->bi_iter.bi_sector, z->z_sector); + ret = -EIO; + } + } +-- +2.39.2 + diff --git a/queue-6.1/zonefs-reduce-struct-zonefs_inode_info-size.patch b/queue-6.1/zonefs-reduce-struct-zonefs_inode_info-size.patch new file mode 100644 index 00000000000..24190903977 --- /dev/null +++ b/queue-6.1/zonefs-reduce-struct-zonefs_inode_info-size.patch @@ -0,0 +1,283 @@ +From d8736462ac18e027b5e6882dcb149196d9a265cf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 24 Nov 2022 19:43:30 +0900 +Subject: zonefs: Reduce struct zonefs_inode_info size + +From: Damien Le Moal + +[ Upstream commit 34422914dc00b291d1c47dbdabe93b154c2f2b25 ] + +Instead of using the i_ztype field in struct zonefs_inode_info to +indicate the zone type of an inode, introduce the new inode flag +ZONEFS_ZONE_CNV to be set in the i_flags field of struct +zonefs_inode_info to identify conventional zones. If this flag is not +set, the zone of an inode is considered to be a sequential zone. + +The helpers zonefs_zone_is_cnv(), zonefs_zone_is_seq(), +zonefs_inode_is_cnv() and zonefs_inode_is_seq() are introduced to +simplify testing the zone type of a struct zonefs_inode_info and of a +struct inode. + +Signed-off-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") +Signed-off-by: Sasha Levin +--- + fs/zonefs/file.c | 35 ++++++++++++++--------------------- + fs/zonefs/super.c | 12 +++++++----- + fs/zonefs/zonefs.h | 24 +++++++++++++++++++++--- + 3 files changed, 42 insertions(+), 29 deletions(-) + +diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c +index ece0f3959b6d1..64873d31d75dd 100644 +--- a/fs/zonefs/file.c ++++ b/fs/zonefs/file.c +@@ -77,8 +77,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + * checked when writes are issued, so warn if we see a page writeback + * operation. + */ +- if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && +- !(flags & IOMAP_DIRECT))) ++ if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT))) + return -EIO; + + /* +@@ -128,7 +127,7 @@ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + +- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) ++ if (WARN_ON_ONCE(zonefs_zone_is_seq(zi))) + return -EIO; + if (WARN_ON_ONCE(offset >= i_size_read(inode))) + return -EIO; +@@ -158,9 +157,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) + { + struct inode *inode = file_inode(swap_file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); + +- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { ++ if (zonefs_inode_is_seq(inode)) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; +@@ -196,7 +194,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + * only down to a 0 size, which is equivalent to a zone reset, and to + * the maximum file size, which is equivalent to a zone finish. + */ +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ if (!zonefs_zone_is_seq(zi)) + return -EPERM; + + if (!isize) +@@ -266,7 +264,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, + * Since only direct writes are allowed in sequential files, page cache + * flush is needed only for conventional zone files. + */ +- if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ++ if (zonefs_inode_is_cnv(inode)) + ret = file_write_and_wait_range(file, start, end); + if (!ret) + ret = blkdev_issue_flush(inode->i_sb->s_bdev); +@@ -280,7 +278,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, + static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) + { + struct inode *inode = file_inode(vmf->vma->vm_file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); + vm_fault_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) +@@ -290,7 +287,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) + * Sanity check: only conventional zone files can have shared + * writeable mappings. + */ +- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) ++ if (zonefs_inode_is_seq(inode)) + return VM_FAULT_NOPAGE; + + sb_start_pagefault(inode->i_sb); +@@ -319,7 +316,7 @@ static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) + * mappings are possible since there are no guarantees for write + * ordering between msync() and page cache writeback. + */ +- if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && ++ if (zonefs_inode_is_seq(file_inode(file)) && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + +@@ -352,7 +349,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + return error; + } + +- if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { ++ if (size && zonefs_zone_is_seq(zi)) { + /* + * Note that we may be seeing completions out of order, + * but that is not a problem since a write completed +@@ -491,7 +488,7 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ if (zonefs_zone_is_cnv(zi)) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); + iocb->ki_pos = zi->i_wpoffset; +@@ -531,8 +528,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + * as this can cause write reordering (e.g. the first aio gets EAGAIN + * on the inode lock but the second goes through but is now unaligned). + */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && +- (iocb->ki_flags & IOCB_NOWAIT)) ++ if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (iocb->ki_flags & IOCB_NOWAIT) { +@@ -554,7 +550,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + } + + /* Enforce sequential writes (append only) in sequential zones */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { ++ if (zonefs_zone_is_seq(zi)) { + mutex_lock(&zi->i_truncate_mutex); + if (iocb->ki_pos != zi->i_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); +@@ -570,7 +566,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + else + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + &zonefs_write_dio_ops, 0, NULL, 0); +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && ++ if (zonefs_zone_is_seq(zi) && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) + count = ret; +@@ -596,14 +592,13 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); + ssize_t ret; + + /* + * Direct IO writes are mandatory for sequential zone files so that the + * write IO issuing order is preserved. + */ +- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) ++ if (zonefs_inode_is_seq(inode)) + return -EIO; + + if (iocb->ki_flags & IOCB_NOWAIT) { +@@ -731,9 +726,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + static inline bool zonefs_seq_file_need_wro(struct inode *inode, + struct file *file) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ if (zonefs_inode_is_cnv(inode)) + return false; + + if (!(file->f_mode & FMODE_WRITE)) +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index 6307cc95be061..a4af29dc32e7d 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -37,7 +37,7 @@ void zonefs_account_active(struct inode *inode) + + lockdep_assert_held(&zi->i_truncate_mutex); + +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ if (zonefs_zone_is_cnv(zi)) + return; + + /* +@@ -177,14 +177,14 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, + zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", + inode->i_ino); + zi->i_flags |= ZONEFS_ZONE_READONLY; +- if (zi->i_ztype == ZONEFS_ZTYPE_CNV) ++ if (zonefs_zone_is_cnv(zi)) + return zi->i_max_size; + return zi->i_wpoffset; + case BLK_ZONE_COND_FULL: + /* The write pointer of full zones is invalid. */ + return zi->i_max_size; + default: +- if (zi->i_ztype == ZONEFS_ZTYPE_CNV) ++ if (zonefs_zone_is_cnv(zi)) + return zi->i_max_size; + return (zone->wp - zone->start) << SECTOR_SHIFT; + } +@@ -260,7 +260,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * In all cases, warn about inode size inconsistency and handle the + * IO error according to the zone condition and to the mount options. + */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size) ++ if (zonefs_zone_is_seq(zi) && isize != data_size) + zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", + inode->i_ino, isize, data_size); + +@@ -584,7 +584,9 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; + inode->i_mode = S_IFREG | sbi->s_perm; + +- zi->i_ztype = type; ++ if (type == ZONEFS_ZTYPE_CNV) ++ zi->i_flags |= ZONEFS_ZONE_CNV; ++ + zi->i_zsector = zone->start; + zi->i_zone_size = zone->len << SECTOR_SHIFT; + if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && +diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h +index 439096445ee53..1a225f74015a0 100644 +--- a/fs/zonefs/zonefs.h ++++ b/fs/zonefs/zonefs.h +@@ -44,6 +44,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) + #define ZONEFS_ZONE_ACTIVE (1U << 2) + #define ZONEFS_ZONE_OFFLINE (1U << 3) + #define ZONEFS_ZONE_READONLY (1U << 4) ++#define ZONEFS_ZONE_CNV (1U << 31) + + /* + * In-memory inode data. +@@ -51,9 +52,6 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) + struct zonefs_inode_info { + struct inode i_vnode; + +- /* File zone type */ +- enum zonefs_ztype i_ztype; +- + /* File zone start sector (512B unit) */ + sector_t i_zsector; + +@@ -91,6 +89,26 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) + return container_of(inode, struct zonefs_inode_info, i_vnode); + } + ++static inline bool zonefs_zone_is_cnv(struct zonefs_inode_info *zi) ++{ ++ return zi->i_flags & ZONEFS_ZONE_CNV; ++} ++ ++static inline bool zonefs_zone_is_seq(struct zonefs_inode_info *zi) ++{ ++ return !zonefs_zone_is_cnv(zi); ++} ++ ++static inline bool zonefs_inode_is_cnv(struct inode *inode) ++{ ++ return zonefs_zone_is_cnv(ZONEFS_I(inode)); ++} ++ ++static inline bool zonefs_inode_is_seq(struct inode *inode) ++{ ++ return zonefs_zone_is_seq(ZONEFS_I(inode)); ++} ++ + /* + * On-disk super block (block 0). + */ +-- +2.39.2 + diff --git a/queue-6.1/zonefs-reorganize-code.patch b/queue-6.1/zonefs-reorganize-code.patch new file mode 100644 index 00000000000..9b2687bd8f6 --- /dev/null +++ b/queue-6.1/zonefs-reorganize-code.patch @@ -0,0 +1,1990 @@ +From 7984f3ba9ba89242cbae90529b4c1680787a207c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 25 Nov 2022 09:39:33 +0900 +Subject: zonefs: Reorganize code + +From: Damien Le Moal + +[ Upstream commit 4008e2a0b01aba982356fd15b128a47bf11bd9c7 ] + +Move all code related to zone file operations from super.c to the new +file.c file. Inode and zone management code remains in super.c. + +Signed-off-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") +Signed-off-by: Sasha Levin +--- + fs/zonefs/Makefile | 2 +- + fs/zonefs/file.c | 874 ++++++++++++++++++++++++++++++++++++++++ + fs/zonefs/super.c | 973 +++------------------------------------------ + fs/zonefs/zonefs.h | 22 + + 4 files changed, 955 insertions(+), 916 deletions(-) + create mode 100644 fs/zonefs/file.c + +diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile +index 9fe54f5319f22..645f7229de4a0 100644 +--- a/fs/zonefs/Makefile ++++ b/fs/zonefs/Makefile +@@ -3,4 +3,4 @@ ccflags-y += -I$(src) + + obj-$(CONFIG_ZONEFS_FS) += zonefs.o + +-zonefs-y := super.o sysfs.o ++zonefs-y := super.o file.o sysfs.o +diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c +new file mode 100644 +index 0000000000000..ece0f3959b6d1 +--- /dev/null ++++ b/fs/zonefs/file.c +@@ -0,0 +1,874 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Simple file system for zoned block devices exposing zones as files. ++ * ++ * Copyright (C) 2022 Western Digital Corporation or its affiliates. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zonefs.h" ++ ++#include "trace.h" ++ ++static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, ++ loff_t length, unsigned int flags, ++ struct iomap *iomap, struct iomap *srcmap) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ loff_t isize; ++ ++ /* ++ * All blocks are always mapped below EOF. If reading past EOF, ++ * act as if there is a hole up to the file maximum size. ++ */ ++ mutex_lock(&zi->i_truncate_mutex); ++ iomap->bdev = inode->i_sb->s_bdev; ++ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); ++ isize = i_size_read(inode); ++ if (iomap->offset >= isize) { ++ iomap->type = IOMAP_HOLE; ++ iomap->addr = IOMAP_NULL_ADDR; ++ iomap->length = length; ++ } else { ++ iomap->type = IOMAP_MAPPED; ++ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; ++ iomap->length = isize - iomap->offset; ++ } ++ mutex_unlock(&zi->i_truncate_mutex); ++ ++ trace_zonefs_iomap_begin(inode, iomap); ++ ++ return 0; ++} ++ ++static const struct iomap_ops zonefs_read_iomap_ops = { ++ .iomap_begin = zonefs_read_iomap_begin, ++}; ++ ++static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, ++ loff_t length, unsigned int flags, ++ struct iomap *iomap, struct iomap *srcmap) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ loff_t isize; ++ ++ /* All write I/Os should always be within the file maximum size */ ++ if (WARN_ON_ONCE(offset + length > zi->i_max_size)) ++ return -EIO; ++ ++ /* ++ * Sequential zones can only accept direct writes. This is already ++ * checked when writes are issued, so warn if we see a page writeback ++ * operation. ++ */ ++ if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && ++ !(flags & IOMAP_DIRECT))) ++ return -EIO; ++ ++ /* ++ * For conventional zones, all blocks are always mapped. For sequential ++ * zones, all blocks after always mapped below the inode size (zone ++ * write pointer) and unwriten beyond. ++ */ ++ mutex_lock(&zi->i_truncate_mutex); ++ iomap->bdev = inode->i_sb->s_bdev; ++ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); ++ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; ++ isize = i_size_read(inode); ++ if (iomap->offset >= isize) { ++ iomap->type = IOMAP_UNWRITTEN; ++ iomap->length = zi->i_max_size - iomap->offset; ++ } else { ++ iomap->type = IOMAP_MAPPED; ++ iomap->length = isize - iomap->offset; ++ } ++ mutex_unlock(&zi->i_truncate_mutex); ++ ++ trace_zonefs_iomap_begin(inode, iomap); ++ ++ return 0; ++} ++ ++static const struct iomap_ops zonefs_write_iomap_ops = { ++ .iomap_begin = zonefs_write_iomap_begin, ++}; ++ ++static int zonefs_read_folio(struct file *unused, struct folio *folio) ++{ ++ return iomap_read_folio(folio, &zonefs_read_iomap_ops); ++} ++ ++static void zonefs_readahead(struct readahead_control *rac) ++{ ++ iomap_readahead(rac, &zonefs_read_iomap_ops); ++} ++ ++/* ++ * Map blocks for page writeback. This is used only on conventional zone files, ++ * which implies that the page range can only be within the fixed inode size. ++ */ ++static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, ++ struct inode *inode, loff_t offset) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) ++ return -EIO; ++ if (WARN_ON_ONCE(offset >= i_size_read(inode))) ++ return -EIO; ++ ++ /* If the mapping is already OK, nothing needs to be done */ ++ if (offset >= wpc->iomap.offset && ++ offset < wpc->iomap.offset + wpc->iomap.length) ++ return 0; ++ ++ return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, ++ IOMAP_WRITE, &wpc->iomap, NULL); ++} ++ ++static const struct iomap_writeback_ops zonefs_writeback_ops = { ++ .map_blocks = zonefs_write_map_blocks, ++}; ++ ++static int zonefs_writepages(struct address_space *mapping, ++ struct writeback_control *wbc) ++{ ++ struct iomap_writepage_ctx wpc = { }; ++ ++ return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); ++} ++ ++static int zonefs_swap_activate(struct swap_info_struct *sis, ++ struct file *swap_file, sector_t *span) ++{ ++ struct inode *inode = file_inode(swap_file); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { ++ zonefs_err(inode->i_sb, ++ "swap file: not a conventional zone file\n"); ++ return -EINVAL; ++ } ++ ++ return iomap_swapfile_activate(sis, swap_file, span, ++ &zonefs_read_iomap_ops); ++} ++ ++const struct address_space_operations zonefs_file_aops = { ++ .read_folio = zonefs_read_folio, ++ .readahead = zonefs_readahead, ++ .writepages = zonefs_writepages, ++ .dirty_folio = filemap_dirty_folio, ++ .release_folio = iomap_release_folio, ++ .invalidate_folio = iomap_invalidate_folio, ++ .migrate_folio = filemap_migrate_folio, ++ .is_partially_uptodate = iomap_is_partially_uptodate, ++ .error_remove_page = generic_error_remove_page, ++ .direct_IO = noop_direct_IO, ++ .swap_activate = zonefs_swap_activate, ++}; ++ ++int zonefs_file_truncate(struct inode *inode, loff_t isize) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ loff_t old_isize; ++ enum req_op op; ++ int ret = 0; ++ ++ /* ++ * Only sequential zone files can be truncated and truncation is allowed ++ * only down to a 0 size, which is equivalent to a zone reset, and to ++ * the maximum file size, which is equivalent to a zone finish. ++ */ ++ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ return -EPERM; ++ ++ if (!isize) ++ op = REQ_OP_ZONE_RESET; ++ else if (isize == zi->i_max_size) ++ op = REQ_OP_ZONE_FINISH; ++ else ++ return -EPERM; ++ ++ inode_dio_wait(inode); ++ ++ /* Serialize against page faults */ ++ filemap_invalidate_lock(inode->i_mapping); ++ ++ /* Serialize against zonefs_iomap_begin() */ ++ mutex_lock(&zi->i_truncate_mutex); ++ ++ old_isize = i_size_read(inode); ++ if (isize == old_isize) ++ goto unlock; ++ ++ ret = zonefs_zone_mgmt(inode, op); ++ if (ret) ++ goto unlock; ++ ++ /* ++ * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, ++ * take care of open zones. ++ */ ++ if (zi->i_flags & ZONEFS_ZONE_OPEN) { ++ /* ++ * Truncating a zone to EMPTY or FULL is the equivalent of ++ * closing the zone. For a truncation to 0, we need to ++ * re-open the zone to ensure new writes can be processed. ++ * For a truncation to the maximum file size, the zone is ++ * closed and writes cannot be accepted anymore, so clear ++ * the open flag. ++ */ ++ if (!isize) ++ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); ++ else ++ zi->i_flags &= ~ZONEFS_ZONE_OPEN; ++ } ++ ++ zonefs_update_stats(inode, isize); ++ truncate_setsize(inode, isize); ++ zi->i_wpoffset = isize; ++ zonefs_account_active(inode); ++ ++unlock: ++ mutex_unlock(&zi->i_truncate_mutex); ++ filemap_invalidate_unlock(inode->i_mapping); ++ ++ return ret; ++} ++ ++static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ struct inode *inode = file_inode(file); ++ int ret = 0; ++ ++ if (unlikely(IS_IMMUTABLE(inode))) ++ return -EPERM; ++ ++ /* ++ * Since only direct writes are allowed in sequential files, page cache ++ * flush is needed only for conventional zone files. ++ */ ++ if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ++ ret = file_write_and_wait_range(file, start, end); ++ if (!ret) ++ ret = blkdev_issue_flush(inode->i_sb->s_bdev); ++ ++ if (ret) ++ zonefs_io_error(inode, true); ++ ++ return ret; ++} ++ ++static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) ++{ ++ struct inode *inode = file_inode(vmf->vma->vm_file); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ vm_fault_t ret; ++ ++ if (unlikely(IS_IMMUTABLE(inode))) ++ return VM_FAULT_SIGBUS; ++ ++ /* ++ * Sanity check: only conventional zone files can have shared ++ * writeable mappings. ++ */ ++ if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) ++ return VM_FAULT_NOPAGE; ++ ++ sb_start_pagefault(inode->i_sb); ++ file_update_time(vmf->vma->vm_file); ++ ++ /* Serialize against truncates */ ++ filemap_invalidate_lock_shared(inode->i_mapping); ++ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); ++ filemap_invalidate_unlock_shared(inode->i_mapping); ++ ++ sb_end_pagefault(inode->i_sb); ++ return ret; ++} ++ ++static const struct vm_operations_struct zonefs_file_vm_ops = { ++ .fault = filemap_fault, ++ .map_pages = filemap_map_pages, ++ .page_mkwrite = zonefs_filemap_page_mkwrite, ++}; ++ ++static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ /* ++ * Conventional zones accept random writes, so their files can support ++ * shared writable mappings. For sequential zone files, only read ++ * mappings are possible since there are no guarantees for write ++ * ordering between msync() and page cache writeback. ++ */ ++ if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && ++ (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) ++ return -EINVAL; ++ ++ file_accessed(file); ++ vma->vm_ops = &zonefs_file_vm_ops; ++ ++ return 0; ++} ++ ++static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) ++{ ++ loff_t isize = i_size_read(file_inode(file)); ++ ++ /* ++ * Seeks are limited to below the zone size for conventional zones ++ * and below the zone write pointer for sequential zones. In both ++ * cases, this limit is the inode size. ++ */ ++ return generic_file_llseek_size(file, offset, whence, isize, isize); ++} ++ ++static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, ++ int error, unsigned int flags) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (error) { ++ zonefs_io_error(inode, true); ++ return error; ++ } ++ ++ if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { ++ /* ++ * Note that we may be seeing completions out of order, ++ * but that is not a problem since a write completed ++ * successfully necessarily means that all preceding writes ++ * were also successful. So we can safely increase the inode ++ * size to the write end location. ++ */ ++ mutex_lock(&zi->i_truncate_mutex); ++ if (i_size_read(inode) < iocb->ki_pos + size) { ++ zonefs_update_stats(inode, iocb->ki_pos + size); ++ zonefs_i_size_write(inode, iocb->ki_pos + size); ++ } ++ mutex_unlock(&zi->i_truncate_mutex); ++ } ++ ++ return 0; ++} ++ ++static const struct iomap_dio_ops zonefs_write_dio_ops = { ++ .end_io = zonefs_file_write_dio_end_io, ++}; ++ ++static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct block_device *bdev = inode->i_sb->s_bdev; ++ unsigned int max = bdev_max_zone_append_sectors(bdev); ++ struct bio *bio; ++ ssize_t size; ++ int nr_pages; ++ ssize_t ret; ++ ++ max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); ++ iov_iter_truncate(from, max); ++ ++ nr_pages = iov_iter_npages(from, BIO_MAX_VECS); ++ if (!nr_pages) ++ return 0; ++ ++ bio = bio_alloc(bdev, nr_pages, ++ REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); ++ bio->bi_iter.bi_sector = zi->i_zsector; ++ bio->bi_ioprio = iocb->ki_ioprio; ++ if (iocb_is_dsync(iocb)) ++ bio->bi_opf |= REQ_FUA; ++ ++ ret = bio_iov_iter_get_pages(bio, from); ++ if (unlikely(ret)) ++ goto out_release; ++ ++ size = bio->bi_iter.bi_size; ++ task_io_account_write(size); ++ ++ if (iocb->ki_flags & IOCB_HIPRI) ++ bio_set_polled(bio, iocb); ++ ++ ret = submit_bio_wait(bio); ++ ++ /* ++ * If the file zone was written underneath the file system, the zone ++ * write pointer may not be where we expect it to be, but the zone ++ * append write can still succeed. So check manually that we wrote where ++ * we intended to, that is, at zi->i_wpoffset. ++ */ ++ if (!ret) { ++ sector_t wpsector = ++ zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); ++ ++ if (bio->bi_iter.bi_sector != wpsector) { ++ zonefs_warn(inode->i_sb, ++ "Corrupted write pointer %llu for zone at %llu\n", ++ wpsector, zi->i_zsector); ++ ret = -EIO; ++ } ++ } ++ ++ zonefs_file_write_dio_end_io(iocb, size, ret, 0); ++ trace_zonefs_file_dio_append(inode, size, ret); ++ ++out_release: ++ bio_release_pages(bio, false); ++ bio_put(bio); ++ ++ if (ret >= 0) { ++ iocb->ki_pos += size; ++ return size; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Do not exceed the LFS limits nor the file zone size. If pos is under the ++ * limit it becomes a short access. If it exceeds the limit, return -EFBIG. ++ */ ++static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, ++ loff_t count) ++{ ++ struct inode *inode = file_inode(file); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ loff_t limit = rlimit(RLIMIT_FSIZE); ++ loff_t max_size = zi->i_max_size; ++ ++ if (limit != RLIM_INFINITY) { ++ if (pos >= limit) { ++ send_sig(SIGXFSZ, current, 0); ++ return -EFBIG; ++ } ++ count = min(count, limit - pos); ++ } ++ ++ if (!(file->f_flags & O_LARGEFILE)) ++ max_size = min_t(loff_t, MAX_NON_LFS, max_size); ++ ++ if (unlikely(pos >= max_size)) ++ return -EFBIG; ++ ++ return min(count, max_size - pos); ++} ++ ++static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct file *file = iocb->ki_filp; ++ struct inode *inode = file_inode(file); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ loff_t count; ++ ++ if (IS_SWAPFILE(inode)) ++ return -ETXTBSY; ++ ++ if (!iov_iter_count(from)) ++ return 0; ++ ++ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) ++ return -EINVAL; ++ ++ if (iocb->ki_flags & IOCB_APPEND) { ++ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ return -EINVAL; ++ mutex_lock(&zi->i_truncate_mutex); ++ iocb->ki_pos = zi->i_wpoffset; ++ mutex_unlock(&zi->i_truncate_mutex); ++ } ++ ++ count = zonefs_write_check_limits(file, iocb->ki_pos, ++ iov_iter_count(from)); ++ if (count < 0) ++ return count; ++ ++ iov_iter_truncate(from, count); ++ return iov_iter_count(from); ++} ++ ++/* ++ * Handle direct writes. For sequential zone files, this is the only possible ++ * write path. For these files, check that the user is issuing writes ++ * sequentially from the end of the file. This code assumes that the block layer ++ * delivers write requests to the device in sequential order. This is always the ++ * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE ++ * elevator feature is being used (e.g. mq-deadline). The block layer always ++ * automatically select such an elevator for zoned block devices during the ++ * device initialization. ++ */ ++static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ bool sync = is_sync_kiocb(iocb); ++ bool append = false; ++ ssize_t ret, count; ++ ++ /* ++ * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT ++ * as this can cause write reordering (e.g. the first aio gets EAGAIN ++ * on the inode lock but the second goes through but is now unaligned). ++ */ ++ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && ++ (iocb->ki_flags & IOCB_NOWAIT)) ++ return -EOPNOTSUPP; ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!inode_trylock(inode)) ++ return -EAGAIN; ++ } else { ++ inode_lock(inode); ++ } ++ ++ count = zonefs_write_checks(iocb, from); ++ if (count <= 0) { ++ ret = count; ++ goto inode_unlock; ++ } ++ ++ if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { ++ ret = -EINVAL; ++ goto inode_unlock; ++ } ++ ++ /* Enforce sequential writes (append only) in sequential zones */ ++ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { ++ mutex_lock(&zi->i_truncate_mutex); ++ if (iocb->ki_pos != zi->i_wpoffset) { ++ mutex_unlock(&zi->i_truncate_mutex); ++ ret = -EINVAL; ++ goto inode_unlock; ++ } ++ mutex_unlock(&zi->i_truncate_mutex); ++ append = sync; ++ } ++ ++ if (append) ++ ret = zonefs_file_dio_append(iocb, from); ++ else ++ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, ++ &zonefs_write_dio_ops, 0, NULL, 0); ++ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && ++ (ret > 0 || ret == -EIOCBQUEUED)) { ++ if (ret > 0) ++ count = ret; ++ ++ /* ++ * Update the zone write pointer offset assuming the write ++ * operation succeeded. If it did not, the error recovery path ++ * will correct it. Also do active seq file accounting. ++ */ ++ mutex_lock(&zi->i_truncate_mutex); ++ zi->i_wpoffset += count; ++ zonefs_account_active(inode); ++ mutex_unlock(&zi->i_truncate_mutex); ++ } ++ ++inode_unlock: ++ inode_unlock(inode); ++ ++ return ret; ++} ++ ++static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, ++ struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ssize_t ret; ++ ++ /* ++ * Direct IO writes are mandatory for sequential zone files so that the ++ * write IO issuing order is preserved. ++ */ ++ if (zi->i_ztype != ZONEFS_ZTYPE_CNV) ++ return -EIO; ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!inode_trylock(inode)) ++ return -EAGAIN; ++ } else { ++ inode_lock(inode); ++ } ++ ++ ret = zonefs_write_checks(iocb, from); ++ if (ret <= 0) ++ goto inode_unlock; ++ ++ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); ++ if (ret > 0) ++ iocb->ki_pos += ret; ++ else if (ret == -EIO) ++ zonefs_io_error(inode, true); ++ ++inode_unlock: ++ inode_unlock(inode); ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ ++ return ret; ++} ++ ++static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ++ if (unlikely(IS_IMMUTABLE(inode))) ++ return -EPERM; ++ ++ if (sb_rdonly(inode->i_sb)) ++ return -EROFS; ++ ++ /* Write operations beyond the zone size are not allowed */ ++ if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) ++ return -EFBIG; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ ssize_t ret = zonefs_file_dio_write(iocb, from); ++ ++ if (ret != -ENOTBLK) ++ return ret; ++ } ++ ++ return zonefs_file_buffered_write(iocb, from); ++} ++ ++static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, ++ int error, unsigned int flags) ++{ ++ if (error) { ++ zonefs_io_error(file_inode(iocb->ki_filp), false); ++ return error; ++ } ++ ++ return 0; ++} ++ ++static const struct iomap_dio_ops zonefs_read_dio_ops = { ++ .end_io = zonefs_file_read_dio_end_io, ++}; ++ ++static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ loff_t isize; ++ ssize_t ret; ++ ++ /* Offline zones cannot be read */ ++ if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) ++ return -EPERM; ++ ++ if (iocb->ki_pos >= zi->i_max_size) ++ return 0; ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!inode_trylock_shared(inode)) ++ return -EAGAIN; ++ } else { ++ inode_lock_shared(inode); ++ } ++ ++ /* Limit read operations to written data */ ++ mutex_lock(&zi->i_truncate_mutex); ++ isize = i_size_read(inode); ++ if (iocb->ki_pos >= isize) { ++ mutex_unlock(&zi->i_truncate_mutex); ++ ret = 0; ++ goto inode_unlock; ++ } ++ iov_iter_truncate(to, isize - iocb->ki_pos); ++ mutex_unlock(&zi->i_truncate_mutex); ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ size_t count = iov_iter_count(to); ++ ++ if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { ++ ret = -EINVAL; ++ goto inode_unlock; ++ } ++ file_accessed(iocb->ki_filp); ++ ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, ++ &zonefs_read_dio_ops, 0, NULL, 0); ++ } else { ++ ret = generic_file_read_iter(iocb, to); ++ if (ret == -EIO) ++ zonefs_io_error(inode, false); ++ } ++ ++inode_unlock: ++ inode_unlock_shared(inode); ++ ++ return ret; ++} ++ ++/* ++ * Write open accounting is done only for sequential files. ++ */ ++static inline bool zonefs_seq_file_need_wro(struct inode *inode, ++ struct file *file) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ return false; ++ ++ if (!(file->f_mode & FMODE_WRITE)) ++ return false; ++ ++ return true; ++} ++ ++static int zonefs_seq_file_write_open(struct inode *inode) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ int ret = 0; ++ ++ mutex_lock(&zi->i_truncate_mutex); ++ ++ if (!zi->i_wr_refcnt) { ++ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); ++ unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); ++ ++ if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { ++ ++ if (sbi->s_max_wro_seq_files ++ && wro > sbi->s_max_wro_seq_files) { ++ atomic_dec(&sbi->s_wro_seq_files); ++ ret = -EBUSY; ++ goto unlock; ++ } ++ ++ if (i_size_read(inode) < zi->i_max_size) { ++ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); ++ if (ret) { ++ atomic_dec(&sbi->s_wro_seq_files); ++ goto unlock; ++ } ++ zi->i_flags |= ZONEFS_ZONE_OPEN; ++ zonefs_account_active(inode); ++ } ++ } ++ } ++ ++ zi->i_wr_refcnt++; ++ ++unlock: ++ mutex_unlock(&zi->i_truncate_mutex); ++ ++ return ret; ++} ++ ++static int zonefs_file_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ ++ ret = generic_file_open(inode, file); ++ if (ret) ++ return ret; ++ ++ if (zonefs_seq_file_need_wro(inode, file)) ++ return zonefs_seq_file_write_open(inode); ++ ++ return 0; ++} ++ ++static void zonefs_seq_file_write_close(struct inode *inode) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); ++ int ret = 0; ++ ++ mutex_lock(&zi->i_truncate_mutex); ++ ++ zi->i_wr_refcnt--; ++ if (zi->i_wr_refcnt) ++ goto unlock; ++ ++ /* ++ * The file zone may not be open anymore (e.g. the file was truncated to ++ * its maximum size or it was fully written). For this case, we only ++ * need to decrement the write open count. ++ */ ++ if (zi->i_flags & ZONEFS_ZONE_OPEN) { ++ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); ++ if (ret) { ++ __zonefs_io_error(inode, false); ++ /* ++ * Leaving zones explicitly open may lead to a state ++ * where most zones cannot be written (zone resources ++ * exhausted). So take preventive action by remounting ++ * read-only. ++ */ ++ if (zi->i_flags & ZONEFS_ZONE_OPEN && ++ !(sb->s_flags & SB_RDONLY)) { ++ zonefs_warn(sb, ++ "closing zone at %llu failed %d\n", ++ zi->i_zsector, ret); ++ zonefs_warn(sb, ++ "remounting filesystem read-only\n"); ++ sb->s_flags |= SB_RDONLY; ++ } ++ goto unlock; ++ } ++ ++ zi->i_flags &= ~ZONEFS_ZONE_OPEN; ++ zonefs_account_active(inode); ++ } ++ ++ atomic_dec(&sbi->s_wro_seq_files); ++ ++unlock: ++ mutex_unlock(&zi->i_truncate_mutex); ++} ++ ++static int zonefs_file_release(struct inode *inode, struct file *file) ++{ ++ /* ++ * If we explicitly open a zone we must close it again as well, but the ++ * zone management operation can fail (either due to an IO error or as ++ * the zone has gone offline or read-only). Make sure we don't fail the ++ * close(2) for user-space. ++ */ ++ if (zonefs_seq_file_need_wro(inode, file)) ++ zonefs_seq_file_write_close(inode); ++ ++ return 0; ++} ++ ++const struct file_operations zonefs_file_operations = { ++ .open = zonefs_file_open, ++ .release = zonefs_file_release, ++ .fsync = zonefs_file_fsync, ++ .mmap = zonefs_file_mmap, ++ .llseek = zonefs_file_llseek, ++ .read_iter = zonefs_file_read_iter, ++ .write_iter = zonefs_file_write_iter, ++ .splice_read = generic_file_splice_read, ++ .splice_write = iter_file_splice_write, ++ .iopoll = iocb_bio_iopoll, ++}; +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index a9c5c3f720adf..e808276b88018 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -30,7 +30,7 @@ + /* + * Manage the active zone count. Called with zi->i_truncate_mutex held. + */ +-static void zonefs_account_active(struct inode *inode) ++void zonefs_account_active(struct inode *inode) + { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct zonefs_inode_info *zi = ZONEFS_I(inode); +@@ -68,7 +68,7 @@ static void zonefs_account_active(struct inode *inode) + } + } + +-static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) ++int zonefs_zone_mgmt(struct inode *inode, enum req_op op) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret; +@@ -99,7 +99,7 @@ static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) + return 0; + } + +-static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) ++void zonefs_i_size_write(struct inode *inode, loff_t isize) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + +@@ -117,167 +117,7 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) + } + } + +-static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, +- loff_t length, unsigned int flags, +- struct iomap *iomap, struct iomap *srcmap) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- loff_t isize; +- +- /* +- * All blocks are always mapped below EOF. If reading past EOF, +- * act as if there is a hole up to the file maximum size. +- */ +- mutex_lock(&zi->i_truncate_mutex); +- iomap->bdev = inode->i_sb->s_bdev; +- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); +- isize = i_size_read(inode); +- if (iomap->offset >= isize) { +- iomap->type = IOMAP_HOLE; +- iomap->addr = IOMAP_NULL_ADDR; +- iomap->length = length; +- } else { +- iomap->type = IOMAP_MAPPED; +- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; +- iomap->length = isize - iomap->offset; +- } +- mutex_unlock(&zi->i_truncate_mutex); +- +- trace_zonefs_iomap_begin(inode, iomap); +- +- return 0; +-} +- +-static const struct iomap_ops zonefs_read_iomap_ops = { +- .iomap_begin = zonefs_read_iomap_begin, +-}; +- +-static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, +- loff_t length, unsigned int flags, +- struct iomap *iomap, struct iomap *srcmap) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- loff_t isize; +- +- /* All write I/Os should always be within the file maximum size */ +- if (WARN_ON_ONCE(offset + length > zi->i_max_size)) +- return -EIO; +- +- /* +- * Sequential zones can only accept direct writes. This is already +- * checked when writes are issued, so warn if we see a page writeback +- * operation. +- */ +- if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && +- !(flags & IOMAP_DIRECT))) +- return -EIO; +- +- /* +- * For conventional zones, all blocks are always mapped. For sequential +- * zones, all blocks after always mapped below the inode size (zone +- * write pointer) and unwriten beyond. +- */ +- mutex_lock(&zi->i_truncate_mutex); +- iomap->bdev = inode->i_sb->s_bdev; +- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); +- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; +- isize = i_size_read(inode); +- if (iomap->offset >= isize) { +- iomap->type = IOMAP_UNWRITTEN; +- iomap->length = zi->i_max_size - iomap->offset; +- } else { +- iomap->type = IOMAP_MAPPED; +- iomap->length = isize - iomap->offset; +- } +- mutex_unlock(&zi->i_truncate_mutex); +- +- trace_zonefs_iomap_begin(inode, iomap); +- +- return 0; +-} +- +-static const struct iomap_ops zonefs_write_iomap_ops = { +- .iomap_begin = zonefs_write_iomap_begin, +-}; +- +-static int zonefs_read_folio(struct file *unused, struct folio *folio) +-{ +- return iomap_read_folio(folio, &zonefs_read_iomap_ops); +-} +- +-static void zonefs_readahead(struct readahead_control *rac) +-{ +- iomap_readahead(rac, &zonefs_read_iomap_ops); +-} +- +-/* +- * Map blocks for page writeback. This is used only on conventional zone files, +- * which implies that the page range can only be within the fixed inode size. +- */ +-static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, +- struct inode *inode, loff_t offset) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) +- return -EIO; +- if (WARN_ON_ONCE(offset >= i_size_read(inode))) +- return -EIO; +- +- /* If the mapping is already OK, nothing needs to be done */ +- if (offset >= wpc->iomap.offset && +- offset < wpc->iomap.offset + wpc->iomap.length) +- return 0; +- +- return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, +- IOMAP_WRITE, &wpc->iomap, NULL); +-} +- +-static const struct iomap_writeback_ops zonefs_writeback_ops = { +- .map_blocks = zonefs_write_map_blocks, +-}; +- +-static int zonefs_writepages(struct address_space *mapping, +- struct writeback_control *wbc) +-{ +- struct iomap_writepage_ctx wpc = { }; +- +- return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); +-} +- +-static int zonefs_swap_activate(struct swap_info_struct *sis, +- struct file *swap_file, sector_t *span) +-{ +- struct inode *inode = file_inode(swap_file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { +- zonefs_err(inode->i_sb, +- "swap file: not a conventional zone file\n"); +- return -EINVAL; +- } +- +- return iomap_swapfile_activate(sis, swap_file, span, +- &zonefs_read_iomap_ops); +-} +- +-static const struct address_space_operations zonefs_file_aops = { +- .read_folio = zonefs_read_folio, +- .readahead = zonefs_readahead, +- .writepages = zonefs_writepages, +- .dirty_folio = filemap_dirty_folio, +- .release_folio = iomap_release_folio, +- .invalidate_folio = iomap_invalidate_folio, +- .migrate_folio = filemap_migrate_folio, +- .is_partially_uptodate = iomap_is_partially_uptodate, +- .error_remove_page = generic_error_remove_page, +- .direct_IO = noop_direct_IO, +- .swap_activate = zonefs_swap_activate, +-}; +- +-static void zonefs_update_stats(struct inode *inode, loff_t new_isize) ++void zonefs_update_stats(struct inode *inode, loff_t new_isize) + { + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +@@ -487,7 +327,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * eventually correct the file size and zonefs inode write pointer offset + * (which can be out of sync with the drive due to partial write failures). + */ +-static void __zonefs_io_error(struct inode *inode, bool write) ++void __zonefs_io_error(struct inode *inode, bool write) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; +@@ -526,749 +366,6 @@ static void __zonefs_io_error(struct inode *inode, bool write) + memalloc_noio_restore(noio_flag); + } + +-static void zonefs_io_error(struct inode *inode, bool write) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- mutex_lock(&zi->i_truncate_mutex); +- __zonefs_io_error(inode, write); +- mutex_unlock(&zi->i_truncate_mutex); +-} +- +-static int zonefs_file_truncate(struct inode *inode, loff_t isize) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- loff_t old_isize; +- enum req_op op; +- int ret = 0; +- +- /* +- * Only sequential zone files can be truncated and truncation is allowed +- * only down to a 0 size, which is equivalent to a zone reset, and to +- * the maximum file size, which is equivalent to a zone finish. +- */ +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) +- return -EPERM; +- +- if (!isize) +- op = REQ_OP_ZONE_RESET; +- else if (isize == zi->i_max_size) +- op = REQ_OP_ZONE_FINISH; +- else +- return -EPERM; +- +- inode_dio_wait(inode); +- +- /* Serialize against page faults */ +- filemap_invalidate_lock(inode->i_mapping); +- +- /* Serialize against zonefs_iomap_begin() */ +- mutex_lock(&zi->i_truncate_mutex); +- +- old_isize = i_size_read(inode); +- if (isize == old_isize) +- goto unlock; +- +- ret = zonefs_zone_mgmt(inode, op); +- if (ret) +- goto unlock; +- +- /* +- * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, +- * take care of open zones. +- */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN) { +- /* +- * Truncating a zone to EMPTY or FULL is the equivalent of +- * closing the zone. For a truncation to 0, we need to +- * re-open the zone to ensure new writes can be processed. +- * For a truncation to the maximum file size, the zone is +- * closed and writes cannot be accepted anymore, so clear +- * the open flag. +- */ +- if (!isize) +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); +- else +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; +- } +- +- zonefs_update_stats(inode, isize); +- truncate_setsize(inode, isize); +- zi->i_wpoffset = isize; +- zonefs_account_active(inode); +- +-unlock: +- mutex_unlock(&zi->i_truncate_mutex); +- filemap_invalidate_unlock(inode->i_mapping); +- +- return ret; +-} +- +-static int zonefs_inode_setattr(struct user_namespace *mnt_userns, +- struct dentry *dentry, struct iattr *iattr) +-{ +- struct inode *inode = d_inode(dentry); +- int ret; +- +- if (unlikely(IS_IMMUTABLE(inode))) +- return -EPERM; +- +- ret = setattr_prepare(&init_user_ns, dentry, iattr); +- if (ret) +- return ret; +- +- /* +- * Since files and directories cannot be created nor deleted, do not +- * allow setting any write attributes on the sub-directories grouping +- * files by zone type. +- */ +- if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && +- (iattr->ia_mode & 0222)) +- return -EPERM; +- +- if (((iattr->ia_valid & ATTR_UID) && +- !uid_eq(iattr->ia_uid, inode->i_uid)) || +- ((iattr->ia_valid & ATTR_GID) && +- !gid_eq(iattr->ia_gid, inode->i_gid))) { +- ret = dquot_transfer(mnt_userns, inode, iattr); +- if (ret) +- return ret; +- } +- +- if (iattr->ia_valid & ATTR_SIZE) { +- ret = zonefs_file_truncate(inode, iattr->ia_size); +- if (ret) +- return ret; +- } +- +- setattr_copy(&init_user_ns, inode, iattr); +- +- return 0; +-} +- +-static const struct inode_operations zonefs_file_inode_operations = { +- .setattr = zonefs_inode_setattr, +-}; +- +-static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, +- int datasync) +-{ +- struct inode *inode = file_inode(file); +- int ret = 0; +- +- if (unlikely(IS_IMMUTABLE(inode))) +- return -EPERM; +- +- /* +- * Since only direct writes are allowed in sequential files, page cache +- * flush is needed only for conventional zone files. +- */ +- if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) +- ret = file_write_and_wait_range(file, start, end); +- if (!ret) +- ret = blkdev_issue_flush(inode->i_sb->s_bdev); +- +- if (ret) +- zonefs_io_error(inode, true); +- +- return ret; +-} +- +-static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) +-{ +- struct inode *inode = file_inode(vmf->vma->vm_file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- vm_fault_t ret; +- +- if (unlikely(IS_IMMUTABLE(inode))) +- return VM_FAULT_SIGBUS; +- +- /* +- * Sanity check: only conventional zone files can have shared +- * writeable mappings. +- */ +- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) +- return VM_FAULT_NOPAGE; +- +- sb_start_pagefault(inode->i_sb); +- file_update_time(vmf->vma->vm_file); +- +- /* Serialize against truncates */ +- filemap_invalidate_lock_shared(inode->i_mapping); +- ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); +- filemap_invalidate_unlock_shared(inode->i_mapping); +- +- sb_end_pagefault(inode->i_sb); +- return ret; +-} +- +-static const struct vm_operations_struct zonefs_file_vm_ops = { +- .fault = filemap_fault, +- .map_pages = filemap_map_pages, +- .page_mkwrite = zonefs_filemap_page_mkwrite, +-}; +- +-static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) +-{ +- /* +- * Conventional zones accept random writes, so their files can support +- * shared writable mappings. For sequential zone files, only read +- * mappings are possible since there are no guarantees for write +- * ordering between msync() and page cache writeback. +- */ +- if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && +- (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) +- return -EINVAL; +- +- file_accessed(file); +- vma->vm_ops = &zonefs_file_vm_ops; +- +- return 0; +-} +- +-static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) +-{ +- loff_t isize = i_size_read(file_inode(file)); +- +- /* +- * Seeks are limited to below the zone size for conventional zones +- * and below the zone write pointer for sequential zones. In both +- * cases, this limit is the inode size. +- */ +- return generic_file_llseek_size(file, offset, whence, isize, isize); +-} +- +-static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, +- int error, unsigned int flags) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (error) { +- zonefs_io_error(inode, true); +- return error; +- } +- +- if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { +- /* +- * Note that we may be seeing completions out of order, +- * but that is not a problem since a write completed +- * successfully necessarily means that all preceding writes +- * were also successful. So we can safely increase the inode +- * size to the write end location. +- */ +- mutex_lock(&zi->i_truncate_mutex); +- if (i_size_read(inode) < iocb->ki_pos + size) { +- zonefs_update_stats(inode, iocb->ki_pos + size); +- zonefs_i_size_write(inode, iocb->ki_pos + size); +- } +- mutex_unlock(&zi->i_truncate_mutex); +- } +- +- return 0; +-} +- +-static const struct iomap_dio_ops zonefs_write_dio_ops = { +- .end_io = zonefs_file_write_dio_end_io, +-}; +- +-static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct block_device *bdev = inode->i_sb->s_bdev; +- unsigned int max = bdev_max_zone_append_sectors(bdev); +- struct bio *bio; +- ssize_t size; +- int nr_pages; +- ssize_t ret; +- +- max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); +- iov_iter_truncate(from, max); +- +- nr_pages = iov_iter_npages(from, BIO_MAX_VECS); +- if (!nr_pages) +- return 0; +- +- bio = bio_alloc(bdev, nr_pages, +- REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); +- bio->bi_iter.bi_sector = zi->i_zsector; +- bio->bi_ioprio = iocb->ki_ioprio; +- if (iocb_is_dsync(iocb)) +- bio->bi_opf |= REQ_FUA; +- +- ret = bio_iov_iter_get_pages(bio, from); +- if (unlikely(ret)) +- goto out_release; +- +- size = bio->bi_iter.bi_size; +- task_io_account_write(size); +- +- if (iocb->ki_flags & IOCB_HIPRI) +- bio_set_polled(bio, iocb); +- +- ret = submit_bio_wait(bio); +- +- /* +- * If the file zone was written underneath the file system, the zone +- * write pointer may not be where we expect it to be, but the zone +- * append write can still succeed. So check manually that we wrote where +- * we intended to, that is, at zi->i_wpoffset. +- */ +- if (!ret) { +- sector_t wpsector = +- zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); +- +- if (bio->bi_iter.bi_sector != wpsector) { +- zonefs_warn(inode->i_sb, +- "Corrupted write pointer %llu for zone at %llu\n", +- wpsector, zi->i_zsector); +- ret = -EIO; +- } +- } +- +- zonefs_file_write_dio_end_io(iocb, size, ret, 0); +- trace_zonefs_file_dio_append(inode, size, ret); +- +-out_release: +- bio_release_pages(bio, false); +- bio_put(bio); +- +- if (ret >= 0) { +- iocb->ki_pos += size; +- return size; +- } +- +- return ret; +-} +- +-/* +- * Do not exceed the LFS limits nor the file zone size. If pos is under the +- * limit it becomes a short access. If it exceeds the limit, return -EFBIG. +- */ +-static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, +- loff_t count) +-{ +- struct inode *inode = file_inode(file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- loff_t limit = rlimit(RLIMIT_FSIZE); +- loff_t max_size = zi->i_max_size; +- +- if (limit != RLIM_INFINITY) { +- if (pos >= limit) { +- send_sig(SIGXFSZ, current, 0); +- return -EFBIG; +- } +- count = min(count, limit - pos); +- } +- +- if (!(file->f_flags & O_LARGEFILE)) +- max_size = min_t(loff_t, MAX_NON_LFS, max_size); +- +- if (unlikely(pos >= max_size)) +- return -EFBIG; +- +- return min(count, max_size - pos); +-} +- +-static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) +-{ +- struct file *file = iocb->ki_filp; +- struct inode *inode = file_inode(file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- loff_t count; +- +- if (IS_SWAPFILE(inode)) +- return -ETXTBSY; +- +- if (!iov_iter_count(from)) +- return 0; +- +- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) +- return -EINVAL; +- +- if (iocb->ki_flags & IOCB_APPEND) { +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) +- return -EINVAL; +- mutex_lock(&zi->i_truncate_mutex); +- iocb->ki_pos = zi->i_wpoffset; +- mutex_unlock(&zi->i_truncate_mutex); +- } +- +- count = zonefs_write_check_limits(file, iocb->ki_pos, +- iov_iter_count(from)); +- if (count < 0) +- return count; +- +- iov_iter_truncate(from, count); +- return iov_iter_count(from); +-} +- +-/* +- * Handle direct writes. For sequential zone files, this is the only possible +- * write path. For these files, check that the user is issuing writes +- * sequentially from the end of the file. This code assumes that the block layer +- * delivers write requests to the device in sequential order. This is always the +- * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE +- * elevator feature is being used (e.g. mq-deadline). The block layer always +- * automatically select such an elevator for zoned block devices during the +- * device initialization. +- */ +-static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- bool sync = is_sync_kiocb(iocb); +- bool append = false; +- ssize_t ret, count; +- +- /* +- * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT +- * as this can cause write reordering (e.g. the first aio gets EAGAIN +- * on the inode lock but the second goes through but is now unaligned). +- */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && +- (iocb->ki_flags & IOCB_NOWAIT)) +- return -EOPNOTSUPP; +- +- if (iocb->ki_flags & IOCB_NOWAIT) { +- if (!inode_trylock(inode)) +- return -EAGAIN; +- } else { +- inode_lock(inode); +- } +- +- count = zonefs_write_checks(iocb, from); +- if (count <= 0) { +- ret = count; +- goto inode_unlock; +- } +- +- if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { +- ret = -EINVAL; +- goto inode_unlock; +- } +- +- /* Enforce sequential writes (append only) in sequential zones */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { +- mutex_lock(&zi->i_truncate_mutex); +- if (iocb->ki_pos != zi->i_wpoffset) { +- mutex_unlock(&zi->i_truncate_mutex); +- ret = -EINVAL; +- goto inode_unlock; +- } +- mutex_unlock(&zi->i_truncate_mutex); +- append = sync; +- } +- +- if (append) +- ret = zonefs_file_dio_append(iocb, from); +- else +- ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, +- &zonefs_write_dio_ops, 0, NULL, 0); +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && +- (ret > 0 || ret == -EIOCBQUEUED)) { +- if (ret > 0) +- count = ret; +- +- /* +- * Update the zone write pointer offset assuming the write +- * operation succeeded. If it did not, the error recovery path +- * will correct it. Also do active seq file accounting. +- */ +- mutex_lock(&zi->i_truncate_mutex); +- zi->i_wpoffset += count; +- zonefs_account_active(inode); +- mutex_unlock(&zi->i_truncate_mutex); +- } +- +-inode_unlock: +- inode_unlock(inode); +- +- return ret; +-} +- +-static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, +- struct iov_iter *from) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- ssize_t ret; +- +- /* +- * Direct IO writes are mandatory for sequential zone files so that the +- * write IO issuing order is preserved. +- */ +- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) +- return -EIO; +- +- if (iocb->ki_flags & IOCB_NOWAIT) { +- if (!inode_trylock(inode)) +- return -EAGAIN; +- } else { +- inode_lock(inode); +- } +- +- ret = zonefs_write_checks(iocb, from); +- if (ret <= 0) +- goto inode_unlock; +- +- ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); +- if (ret > 0) +- iocb->ki_pos += ret; +- else if (ret == -EIO) +- zonefs_io_error(inode, true); +- +-inode_unlock: +- inode_unlock(inode); +- if (ret > 0) +- ret = generic_write_sync(iocb, ret); +- +- return ret; +-} +- +-static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- +- if (unlikely(IS_IMMUTABLE(inode))) +- return -EPERM; +- +- if (sb_rdonly(inode->i_sb)) +- return -EROFS; +- +- /* Write operations beyond the zone size are not allowed */ +- if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) +- return -EFBIG; +- +- if (iocb->ki_flags & IOCB_DIRECT) { +- ssize_t ret = zonefs_file_dio_write(iocb, from); +- if (ret != -ENOTBLK) +- return ret; +- } +- +- return zonefs_file_buffered_write(iocb, from); +-} +- +-static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, +- int error, unsigned int flags) +-{ +- if (error) { +- zonefs_io_error(file_inode(iocb->ki_filp), false); +- return error; +- } +- +- return 0; +-} +- +-static const struct iomap_dio_ops zonefs_read_dio_ops = { +- .end_io = zonefs_file_read_dio_end_io, +-}; +- +-static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- loff_t isize; +- ssize_t ret; +- +- /* Offline zones cannot be read */ +- if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) +- return -EPERM; +- +- if (iocb->ki_pos >= zi->i_max_size) +- return 0; +- +- if (iocb->ki_flags & IOCB_NOWAIT) { +- if (!inode_trylock_shared(inode)) +- return -EAGAIN; +- } else { +- inode_lock_shared(inode); +- } +- +- /* Limit read operations to written data */ +- mutex_lock(&zi->i_truncate_mutex); +- isize = i_size_read(inode); +- if (iocb->ki_pos >= isize) { +- mutex_unlock(&zi->i_truncate_mutex); +- ret = 0; +- goto inode_unlock; +- } +- iov_iter_truncate(to, isize - iocb->ki_pos); +- mutex_unlock(&zi->i_truncate_mutex); +- +- if (iocb->ki_flags & IOCB_DIRECT) { +- size_t count = iov_iter_count(to); +- +- if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { +- ret = -EINVAL; +- goto inode_unlock; +- } +- file_accessed(iocb->ki_filp); +- ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, +- &zonefs_read_dio_ops, 0, NULL, 0); +- } else { +- ret = generic_file_read_iter(iocb, to); +- if (ret == -EIO) +- zonefs_io_error(inode, false); +- } +- +-inode_unlock: +- inode_unlock_shared(inode); +- +- return ret; +-} +- +-/* +- * Write open accounting is done only for sequential files. +- */ +-static inline bool zonefs_seq_file_need_wro(struct inode *inode, +- struct file *file) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) +- return false; +- +- if (!(file->f_mode & FMODE_WRITE)) +- return false; +- +- return true; +-} +- +-static int zonefs_seq_file_write_open(struct inode *inode) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- int ret = 0; +- +- mutex_lock(&zi->i_truncate_mutex); +- +- if (!zi->i_wr_refcnt) { +- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); +- unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); +- +- if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { +- +- if (sbi->s_max_wro_seq_files +- && wro > sbi->s_max_wro_seq_files) { +- atomic_dec(&sbi->s_wro_seq_files); +- ret = -EBUSY; +- goto unlock; +- } +- +- if (i_size_read(inode) < zi->i_max_size) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); +- if (ret) { +- atomic_dec(&sbi->s_wro_seq_files); +- goto unlock; +- } +- zi->i_flags |= ZONEFS_ZONE_OPEN; +- zonefs_account_active(inode); +- } +- } +- } +- +- zi->i_wr_refcnt++; +- +-unlock: +- mutex_unlock(&zi->i_truncate_mutex); +- +- return ret; +-} +- +-static int zonefs_file_open(struct inode *inode, struct file *file) +-{ +- int ret; +- +- ret = generic_file_open(inode, file); +- if (ret) +- return ret; +- +- if (zonefs_seq_file_need_wro(inode, file)) +- return zonefs_seq_file_write_open(inode); +- +- return 0; +-} +- +-static void zonefs_seq_file_write_close(struct inode *inode) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +- int ret = 0; +- +- mutex_lock(&zi->i_truncate_mutex); +- +- zi->i_wr_refcnt--; +- if (zi->i_wr_refcnt) +- goto unlock; +- +- /* +- * The file zone may not be open anymore (e.g. the file was truncated to +- * its maximum size or it was fully written). For this case, we only +- * need to decrement the write open count. +- */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); +- if (ret) { +- __zonefs_io_error(inode, false); +- /* +- * Leaving zones explicitly open may lead to a state +- * where most zones cannot be written (zone resources +- * exhausted). So take preventive action by remounting +- * read-only. +- */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN && +- !(sb->s_flags & SB_RDONLY)) { +- zonefs_warn(sb, +- "closing zone at %llu failed %d\n", +- zi->i_zsector, ret); +- zonefs_warn(sb, +- "remounting filesystem read-only\n"); +- sb->s_flags |= SB_RDONLY; +- } +- goto unlock; +- } +- +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; +- zonefs_account_active(inode); +- } +- +- atomic_dec(&sbi->s_wro_seq_files); +- +-unlock: +- mutex_unlock(&zi->i_truncate_mutex); +-} +- +-static int zonefs_file_release(struct inode *inode, struct file *file) +-{ +- /* +- * If we explicitly open a zone we must close it again as well, but the +- * zone management operation can fail (either due to an IO error or as +- * the zone has gone offline or read-only). Make sure we don't fail the +- * close(2) for user-space. +- */ +- if (zonefs_seq_file_need_wro(inode, file)) +- zonefs_seq_file_write_close(inode); +- +- return 0; +-} +- +-static const struct file_operations zonefs_file_operations = { +- .open = zonefs_file_open, +- .release = zonefs_file_release, +- .fsync = zonefs_file_fsync, +- .mmap = zonefs_file_mmap, +- .llseek = zonefs_file_llseek, +- .read_iter = zonefs_file_read_iter, +- .write_iter = zonefs_file_write_iter, +- .splice_read = generic_file_splice_read, +- .splice_write = iter_file_splice_write, +- .iopoll = iocb_bio_iopoll, +-}; +- + static struct kmem_cache *zonefs_inode_cachep; + + static struct inode *zonefs_alloc_inode(struct super_block *sb) +@@ -1408,13 +505,47 @@ static int zonefs_remount(struct super_block *sb, int *flags, char *data) + return zonefs_parse_options(sb, data); + } + +-static const struct super_operations zonefs_sops = { +- .alloc_inode = zonefs_alloc_inode, +- .free_inode = zonefs_free_inode, +- .statfs = zonefs_statfs, +- .remount_fs = zonefs_remount, +- .show_options = zonefs_show_options, +-}; ++static int zonefs_inode_setattr(struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct iattr *iattr) ++{ ++ struct inode *inode = d_inode(dentry); ++ int ret; ++ ++ if (unlikely(IS_IMMUTABLE(inode))) ++ return -EPERM; ++ ++ ret = setattr_prepare(&init_user_ns, dentry, iattr); ++ if (ret) ++ return ret; ++ ++ /* ++ * Since files and directories cannot be created nor deleted, do not ++ * allow setting any write attributes on the sub-directories grouping ++ * files by zone type. ++ */ ++ if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && ++ (iattr->ia_mode & 0222)) ++ return -EPERM; ++ ++ if (((iattr->ia_valid & ATTR_UID) && ++ !uid_eq(iattr->ia_uid, inode->i_uid)) || ++ ((iattr->ia_valid & ATTR_GID) && ++ !gid_eq(iattr->ia_gid, inode->i_gid))) { ++ ret = dquot_transfer(mnt_userns, inode, iattr); ++ if (ret) ++ return ret; ++ } ++ ++ if (iattr->ia_valid & ATTR_SIZE) { ++ ret = zonefs_file_truncate(inode, iattr->ia_size); ++ if (ret) ++ return ret; ++ } ++ ++ setattr_copy(&init_user_ns, inode, iattr); ++ ++ return 0; ++} + + static const struct inode_operations zonefs_dir_inode_operations = { + .lookup = simple_lookup, +@@ -1434,6 +565,10 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, + inc_nlink(parent); + } + ++static const struct inode_operations zonefs_file_inode_operations = { ++ .setattr = zonefs_inode_setattr, ++}; ++ + static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + enum zonefs_ztype type) + { +@@ -1785,6 +920,14 @@ static int zonefs_read_super(struct super_block *sb) + return ret; + } + ++static const struct super_operations zonefs_sops = { ++ .alloc_inode = zonefs_alloc_inode, ++ .free_inode = zonefs_free_inode, ++ .statfs = zonefs_statfs, ++ .remount_fs = zonefs_remount, ++ .show_options = zonefs_show_options, ++}; ++ + /* + * Check that the device is zoned. If it is, get the list of zones and create + * sub-directories and files according to the device zone configuration and +diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h +index 1dbe78119ff16..839ebe9afb6c1 100644 +--- a/fs/zonefs/zonefs.h ++++ b/fs/zonefs/zonefs.h +@@ -209,6 +209,28 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) + #define zonefs_warn(sb, format, args...) \ + pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) + ++/* In super.c */ ++void zonefs_account_active(struct inode *inode); ++int zonefs_zone_mgmt(struct inode *inode, enum req_op op); ++void zonefs_i_size_write(struct inode *inode, loff_t isize); ++void zonefs_update_stats(struct inode *inode, loff_t new_isize); ++void __zonefs_io_error(struct inode *inode, bool write); ++ ++static inline void zonefs_io_error(struct inode *inode, bool write) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ mutex_lock(&zi->i_truncate_mutex); ++ __zonefs_io_error(inode, write); ++ mutex_unlock(&zi->i_truncate_mutex); ++} ++ ++/* In file.c */ ++extern const struct address_space_operations zonefs_file_aops; ++extern const struct file_operations zonefs_file_operations; ++int zonefs_file_truncate(struct inode *inode, loff_t isize); ++ ++/* In sysfs.c */ + int zonefs_sysfs_register(struct super_block *sb); + void zonefs_sysfs_unregister(struct super_block *sb); + int zonefs_sysfs_init(void); +-- +2.39.2 + diff --git a/queue-6.1/zonefs-separate-zone-information-from-inode-informat.patch b/queue-6.1/zonefs-separate-zone-information-from-inode-informat.patch new file mode 100644 index 00000000000..64a88d8b794 --- /dev/null +++ b/queue-6.1/zonefs-separate-zone-information-from-inode-informat.patch @@ -0,0 +1,1485 @@ +From 7fa0c6f6351e25a9e83feab49308a1b92daf841c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Nov 2022 18:15:40 +0900 +Subject: zonefs: Separate zone information from inode information + +From: Damien Le Moal + +[ Upstream commit aa7f243f32e1d18036ee00d71d3ccfad70ae2121 ] + +In preparation for adding dynamic inode allocation, separate an inode +zone information from the zonefs inode structure. The new data structure +zonefs_zone is introduced to store in memory information about a zone +that must be kept throughout the lifetime of the device mount. + +Linking between a zone file inode and its zone information is done by +setting the inode i_private field to point to a struct zonefs_zone. +Using the i_private pointer avoids the need for adding a pointer in +struct zonefs_inode_info. Beside the vfs inode, this structure is +reduced to a mutex and a write open counter. + +One struct zonefs_zone is created per file inode on mount. These +structures are organized in an array using the new struct +zonefs_zone_group data structure to represent zone groups. The +zonefs_zone arrays are indexed per file number (the index of a struct +zonefs_zone in its array directly gives the file number/name for that +zone file inode). + +Signed-off-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") +Signed-off-by: Sasha Levin +--- + fs/zonefs/file.c | 99 ++++---- + fs/zonefs/super.c | 571 +++++++++++++++++++++++++++------------------ + fs/zonefs/trace.h | 20 +- + fs/zonefs/zonefs.h | 63 +++-- + 4 files changed, 449 insertions(+), 304 deletions(-) + +diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c +index 64873d31d75dd..738b0e28d74b5 100644 +--- a/fs/zonefs/file.c ++++ b/fs/zonefs/file.c +@@ -29,6 +29,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + struct iomap *iomap, struct iomap *srcmap) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + +@@ -46,7 +47,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + iomap->length = length; + } else { + iomap->type = IOMAP_MAPPED; +- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; ++ iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); +@@ -65,11 +66,12 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + struct iomap *iomap, struct iomap *srcmap) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* All write I/Os should always be within the file maximum size */ +- if (WARN_ON_ONCE(offset + length > zi->i_max_size)) ++ if (WARN_ON_ONCE(offset + length > z->z_capacity)) + return -EIO; + + /* +@@ -77,7 +79,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + * checked when writes are issued, so warn if we see a page writeback + * operation. + */ +- if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT))) ++ if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) + return -EIO; + + /* +@@ -88,11 +90,11 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); +- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; ++ iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_UNWRITTEN; +- iomap->length = zi->i_max_size - iomap->offset; ++ iomap->length = z->z_capacity - iomap->offset; + } else { + iomap->type = IOMAP_MAPPED; + iomap->length = isize - iomap->offset; +@@ -125,9 +127,9 @@ static void zonefs_readahead(struct readahead_control *rac) + static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + +- if (WARN_ON_ONCE(zonefs_zone_is_seq(zi))) ++ if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) + return -EIO; + if (WARN_ON_ONCE(offset >= i_size_read(inode))) + return -EIO; +@@ -137,7 +139,8 @@ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + +- return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, ++ return zonefs_write_iomap_begin(inode, offset, ++ z->z_capacity - offset, + IOMAP_WRITE, &wpc->iomap, NULL); + } + +@@ -185,6 +188,7 @@ const struct address_space_operations zonefs_file_aops = { + int zonefs_file_truncate(struct inode *inode, loff_t isize) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t old_isize; + enum req_op op; + int ret = 0; +@@ -194,12 +198,12 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + * only down to a 0 size, which is equivalent to a zone reset, and to + * the maximum file size, which is equivalent to a zone finish. + */ +- if (!zonefs_zone_is_seq(zi)) ++ if (!zonefs_zone_is_seq(z)) + return -EPERM; + + if (!isize) + op = REQ_OP_ZONE_RESET; +- else if (isize == zi->i_max_size) ++ else if (isize == z->z_capacity) + op = REQ_OP_ZONE_FINISH; + else + return -EPERM; +@@ -216,7 +220,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + if (isize == old_isize) + goto unlock; + +- ret = zonefs_zone_mgmt(inode, op); ++ ret = zonefs_inode_zone_mgmt(inode, op); + if (ret) + goto unlock; + +@@ -224,7 +228,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN) { ++ if (z->z_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to +@@ -234,15 +238,15 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + * the open flag. + */ + if (!isize) +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); ++ ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; ++ z->z_flags &= ~ZONEFS_ZONE_OPEN; + } + + zonefs_update_stats(inode, isize); + truncate_setsize(inode, isize); +- zi->i_wpoffset = isize; +- zonefs_account_active(inode); ++ z->z_wpoffset = isize; ++ zonefs_inode_account_active(inode); + + unlock: + mutex_unlock(&zi->i_truncate_mutex); +@@ -349,7 +353,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + return error; + } + +- if (size && zonefs_zone_is_seq(zi)) { ++ if (size && zonefs_inode_is_seq(inode)) { + /* + * Note that we may be seeing completions out of order, + * but that is not a problem since a write completed +@@ -375,7 +379,7 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = { + static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int max = bdev_max_zone_append_sectors(bdev); + struct bio *bio; +@@ -392,7 +396,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + + bio = bio_alloc(bdev, nr_pages, + REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); +- bio->bi_iter.bi_sector = zi->i_zsector; ++ bio->bi_iter.bi_sector = z->z_sector; + bio->bi_ioprio = iocb->ki_ioprio; + if (iocb_is_dsync(iocb)) + bio->bi_opf |= REQ_FUA; +@@ -417,12 +421,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + */ + if (!ret) { + sector_t wpsector = +- zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); ++ z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT); + + if (bio->bi_iter.bi_sector != wpsector) { + zonefs_warn(inode->i_sb, + "Corrupted write pointer %llu for zone at %llu\n", +- wpsector, zi->i_zsector); ++ wpsector, z->z_sector); + ret = -EIO; + } + } +@@ -450,9 +454,9 @@ static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, + loff_t count) + { + struct inode *inode = file_inode(file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t limit = rlimit(RLIMIT_FSIZE); +- loff_t max_size = zi->i_max_size; ++ loff_t max_size = z->z_capacity; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { +@@ -476,6 +480,7 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t count; + + if (IS_SWAPFILE(inode)) +@@ -488,10 +493,10 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { +- if (zonefs_zone_is_cnv(zi)) ++ if (zonefs_zone_is_cnv(z)) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); +- iocb->ki_pos = zi->i_wpoffset; ++ iocb->ki_pos = z->z_wpoffset; + mutex_unlock(&zi->i_truncate_mutex); + } + +@@ -518,6 +523,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + bool sync = is_sync_kiocb(iocb); + bool append = false; +@@ -528,7 +534,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + * as this can cause write reordering (e.g. the first aio gets EAGAIN + * on the inode lock but the second goes through but is now unaligned). + */ +- if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) ++ if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (iocb->ki_flags & IOCB_NOWAIT) { +@@ -550,9 +556,9 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + } + + /* Enforce sequential writes (append only) in sequential zones */ +- if (zonefs_zone_is_seq(zi)) { ++ if (zonefs_zone_is_seq(z)) { + mutex_lock(&zi->i_truncate_mutex); +- if (iocb->ki_pos != zi->i_wpoffset) { ++ if (iocb->ki_pos != z->z_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); + ret = -EINVAL; + goto inode_unlock; +@@ -566,7 +572,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + else + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + &zonefs_write_dio_ops, 0, NULL, 0); +- if (zonefs_zone_is_seq(zi) && ++ if (zonefs_zone_is_seq(z) && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) + count = ret; +@@ -577,8 +583,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + * will correct it. Also do active seq file accounting. + */ + mutex_lock(&zi->i_truncate_mutex); +- zi->i_wpoffset += count; +- zonefs_account_active(inode); ++ z->z_wpoffset += count; ++ zonefs_inode_account_active(inode); + mutex_unlock(&zi->i_truncate_mutex); + } + +@@ -629,6 +635,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, + static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; +@@ -636,8 +643,8 @@ static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + if (sb_rdonly(inode->i_sb)) + return -EROFS; + +- /* Write operations beyond the zone size are not allowed */ +- if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) ++ /* Write operations beyond the zone capacity are not allowed */ ++ if (iocb->ki_pos >= z->z_capacity) + return -EFBIG; + + if (iocb->ki_flags & IOCB_DIRECT) { +@@ -669,6 +676,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + { + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + ssize_t ret; +@@ -677,7 +685,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) + return -EPERM; + +- if (iocb->ki_pos >= zi->i_max_size) ++ if (iocb->ki_pos >= z->z_capacity) + return 0; + + if (iocb->ki_flags & IOCB_NOWAIT) { +@@ -738,6 +746,7 @@ static inline bool zonefs_seq_file_need_wro(struct inode *inode, + static int zonefs_seq_file_write_open(struct inode *inode) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); +@@ -755,14 +764,15 @@ static int zonefs_seq_file_write_open(struct inode *inode) + goto unlock; + } + +- if (i_size_read(inode) < zi->i_max_size) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); ++ if (i_size_read(inode) < z->z_capacity) { ++ ret = zonefs_inode_zone_mgmt(inode, ++ REQ_OP_ZONE_OPEN); + if (ret) { + atomic_dec(&sbi->s_wro_seq_files); + goto unlock; + } +- zi->i_flags |= ZONEFS_ZONE_OPEN; +- zonefs_account_active(inode); ++ z->z_flags |= ZONEFS_ZONE_OPEN; ++ zonefs_inode_account_active(inode); + } + } + } +@@ -792,6 +802,7 @@ static int zonefs_file_open(struct inode *inode, struct file *file) + static void zonefs_seq_file_write_close(struct inode *inode) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + int ret = 0; +@@ -807,8 +818,8 @@ static void zonefs_seq_file_write_close(struct inode *inode) + * its maximum size or it was fully written). For this case, we only + * need to decrement the write open count. + */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); ++ if (z->z_flags & ZONEFS_ZONE_OPEN) { ++ ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* +@@ -817,11 +828,11 @@ static void zonefs_seq_file_write_close(struct inode *inode) + * exhausted). So take preventive action by remounting + * read-only. + */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN && ++ if (z->z_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, + "closing zone at %llu failed %d\n", +- zi->i_zsector, ret); ++ z->z_sector, ret); + zonefs_warn(sb, + "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; +@@ -829,8 +840,8 @@ static void zonefs_seq_file_write_close(struct inode *inode) + goto unlock; + } + +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; +- zonefs_account_active(inode); ++ z->z_flags &= ~ZONEFS_ZONE_OPEN; ++ zonefs_inode_account_active(inode); + } + + atomic_dec(&sbi->s_wro_seq_files); +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index a4af29dc32e7d..270ded209dde5 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -28,33 +28,47 @@ + #include "trace.h" + + /* +- * Manage the active zone count. Called with zi->i_truncate_mutex held. ++ * Get the name of a zone group directory. + */ +-void zonefs_account_active(struct inode *inode) ++static const char *zonefs_zgroup_name(enum zonefs_ztype ztype) + { +- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ switch (ztype) { ++ case ZONEFS_ZTYPE_CNV: ++ return "cnv"; ++ case ZONEFS_ZTYPE_SEQ: ++ return "seq"; ++ default: ++ WARN_ON_ONCE(1); ++ return "???"; ++ } ++} + +- lockdep_assert_held(&zi->i_truncate_mutex); ++/* ++ * Manage the active zone count. ++ */ ++static void zonefs_account_active(struct super_block *sb, ++ struct zonefs_zone *z) ++{ ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + +- if (zonefs_zone_is_cnv(zi)) ++ if (zonefs_zone_is_cnv(z)) + return; + + /* + * For zones that transitioned to the offline or readonly condition, + * we only need to clear the active state. + */ +- if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) ++ if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) + goto out; + + /* + * If the zone is active, that is, if it is explicitly open or + * partially written, check if it was already accounted as active. + */ +- if ((zi->i_flags & ZONEFS_ZONE_OPEN) || +- (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { +- if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { +- zi->i_flags |= ZONEFS_ZONE_ACTIVE; ++ if ((z->z_flags & ZONEFS_ZONE_OPEN) || ++ (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) { ++ if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) { ++ z->z_flags |= ZONEFS_ZONE_ACTIVE; + atomic_inc(&sbi->s_active_seq_files); + } + return; +@@ -62,18 +76,29 @@ void zonefs_account_active(struct inode *inode) + + out: + /* The zone is not active. If it was, update the active count */ +- if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { +- zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; ++ if (z->z_flags & ZONEFS_ZONE_ACTIVE) { ++ z->z_flags &= ~ZONEFS_ZONE_ACTIVE; + atomic_dec(&sbi->s_active_seq_files); + } + } + +-int zonefs_zone_mgmt(struct inode *inode, enum req_op op) ++/* ++ * Manage the active zone count. Called with zi->i_truncate_mutex held. ++ */ ++void zonefs_inode_account_active(struct inode *inode) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- int ret; ++ lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); + +- lockdep_assert_held(&zi->i_truncate_mutex); ++ return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode)); ++} ++ ++/* ++ * Execute a zone management operation. ++ */ ++static int zonefs_zone_mgmt(struct super_block *sb, ++ struct zonefs_zone *z, enum req_op op) ++{ ++ int ret; + + /* + * With ZNS drives, closing an explicitly open zone that has not been +@@ -83,37 +108,45 @@ int zonefs_zone_mgmt(struct inode *inode, enum req_op op) + * are exceeded, make sure that the zone does not remain active by + * resetting it. + */ +- if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) ++ if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset) + op = REQ_OP_ZONE_RESET; + +- trace_zonefs_zone_mgmt(inode, op); +- ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, +- zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); ++ trace_zonefs_zone_mgmt(sb, z, op); ++ ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, ++ z->z_size >> SECTOR_SHIFT, GFP_NOFS); + if (ret) { +- zonefs_err(inode->i_sb, ++ zonefs_err(sb, + "Zone management operation %s at %llu failed %d\n", +- blk_op_str(op), zi->i_zsector, ret); ++ blk_op_str(op), z->z_sector, ret); + return ret; + } + + return 0; + } + ++int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op) ++{ ++ lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); ++ ++ return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op); ++} ++ + void zonefs_i_size_write(struct inode *inode, loff_t isize) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + + i_size_write(inode, isize); ++ + /* + * A full zone is no longer open/active and does not need + * explicit closing. + */ +- if (isize >= zi->i_max_size) { ++ if (isize >= z->z_capacity) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + +- if (zi->i_flags & ZONEFS_ZONE_ACTIVE) ++ if (z->z_flags & ZONEFS_ZONE_ACTIVE) + atomic_dec(&sbi->s_active_seq_files); +- zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); ++ z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); + } + } + +@@ -150,20 +183,18 @@ void zonefs_update_stats(struct inode *inode, loff_t new_isize) + } + + /* +- * Check a zone condition and adjust its file inode access permissions for +- * offline and readonly zones. Return the inode size corresponding to the +- * amount of readable data in the zone. ++ * Check a zone condition. Return the amount of written (and still readable) ++ * data in the zone. + */ +-static loff_t zonefs_check_zone_condition(struct inode *inode, ++static loff_t zonefs_check_zone_condition(struct super_block *sb, ++ struct zonefs_zone *z, + struct blk_zone *zone) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- + switch (zone->cond) { + case BLK_ZONE_COND_OFFLINE: +- zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", +- inode->i_ino); +- zi->i_flags |= ZONEFS_ZONE_OFFLINE; ++ zonefs_warn(sb, "Zone %llu: offline zone\n", ++ z->z_sector); ++ z->z_flags |= ZONEFS_ZONE_OFFLINE; + return 0; + case BLK_ZONE_COND_READONLY: + /* +@@ -174,18 +205,18 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, + * the inode size as it was when last updated so that the user + * can recover data. + */ +- zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", +- inode->i_ino); +- zi->i_flags |= ZONEFS_ZONE_READONLY; +- if (zonefs_zone_is_cnv(zi)) +- return zi->i_max_size; +- return zi->i_wpoffset; ++ zonefs_warn(sb, "Zone %llu: read-only zone\n", ++ z->z_sector); ++ z->z_flags |= ZONEFS_ZONE_READONLY; ++ if (zonefs_zone_is_cnv(z)) ++ return z->z_capacity; ++ return z->z_wpoffset; + case BLK_ZONE_COND_FULL: + /* The write pointer of full zones is invalid. */ +- return zi->i_max_size; ++ return z->z_capacity; + default: +- if (zonefs_zone_is_cnv(zi)) +- return zi->i_max_size; ++ if (zonefs_zone_is_cnv(z)) ++ return z->z_capacity; + return (zone->wp - zone->start) << SECTOR_SHIFT; + } + } +@@ -196,22 +227,22 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, + */ + static void zonefs_inode_update_mode(struct inode *inode) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + +- if (zi->i_flags & ZONEFS_ZONE_OFFLINE) { ++ if (z->z_flags & ZONEFS_ZONE_OFFLINE) { + /* Offline zones cannot be read nor written */ + inode->i_flags |= S_IMMUTABLE; + inode->i_mode &= ~0777; +- } else if (zi->i_flags & ZONEFS_ZONE_READONLY) { ++ } else if (z->z_flags & ZONEFS_ZONE_READONLY) { + /* Readonly zones cannot be written */ + inode->i_flags |= S_IMMUTABLE; +- if (zi->i_flags & ZONEFS_ZONE_INIT_MODE) ++ if (z->z_flags & ZONEFS_ZONE_INIT_MODE) + inode->i_mode &= ~0777; + else + inode->i_mode &= ~0222; + } + +- zi->i_flags &= ~ZONEFS_ZONE_INIT_MODE; ++ z->z_flags &= ~ZONEFS_ZONE_INIT_MODE; + } + + struct zonefs_ioerr_data { +@@ -224,7 +255,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + { + struct zonefs_ioerr_data *err = data; + struct inode *inode = err->inode; +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + loff_t isize, data_size; +@@ -235,9 +266,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * as there is no inconsistency between the inode size and the amount of + * data writen in the zone (data_size). + */ +- data_size = zonefs_check_zone_condition(inode, zone); ++ data_size = zonefs_check_zone_condition(sb, z, zone); + isize = i_size_read(inode); +- if (!(zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && ++ if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && + !err->write && isize == data_size) + return 0; + +@@ -260,8 +291,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * In all cases, warn about inode size inconsistency and handle the + * IO error according to the zone condition and to the mount options. + */ +- if (zonefs_zone_is_seq(zi) && isize != data_size) +- zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", ++ if (zonefs_zone_is_seq(z) && isize != data_size) ++ zonefs_warn(sb, ++ "inode %lu: invalid size %lld (should be %lld)\n", + inode->i_ino, isize, data_size); + + /* +@@ -270,20 +302,20 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * zone condition to read-only and offline respectively, as if the + * condition was signaled by the hardware. + */ +- if ((zi->i_flags & ZONEFS_ZONE_OFFLINE) || ++ if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { + zonefs_warn(sb, "inode %lu: read/write access disabled\n", + inode->i_ino); +- if (!(zi->i_flags & ZONEFS_ZONE_OFFLINE)) +- zi->i_flags |= ZONEFS_ZONE_OFFLINE; ++ if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) ++ z->z_flags |= ZONEFS_ZONE_OFFLINE; + zonefs_inode_update_mode(inode); + data_size = 0; +- } else if ((zi->i_flags & ZONEFS_ZONE_READONLY) || ++ } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { + zonefs_warn(sb, "inode %lu: write access disabled\n", + inode->i_ino); +- if (!(zi->i_flags & ZONEFS_ZONE_READONLY)) +- zi->i_flags |= ZONEFS_ZONE_READONLY; ++ if (!(z->z_flags & ZONEFS_ZONE_READONLY)) ++ z->z_flags |= ZONEFS_ZONE_READONLY; + zonefs_inode_update_mode(inode); + data_size = isize; + } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && +@@ -299,8 +331,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && +- (zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; ++ (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) ++ z->z_flags &= ~ZONEFS_ZONE_OPEN; + + /* + * If error=remount-ro was specified, any error result in remounting +@@ -317,8 +349,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + */ + zonefs_update_stats(inode, data_size); + zonefs_i_size_write(inode, data_size); +- zi->i_wpoffset = data_size; +- zonefs_account_active(inode); ++ z->z_wpoffset = data_size; ++ zonefs_inode_account_active(inode); + + return 0; + } +@@ -332,7 +364,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + */ + void __zonefs_io_error(struct inode *inode, bool write) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + unsigned int noio_flag; +@@ -348,8 +380,8 @@ void __zonefs_io_error(struct inode *inode, bool write) + * files with aggregated conventional zones, for which the inode zone + * size is always larger than the device zone size. + */ +- if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev)) +- nr_zones = zi->i_zone_size >> ++ if (z->z_size > bdev_zone_sectors(sb->s_bdev)) ++ nr_zones = z->z_size >> + (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + + /* +@@ -361,7 +393,7 @@ void __zonefs_io_error(struct inode *inode, bool write) + * the GFP_NOIO context avoids both problems. + */ + noio_flag = memalloc_noio_save(); +- ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones, ++ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, + zonefs_io_error_cb, &err); + if (ret != nr_zones) + zonefs_err(sb, "Get inode %lu zone information failed %d\n", +@@ -381,9 +413,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) + + inode_init_once(&zi->i_vnode); + mutex_init(&zi->i_truncate_mutex); +- zi->i_wpoffset = 0; + zi->i_wr_refcnt = 0; +- zi->i_flags = 0; + + return &zi->i_vnode; + } +@@ -416,8 +446,8 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_bavail = buf->f_bfree; + + for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { +- if (sbi->s_nr_files[t]) +- buf->f_files += sbi->s_nr_files[t] + 1; ++ if (sbi->s_zgroup[t].g_nr_zones) ++ buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; + } + buf->f_ffree = 0; + +@@ -557,11 +587,11 @@ static const struct inode_operations zonefs_dir_inode_operations = { + }; + + static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, +- enum zonefs_ztype type) ++ enum zonefs_ztype ztype) + { + struct super_block *sb = parent->i_sb; + +- inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; ++ inode->i_ino = bdev_nr_zones(sb->s_bdev) + ztype + 1; + inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); + inode->i_op = &zonefs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; +@@ -573,79 +603,34 @@ static const struct inode_operations zonefs_file_inode_operations = { + .setattr = zonefs_inode_setattr, + }; + +-static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, +- enum zonefs_ztype type) ++static void zonefs_init_file_inode(struct inode *inode, ++ struct zonefs_zone *z) + { + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- int ret = 0; +- +- inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; +- inode->i_mode = S_IFREG | sbi->s_perm; + +- if (type == ZONEFS_ZTYPE_CNV) +- zi->i_flags |= ZONEFS_ZONE_CNV; +- +- zi->i_zsector = zone->start; +- zi->i_zone_size = zone->len << SECTOR_SHIFT; +- if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && +- !(sbi->s_features & ZONEFS_F_AGGRCNV)) { +- zonefs_err(sb, +- "zone size %llu doesn't match device's zone sectors %llu\n", +- zi->i_zone_size, +- bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); +- return -EINVAL; +- } +- +- zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, +- zone->capacity << SECTOR_SHIFT); +- zi->i_wpoffset = zonefs_check_zone_condition(inode, zone); ++ inode->i_private = z; + ++ inode->i_ino = z->z_sector >> sbi->s_zone_sectors_shift; ++ inode->i_mode = S_IFREG | sbi->s_perm; + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; +- inode->i_size = zi->i_wpoffset; +- inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT; ++ inode->i_size = z->z_wpoffset; ++ inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; + + inode->i_op = &zonefs_file_inode_operations; + inode->i_fop = &zonefs_file_operations; + inode->i_mapping->a_ops = &zonefs_file_aops; + + /* Update the inode access rights depending on the zone condition */ +- zi->i_flags |= ZONEFS_ZONE_INIT_MODE; ++ z->z_flags |= ZONEFS_ZONE_INIT_MODE; + zonefs_inode_update_mode(inode); +- +- sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); +- sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; +- sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; +- +- mutex_lock(&zi->i_truncate_mutex); +- +- /* +- * For sequential zones, make sure that any open zone is closed first +- * to ensure that the initial number of open zones is 0, in sync with +- * the open zone accounting done when the mount option +- * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. +- */ +- if (type == ZONEFS_ZTYPE_SEQ && +- (zone->cond == BLK_ZONE_COND_IMP_OPEN || +- zone->cond == BLK_ZONE_COND_EXP_OPEN)) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); +- if (ret) +- goto unlock; +- } +- +- zonefs_account_active(inode); +- +-unlock: +- mutex_unlock(&zi->i_truncate_mutex); +- +- return ret; + } + + static struct dentry *zonefs_create_inode(struct dentry *parent, +- const char *name, struct blk_zone *zone, +- enum zonefs_ztype type) ++ const char *name, ++ struct zonefs_zone *z, ++ enum zonefs_ztype ztype) + { + struct inode *dir = d_inode(parent); + struct dentry *dentry; +@@ -661,15 +646,10 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, + goto dput; + + inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; +- if (zone) { +- ret = zonefs_init_file_inode(inode, zone, type); +- if (ret) { +- iput(inode); +- goto dput; +- } +- } else { +- zonefs_init_dir_inode(dir, inode, type); +- } ++ if (z) ++ zonefs_init_file_inode(inode, z); ++ else ++ zonefs_init_dir_inode(dir, inode, ztype); + + d_add(dentry, inode); + dir->i_size++; +@@ -685,100 +665,51 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, + struct zonefs_zone_data { + struct super_block *sb; + unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; ++ sector_t cnv_zone_start; + struct blk_zone *zones; + }; + + /* +- * Create a zone group and populate it with zone files. ++ * Create the inodes for a zone group. + */ +-static int zonefs_create_zgroup(struct zonefs_zone_data *zd, +- enum zonefs_ztype type) ++static int zonefs_create_zgroup_inodes(struct super_block *sb, ++ enum zonefs_ztype ztype) + { +- struct super_block *sb = zd->sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +- struct blk_zone *zone, *next, *end; +- const char *zgroup_name; +- char *file_name; ++ struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; + struct dentry *dir, *dent; +- unsigned int n = 0; +- int ret; ++ char *file_name; ++ int i, ret = 0; ++ ++ if (!zgroup) ++ return -ENOMEM; + + /* If the group is empty, there is nothing to do */ +- if (!zd->nr_zones[type]) ++ if (!zgroup->g_nr_zones) + return 0; + + file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); + if (!file_name) + return -ENOMEM; + +- if (type == ZONEFS_ZTYPE_CNV) +- zgroup_name = "cnv"; +- else +- zgroup_name = "seq"; +- +- dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); ++ dir = zonefs_create_inode(sb->s_root, zonefs_zgroup_name(ztype), ++ NULL, ztype); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + goto free; + } + +- /* +- * The first zone contains the super block: skip it. +- */ +- end = zd->zones + bdev_nr_zones(sb->s_bdev); +- for (zone = &zd->zones[1]; zone < end; zone = next) { +- +- next = zone + 1; +- if (zonefs_zone_type(zone) != type) +- continue; +- +- /* +- * For conventional zones, contiguous zones can be aggregated +- * together to form larger files. Note that this overwrites the +- * length of the first zone of the set of contiguous zones +- * aggregated together. If one offline or read-only zone is +- * found, assume that all zones aggregated have the same +- * condition. +- */ +- if (type == ZONEFS_ZTYPE_CNV && +- (sbi->s_features & ZONEFS_F_AGGRCNV)) { +- for (; next < end; next++) { +- if (zonefs_zone_type(next) != type) +- break; +- zone->len += next->len; +- zone->capacity += next->capacity; +- if (next->cond == BLK_ZONE_COND_READONLY && +- zone->cond != BLK_ZONE_COND_OFFLINE) +- zone->cond = BLK_ZONE_COND_READONLY; +- else if (next->cond == BLK_ZONE_COND_OFFLINE) +- zone->cond = BLK_ZONE_COND_OFFLINE; +- } +- if (zone->capacity != zone->len) { +- zonefs_err(sb, "Invalid conventional zone capacity\n"); +- ret = -EINVAL; +- goto free; +- } +- } +- +- /* +- * Use the file number within its group as file name. +- */ +- snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); +- dent = zonefs_create_inode(dir, file_name, zone, type); ++ for (i = 0; i < zgroup->g_nr_zones; i++) { ++ /* Use the zone number within its group as the file name */ ++ snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", i); ++ dent = zonefs_create_inode(dir, file_name, ++ &zgroup->g_zones[i], ztype); + if (IS_ERR(dent)) { + ret = PTR_ERR(dent); +- goto free; ++ break; + } +- +- n++; + } + +- zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", +- zgroup_name, n, n > 1 ? "s" : ""); +- +- sbi->s_nr_files[type] = n; +- ret = 0; +- + free: + kfree(file_name); + +@@ -789,21 +720,38 @@ static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, + void *data) + { + struct zonefs_zone_data *zd = data; ++ struct super_block *sb = zd->sb; ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); ++ ++ /* ++ * We do not care about the first zone: it contains the super block ++ * and not exposed as a file. ++ */ ++ if (!idx) ++ return 0; + + /* +- * Count the number of usable zones: the first zone at index 0 contains +- * the super block and is ignored. ++ * Count the number of zones that will be exposed as files. ++ * For sequential zones, we always have as many files as zones. ++ * FOr conventional zones, the number of files depends on if we have ++ * conventional zones aggregation enabled. + */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: +- zone->wp = zone->start + zone->len; +- if (idx) +- zd->nr_zones[ZONEFS_ZTYPE_CNV]++; ++ if (sbi->s_features & ZONEFS_F_AGGRCNV) { ++ /* One file per set of contiguous conventional zones */ ++ if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) || ++ zone->start != zd->cnv_zone_start) ++ sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; ++ zd->cnv_zone_start = zone->start + zone->len; ++ } else { ++ /* One file per zone */ ++ sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; ++ } + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: +- if (idx) +- zd->nr_zones[ZONEFS_ZTYPE_SEQ]++; ++ sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++; + break; + default: + zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", +@@ -843,11 +791,173 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd) + return 0; + } + +-static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd) ++static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd) + { + kvfree(zd->zones); + } + ++/* ++ * Create a zone group and populate it with zone files. ++ */ ++static int zonefs_init_zgroup(struct super_block *sb, ++ struct zonefs_zone_data *zd, ++ enum zonefs_ztype ztype) ++{ ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); ++ struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; ++ struct blk_zone *zone, *next, *end; ++ struct zonefs_zone *z; ++ unsigned int n = 0; ++ int ret; ++ ++ /* Allocate the zone group. If it is empty, we have nothing to do. */ ++ if (!zgroup->g_nr_zones) ++ return 0; ++ ++ zgroup->g_zones = kvcalloc(zgroup->g_nr_zones, ++ sizeof(struct zonefs_zone), GFP_KERNEL); ++ if (!zgroup->g_zones) ++ return -ENOMEM; ++ ++ /* ++ * Initialize the zone groups using the device zone information. ++ * We always skip the first zone as it contains the super block ++ * and is not use to back a file. ++ */ ++ end = zd->zones + bdev_nr_zones(sb->s_bdev); ++ for (zone = &zd->zones[1]; zone < end; zone = next) { ++ ++ next = zone + 1; ++ if (zonefs_zone_type(zone) != ztype) ++ continue; ++ ++ if (WARN_ON_ONCE(n >= zgroup->g_nr_zones)) ++ return -EINVAL; ++ ++ /* ++ * For conventional zones, contiguous zones can be aggregated ++ * together to form larger files. Note that this overwrites the ++ * length of the first zone of the set of contiguous zones ++ * aggregated together. If one offline or read-only zone is ++ * found, assume that all zones aggregated have the same ++ * condition. ++ */ ++ if (ztype == ZONEFS_ZTYPE_CNV && ++ (sbi->s_features & ZONEFS_F_AGGRCNV)) { ++ for (; next < end; next++) { ++ if (zonefs_zone_type(next) != ztype) ++ break; ++ zone->len += next->len; ++ zone->capacity += next->capacity; ++ if (next->cond == BLK_ZONE_COND_READONLY && ++ zone->cond != BLK_ZONE_COND_OFFLINE) ++ zone->cond = BLK_ZONE_COND_READONLY; ++ else if (next->cond == BLK_ZONE_COND_OFFLINE) ++ zone->cond = BLK_ZONE_COND_OFFLINE; ++ } ++ } ++ ++ z = &zgroup->g_zones[n]; ++ if (ztype == ZONEFS_ZTYPE_CNV) ++ z->z_flags |= ZONEFS_ZONE_CNV; ++ z->z_sector = zone->start; ++ z->z_size = zone->len << SECTOR_SHIFT; ++ if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && ++ !(sbi->s_features & ZONEFS_F_AGGRCNV)) { ++ zonefs_err(sb, ++ "Invalid zone size %llu (device zone sectors %llu)\n", ++ z->z_size, ++ bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); ++ return -EINVAL; ++ } ++ ++ z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE, ++ zone->capacity << SECTOR_SHIFT); ++ z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone); ++ ++ sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes); ++ sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits; ++ sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits; ++ ++ /* ++ * For sequential zones, make sure that any open zone is closed ++ * first to ensure that the initial number of open zones is 0, ++ * in sync with the open zone accounting done when the mount ++ * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used. ++ */ ++ if (ztype == ZONEFS_ZTYPE_SEQ && ++ (zone->cond == BLK_ZONE_COND_IMP_OPEN || ++ zone->cond == BLK_ZONE_COND_EXP_OPEN)) { ++ ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE); ++ if (ret) ++ return ret; ++ } ++ ++ zonefs_account_active(sb, z); ++ ++ n++; ++ } ++ ++ if (WARN_ON_ONCE(n != zgroup->g_nr_zones)) ++ return -EINVAL; ++ ++ zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", ++ zonefs_zgroup_name(ztype), ++ zgroup->g_nr_zones, ++ zgroup->g_nr_zones > 1 ? "s" : ""); ++ ++ return 0; ++} ++ ++static void zonefs_free_zgroups(struct super_block *sb) ++{ ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); ++ enum zonefs_ztype ztype; ++ ++ if (!sbi) ++ return; ++ ++ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { ++ kvfree(sbi->s_zgroup[ztype].g_zones); ++ sbi->s_zgroup[ztype].g_zones = NULL; ++ } ++} ++ ++/* ++ * Create a zone group and populate it with zone files. ++ */ ++static int zonefs_init_zgroups(struct super_block *sb) ++{ ++ struct zonefs_zone_data zd; ++ enum zonefs_ztype ztype; ++ int ret; ++ ++ /* First get the device zone information */ ++ memset(&zd, 0, sizeof(struct zonefs_zone_data)); ++ zd.sb = sb; ++ ret = zonefs_get_zone_info(&zd); ++ if (ret) ++ goto cleanup; ++ ++ /* Allocate and initialize the zone groups */ ++ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { ++ ret = zonefs_init_zgroup(sb, &zd, ztype); ++ if (ret) { ++ zonefs_info(sb, ++ "Zone group \"%s\" initialization failed\n", ++ zonefs_zgroup_name(ztype)); ++ break; ++ } ++ } ++ ++cleanup: ++ zonefs_free_zone_info(&zd); ++ if (ret) ++ zonefs_free_zgroups(sb); ++ ++ return ret; ++} ++ + /* + * Read super block information from the device. + */ +@@ -945,7 +1055,6 @@ static const struct super_operations zonefs_sops = { + */ + static int zonefs_fill_super(struct super_block *sb, void *data, int silent) + { +- struct zonefs_zone_data zd; + struct zonefs_sb_info *sbi; + struct inode *inode; + enum zonefs_ztype t; +@@ -998,16 +1107,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) + if (ret) + return ret; + +- memset(&zd, 0, sizeof(struct zonefs_zone_data)); +- zd.sb = sb; +- ret = zonefs_get_zone_info(&zd); +- if (ret) +- goto cleanup; +- +- ret = zonefs_sysfs_register(sb); +- if (ret) +- goto cleanup; +- + zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); + + if (!sbi->s_max_wro_seq_files && +@@ -1018,6 +1117,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } + ++ /* Initialize the zone groups */ ++ ret = zonefs_init_zgroups(sb); ++ if (ret) ++ goto cleanup; ++ + /* Create root directory inode */ + ret = -ENOMEM; + inode = new_inode(sb); +@@ -1037,13 +1141,19 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) + + /* Create and populate files in zone groups directories */ + for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { +- ret = zonefs_create_zgroup(&zd, t); ++ ret = zonefs_create_zgroup_inodes(sb, t); + if (ret) +- break; ++ goto cleanup; + } + ++ ret = zonefs_sysfs_register(sb); ++ if (ret) ++ goto cleanup; ++ ++ return 0; ++ + cleanup: +- zonefs_cleanup_zone_info(&zd); ++ zonefs_free_zgroups(sb); + + return ret; + } +@@ -1062,6 +1172,7 @@ static void zonefs_kill_super(struct super_block *sb) + d_genocide(sb->s_root); + + zonefs_sysfs_unregister(sb); ++ zonefs_free_zgroups(sb); + kill_block_super(sb); + kfree(sbi); + } +diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h +index 42edcfd393ed2..9969db3a9c7dc 100644 +--- a/fs/zonefs/trace.h ++++ b/fs/zonefs/trace.h +@@ -20,8 +20,9 @@ + #define show_dev(dev) MAJOR(dev), MINOR(dev) + + TRACE_EVENT(zonefs_zone_mgmt, +- TP_PROTO(struct inode *inode, enum req_op op), +- TP_ARGS(inode, op), ++ TP_PROTO(struct super_block *sb, struct zonefs_zone *z, ++ enum req_op op), ++ TP_ARGS(sb, z, op), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) +@@ -30,12 +31,12 @@ TRACE_EVENT(zonefs_zone_mgmt, + __field(sector_t, nr_sectors) + ), + TP_fast_assign( +- __entry->dev = inode->i_sb->s_dev; +- __entry->ino = inode->i_ino; ++ __entry->dev = sb->s_dev; ++ __entry->ino = ++ z->z_sector >> ZONEFS_SB(sb)->s_zone_sectors_shift; + __entry->op = op; +- __entry->sector = ZONEFS_I(inode)->i_zsector; +- __entry->nr_sectors = +- ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT; ++ __entry->sector = z->z_sector; ++ __entry->nr_sectors = z->z_size >> SECTOR_SHIFT; + ), + TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu", + show_dev(__entry->dev), (unsigned long)__entry->ino, +@@ -58,9 +59,10 @@ TRACE_EVENT(zonefs_file_dio_append, + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; +- __entry->sector = ZONEFS_I(inode)->i_zsector; ++ __entry->sector = zonefs_inode_zone(inode)->z_sector; + __entry->size = size; +- __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset; ++ __entry->wpoffset = ++ zonefs_inode_zone(inode)->z_wpoffset; + __entry->ret = ret; + ), + TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu", +diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h +index 1a225f74015a0..2d626e18b1411 100644 +--- a/fs/zonefs/zonefs.h ++++ b/fs/zonefs/zonefs.h +@@ -47,22 +47,39 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) + #define ZONEFS_ZONE_CNV (1U << 31) + + /* +- * In-memory inode data. ++ * In-memory per-file inode zone data. + */ +-struct zonefs_inode_info { +- struct inode i_vnode; ++struct zonefs_zone { ++ /* Zone state flags */ ++ unsigned int z_flags; + +- /* File zone start sector (512B unit) */ +- sector_t i_zsector; ++ /* Zone start sector (512B unit) */ ++ sector_t z_sector; + +- /* File zone write pointer position (sequential zones only) */ +- loff_t i_wpoffset; ++ /* Zone size (bytes) */ ++ loff_t z_size; + +- /* File maximum size */ +- loff_t i_max_size; ++ /* Zone capacity (file maximum size, bytes) */ ++ loff_t z_capacity; + +- /* File zone size */ +- loff_t i_zone_size; ++ /* Write pointer offset in the zone (sequential zones only, bytes) */ ++ loff_t z_wpoffset; ++}; ++ ++/* ++ * In memory zone group information: all zones of a group are exposed ++ * as files, one file per zone. ++ */ ++struct zonefs_zone_group { ++ unsigned int g_nr_zones; ++ struct zonefs_zone *g_zones; ++}; ++ ++/* ++ * In-memory inode data. ++ */ ++struct zonefs_inode_info { ++ struct inode i_vnode; + + /* + * To serialise fully against both syscall and mmap based IO and +@@ -81,7 +98,6 @@ struct zonefs_inode_info { + + /* guarded by i_truncate_mutex */ + unsigned int i_wr_refcnt; +- unsigned int i_flags; + }; + + static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) +@@ -89,24 +105,29 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) + return container_of(inode, struct zonefs_inode_info, i_vnode); + } + +-static inline bool zonefs_zone_is_cnv(struct zonefs_inode_info *zi) ++static inline bool zonefs_zone_is_cnv(struct zonefs_zone *z) ++{ ++ return z->z_flags & ZONEFS_ZONE_CNV; ++} ++ ++static inline bool zonefs_zone_is_seq(struct zonefs_zone *z) + { +- return zi->i_flags & ZONEFS_ZONE_CNV; ++ return !zonefs_zone_is_cnv(z); + } + +-static inline bool zonefs_zone_is_seq(struct zonefs_inode_info *zi) ++static inline struct zonefs_zone *zonefs_inode_zone(struct inode *inode) + { +- return !zonefs_zone_is_cnv(zi); ++ return inode->i_private; + } + + static inline bool zonefs_inode_is_cnv(struct inode *inode) + { +- return zonefs_zone_is_cnv(ZONEFS_I(inode)); ++ return zonefs_zone_is_cnv(zonefs_inode_zone(inode)); + } + + static inline bool zonefs_inode_is_seq(struct inode *inode) + { +- return zonefs_zone_is_seq(ZONEFS_I(inode)); ++ return zonefs_zone_is_seq(zonefs_inode_zone(inode)); + } + + /* +@@ -200,7 +221,7 @@ struct zonefs_sb_info { + uuid_t s_uuid; + unsigned int s_zone_sectors_shift; + +- unsigned int s_nr_files[ZONEFS_ZTYPE_MAX]; ++ struct zonefs_zone_group s_zgroup[ZONEFS_ZTYPE_MAX]; + + loff_t s_blocks; + loff_t s_used_blocks; +@@ -229,8 +250,8 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) + pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) + + /* In super.c */ +-void zonefs_account_active(struct inode *inode); +-int zonefs_zone_mgmt(struct inode *inode, enum req_op op); ++void zonefs_inode_account_active(struct inode *inode); ++int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op); + void zonefs_i_size_write(struct inode *inode, loff_t isize); + void zonefs_update_stats(struct inode *inode, loff_t new_isize); + void __zonefs_io_error(struct inode *inode, bool write); +-- +2.39.2 + diff --git a/queue-6.1/zonefs-simplify-io-error-handling.patch b/queue-6.1/zonefs-simplify-io-error-handling.patch new file mode 100644 index 00000000000..0bc06889a2a --- /dev/null +++ b/queue-6.1/zonefs-simplify-io-error-handling.patch @@ -0,0 +1,244 @@ +From 236111cac1592239f3295e8e5de2e95dd808a786 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 25 Nov 2022 11:06:20 +0900 +Subject: zonefs: Simplify IO error handling + +From: Damien Le Moal + +[ Upstream commit 46a9c526eef7fb68a00321e2a9591ce5276ae92b ] + +Simplify zonefs_check_zone_condition() by moving the code that changes +an inode access rights to the new function zonefs_inode_update_mode(). +Furthermore, since on mount an inode wpoffset is always zero when +zonefs_check_zone_condition() is called during an inode initialization, +the "mount" boolean argument is not necessary for the readonly zone +case. This argument is thus removed. + +zonefs_io_error_cb() is also modified to use the inode offline and +zone state flags instead of checking the device zone condition. The +multiple calls to zonefs_check_zone_condition() are reduced to the first +call on entry, which allows removing the "warn" argument. +zonefs_inode_update_mode() is also used to update an inode access rights +as zonefs_io_error_cb() modifies the inode flags depending on the volume +error handling mode (defined with a mount option). Since an inode mode +change differs for read-only zones between mount time and IO error time, +the flag ZONEFS_ZONE_INIT_MODE is used to differentiate both cases. + +Signed-off-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") +Signed-off-by: Sasha Levin +--- + fs/zonefs/super.c | 110 ++++++++++++++++++++++++--------------------- + fs/zonefs/zonefs.h | 9 ++-- + 2 files changed, 64 insertions(+), 55 deletions(-) + +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index e808276b88018..6307cc95be061 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -155,48 +155,31 @@ void zonefs_update_stats(struct inode *inode, loff_t new_isize) + * amount of readable data in the zone. + */ + static loff_t zonefs_check_zone_condition(struct inode *inode, +- struct blk_zone *zone, bool warn, +- bool mount) ++ struct blk_zone *zone) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + switch (zone->cond) { + case BLK_ZONE_COND_OFFLINE: +- /* +- * Dead zone: make the inode immutable, disable all accesses +- * and set the file size to 0 (zone wp set to zone start). +- */ +- if (warn) +- zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", +- inode->i_ino); +- inode->i_flags |= S_IMMUTABLE; +- inode->i_mode &= ~0777; +- zone->wp = zone->start; ++ zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", ++ inode->i_ino); + zi->i_flags |= ZONEFS_ZONE_OFFLINE; + return 0; + case BLK_ZONE_COND_READONLY: + /* +- * The write pointer of read-only zones is invalid. If such a +- * zone is found during mount, the file size cannot be retrieved +- * so we treat the zone as offline (mount == true case). +- * Otherwise, keep the file size as it was when last updated +- * so that the user can recover data. In both cases, writes are +- * always disabled for the zone. ++ * The write pointer of read-only zones is invalid, so we cannot ++ * determine the zone wpoffset (inode size). We thus keep the ++ * zone wpoffset as is, which leads to an empty file ++ * (wpoffset == 0) on mount. For a runtime error, this keeps ++ * the inode size as it was when last updated so that the user ++ * can recover data. + */ +- if (warn) +- zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", +- inode->i_ino); +- inode->i_flags |= S_IMMUTABLE; +- if (mount) { +- zone->cond = BLK_ZONE_COND_OFFLINE; +- inode->i_mode &= ~0777; +- zone->wp = zone->start; +- zi->i_flags |= ZONEFS_ZONE_OFFLINE; +- return 0; +- } ++ zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", ++ inode->i_ino); + zi->i_flags |= ZONEFS_ZONE_READONLY; +- inode->i_mode &= ~0222; +- return i_size_read(inode); ++ if (zi->i_ztype == ZONEFS_ZTYPE_CNV) ++ return zi->i_max_size; ++ return zi->i_wpoffset; + case BLK_ZONE_COND_FULL: + /* The write pointer of full zones is invalid. */ + return zi->i_max_size; +@@ -207,6 +190,30 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, + } + } + ++/* ++ * Check a zone condition and adjust its inode access permissions for ++ * offline and readonly zones. ++ */ ++static void zonefs_inode_update_mode(struct inode *inode) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (zi->i_flags & ZONEFS_ZONE_OFFLINE) { ++ /* Offline zones cannot be read nor written */ ++ inode->i_flags |= S_IMMUTABLE; ++ inode->i_mode &= ~0777; ++ } else if (zi->i_flags & ZONEFS_ZONE_READONLY) { ++ /* Readonly zones cannot be written */ ++ inode->i_flags |= S_IMMUTABLE; ++ if (zi->i_flags & ZONEFS_ZONE_INIT_MODE) ++ inode->i_mode &= ~0777; ++ else ++ inode->i_mode &= ~0222; ++ } ++ ++ zi->i_flags &= ~ZONEFS_ZONE_INIT_MODE; ++} ++ + struct zonefs_ioerr_data { + struct inode *inode; + bool write; +@@ -228,10 +235,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * as there is no inconsistency between the inode size and the amount of + * data writen in the zone (data_size). + */ +- data_size = zonefs_check_zone_condition(inode, zone, true, false); ++ data_size = zonefs_check_zone_condition(inode, zone); + isize = i_size_read(inode); +- if (zone->cond != BLK_ZONE_COND_OFFLINE && +- zone->cond != BLK_ZONE_COND_READONLY && ++ if (!(zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && + !err->write && isize == data_size) + return 0; + +@@ -264,24 +270,22 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * zone condition to read-only and offline respectively, as if the + * condition was signaled by the hardware. + */ +- if (zone->cond == BLK_ZONE_COND_OFFLINE || +- sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) { ++ if ((zi->i_flags & ZONEFS_ZONE_OFFLINE) || ++ (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { + zonefs_warn(sb, "inode %lu: read/write access disabled\n", + inode->i_ino); +- if (zone->cond != BLK_ZONE_COND_OFFLINE) { +- zone->cond = BLK_ZONE_COND_OFFLINE; +- data_size = zonefs_check_zone_condition(inode, zone, +- false, false); +- } +- } else if (zone->cond == BLK_ZONE_COND_READONLY || +- sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) { ++ if (!(zi->i_flags & ZONEFS_ZONE_OFFLINE)) ++ zi->i_flags |= ZONEFS_ZONE_OFFLINE; ++ zonefs_inode_update_mode(inode); ++ data_size = 0; ++ } else if ((zi->i_flags & ZONEFS_ZONE_READONLY) || ++ (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { + zonefs_warn(sb, "inode %lu: write access disabled\n", + inode->i_ino); +- if (zone->cond != BLK_ZONE_COND_READONLY) { +- zone->cond = BLK_ZONE_COND_READONLY; +- data_size = zonefs_check_zone_condition(inode, zone, +- false, false); +- } ++ if (!(zi->i_flags & ZONEFS_ZONE_READONLY)) ++ zi->i_flags |= ZONEFS_ZONE_READONLY; ++ zonefs_inode_update_mode(inode); ++ data_size = isize; + } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && + data_size > isize) { + /* Do not expose garbage data */ +@@ -295,8 +299,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && +- (zone->cond == BLK_ZONE_COND_OFFLINE || +- zone->cond == BLK_ZONE_COND_READONLY)) ++ (zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + + /* +@@ -378,6 +381,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) + + inode_init_once(&zi->i_vnode); + mutex_init(&zi->i_truncate_mutex); ++ zi->i_wpoffset = 0; + zi->i_wr_refcnt = 0; + zi->i_flags = 0; + +@@ -594,7 +598,7 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + + zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, + zone->capacity << SECTOR_SHIFT); +- zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true); ++ zi->i_wpoffset = zonefs_check_zone_condition(inode, zone); + + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; +@@ -605,6 +609,10 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + inode->i_fop = &zonefs_file_operations; + inode->i_mapping->a_ops = &zonefs_file_aops; + ++ /* Update the inode access rights depending on the zone condition */ ++ zi->i_flags |= ZONEFS_ZONE_INIT_MODE; ++ zonefs_inode_update_mode(inode); ++ + sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); + sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; + sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; +diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h +index 839ebe9afb6c1..439096445ee53 100644 +--- a/fs/zonefs/zonefs.h ++++ b/fs/zonefs/zonefs.h +@@ -39,10 +39,11 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) + return ZONEFS_ZTYPE_SEQ; + } + +-#define ZONEFS_ZONE_OPEN (1U << 0) +-#define ZONEFS_ZONE_ACTIVE (1U << 1) +-#define ZONEFS_ZONE_OFFLINE (1U << 2) +-#define ZONEFS_ZONE_READONLY (1U << 3) ++#define ZONEFS_ZONE_INIT_MODE (1U << 0) ++#define ZONEFS_ZONE_OPEN (1U << 1) ++#define ZONEFS_ZONE_ACTIVE (1U << 2) ++#define ZONEFS_ZONE_OFFLINE (1U << 3) ++#define ZONEFS_ZONE_READONLY (1U << 4) + + /* + * In-memory inode data. +-- +2.39.2 +