From: Sasha Levin Date: Thu, 30 Mar 2023 11:12:54 +0000 (-0400) Subject: Fixes for 6.2 X-Git-Tag: v4.14.312~67 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2138213cd021d406d11a70bffe500f3915b65461;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.2 Signed-off-by: Sasha Levin --- diff --git a/queue-6.2/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch b/queue-6.2/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch new file mode 100644 index 00000000000..7b9fdf24ab0 --- /dev/null +++ b/queue-6.2/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch @@ -0,0 +1,76 @@ +From 8a61e08dbc8fc089eca3ffba2e3a40df1b0a790e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 1 Mar 2023 16:14:42 -0500 +Subject: btrfs: rename BTRFS_FS_NO_OVERCOMMIT to BTRFS_FS_ACTIVE_ZONE_TRACKING + +From: Josef Bacik + +[ Upstream commit bf1f1fec2724a33b67ec12032402ea75f2a83622 ] + +This flag only gets set when we're doing active zone tracking, and we're +going to need to use this flag for things related to this behavior. +Rename the flag to represent what it actually means for the file system +so it can be used in other ways and still make sense. + +Reviewed-by: Naohiro Aota +Reviewed-by: Johannes Thumshirn +Reviewed-by: Anand Jain +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Stable-dep-of: e15acc25880c ("btrfs: zoned: drop space_info->active_total_bytes") +Signed-off-by: Sasha Levin +--- + fs/btrfs/fs.h | 7 ++----- + fs/btrfs/space-info.c | 2 +- + fs/btrfs/zoned.c | 3 +-- + 3 files changed, 4 insertions(+), 8 deletions(-) + +diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h +index 3d8156fc8523f..f180ca061aef4 100644 +--- a/fs/btrfs/fs.h ++++ b/fs/btrfs/fs.h +@@ -119,11 +119,8 @@ enum { + /* Indicate that we want to commit the transaction. */ + BTRFS_FS_NEED_TRANS_COMMIT, + +- /* +- * Indicate metadata over-commit is disabled. This is set when active +- * zone tracking is needed. +- */ +- BTRFS_FS_NO_OVERCOMMIT, ++ /* This is set when active zone tracking is needed. */ ++ BTRFS_FS_ACTIVE_ZONE_TRACKING, + + /* + * Indicate if we have some features changed, this is mostly for +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 69c09508afb50..2237685d1ed0c 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -407,7 +407,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + return 0; + + used = btrfs_space_info_used(space_info, true); +- if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) && ++ if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && + (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + avail = 0; + else +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index f3b7d8ae93a9f..a6a8bc112fc42 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -539,8 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) + } + atomic_set(&zone_info->active_zones_left, + max_active_zones - nactive); +- /* Overcommit does not work well with active zone tacking. */ +- set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags); ++ set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); + } + + /* Validate superblock log */ +-- +2.39.2 + diff --git a/queue-6.2/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch b/queue-6.2/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch new file mode 100644 index 00000000000..2de26b26729 --- /dev/null +++ b/queue-6.2/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch @@ -0,0 +1,135 @@ +From fb89e8debf0fa9e2c31a8441294231789d9ba76a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Mar 2023 16:06:13 +0900 +Subject: btrfs: zoned: count fresh BG region as zone unusable + +From: Naohiro Aota + +[ Upstream commit fa2068d7e922b434eba5bfb0131e6d39febfdb48 ] + +The naming of space_info->active_total_bytes is misleading. It counts +not only active block groups but also full ones which are previously +active but now inactive. That confusion results in a bug not counting +the full BGs into active_total_bytes on mount time. + +For a background, there are three kinds of block groups in terms of +activation. + + 1. Block groups never activated + 2. Block groups currently active + 3. Block groups previously active and currently inactive (due to fully + written or zone finish) + +What we really wanted to exclude from "total_bytes" is the total size of +BGs #1. They seem empty and allocatable but since they are not activated, +we cannot rely on them to do the space reservation. + +And, since BGs #1 never get activated, they should have no "used", +"reserved" and "pinned" bytes. + +OTOH, BGs #3 can be counted in the "total", since they are already full +we cannot allocate from them anyway. For them, "total_bytes == used + +reserved + pinned + zone_unusable" should hold. + +Tracking #2 and #3 as "active_total_bytes" (current implementation) is +confusing. And, tracking #1 and subtract that properly from "total_bytes" +every time you need space reservation is cumbersome. + +Instead, we can count the whole region of a newly allocated block group as +zone_unusable. Then, once that block group is activated, release +[0 .. zone_capacity] from the zone_unusable counters. With this, we can +eliminate the confusing ->active_total_bytes and the code will be common +among regular and the zoned mode. Also, no additional counter is needed +with this approach. + +Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes") +CC: stable@vger.kernel.org # 6.1+ +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Stable-dep-of: e15acc25880c ("btrfs: zoned: drop space_info->active_total_bytes") +Signed-off-by: Sasha Levin +--- + fs/btrfs/free-space-cache.c | 8 +++++++- + fs/btrfs/zoned.c | 24 +++++++++++++++++++----- + 2 files changed, 26 insertions(+), 6 deletions(-) + +diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c +index 0d250d052487c..d84cef89cdff5 100644 +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -2693,8 +2693,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + + spin_lock(&ctl->tree_lock); ++ /* Count initial region as zone_unusable until it gets activated. */ + if (!used) + to_free = size; ++ else if (initial && ++ test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && ++ (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) ++ to_free = 0; + else if (initial) + to_free = block_group->zone_capacity; + else if (offset >= block_group->alloc_offset) +@@ -2722,7 +2727,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, + reclaimable_unusable = block_group->zone_unusable - + (block_group->length - block_group->zone_capacity); + /* All the region is now unusable. Mark it as unused and reclaim */ +- if (block_group->zone_unusable == block_group->length) { ++ if (block_group->zone_unusable == block_group->length && ++ block_group->alloc_offset) { + btrfs_mark_bg_unused(block_group); + } else if (bg_reclaim_threshold && + reclaimable_unusable >= +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index a6a8bc112fc42..c3c763cc06399 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1576,9 +1576,19 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) + return; + + WARN_ON(cache->bytes_super != 0); +- unusable = (cache->alloc_offset - cache->used) + +- (cache->length - cache->zone_capacity); +- free = cache->zone_capacity - cache->alloc_offset; ++ ++ /* Check for block groups never get activated */ ++ if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && ++ cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && ++ !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && ++ cache->alloc_offset == 0) { ++ unusable = cache->length; ++ free = 0; ++ } else { ++ unusable = (cache->alloc_offset - cache->used) + ++ (cache->length - cache->zone_capacity); ++ free = cache->zone_capacity - cache->alloc_offset; ++ } + + /* We only need ->free_space in ALLOC_SEQ block groups */ + cache->cached = BTRFS_CACHE_FINISHED; +@@ -1915,7 +1925,11 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) + + /* Successfully activated all the zones */ + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); +- space_info->active_total_bytes += block_group->length; ++ WARN_ON(block_group->alloc_offset != 0); ++ if (block_group->zone_unusable == block_group->length) { ++ block_group->zone_unusable = block_group->length - block_group->zone_capacity; ++ space_info->bytes_zone_unusable -= block_group->zone_capacity; ++ } + spin_unlock(&block_group->lock); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); +@@ -2279,7 +2293,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) + u64 avail; + + spin_lock(&block_group->lock); +- if (block_group->reserved || ++ if (block_group->reserved || block_group->alloc_offset == 0 || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + spin_unlock(&block_group->lock); + continue; +-- +2.39.2 + diff --git a/queue-6.2/btrfs-zoned-drop-space_info-active_total_bytes.patch b/queue-6.2/btrfs-zoned-drop-space_info-active_total_bytes.patch new file mode 100644 index 00000000000..e48bc20f8e2 --- /dev/null +++ b/queue-6.2/btrfs-zoned-drop-space_info-active_total_bytes.patch @@ -0,0 +1,192 @@ +From 56c14d707e8baa1a944f01457f5509fa7188c304 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Mar 2023 16:06:14 +0900 +Subject: btrfs: zoned: drop space_info->active_total_bytes + +From: Naohiro Aota + +[ Upstream commit e15acc25880cf048dba9df94d76ed7e7e10040e6 ] + +The space_info->active_total_bytes is no longer necessary as we now +count the region of newly allocated block group as zone_unusable. Drop +its usage. + +Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes") +CC: stable@vger.kernel.org # 6.1+ +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/block-group.c | 6 ------ + fs/btrfs/space-info.c | 40 +++++++++------------------------------- + fs/btrfs/space-info.h | 2 -- + fs/btrfs/zoned.c | 4 ---- + 4 files changed, 9 insertions(+), 43 deletions(-) + +diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c +index d628d545ffea7..8eb625318e785 100644 +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -1036,14 +1036,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + < block_group->zone_unusable); + WARN_ON(block_group->space_info->disk_total + < block_group->length * factor); +- WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, +- &block_group->runtime_flags) && +- block_group->space_info->active_total_bytes +- < block_group->length); + } + block_group->space_info->total_bytes -= block_group->length; +- if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) +- block_group->space_info->active_total_bytes -= block_group->length; + block_group->space_info->bytes_readonly -= + (block_group->length - block_group->zone_unusable); + block_group->space_info->bytes_zone_unusable -= +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 2237685d1ed0c..3eecce86f63fc 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -308,8 +308,6 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, + ASSERT(found); + spin_lock(&found->lock); + found->total_bytes += block_group->length; +- if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) +- found->active_total_bytes += block_group->length; + found->disk_total += block_group->length * factor; + found->bytes_used += block_group->used; + found->disk_used += block_group->used * factor; +@@ -379,22 +377,6 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, + return avail; + } + +-static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, +- struct btrfs_space_info *space_info) +-{ +- /* +- * On regular filesystem, all total_bytes are always writable. On zoned +- * filesystem, there may be a limitation imposed by max_active_zones. +- * For metadata allocation, we cannot finish an existing active block +- * group to avoid a deadlock. Thus, we need to consider only the active +- * groups to be writable for metadata space. +- */ +- if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) +- return space_info->total_bytes; +- +- return space_info->active_total_bytes; +-} +- + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +@@ -413,7 +395,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + else + avail = calc_available_free_space(fs_info, space_info, flush); + +- if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) ++ if (used + bytes < space_info->total_bytes + avail) + return 1; + return 0; + } +@@ -449,7 +431,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, + ticket = list_first_entry(head, struct reserve_ticket, list); + + /* Check and see if our ticket can be satisfied now. */ +- if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || ++ if ((used + ticket->bytes <= space_info->total_bytes) || + btrfs_can_overcommit(fs_info, space_info, ticket->bytes, + flush)) { + btrfs_space_info_update_bytes_may_use(fs_info, +@@ -829,7 +811,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + { + u64 used; + u64 avail; +- u64 total; + u64 to_reclaim = space_info->reclaim_size; + + lockdep_assert_held(&space_info->lock); +@@ -844,9 +825,8 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, + * space. If that's the case add in our overage so we make sure to put + * appropriate pressure on the flushing state machine. + */ +- total = writable_total_bytes(fs_info, space_info); +- if (total + avail < used) +- to_reclaim += used - (total + avail); ++ if (space_info->total_bytes + avail < used) ++ to_reclaim += used - (space_info->total_bytes + avail); + + return to_reclaim; + } +@@ -856,11 +836,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + { + u64 global_rsv_size = fs_info->global_block_rsv.reserved; + u64 ordered, delalloc; +- u64 total = writable_total_bytes(fs_info, space_info); + u64 thresh; + u64 used; + +- thresh = mult_perc(total, 90); ++ thresh = mult_perc(space_info->total_bytes, 90); + + lockdep_assert_held(&space_info->lock); + +@@ -923,8 +902,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, + BTRFS_RESERVE_FLUSH_ALL); + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_readonly + global_rsv_size; +- if (used < total) +- thresh += total - used; ++ if (used < space_info->total_bytes) ++ thresh += space_info->total_bytes - used; + thresh >>= space_info->clamp; + + used = space_info->bytes_pinned; +@@ -1651,7 +1630,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, + * can_overcommit() to ensure we can overcommit to continue. + */ + if (!pending_tickets && +- ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || ++ ((used + orig_bytes <= space_info->total_bytes) || + btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); +@@ -1665,8 +1644,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, + */ + if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { + used = btrfs_space_info_used(space_info, false); +- if (used + orig_bytes <= +- writable_total_bytes(fs_info, space_info)) { ++ if (used + orig_bytes <= space_info->total_bytes) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + ret = 0; +diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h +index fc99ea2b0c34f..2033b71b18cec 100644 +--- a/fs/btrfs/space-info.h ++++ b/fs/btrfs/space-info.h +@@ -96,8 +96,6 @@ struct btrfs_space_info { + u64 bytes_may_use; /* number of bytes that may be used for + delalloc/allocations */ + u64 bytes_readonly; /* total bytes that are read only */ +- /* Total bytes in the space, but only accounts active block groups. */ +- u64 active_total_bytes; + u64 bytes_zone_unusable; /* total bytes that are unusable until + resetting the device zone */ + +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index c3c763cc06399..ce5ebba7fdd9a 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2330,10 +2330,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + +- /* No more block groups to activate */ +- if (space_info->active_total_bytes == space_info->total_bytes) +- return 0; +- + for (;;) { + int ret; + bool need_finish = false; +-- +2.39.2 + diff --git a/queue-6.2/cifs-avoid-race-conditions-with-parallel-reconnects.patch b/queue-6.2/cifs-avoid-race-conditions-with-parallel-reconnects.patch new file mode 100644 index 00000000000..5c15f0928cb --- /dev/null +++ b/queue-6.2/cifs-avoid-race-conditions-with-parallel-reconnects.patch @@ -0,0 +1,333 @@ +From 1f3fa825883f51944f3e0d3d92717251a8844cb3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Mar 2023 06:08:19 +0000 +Subject: cifs: avoid race conditions with parallel reconnects + +From: Shyam Prasad N + +[ Upstream commit bc962159e8e326af634a506508034a375bf2b858 ] + +When multiple processes/channels do reconnects in parallel +we used to return success immediately +negotiate/session-setup/tree-connect, causing race conditions +between processes that enter the function in parallel. +This caused several errors related to session not found to +show up during parallel reconnects. + +Signed-off-by: Shyam Prasad N +Reviewed-by: Paulo Alcantara (SUSE) +Cc: stable@vger.kernel.org +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/cifs/connect.c | 48 ++++++++++++++++++++++++++++++----------- + fs/cifs/smb2pdu.c | 44 +++++++++++++++++++++---------------- + fs/cifs/smb2transport.c | 17 ++++++++++++--- + 3 files changed, 76 insertions(+), 33 deletions(-) + +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c +index f53837f436d08..985e962cf0858 100644 +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -244,31 +244,42 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server, + cifs_chan_update_iface(ses, server); + + spin_lock(&ses->chan_lock); +- if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) +- goto next_session; ++ if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) { ++ spin_unlock(&ses->chan_lock); ++ continue; ++ } + + if (mark_smb_session) + CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses); + else + cifs_chan_set_need_reconnect(ses, server); + ++ cifs_dbg(FYI, "%s: channel connect bitmap: 0x%lx\n", ++ __func__, ses->chans_need_reconnect); ++ + /* If all channels need reconnect, then tcon needs reconnect */ +- if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) +- goto next_session; ++ if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { ++ spin_unlock(&ses->chan_lock); ++ continue; ++ } ++ spin_unlock(&ses->chan_lock); + ++ spin_lock(&ses->ses_lock); + ses->ses_status = SES_NEED_RECON; ++ spin_unlock(&ses->ses_lock); + + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + tcon->need_reconnect = true; ++ spin_lock(&tcon->tc_lock); + tcon->status = TID_NEED_RECON; ++ spin_unlock(&tcon->tc_lock); + } + if (ses->tcon_ipc) { + ses->tcon_ipc->need_reconnect = true; ++ spin_lock(&ses->tcon_ipc->tc_lock); + ses->tcon_ipc->status = TID_NEED_RECON; ++ spin_unlock(&ses->tcon_ipc->tc_lock); + } +- +-next_session: +- spin_unlock(&ses->chan_lock); + } + spin_unlock(&cifs_tcp_ses_lock); + } +@@ -3703,11 +3714,19 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, + + /* only send once per connect */ + spin_lock(&server->srv_lock); +- if (!server->ops->need_neg(server) || ++ if (server->tcpStatus != CifsGood && ++ server->tcpStatus != CifsNew && + server->tcpStatus != CifsNeedNegotiate) { ++ spin_unlock(&server->srv_lock); ++ return -EHOSTDOWN; ++ } ++ ++ if (!server->ops->need_neg(server) && ++ server->tcpStatus == CifsGood) { + spin_unlock(&server->srv_lock); + return 0; + } ++ + server->tcpStatus = CifsInNegotiate; + spin_unlock(&server->srv_lock); + +@@ -3741,23 +3760,28 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + bool is_binding = false; + + spin_lock(&ses->ses_lock); ++ cifs_dbg(FYI, "%s: channel connect bitmap: 0x%lx\n", ++ __func__, ses->chans_need_reconnect); ++ + if (ses->ses_status != SES_GOOD && + ses->ses_status != SES_NEW && + ses->ses_status != SES_NEED_RECON) { + spin_unlock(&ses->ses_lock); +- return 0; ++ return -EHOSTDOWN; + } + + /* only send once per connect */ + spin_lock(&ses->chan_lock); +- if (CIFS_ALL_CHANS_GOOD(ses) || +- cifs_chan_in_reconnect(ses, server)) { ++ if (CIFS_ALL_CHANS_GOOD(ses)) { ++ if (ses->ses_status == SES_NEED_RECON) ++ ses->ses_status = SES_GOOD; + spin_unlock(&ses->chan_lock); + spin_unlock(&ses->ses_lock); + return 0; + } +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); ++ + cifs_chan_set_in_reconnect(ses, server); ++ is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); + spin_unlock(&ses->chan_lock); + + if (!is_binding) +diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c +index 83d04cd2f9df8..f0b1ae0835d71 100644 +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -199,6 +199,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + } + spin_unlock(&server->srv_lock); + ++again: + rc = cifs_wait_for_server_reconnect(server, tcon->retry); + if (rc) + return rc; +@@ -217,6 +218,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + + nls_codepage = load_nls_default(); + ++ mutex_lock(&ses->session_mutex); + /* + * Recheck after acquire mutex. If another thread is negotiating + * and the server never sends an answer the socket will be closed +@@ -225,6 +227,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + spin_lock(&server->srv_lock); + if (server->tcpStatus == CifsNeedReconnect) { + spin_unlock(&server->srv_lock); ++ mutex_unlock(&ses->session_mutex); ++ ++ if (tcon->retry) ++ goto again; ++ + rc = -EHOSTDOWN; + goto out; + } +@@ -234,19 +241,22 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ ++ spin_lock(&ses->ses_lock); + spin_lock(&ses->chan_lock); +- if (!cifs_chan_needs_reconnect(ses, server)) { ++ if (!cifs_chan_needs_reconnect(ses, server) && ++ ses->ses_status == SES_GOOD) { + spin_unlock(&ses->chan_lock); +- ++ spin_unlock(&ses->ses_lock); + /* this means that we only need to tree connect */ + if (tcon->need_reconnect) + goto skip_sess_setup; + ++ mutex_unlock(&ses->session_mutex); + goto out; + } + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + +- mutex_lock(&ses->session_mutex); + rc = cifs_negotiate_protocol(0, ses, server); + if (!rc) { + rc = cifs_setup_session(0, ses, server, nls_codepage); +@@ -262,10 +272,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + mutex_unlock(&ses->session_mutex); + goto out; + } +- mutex_unlock(&ses->session_mutex); + + skip_sess_setup: +- mutex_lock(&ses->session_mutex); + if (!tcon->need_reconnect) { + mutex_unlock(&ses->session_mutex); + goto out; +@@ -280,7 +288,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); + if (rc) { + /* If sess reconnected but tcon didn't, something strange ... */ +- pr_warn_once("reconnect tcon failed rc = %d\n", rc); ++ cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc); + goto out; + } + +@@ -1252,9 +1260,9 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) + if (rc) + return rc; + +- spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); +- spin_unlock(&ses->chan_lock); ++ spin_lock(&ses->ses_lock); ++ is_binding = (ses->ses_status == SES_GOOD); ++ spin_unlock(&ses->ses_lock); + + if (is_binding) { + req->hdr.SessionId = cpu_to_le64(ses->Suid); +@@ -1412,9 +1420,9 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) + goto out_put_spnego_key; + } + +- spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); +- spin_unlock(&ses->chan_lock); ++ spin_lock(&ses->ses_lock); ++ is_binding = (ses->ses_status == SES_GOOD); ++ spin_unlock(&ses->ses_lock); + + /* keep session key if binding */ + if (!is_binding) { +@@ -1538,9 +1546,9 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + +- spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); +- spin_unlock(&ses->chan_lock); ++ spin_lock(&ses->ses_lock); ++ is_binding = (ses->ses_status == SES_GOOD); ++ spin_unlock(&ses->ses_lock); + + /* keep existing ses id and flags if binding */ + if (!is_binding) { +@@ -1606,9 +1614,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + +- spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); +- spin_unlock(&ses->chan_lock); ++ spin_lock(&ses->ses_lock); ++ is_binding = (ses->ses_status == SES_GOOD); ++ spin_unlock(&ses->ses_lock); + + /* keep existing ses id and flags if binding */ + if (!is_binding) { +diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c +index d827b7547ffad..790acf65a0926 100644 +--- a/fs/cifs/smb2transport.c ++++ b/fs/cifs/smb2transport.c +@@ -81,6 +81,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) + struct cifs_ses *ses = NULL; + int i; + int rc = 0; ++ bool is_binding = false; + + spin_lock(&cifs_tcp_ses_lock); + +@@ -97,9 +98,12 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) + goto out; + + found: ++ spin_lock(&ses->ses_lock); + spin_lock(&ses->chan_lock); +- if (cifs_chan_needs_reconnect(ses, server) && +- !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) { ++ ++ is_binding = (cifs_chan_needs_reconnect(ses, server) && ++ ses->ses_status == SES_GOOD); ++ if (is_binding) { + /* + * If we are in the process of binding a new channel + * to an existing session, use the master connection +@@ -107,6 +111,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) + */ + memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + goto out; + } + +@@ -119,10 +124,12 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) + if (chan->server == server) { + memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE); + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + goto out; + } + } + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + + cifs_dbg(VFS, + "%s: Could not find channel signing key for session 0x%llx\n", +@@ -392,11 +399,15 @@ generate_smb3signingkey(struct cifs_ses *ses, + bool is_binding = false; + int chan_index = 0; + ++ spin_lock(&ses->ses_lock); + spin_lock(&ses->chan_lock); +- is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses); ++ is_binding = (cifs_chan_needs_reconnect(ses, server) && ++ ses->ses_status == SES_GOOD); ++ + chan_index = cifs_ses_get_chan_index(ses, server); + /* TODO: introduce ref counting for channels when the can be freed */ + spin_unlock(&ses->chan_lock); ++ spin_unlock(&ses->ses_lock); + + /* + * All channels use the same encryption/decryption keys but +-- +2.39.2 + diff --git a/queue-6.2/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch b/queue-6.2/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch new file mode 100644 index 00000000000..e9c4e04da68 --- /dev/null +++ b/queue-6.2/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch @@ -0,0 +1,255 @@ +From 9b1a97c7aa9103eaed0e2fde976931dbb03b6622 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Feb 2023 19:01:55 -0300 +Subject: cifs: prevent data race in cifs_reconnect_tcon() + +From: Paulo Alcantara + +[ Upstream commit 1bcd548d935a33c6fc58331405eb1b82fd6150de ] + +Make sure to get an up-to-date TCP_Server_Info::nr_targets value prior +to waiting the server to be reconnected in cifs_reconnect_tcon(). It +is set in cifs_tcp_ses_needs_reconnect() and protected by +TCP_Server_Info::srv_lock. + +Create a new cifs_wait_for_server_reconnect() helper that can be used +by both SMB2+ and CIFS reconnect code. + +Signed-off-by: Paulo Alcantara (SUSE) +Signed-off-by: Steve French +Stable-dep-of: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects") +Signed-off-by: Sasha Levin +--- + fs/cifs/cifsproto.h | 1 + + fs/cifs/cifssmb.c | 43 ++---------------------- + fs/cifs/misc.c | 44 ++++++++++++++++++++++++ + fs/cifs/smb2pdu.c | 82 ++++++++++++--------------------------------- + 4 files changed, 69 insertions(+), 101 deletions(-) + +diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h +index e75184544ecb4..639df85dafd6c 100644 +--- a/fs/cifs/cifsproto.h ++++ b/fs/cifs/cifsproto.h +@@ -697,5 +697,6 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options) + + struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon); + void cifs_put_tcon_super(struct super_block *sb); ++int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry); + + #endif /* _CIFSPROTO_H */ +diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c +index 566e6a26b897c..5ca4a5383aaae 100644 +--- a/fs/cifs/cifssmb.c ++++ b/fs/cifs/cifssmb.c +@@ -70,7 +70,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) + struct cifs_ses *ses; + struct TCP_Server_Info *server; + struct nls_table *nls_codepage; +- int retries; + + /* + * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for +@@ -98,45 +97,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) + } + spin_unlock(&tcon->tc_lock); + +- retries = server->nr_targets; +- +- /* +- * Give demultiplex thread up to 10 seconds to each target available for +- * reconnect -- should be greater than cifs socket timeout which is 7 +- * seconds. +- */ +- while (server->tcpStatus == CifsNeedReconnect) { +- rc = wait_event_interruptible_timeout(server->response_q, +- (server->tcpStatus != CifsNeedReconnect), +- 10 * HZ); +- if (rc < 0) { +- cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n", +- __func__); +- return -ERESTARTSYS; +- } +- +- /* are we still trying to reconnect? */ +- spin_lock(&server->srv_lock); +- if (server->tcpStatus != CifsNeedReconnect) { +- spin_unlock(&server->srv_lock); +- break; +- } +- spin_unlock(&server->srv_lock); +- +- if (retries && --retries) +- continue; +- +- /* +- * on "soft" mounts we wait once. Hard mounts keep +- * retrying until process is killed or server comes +- * back on-line +- */ +- if (!tcon->retry) { +- cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n"); +- return -EHOSTDOWN; +- } +- retries = server->nr_targets; +- } ++ rc = cifs_wait_for_server_reconnect(server, tcon->retry); ++ if (rc) ++ return rc; + + spin_lock(&ses->chan_lock); + if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) { +diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c +index 9f4486b705d5c..5542893ef03f7 100644 +--- a/fs/cifs/misc.c ++++ b/fs/cifs/misc.c +@@ -1376,3 +1376,47 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid, + return 0; + } + #endif ++ ++int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry) ++{ ++ int timeout = 10; ++ int rc; ++ ++ spin_lock(&server->srv_lock); ++ if (server->tcpStatus != CifsNeedReconnect) { ++ spin_unlock(&server->srv_lock); ++ return 0; ++ } ++ timeout *= server->nr_targets; ++ spin_unlock(&server->srv_lock); ++ ++ /* ++ * Give demultiplex thread up to 10 seconds to each target available for ++ * reconnect -- should be greater than cifs socket timeout which is 7 ++ * seconds. ++ * ++ * On "soft" mounts we wait once. Hard mounts keep retrying until ++ * process is killed or server comes back on-line. ++ */ ++ do { ++ rc = wait_event_interruptible_timeout(server->response_q, ++ (server->tcpStatus != CifsNeedReconnect), ++ timeout * HZ); ++ if (rc < 0) { ++ cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", ++ __func__); ++ return -ERESTARTSYS; ++ } ++ ++ /* are we still trying to reconnect? */ ++ spin_lock(&server->srv_lock); ++ if (server->tcpStatus != CifsNeedReconnect) { ++ spin_unlock(&server->srv_lock); ++ return 0; ++ } ++ spin_unlock(&server->srv_lock); ++ } while (retry); ++ ++ cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); ++ return -EHOSTDOWN; ++} +diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c +index 6e6e44d8b4c79..83d04cd2f9df8 100644 +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -139,66 +139,6 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, + return; + } + +-static int wait_for_server_reconnect(struct TCP_Server_Info *server, +- __le16 smb2_command, bool retry) +-{ +- int timeout = 10; +- int rc; +- +- spin_lock(&server->srv_lock); +- if (server->tcpStatus != CifsNeedReconnect) { +- spin_unlock(&server->srv_lock); +- return 0; +- } +- timeout *= server->nr_targets; +- spin_unlock(&server->srv_lock); +- +- /* +- * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE +- * here since they are implicitly done when session drops. +- */ +- switch (smb2_command) { +- /* +- * BB Should we keep oplock break and add flush to exceptions? +- */ +- case SMB2_TREE_DISCONNECT: +- case SMB2_CANCEL: +- case SMB2_CLOSE: +- case SMB2_OPLOCK_BREAK: +- return -EAGAIN; +- } +- +- /* +- * Give demultiplex thread up to 10 seconds to each target available for +- * reconnect -- should be greater than cifs socket timeout which is 7 +- * seconds. +- * +- * On "soft" mounts we wait once. Hard mounts keep retrying until +- * process is killed or server comes back on-line. +- */ +- do { +- rc = wait_event_interruptible_timeout(server->response_q, +- (server->tcpStatus != CifsNeedReconnect), +- timeout * HZ); +- if (rc < 0) { +- cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n", +- __func__); +- return -ERESTARTSYS; +- } +- +- /* are we still trying to reconnect? */ +- spin_lock(&server->srv_lock); +- if (server->tcpStatus != CifsNeedReconnect) { +- spin_unlock(&server->srv_lock); +- return 0; +- } +- spin_unlock(&server->srv_lock); +- } while (retry); +- +- cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__); +- return -EHOSTDOWN; +-} +- + static int + smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + struct TCP_Server_Info *server) +@@ -239,7 +179,27 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, + (!tcon->ses->server) || !server) + return -EIO; + +- rc = wait_for_server_reconnect(server, smb2_command, tcon->retry); ++ spin_lock(&server->srv_lock); ++ if (server->tcpStatus == CifsNeedReconnect) { ++ /* ++ * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE ++ * here since they are implicitly done when session drops. ++ */ ++ switch (smb2_command) { ++ /* ++ * BB Should we keep oplock break and add flush to exceptions? ++ */ ++ case SMB2_TREE_DISCONNECT: ++ case SMB2_CANCEL: ++ case SMB2_CLOSE: ++ case SMB2_OPLOCK_BREAK: ++ spin_unlock(&server->srv_lock); ++ return -EAGAIN; ++ } ++ } ++ spin_unlock(&server->srv_lock); ++ ++ rc = cifs_wait_for_server_reconnect(server, tcon->retry); + if (rc) + return rc; + +-- +2.39.2 + diff --git a/queue-6.2/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch b/queue-6.2/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch new file mode 100644 index 00000000000..93f14e46b22 --- /dev/null +++ b/queue-6.2/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch @@ -0,0 +1,64 @@ +From 6ada185dd647bb8d36c926924cab533738bf13de Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Feb 2023 17:41:17 +0000 +Subject: cifs: update ip_addr for ses only for primary chan setup + +From: Shyam Prasad N + +[ Upstream commit e77978de4765229e09c8fabcf4f8419ff367317f ] + +We update ses->ip_addr whenever we do a session setup. +But this should happen only for primary channel in mchan +scenario. + +Signed-off-by: Shyam Prasad N +Reviewed-by: Paulo Alcantara (SUSE) +Signed-off-by: Steve French +Stable-dep-of: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects") +Signed-off-by: Sasha Levin +--- + fs/cifs/connect.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c +index 6da2af97b8bac..f53837f436d08 100644 +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -3735,16 +3735,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + struct nls_table *nls_info) + { + int rc = -ENOSYS; +- struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; +- struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; ++ struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; ++ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; ++ struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; + bool is_binding = false; + + spin_lock(&ses->ses_lock); +- if (server->dstaddr.ss_family == AF_INET6) +- scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); +- else +- scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); +- + if (ses->ses_status != SES_GOOD && + ses->ses_status != SES_NEW && + ses->ses_status != SES_NEED_RECON) { +@@ -3768,6 +3764,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, + ses->ses_status = SES_IN_SETUP; + spin_unlock(&ses->ses_lock); + ++ /* update ses ip_addr only for primary chan */ ++ if (server == pserver) { ++ if (server->dstaddr.ss_family == AF_INET6) ++ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr); ++ else ++ scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr); ++ } ++ + if (!is_binding) { + ses->capabilities = server->capabilities; + if (!linuxExtEnabled) +-- +2.39.2 + diff --git a/queue-6.2/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch b/queue-6.2/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch new file mode 100644 index 00000000000..1872cd632db --- /dev/null +++ b/queue-6.2/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch @@ -0,0 +1,73 @@ +From 8e38bc5af3d0e0d81a9ccb3a22c97e9361acd81d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Mar 2023 16:31:32 -0700 +Subject: fsverity: don't drop pagecache at end of FS_IOC_ENABLE_VERITY + +From: Eric Biggers + +[ Upstream commit a075bacde257f755bea0e53400c9f1cdd1b8e8e6 ] + +The full pagecache drop at the end of FS_IOC_ENABLE_VERITY is causing +performance problems and is hindering adoption of fsverity. It was +intended to solve a race condition where unverified pages might be left +in the pagecache. But actually it doesn't solve it fully. + +Since the incomplete solution for this race condition has too much +performance impact for it to be worth it, let's remove it for now. + +Fixes: 3fda4c617e84 ("fs-verity: implement FS_IOC_ENABLE_VERITY ioctl") +Cc: stable@vger.kernel.org +Reviewed-by: Victor Hsieh +Link: https://lore.kernel.org/r/20230314235332.50270-1-ebiggers@kernel.org +Signed-off-by: Eric Biggers +Signed-off-by: Sasha Levin +--- + fs/verity/enable.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +diff --git a/fs/verity/enable.c b/fs/verity/enable.c +index df6b499bf6a14..400c264bf8930 100644 +--- a/fs/verity/enable.c ++++ b/fs/verity/enable.c +@@ -390,25 +390,27 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) + goto out_drop_write; + + err = enable_verity(filp, &arg); +- if (err) +- goto out_allow_write_access; + + /* +- * Some pages of the file may have been evicted from pagecache after +- * being used in the Merkle tree construction, then read into pagecache +- * again by another process reading from the file concurrently. Since +- * these pages didn't undergo verification against the file digest which +- * fs-verity now claims to be enforcing, we have to wipe the pagecache +- * to ensure that all future reads are verified. ++ * We no longer drop the inode's pagecache after enabling verity. This ++ * used to be done to try to avoid a race condition where pages could be ++ * evicted after being used in the Merkle tree construction, then ++ * re-instantiated by a concurrent read. Such pages are unverified, and ++ * the backing storage could have filled them with different content, so ++ * they shouldn't be used to fulfill reads once verity is enabled. ++ * ++ * But, dropping the pagecache has a big performance impact, and it ++ * doesn't fully solve the race condition anyway. So for those reasons, ++ * and also because this race condition isn't very important relatively ++ * speaking (especially for small-ish files, where the chance of a page ++ * being used, evicted, *and* re-instantiated all while enabling verity ++ * is quite small), we no longer drop the inode's pagecache. + */ +- filemap_write_and_wait(inode->i_mapping); +- invalidate_inode_pages2(inode->i_mapping); + + /* + * allow_write_access() is needed to pair with deny_write_access(). + * Regardless, the filesystem won't allow writing to verity files. + */ +-out_allow_write_access: + allow_write_access(filp); + out_drop_write: + mnt_drop_write_file(filp); +-- +2.39.2 + diff --git a/queue-6.2/series b/queue-6.2/series new file mode 100644 index 00000000000..17cefee76b0 --- /dev/null +++ b/queue-6.2/series @@ -0,0 +1,13 @@ +thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch +cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch +cifs-prevent-data-race-in-cifs_reconnect_tcon.patch +cifs-avoid-race-conditions-with-parallel-reconnects.patch +zonefs-reorganize-code.patch +zonefs-simplify-io-error-handling.patch +zonefs-reduce-struct-zonefs_inode_info-size.patch +zonefs-separate-zone-information-from-inode-informat.patch +zonefs-fix-error-message-in-zonefs_file_dio_append.patch +btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch +btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch +btrfs-zoned-drop-space_info-active_total_bytes.patch +fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch diff --git a/queue-6.2/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch b/queue-6.2/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch new file mode 100644 index 00000000000..a7ccee3242d --- /dev/null +++ b/queue-6.2/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch @@ -0,0 +1,138 @@ +From 37e2a48598a23c799b1dd4fabaa5b7d6bec7ab4c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 31 Jan 2023 13:04:52 +0200 +Subject: thunderbolt: Limit USB3 bandwidth of certain Intel USB4 host routers + +From: Gil Fine + +[ Upstream commit f0a57dd33b3eadf540912cd130db727ea824d174 ] + +Current Intel USB4 host routers have hardware limitation that the USB3 +bandwidth cannot go higher than 16376 Mb/s. Work this around by adding a +new quirk that limits the bandwidth for the affected host routers. + +Cc: stable@vger.kernel.org +Signed-off-by: Gil Fine +Signed-off-by: Mika Westerberg +Signed-off-by: Sasha Levin +--- + drivers/thunderbolt/quirks.c | 31 +++++++++++++++++++++++++++++++ + drivers/thunderbolt/tb.h | 3 +++ + drivers/thunderbolt/usb4.c | 17 +++++++++++++++-- + 3 files changed, 49 insertions(+), 2 deletions(-) + +diff --git a/drivers/thunderbolt/quirks.c b/drivers/thunderbolt/quirks.c +index ae28a03fa890b..1157b8869bcca 100644 +--- a/drivers/thunderbolt/quirks.c ++++ b/drivers/thunderbolt/quirks.c +@@ -26,6 +26,19 @@ static void quirk_clx_disable(struct tb_switch *sw) + tb_sw_dbg(sw, "disabling CL states\n"); + } + ++static void quirk_usb3_maximum_bandwidth(struct tb_switch *sw) ++{ ++ struct tb_port *port; ++ ++ tb_switch_for_each_port(sw, port) { ++ if (!tb_port_is_usb3_down(port)) ++ continue; ++ port->max_bw = 16376; ++ tb_port_dbg(port, "USB3 maximum bandwidth limited to %u Mb/s\n", ++ port->max_bw); ++ } ++} ++ + struct tb_quirk { + u16 hw_vendor_id; + u16 hw_device_id; +@@ -43,6 +56,24 @@ static const struct tb_quirk tb_quirks[] = { + * DP buffers. + */ + { 0x8087, 0x0b26, 0x0000, 0x0000, quirk_dp_credit_allocation }, ++ /* ++ * Limit the maximum USB3 bandwidth for the following Intel USB4 ++ * host routers due to a hardware issue. ++ */ ++ { 0x8087, PCI_DEVICE_ID_INTEL_ADL_NHI0, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_ADL_NHI1, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_RPL_NHI0, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_RPL_NHI1, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_MTL_M_NHI0, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_MTL_P_NHI0, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, ++ { 0x8087, PCI_DEVICE_ID_INTEL_MTL_P_NHI1, 0x0000, 0x0000, ++ quirk_usb3_maximum_bandwidth }, + /* + * CLx is not supported on AMD USB4 Yellow Carp and Pink Sardine platforms. + */ +diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h +index e11d973a8f9b6..f034723b1b40e 100644 +--- a/drivers/thunderbolt/tb.h ++++ b/drivers/thunderbolt/tb.h +@@ -252,6 +252,8 @@ struct tb_switch { + * @ctl_credits: Buffers reserved for control path + * @dma_credits: Number of credits allocated for DMA tunneling for all + * DMA paths through this port. ++ * @max_bw: Maximum possible bandwidth through this adapter if set to ++ * non-zero. + * + * In USB4 terminology this structure represents an adapter (protocol or + * lane adapter). +@@ -277,6 +279,7 @@ struct tb_port { + unsigned int total_credits; + unsigned int ctl_credits; + unsigned int dma_credits; ++ unsigned int max_bw; + }; + + /** +diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c +index d5cd219ee9e6b..3a11b30b6c86a 100644 +--- a/drivers/thunderbolt/usb4.c ++++ b/drivers/thunderbolt/usb4.c +@@ -1882,6 +1882,15 @@ int usb4_port_retimer_nvm_read(struct tb_port *port, u8 index, + usb4_port_retimer_nvm_read_block, &info); + } + ++static inline unsigned int ++usb4_usb3_port_max_bandwidth(const struct tb_port *port, unsigned int bw) ++{ ++ /* Take the possible bandwidth limitation into account */ ++ if (port->max_bw) ++ return min(bw, port->max_bw); ++ return bw; ++} ++ + /** + * usb4_usb3_port_max_link_rate() - Maximum support USB3 link rate + * @port: USB3 adapter port +@@ -1903,7 +1912,9 @@ int usb4_usb3_port_max_link_rate(struct tb_port *port) + return ret; + + lr = (val & ADP_USB3_CS_4_MSLR_MASK) >> ADP_USB3_CS_4_MSLR_SHIFT; +- return lr == ADP_USB3_CS_4_MSLR_20G ? 20000 : 10000; ++ ret = lr == ADP_USB3_CS_4_MSLR_20G ? 20000 : 10000; ++ ++ return usb4_usb3_port_max_bandwidth(port, ret); + } + + /** +@@ -1930,7 +1941,9 @@ int usb4_usb3_port_actual_link_rate(struct tb_port *port) + return 0; + + lr = val & ADP_USB3_CS_4_ALR_MASK; +- return lr == ADP_USB3_CS_4_ALR_20G ? 20000 : 10000; ++ ret = lr == ADP_USB3_CS_4_ALR_20G ? 20000 : 10000; ++ ++ return usb4_usb3_port_max_bandwidth(port, ret); + } + + static int usb4_usb3_port_cm_request(struct tb_port *port, bool request) +-- +2.39.2 + diff --git a/queue-6.2/zonefs-fix-error-message-in-zonefs_file_dio_append.patch b/queue-6.2/zonefs-fix-error-message-in-zonefs_file_dio_append.patch new file mode 100644 index 00000000000..87e59c4d50d --- /dev/null +++ b/queue-6.2/zonefs-fix-error-message-in-zonefs_file_dio_append.patch @@ -0,0 +1,41 @@ +From 9c08f088ff35a6aceeb24a61989b6866012caeb4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Mar 2023 22:49:15 +0900 +Subject: zonefs: Fix error message in zonefs_file_dio_append() + +From: Damien Le Moal + +[ Upstream commit 88b170088ad2c3e27086fe35769aa49f8a512564 ] + +Since the expected write location in a sequential file is always at the +end of the file (append write), when an invalid write append location is +detected in zonefs_file_dio_append(), print the invalid written location +instead of the expected write location. + +Fixes: a608da3bd730 ("zonefs: Detect append writes at invalid locations") +Cc: stable@vger.kernel.org +Signed-off-by: Damien Le Moal +Reviewed-by: Christoph Hellwig +Reviewed-by: Johannes Thumshirn +Reviewed-by: Himanshu Madhani +Signed-off-by: Sasha Levin +--- + fs/zonefs/file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c +index 738b0e28d74b5..c71cc0fcb3ec8 100644 +--- a/fs/zonefs/file.c ++++ b/fs/zonefs/file.c +@@ -426,7 +426,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + if (bio->bi_iter.bi_sector != wpsector) { + zonefs_warn(inode->i_sb, + "Corrupted write pointer %llu for zone at %llu\n", +- wpsector, z->z_sector); ++ bio->bi_iter.bi_sector, z->z_sector); + ret = -EIO; + } + } +-- +2.39.2 + diff --git a/queue-6.2/zonefs-reduce-struct-zonefs_inode_info-size.patch b/queue-6.2/zonefs-reduce-struct-zonefs_inode_info-size.patch new file mode 100644 index 00000000000..ff3bba79206 --- /dev/null +++ b/queue-6.2/zonefs-reduce-struct-zonefs_inode_info-size.patch @@ -0,0 +1,283 @@ +From 9f7ff5e239ee3af9be4cfb57edccf0c120518282 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 24 Nov 2022 19:43:30 +0900 +Subject: zonefs: Reduce struct zonefs_inode_info size + +From: Damien Le Moal + +[ Upstream commit 34422914dc00b291d1c47dbdabe93b154c2f2b25 ] + +Instead of using the i_ztype field in struct zonefs_inode_info to +indicate the zone type of an inode, introduce the new inode flag +ZONEFS_ZONE_CNV to be set in the i_flags field of struct +zonefs_inode_info to identify conventional zones. If this flag is not +set, the zone of an inode is considered to be a sequential zone. + +The helpers zonefs_zone_is_cnv(), zonefs_zone_is_seq(), +zonefs_inode_is_cnv() and zonefs_inode_is_seq() are introduced to +simplify testing the zone type of a struct zonefs_inode_info and of a +struct inode. + +Signed-off-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") +Signed-off-by: Sasha Levin +--- + fs/zonefs/file.c | 35 ++++++++++++++--------------------- + fs/zonefs/super.c | 12 +++++++----- + fs/zonefs/zonefs.h | 24 +++++++++++++++++++++--- + 3 files changed, 42 insertions(+), 29 deletions(-) + +diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c +index ece0f3959b6d1..64873d31d75dd 100644 +--- a/fs/zonefs/file.c ++++ b/fs/zonefs/file.c +@@ -77,8 +77,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + * checked when writes are issued, so warn if we see a page writeback + * operation. + */ +- if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && +- !(flags & IOMAP_DIRECT))) ++ if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT))) + return -EIO; + + /* +@@ -128,7 +127,7 @@ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + +- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) ++ if (WARN_ON_ONCE(zonefs_zone_is_seq(zi))) + return -EIO; + if (WARN_ON_ONCE(offset >= i_size_read(inode))) + return -EIO; +@@ -158,9 +157,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *span) + { + struct inode *inode = file_inode(swap_file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); + +- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { ++ if (zonefs_inode_is_seq(inode)) { + zonefs_err(inode->i_sb, + "swap file: not a conventional zone file\n"); + return -EINVAL; +@@ -196,7 +194,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + * only down to a 0 size, which is equivalent to a zone reset, and to + * the maximum file size, which is equivalent to a zone finish. + */ +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ if (!zonefs_zone_is_seq(zi)) + return -EPERM; + + if (!isize) +@@ -266,7 +264,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, + * Since only direct writes are allowed in sequential files, page cache + * flush is needed only for conventional zone files. + */ +- if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ++ if (zonefs_inode_is_cnv(inode)) + ret = file_write_and_wait_range(file, start, end); + if (!ret) + ret = blkdev_issue_flush(inode->i_sb->s_bdev); +@@ -280,7 +278,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, + static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) + { + struct inode *inode = file_inode(vmf->vma->vm_file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); + vm_fault_t ret; + + if (unlikely(IS_IMMUTABLE(inode))) +@@ -290,7 +287,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) + * Sanity check: only conventional zone files can have shared + * writeable mappings. + */ +- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) ++ if (zonefs_inode_is_seq(inode)) + return VM_FAULT_NOPAGE; + + sb_start_pagefault(inode->i_sb); +@@ -319,7 +316,7 @@ static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) + * mappings are possible since there are no guarantees for write + * ordering between msync() and page cache writeback. + */ +- if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && ++ if (zonefs_inode_is_seq(file_inode(file)) && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + +@@ -352,7 +349,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + return error; + } + +- if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { ++ if (size && zonefs_zone_is_seq(zi)) { + /* + * Note that we may be seeing completions out of order, + * but that is not a problem since a write completed +@@ -491,7 +488,7 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ if (zonefs_zone_is_cnv(zi)) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); + iocb->ki_pos = zi->i_wpoffset; +@@ -531,8 +528,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + * as this can cause write reordering (e.g. the first aio gets EAGAIN + * on the inode lock but the second goes through but is now unaligned). + */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && +- (iocb->ki_flags & IOCB_NOWAIT)) ++ if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (iocb->ki_flags & IOCB_NOWAIT) { +@@ -554,7 +550,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + } + + /* Enforce sequential writes (append only) in sequential zones */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { ++ if (zonefs_zone_is_seq(zi)) { + mutex_lock(&zi->i_truncate_mutex); + if (iocb->ki_pos != zi->i_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); +@@ -570,7 +566,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + else + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + &zonefs_write_dio_ops, 0, NULL, 0); +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && ++ if (zonefs_zone_is_seq(zi) && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) + count = ret; +@@ -596,14 +592,13 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); + ssize_t ret; + + /* + * Direct IO writes are mandatory for sequential zone files so that the + * write IO issuing order is preserved. + */ +- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) ++ if (zonefs_inode_is_seq(inode)) + return -EIO; + + if (iocb->ki_flags & IOCB_NOWAIT) { +@@ -731,9 +726,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + static inline bool zonefs_seq_file_need_wro(struct inode *inode, + struct file *file) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ if (zonefs_inode_is_cnv(inode)) + return false; + + if (!(file->f_mode & FMODE_WRITE)) +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index 6307cc95be061..a4af29dc32e7d 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -37,7 +37,7 @@ void zonefs_account_active(struct inode *inode) + + lockdep_assert_held(&zi->i_truncate_mutex); + +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ if (zonefs_zone_is_cnv(zi)) + return; + + /* +@@ -177,14 +177,14 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, + zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", + inode->i_ino); + zi->i_flags |= ZONEFS_ZONE_READONLY; +- if (zi->i_ztype == ZONEFS_ZTYPE_CNV) ++ if (zonefs_zone_is_cnv(zi)) + return zi->i_max_size; + return zi->i_wpoffset; + case BLK_ZONE_COND_FULL: + /* The write pointer of full zones is invalid. */ + return zi->i_max_size; + default: +- if (zi->i_ztype == ZONEFS_ZTYPE_CNV) ++ if (zonefs_zone_is_cnv(zi)) + return zi->i_max_size; + return (zone->wp - zone->start) << SECTOR_SHIFT; + } +@@ -260,7 +260,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * In all cases, warn about inode size inconsistency and handle the + * IO error according to the zone condition and to the mount options. + */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size) ++ if (zonefs_zone_is_seq(zi) && isize != data_size) + zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", + inode->i_ino, isize, data_size); + +@@ -584,7 +584,9 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; + inode->i_mode = S_IFREG | sbi->s_perm; + +- zi->i_ztype = type; ++ if (type == ZONEFS_ZTYPE_CNV) ++ zi->i_flags |= ZONEFS_ZONE_CNV; ++ + zi->i_zsector = zone->start; + zi->i_zone_size = zone->len << SECTOR_SHIFT; + if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && +diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h +index 439096445ee53..1a225f74015a0 100644 +--- a/fs/zonefs/zonefs.h ++++ b/fs/zonefs/zonefs.h +@@ -44,6 +44,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) + #define ZONEFS_ZONE_ACTIVE (1U << 2) + #define ZONEFS_ZONE_OFFLINE (1U << 3) + #define ZONEFS_ZONE_READONLY (1U << 4) ++#define ZONEFS_ZONE_CNV (1U << 31) + + /* + * In-memory inode data. +@@ -51,9 +52,6 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) + struct zonefs_inode_info { + struct inode i_vnode; + +- /* File zone type */ +- enum zonefs_ztype i_ztype; +- + /* File zone start sector (512B unit) */ + sector_t i_zsector; + +@@ -91,6 +89,26 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) + return container_of(inode, struct zonefs_inode_info, i_vnode); + } + ++static inline bool zonefs_zone_is_cnv(struct zonefs_inode_info *zi) ++{ ++ return zi->i_flags & ZONEFS_ZONE_CNV; ++} ++ ++static inline bool zonefs_zone_is_seq(struct zonefs_inode_info *zi) ++{ ++ return !zonefs_zone_is_cnv(zi); ++} ++ ++static inline bool zonefs_inode_is_cnv(struct inode *inode) ++{ ++ return zonefs_zone_is_cnv(ZONEFS_I(inode)); ++} ++ ++static inline bool zonefs_inode_is_seq(struct inode *inode) ++{ ++ return zonefs_zone_is_seq(ZONEFS_I(inode)); ++} ++ + /* + * On-disk super block (block 0). + */ +-- +2.39.2 + diff --git a/queue-6.2/zonefs-reorganize-code.patch b/queue-6.2/zonefs-reorganize-code.patch new file mode 100644 index 00000000000..b340322beed --- /dev/null +++ b/queue-6.2/zonefs-reorganize-code.patch @@ -0,0 +1,1990 @@ +From 05bc54065b57e4fb5ff76d916d5de3526881794c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 25 Nov 2022 09:39:33 +0900 +Subject: zonefs: Reorganize code + +From: Damien Le Moal + +[ Upstream commit 4008e2a0b01aba982356fd15b128a47bf11bd9c7 ] + +Move all code related to zone file operations from super.c to the new +file.c file. Inode and zone management code remains in super.c. + +Signed-off-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") +Signed-off-by: Sasha Levin +--- + fs/zonefs/Makefile | 2 +- + fs/zonefs/file.c | 874 ++++++++++++++++++++++++++++++++++++++++ + fs/zonefs/super.c | 973 +++------------------------------------------ + fs/zonefs/zonefs.h | 22 + + 4 files changed, 955 insertions(+), 916 deletions(-) + create mode 100644 fs/zonefs/file.c + +diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile +index 9fe54f5319f22..645f7229de4a0 100644 +--- a/fs/zonefs/Makefile ++++ b/fs/zonefs/Makefile +@@ -3,4 +3,4 @@ ccflags-y += -I$(src) + + obj-$(CONFIG_ZONEFS_FS) += zonefs.o + +-zonefs-y := super.o sysfs.o ++zonefs-y := super.o file.o sysfs.o +diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c +new file mode 100644 +index 0000000000000..ece0f3959b6d1 +--- /dev/null ++++ b/fs/zonefs/file.c +@@ -0,0 +1,874 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Simple file system for zoned block devices exposing zones as files. ++ * ++ * Copyright (C) 2022 Western Digital Corporation or its affiliates. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "zonefs.h" ++ ++#include "trace.h" ++ ++static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, ++ loff_t length, unsigned int flags, ++ struct iomap *iomap, struct iomap *srcmap) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ loff_t isize; ++ ++ /* ++ * All blocks are always mapped below EOF. If reading past EOF, ++ * act as if there is a hole up to the file maximum size. ++ */ ++ mutex_lock(&zi->i_truncate_mutex); ++ iomap->bdev = inode->i_sb->s_bdev; ++ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); ++ isize = i_size_read(inode); ++ if (iomap->offset >= isize) { ++ iomap->type = IOMAP_HOLE; ++ iomap->addr = IOMAP_NULL_ADDR; ++ iomap->length = length; ++ } else { ++ iomap->type = IOMAP_MAPPED; ++ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; ++ iomap->length = isize - iomap->offset; ++ } ++ mutex_unlock(&zi->i_truncate_mutex); ++ ++ trace_zonefs_iomap_begin(inode, iomap); ++ ++ return 0; ++} ++ ++static const struct iomap_ops zonefs_read_iomap_ops = { ++ .iomap_begin = zonefs_read_iomap_begin, ++}; ++ ++static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, ++ loff_t length, unsigned int flags, ++ struct iomap *iomap, struct iomap *srcmap) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ loff_t isize; ++ ++ /* All write I/Os should always be within the file maximum size */ ++ if (WARN_ON_ONCE(offset + length > zi->i_max_size)) ++ return -EIO; ++ ++ /* ++ * Sequential zones can only accept direct writes. This is already ++ * checked when writes are issued, so warn if we see a page writeback ++ * operation. ++ */ ++ if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && ++ !(flags & IOMAP_DIRECT))) ++ return -EIO; ++ ++ /* ++ * For conventional zones, all blocks are always mapped. For sequential ++ * zones, all blocks after always mapped below the inode size (zone ++ * write pointer) and unwriten beyond. ++ */ ++ mutex_lock(&zi->i_truncate_mutex); ++ iomap->bdev = inode->i_sb->s_bdev; ++ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); ++ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; ++ isize = i_size_read(inode); ++ if (iomap->offset >= isize) { ++ iomap->type = IOMAP_UNWRITTEN; ++ iomap->length = zi->i_max_size - iomap->offset; ++ } else { ++ iomap->type = IOMAP_MAPPED; ++ iomap->length = isize - iomap->offset; ++ } ++ mutex_unlock(&zi->i_truncate_mutex); ++ ++ trace_zonefs_iomap_begin(inode, iomap); ++ ++ return 0; ++} ++ ++static const struct iomap_ops zonefs_write_iomap_ops = { ++ .iomap_begin = zonefs_write_iomap_begin, ++}; ++ ++static int zonefs_read_folio(struct file *unused, struct folio *folio) ++{ ++ return iomap_read_folio(folio, &zonefs_read_iomap_ops); ++} ++ ++static void zonefs_readahead(struct readahead_control *rac) ++{ ++ iomap_readahead(rac, &zonefs_read_iomap_ops); ++} ++ ++/* ++ * Map blocks for page writeback. This is used only on conventional zone files, ++ * which implies that the page range can only be within the fixed inode size. ++ */ ++static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, ++ struct inode *inode, loff_t offset) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) ++ return -EIO; ++ if (WARN_ON_ONCE(offset >= i_size_read(inode))) ++ return -EIO; ++ ++ /* If the mapping is already OK, nothing needs to be done */ ++ if (offset >= wpc->iomap.offset && ++ offset < wpc->iomap.offset + wpc->iomap.length) ++ return 0; ++ ++ return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, ++ IOMAP_WRITE, &wpc->iomap, NULL); ++} ++ ++static const struct iomap_writeback_ops zonefs_writeback_ops = { ++ .map_blocks = zonefs_write_map_blocks, ++}; ++ ++static int zonefs_writepages(struct address_space *mapping, ++ struct writeback_control *wbc) ++{ ++ struct iomap_writepage_ctx wpc = { }; ++ ++ return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); ++} ++ ++static int zonefs_swap_activate(struct swap_info_struct *sis, ++ struct file *swap_file, sector_t *span) ++{ ++ struct inode *inode = file_inode(swap_file); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { ++ zonefs_err(inode->i_sb, ++ "swap file: not a conventional zone file\n"); ++ return -EINVAL; ++ } ++ ++ return iomap_swapfile_activate(sis, swap_file, span, ++ &zonefs_read_iomap_ops); ++} ++ ++const struct address_space_operations zonefs_file_aops = { ++ .read_folio = zonefs_read_folio, ++ .readahead = zonefs_readahead, ++ .writepages = zonefs_writepages, ++ .dirty_folio = filemap_dirty_folio, ++ .release_folio = iomap_release_folio, ++ .invalidate_folio = iomap_invalidate_folio, ++ .migrate_folio = filemap_migrate_folio, ++ .is_partially_uptodate = iomap_is_partially_uptodate, ++ .error_remove_page = generic_error_remove_page, ++ .direct_IO = noop_direct_IO, ++ .swap_activate = zonefs_swap_activate, ++}; ++ ++int zonefs_file_truncate(struct inode *inode, loff_t isize) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ loff_t old_isize; ++ enum req_op op; ++ int ret = 0; ++ ++ /* ++ * Only sequential zone files can be truncated and truncation is allowed ++ * only down to a 0 size, which is equivalent to a zone reset, and to ++ * the maximum file size, which is equivalent to a zone finish. ++ */ ++ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ return -EPERM; ++ ++ if (!isize) ++ op = REQ_OP_ZONE_RESET; ++ else if (isize == zi->i_max_size) ++ op = REQ_OP_ZONE_FINISH; ++ else ++ return -EPERM; ++ ++ inode_dio_wait(inode); ++ ++ /* Serialize against page faults */ ++ filemap_invalidate_lock(inode->i_mapping); ++ ++ /* Serialize against zonefs_iomap_begin() */ ++ mutex_lock(&zi->i_truncate_mutex); ++ ++ old_isize = i_size_read(inode); ++ if (isize == old_isize) ++ goto unlock; ++ ++ ret = zonefs_zone_mgmt(inode, op); ++ if (ret) ++ goto unlock; ++ ++ /* ++ * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, ++ * take care of open zones. ++ */ ++ if (zi->i_flags & ZONEFS_ZONE_OPEN) { ++ /* ++ * Truncating a zone to EMPTY or FULL is the equivalent of ++ * closing the zone. For a truncation to 0, we need to ++ * re-open the zone to ensure new writes can be processed. ++ * For a truncation to the maximum file size, the zone is ++ * closed and writes cannot be accepted anymore, so clear ++ * the open flag. ++ */ ++ if (!isize) ++ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); ++ else ++ zi->i_flags &= ~ZONEFS_ZONE_OPEN; ++ } ++ ++ zonefs_update_stats(inode, isize); ++ truncate_setsize(inode, isize); ++ zi->i_wpoffset = isize; ++ zonefs_account_active(inode); ++ ++unlock: ++ mutex_unlock(&zi->i_truncate_mutex); ++ filemap_invalidate_unlock(inode->i_mapping); ++ ++ return ret; ++} ++ ++static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, ++ int datasync) ++{ ++ struct inode *inode = file_inode(file); ++ int ret = 0; ++ ++ if (unlikely(IS_IMMUTABLE(inode))) ++ return -EPERM; ++ ++ /* ++ * Since only direct writes are allowed in sequential files, page cache ++ * flush is needed only for conventional zone files. ++ */ ++ if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) ++ ret = file_write_and_wait_range(file, start, end); ++ if (!ret) ++ ret = blkdev_issue_flush(inode->i_sb->s_bdev); ++ ++ if (ret) ++ zonefs_io_error(inode, true); ++ ++ return ret; ++} ++ ++static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) ++{ ++ struct inode *inode = file_inode(vmf->vma->vm_file); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ vm_fault_t ret; ++ ++ if (unlikely(IS_IMMUTABLE(inode))) ++ return VM_FAULT_SIGBUS; ++ ++ /* ++ * Sanity check: only conventional zone files can have shared ++ * writeable mappings. ++ */ ++ if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) ++ return VM_FAULT_NOPAGE; ++ ++ sb_start_pagefault(inode->i_sb); ++ file_update_time(vmf->vma->vm_file); ++ ++ /* Serialize against truncates */ ++ filemap_invalidate_lock_shared(inode->i_mapping); ++ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); ++ filemap_invalidate_unlock_shared(inode->i_mapping); ++ ++ sb_end_pagefault(inode->i_sb); ++ return ret; ++} ++ ++static const struct vm_operations_struct zonefs_file_vm_ops = { ++ .fault = filemap_fault, ++ .map_pages = filemap_map_pages, ++ .page_mkwrite = zonefs_filemap_page_mkwrite, ++}; ++ ++static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ /* ++ * Conventional zones accept random writes, so their files can support ++ * shared writable mappings. For sequential zone files, only read ++ * mappings are possible since there are no guarantees for write ++ * ordering between msync() and page cache writeback. ++ */ ++ if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && ++ (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) ++ return -EINVAL; ++ ++ file_accessed(file); ++ vma->vm_ops = &zonefs_file_vm_ops; ++ ++ return 0; ++} ++ ++static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) ++{ ++ loff_t isize = i_size_read(file_inode(file)); ++ ++ /* ++ * Seeks are limited to below the zone size for conventional zones ++ * and below the zone write pointer for sequential zones. In both ++ * cases, this limit is the inode size. ++ */ ++ return generic_file_llseek_size(file, offset, whence, isize, isize); ++} ++ ++static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, ++ int error, unsigned int flags) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (error) { ++ zonefs_io_error(inode, true); ++ return error; ++ } ++ ++ if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { ++ /* ++ * Note that we may be seeing completions out of order, ++ * but that is not a problem since a write completed ++ * successfully necessarily means that all preceding writes ++ * were also successful. So we can safely increase the inode ++ * size to the write end location. ++ */ ++ mutex_lock(&zi->i_truncate_mutex); ++ if (i_size_read(inode) < iocb->ki_pos + size) { ++ zonefs_update_stats(inode, iocb->ki_pos + size); ++ zonefs_i_size_write(inode, iocb->ki_pos + size); ++ } ++ mutex_unlock(&zi->i_truncate_mutex); ++ } ++ ++ return 0; ++} ++ ++static const struct iomap_dio_ops zonefs_write_dio_ops = { ++ .end_io = zonefs_file_write_dio_end_io, ++}; ++ ++static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct block_device *bdev = inode->i_sb->s_bdev; ++ unsigned int max = bdev_max_zone_append_sectors(bdev); ++ struct bio *bio; ++ ssize_t size; ++ int nr_pages; ++ ssize_t ret; ++ ++ max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); ++ iov_iter_truncate(from, max); ++ ++ nr_pages = iov_iter_npages(from, BIO_MAX_VECS); ++ if (!nr_pages) ++ return 0; ++ ++ bio = bio_alloc(bdev, nr_pages, ++ REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); ++ bio->bi_iter.bi_sector = zi->i_zsector; ++ bio->bi_ioprio = iocb->ki_ioprio; ++ if (iocb_is_dsync(iocb)) ++ bio->bi_opf |= REQ_FUA; ++ ++ ret = bio_iov_iter_get_pages(bio, from); ++ if (unlikely(ret)) ++ goto out_release; ++ ++ size = bio->bi_iter.bi_size; ++ task_io_account_write(size); ++ ++ if (iocb->ki_flags & IOCB_HIPRI) ++ bio_set_polled(bio, iocb); ++ ++ ret = submit_bio_wait(bio); ++ ++ /* ++ * If the file zone was written underneath the file system, the zone ++ * write pointer may not be where we expect it to be, but the zone ++ * append write can still succeed. So check manually that we wrote where ++ * we intended to, that is, at zi->i_wpoffset. ++ */ ++ if (!ret) { ++ sector_t wpsector = ++ zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); ++ ++ if (bio->bi_iter.bi_sector != wpsector) { ++ zonefs_warn(inode->i_sb, ++ "Corrupted write pointer %llu for zone at %llu\n", ++ wpsector, zi->i_zsector); ++ ret = -EIO; ++ } ++ } ++ ++ zonefs_file_write_dio_end_io(iocb, size, ret, 0); ++ trace_zonefs_file_dio_append(inode, size, ret); ++ ++out_release: ++ bio_release_pages(bio, false); ++ bio_put(bio); ++ ++ if (ret >= 0) { ++ iocb->ki_pos += size; ++ return size; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Do not exceed the LFS limits nor the file zone size. If pos is under the ++ * limit it becomes a short access. If it exceeds the limit, return -EFBIG. ++ */ ++static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, ++ loff_t count) ++{ ++ struct inode *inode = file_inode(file); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ loff_t limit = rlimit(RLIMIT_FSIZE); ++ loff_t max_size = zi->i_max_size; ++ ++ if (limit != RLIM_INFINITY) { ++ if (pos >= limit) { ++ send_sig(SIGXFSZ, current, 0); ++ return -EFBIG; ++ } ++ count = min(count, limit - pos); ++ } ++ ++ if (!(file->f_flags & O_LARGEFILE)) ++ max_size = min_t(loff_t, MAX_NON_LFS, max_size); ++ ++ if (unlikely(pos >= max_size)) ++ return -EFBIG; ++ ++ return min(count, max_size - pos); ++} ++ ++static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct file *file = iocb->ki_filp; ++ struct inode *inode = file_inode(file); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ loff_t count; ++ ++ if (IS_SWAPFILE(inode)) ++ return -ETXTBSY; ++ ++ if (!iov_iter_count(from)) ++ return 0; ++ ++ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) ++ return -EINVAL; ++ ++ if (iocb->ki_flags & IOCB_APPEND) { ++ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ return -EINVAL; ++ mutex_lock(&zi->i_truncate_mutex); ++ iocb->ki_pos = zi->i_wpoffset; ++ mutex_unlock(&zi->i_truncate_mutex); ++ } ++ ++ count = zonefs_write_check_limits(file, iocb->ki_pos, ++ iov_iter_count(from)); ++ if (count < 0) ++ return count; ++ ++ iov_iter_truncate(from, count); ++ return iov_iter_count(from); ++} ++ ++/* ++ * Handle direct writes. For sequential zone files, this is the only possible ++ * write path. For these files, check that the user is issuing writes ++ * sequentially from the end of the file. This code assumes that the block layer ++ * delivers write requests to the device in sequential order. This is always the ++ * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE ++ * elevator feature is being used (e.g. mq-deadline). The block layer always ++ * automatically select such an elevator for zoned block devices during the ++ * device initialization. ++ */ ++static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ bool sync = is_sync_kiocb(iocb); ++ bool append = false; ++ ssize_t ret, count; ++ ++ /* ++ * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT ++ * as this can cause write reordering (e.g. the first aio gets EAGAIN ++ * on the inode lock but the second goes through but is now unaligned). ++ */ ++ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && ++ (iocb->ki_flags & IOCB_NOWAIT)) ++ return -EOPNOTSUPP; ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!inode_trylock(inode)) ++ return -EAGAIN; ++ } else { ++ inode_lock(inode); ++ } ++ ++ count = zonefs_write_checks(iocb, from); ++ if (count <= 0) { ++ ret = count; ++ goto inode_unlock; ++ } ++ ++ if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { ++ ret = -EINVAL; ++ goto inode_unlock; ++ } ++ ++ /* Enforce sequential writes (append only) in sequential zones */ ++ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { ++ mutex_lock(&zi->i_truncate_mutex); ++ if (iocb->ki_pos != zi->i_wpoffset) { ++ mutex_unlock(&zi->i_truncate_mutex); ++ ret = -EINVAL; ++ goto inode_unlock; ++ } ++ mutex_unlock(&zi->i_truncate_mutex); ++ append = sync; ++ } ++ ++ if (append) ++ ret = zonefs_file_dio_append(iocb, from); ++ else ++ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, ++ &zonefs_write_dio_ops, 0, NULL, 0); ++ if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && ++ (ret > 0 || ret == -EIOCBQUEUED)) { ++ if (ret > 0) ++ count = ret; ++ ++ /* ++ * Update the zone write pointer offset assuming the write ++ * operation succeeded. If it did not, the error recovery path ++ * will correct it. Also do active seq file accounting. ++ */ ++ mutex_lock(&zi->i_truncate_mutex); ++ zi->i_wpoffset += count; ++ zonefs_account_active(inode); ++ mutex_unlock(&zi->i_truncate_mutex); ++ } ++ ++inode_unlock: ++ inode_unlock(inode); ++ ++ return ret; ++} ++ ++static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, ++ struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ssize_t ret; ++ ++ /* ++ * Direct IO writes are mandatory for sequential zone files so that the ++ * write IO issuing order is preserved. ++ */ ++ if (zi->i_ztype != ZONEFS_ZTYPE_CNV) ++ return -EIO; ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!inode_trylock(inode)) ++ return -EAGAIN; ++ } else { ++ inode_lock(inode); ++ } ++ ++ ret = zonefs_write_checks(iocb, from); ++ if (ret <= 0) ++ goto inode_unlock; ++ ++ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); ++ if (ret > 0) ++ iocb->ki_pos += ret; ++ else if (ret == -EIO) ++ zonefs_io_error(inode, true); ++ ++inode_unlock: ++ inode_unlock(inode); ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ ++ return ret; ++} ++ ++static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ++ if (unlikely(IS_IMMUTABLE(inode))) ++ return -EPERM; ++ ++ if (sb_rdonly(inode->i_sb)) ++ return -EROFS; ++ ++ /* Write operations beyond the zone size are not allowed */ ++ if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) ++ return -EFBIG; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ ssize_t ret = zonefs_file_dio_write(iocb, from); ++ ++ if (ret != -ENOTBLK) ++ return ret; ++ } ++ ++ return zonefs_file_buffered_write(iocb, from); ++} ++ ++static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, ++ int error, unsigned int flags) ++{ ++ if (error) { ++ zonefs_io_error(file_inode(iocb->ki_filp), false); ++ return error; ++ } ++ ++ return 0; ++} ++ ++static const struct iomap_dio_ops zonefs_read_dio_ops = { ++ .end_io = zonefs_file_read_dio_end_io, ++}; ++ ++static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ loff_t isize; ++ ssize_t ret; ++ ++ /* Offline zones cannot be read */ ++ if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) ++ return -EPERM; ++ ++ if (iocb->ki_pos >= zi->i_max_size) ++ return 0; ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ if (!inode_trylock_shared(inode)) ++ return -EAGAIN; ++ } else { ++ inode_lock_shared(inode); ++ } ++ ++ /* Limit read operations to written data */ ++ mutex_lock(&zi->i_truncate_mutex); ++ isize = i_size_read(inode); ++ if (iocb->ki_pos >= isize) { ++ mutex_unlock(&zi->i_truncate_mutex); ++ ret = 0; ++ goto inode_unlock; ++ } ++ iov_iter_truncate(to, isize - iocb->ki_pos); ++ mutex_unlock(&zi->i_truncate_mutex); ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ size_t count = iov_iter_count(to); ++ ++ if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { ++ ret = -EINVAL; ++ goto inode_unlock; ++ } ++ file_accessed(iocb->ki_filp); ++ ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, ++ &zonefs_read_dio_ops, 0, NULL, 0); ++ } else { ++ ret = generic_file_read_iter(iocb, to); ++ if (ret == -EIO) ++ zonefs_io_error(inode, false); ++ } ++ ++inode_unlock: ++ inode_unlock_shared(inode); ++ ++ return ret; ++} ++ ++/* ++ * Write open accounting is done only for sequential files. ++ */ ++static inline bool zonefs_seq_file_need_wro(struct inode *inode, ++ struct file *file) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) ++ return false; ++ ++ if (!(file->f_mode & FMODE_WRITE)) ++ return false; ++ ++ return true; ++} ++ ++static int zonefs_seq_file_write_open(struct inode *inode) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ int ret = 0; ++ ++ mutex_lock(&zi->i_truncate_mutex); ++ ++ if (!zi->i_wr_refcnt) { ++ struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); ++ unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); ++ ++ if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { ++ ++ if (sbi->s_max_wro_seq_files ++ && wro > sbi->s_max_wro_seq_files) { ++ atomic_dec(&sbi->s_wro_seq_files); ++ ret = -EBUSY; ++ goto unlock; ++ } ++ ++ if (i_size_read(inode) < zi->i_max_size) { ++ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); ++ if (ret) { ++ atomic_dec(&sbi->s_wro_seq_files); ++ goto unlock; ++ } ++ zi->i_flags |= ZONEFS_ZONE_OPEN; ++ zonefs_account_active(inode); ++ } ++ } ++ } ++ ++ zi->i_wr_refcnt++; ++ ++unlock: ++ mutex_unlock(&zi->i_truncate_mutex); ++ ++ return ret; ++} ++ ++static int zonefs_file_open(struct inode *inode, struct file *file) ++{ ++ int ret; ++ ++ ret = generic_file_open(inode, file); ++ if (ret) ++ return ret; ++ ++ if (zonefs_seq_file_need_wro(inode, file)) ++ return zonefs_seq_file_write_open(inode); ++ ++ return 0; ++} ++ ++static void zonefs_seq_file_write_close(struct inode *inode) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct super_block *sb = inode->i_sb; ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); ++ int ret = 0; ++ ++ mutex_lock(&zi->i_truncate_mutex); ++ ++ zi->i_wr_refcnt--; ++ if (zi->i_wr_refcnt) ++ goto unlock; ++ ++ /* ++ * The file zone may not be open anymore (e.g. the file was truncated to ++ * its maximum size or it was fully written). For this case, we only ++ * need to decrement the write open count. ++ */ ++ if (zi->i_flags & ZONEFS_ZONE_OPEN) { ++ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); ++ if (ret) { ++ __zonefs_io_error(inode, false); ++ /* ++ * Leaving zones explicitly open may lead to a state ++ * where most zones cannot be written (zone resources ++ * exhausted). So take preventive action by remounting ++ * read-only. ++ */ ++ if (zi->i_flags & ZONEFS_ZONE_OPEN && ++ !(sb->s_flags & SB_RDONLY)) { ++ zonefs_warn(sb, ++ "closing zone at %llu failed %d\n", ++ zi->i_zsector, ret); ++ zonefs_warn(sb, ++ "remounting filesystem read-only\n"); ++ sb->s_flags |= SB_RDONLY; ++ } ++ goto unlock; ++ } ++ ++ zi->i_flags &= ~ZONEFS_ZONE_OPEN; ++ zonefs_account_active(inode); ++ } ++ ++ atomic_dec(&sbi->s_wro_seq_files); ++ ++unlock: ++ mutex_unlock(&zi->i_truncate_mutex); ++} ++ ++static int zonefs_file_release(struct inode *inode, struct file *file) ++{ ++ /* ++ * If we explicitly open a zone we must close it again as well, but the ++ * zone management operation can fail (either due to an IO error or as ++ * the zone has gone offline or read-only). Make sure we don't fail the ++ * close(2) for user-space. ++ */ ++ if (zonefs_seq_file_need_wro(inode, file)) ++ zonefs_seq_file_write_close(inode); ++ ++ return 0; ++} ++ ++const struct file_operations zonefs_file_operations = { ++ .open = zonefs_file_open, ++ .release = zonefs_file_release, ++ .fsync = zonefs_file_fsync, ++ .mmap = zonefs_file_mmap, ++ .llseek = zonefs_file_llseek, ++ .read_iter = zonefs_file_read_iter, ++ .write_iter = zonefs_file_write_iter, ++ .splice_read = generic_file_splice_read, ++ .splice_write = iter_file_splice_write, ++ .iopoll = iocb_bio_iopoll, ++}; +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index a9c5c3f720adf..e808276b88018 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -30,7 +30,7 @@ + /* + * Manage the active zone count. Called with zi->i_truncate_mutex held. + */ +-static void zonefs_account_active(struct inode *inode) ++void zonefs_account_active(struct inode *inode) + { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct zonefs_inode_info *zi = ZONEFS_I(inode); +@@ -68,7 +68,7 @@ static void zonefs_account_active(struct inode *inode) + } + } + +-static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) ++int zonefs_zone_mgmt(struct inode *inode, enum req_op op) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret; +@@ -99,7 +99,7 @@ static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op) + return 0; + } + +-static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) ++void zonefs_i_size_write(struct inode *inode, loff_t isize) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + +@@ -117,167 +117,7 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) + } + } + +-static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, +- loff_t length, unsigned int flags, +- struct iomap *iomap, struct iomap *srcmap) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- loff_t isize; +- +- /* +- * All blocks are always mapped below EOF. If reading past EOF, +- * act as if there is a hole up to the file maximum size. +- */ +- mutex_lock(&zi->i_truncate_mutex); +- iomap->bdev = inode->i_sb->s_bdev; +- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); +- isize = i_size_read(inode); +- if (iomap->offset >= isize) { +- iomap->type = IOMAP_HOLE; +- iomap->addr = IOMAP_NULL_ADDR; +- iomap->length = length; +- } else { +- iomap->type = IOMAP_MAPPED; +- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; +- iomap->length = isize - iomap->offset; +- } +- mutex_unlock(&zi->i_truncate_mutex); +- +- trace_zonefs_iomap_begin(inode, iomap); +- +- return 0; +-} +- +-static const struct iomap_ops zonefs_read_iomap_ops = { +- .iomap_begin = zonefs_read_iomap_begin, +-}; +- +-static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, +- loff_t length, unsigned int flags, +- struct iomap *iomap, struct iomap *srcmap) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- loff_t isize; +- +- /* All write I/Os should always be within the file maximum size */ +- if (WARN_ON_ONCE(offset + length > zi->i_max_size)) +- return -EIO; +- +- /* +- * Sequential zones can only accept direct writes. This is already +- * checked when writes are issued, so warn if we see a page writeback +- * operation. +- */ +- if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && +- !(flags & IOMAP_DIRECT))) +- return -EIO; +- +- /* +- * For conventional zones, all blocks are always mapped. For sequential +- * zones, all blocks after always mapped below the inode size (zone +- * write pointer) and unwriten beyond. +- */ +- mutex_lock(&zi->i_truncate_mutex); +- iomap->bdev = inode->i_sb->s_bdev; +- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); +- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; +- isize = i_size_read(inode); +- if (iomap->offset >= isize) { +- iomap->type = IOMAP_UNWRITTEN; +- iomap->length = zi->i_max_size - iomap->offset; +- } else { +- iomap->type = IOMAP_MAPPED; +- iomap->length = isize - iomap->offset; +- } +- mutex_unlock(&zi->i_truncate_mutex); +- +- trace_zonefs_iomap_begin(inode, iomap); +- +- return 0; +-} +- +-static const struct iomap_ops zonefs_write_iomap_ops = { +- .iomap_begin = zonefs_write_iomap_begin, +-}; +- +-static int zonefs_read_folio(struct file *unused, struct folio *folio) +-{ +- return iomap_read_folio(folio, &zonefs_read_iomap_ops); +-} +- +-static void zonefs_readahead(struct readahead_control *rac) +-{ +- iomap_readahead(rac, &zonefs_read_iomap_ops); +-} +- +-/* +- * Map blocks for page writeback. This is used only on conventional zone files, +- * which implies that the page range can only be within the fixed inode size. +- */ +-static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, +- struct inode *inode, loff_t offset) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) +- return -EIO; +- if (WARN_ON_ONCE(offset >= i_size_read(inode))) +- return -EIO; +- +- /* If the mapping is already OK, nothing needs to be done */ +- if (offset >= wpc->iomap.offset && +- offset < wpc->iomap.offset + wpc->iomap.length) +- return 0; +- +- return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, +- IOMAP_WRITE, &wpc->iomap, NULL); +-} +- +-static const struct iomap_writeback_ops zonefs_writeback_ops = { +- .map_blocks = zonefs_write_map_blocks, +-}; +- +-static int zonefs_writepages(struct address_space *mapping, +- struct writeback_control *wbc) +-{ +- struct iomap_writepage_ctx wpc = { }; +- +- return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); +-} +- +-static int zonefs_swap_activate(struct swap_info_struct *sis, +- struct file *swap_file, sector_t *span) +-{ +- struct inode *inode = file_inode(swap_file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { +- zonefs_err(inode->i_sb, +- "swap file: not a conventional zone file\n"); +- return -EINVAL; +- } +- +- return iomap_swapfile_activate(sis, swap_file, span, +- &zonefs_read_iomap_ops); +-} +- +-static const struct address_space_operations zonefs_file_aops = { +- .read_folio = zonefs_read_folio, +- .readahead = zonefs_readahead, +- .writepages = zonefs_writepages, +- .dirty_folio = filemap_dirty_folio, +- .release_folio = iomap_release_folio, +- .invalidate_folio = iomap_invalidate_folio, +- .migrate_folio = filemap_migrate_folio, +- .is_partially_uptodate = iomap_is_partially_uptodate, +- .error_remove_page = generic_error_remove_page, +- .direct_IO = noop_direct_IO, +- .swap_activate = zonefs_swap_activate, +-}; +- +-static void zonefs_update_stats(struct inode *inode, loff_t new_isize) ++void zonefs_update_stats(struct inode *inode, loff_t new_isize) + { + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +@@ -487,7 +327,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * eventually correct the file size and zonefs inode write pointer offset + * (which can be out of sync with the drive due to partial write failures). + */ +-static void __zonefs_io_error(struct inode *inode, bool write) ++void __zonefs_io_error(struct inode *inode, bool write) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; +@@ -526,749 +366,6 @@ static void __zonefs_io_error(struct inode *inode, bool write) + memalloc_noio_restore(noio_flag); + } + +-static void zonefs_io_error(struct inode *inode, bool write) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- mutex_lock(&zi->i_truncate_mutex); +- __zonefs_io_error(inode, write); +- mutex_unlock(&zi->i_truncate_mutex); +-} +- +-static int zonefs_file_truncate(struct inode *inode, loff_t isize) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- loff_t old_isize; +- enum req_op op; +- int ret = 0; +- +- /* +- * Only sequential zone files can be truncated and truncation is allowed +- * only down to a 0 size, which is equivalent to a zone reset, and to +- * the maximum file size, which is equivalent to a zone finish. +- */ +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) +- return -EPERM; +- +- if (!isize) +- op = REQ_OP_ZONE_RESET; +- else if (isize == zi->i_max_size) +- op = REQ_OP_ZONE_FINISH; +- else +- return -EPERM; +- +- inode_dio_wait(inode); +- +- /* Serialize against page faults */ +- filemap_invalidate_lock(inode->i_mapping); +- +- /* Serialize against zonefs_iomap_begin() */ +- mutex_lock(&zi->i_truncate_mutex); +- +- old_isize = i_size_read(inode); +- if (isize == old_isize) +- goto unlock; +- +- ret = zonefs_zone_mgmt(inode, op); +- if (ret) +- goto unlock; +- +- /* +- * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, +- * take care of open zones. +- */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN) { +- /* +- * Truncating a zone to EMPTY or FULL is the equivalent of +- * closing the zone. For a truncation to 0, we need to +- * re-open the zone to ensure new writes can be processed. +- * For a truncation to the maximum file size, the zone is +- * closed and writes cannot be accepted anymore, so clear +- * the open flag. +- */ +- if (!isize) +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); +- else +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; +- } +- +- zonefs_update_stats(inode, isize); +- truncate_setsize(inode, isize); +- zi->i_wpoffset = isize; +- zonefs_account_active(inode); +- +-unlock: +- mutex_unlock(&zi->i_truncate_mutex); +- filemap_invalidate_unlock(inode->i_mapping); +- +- return ret; +-} +- +-static int zonefs_inode_setattr(struct user_namespace *mnt_userns, +- struct dentry *dentry, struct iattr *iattr) +-{ +- struct inode *inode = d_inode(dentry); +- int ret; +- +- if (unlikely(IS_IMMUTABLE(inode))) +- return -EPERM; +- +- ret = setattr_prepare(&init_user_ns, dentry, iattr); +- if (ret) +- return ret; +- +- /* +- * Since files and directories cannot be created nor deleted, do not +- * allow setting any write attributes on the sub-directories grouping +- * files by zone type. +- */ +- if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && +- (iattr->ia_mode & 0222)) +- return -EPERM; +- +- if (((iattr->ia_valid & ATTR_UID) && +- !uid_eq(iattr->ia_uid, inode->i_uid)) || +- ((iattr->ia_valid & ATTR_GID) && +- !gid_eq(iattr->ia_gid, inode->i_gid))) { +- ret = dquot_transfer(mnt_userns, inode, iattr); +- if (ret) +- return ret; +- } +- +- if (iattr->ia_valid & ATTR_SIZE) { +- ret = zonefs_file_truncate(inode, iattr->ia_size); +- if (ret) +- return ret; +- } +- +- setattr_copy(&init_user_ns, inode, iattr); +- +- return 0; +-} +- +-static const struct inode_operations zonefs_file_inode_operations = { +- .setattr = zonefs_inode_setattr, +-}; +- +-static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, +- int datasync) +-{ +- struct inode *inode = file_inode(file); +- int ret = 0; +- +- if (unlikely(IS_IMMUTABLE(inode))) +- return -EPERM; +- +- /* +- * Since only direct writes are allowed in sequential files, page cache +- * flush is needed only for conventional zone files. +- */ +- if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) +- ret = file_write_and_wait_range(file, start, end); +- if (!ret) +- ret = blkdev_issue_flush(inode->i_sb->s_bdev); +- +- if (ret) +- zonefs_io_error(inode, true); +- +- return ret; +-} +- +-static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) +-{ +- struct inode *inode = file_inode(vmf->vma->vm_file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- vm_fault_t ret; +- +- if (unlikely(IS_IMMUTABLE(inode))) +- return VM_FAULT_SIGBUS; +- +- /* +- * Sanity check: only conventional zone files can have shared +- * writeable mappings. +- */ +- if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) +- return VM_FAULT_NOPAGE; +- +- sb_start_pagefault(inode->i_sb); +- file_update_time(vmf->vma->vm_file); +- +- /* Serialize against truncates */ +- filemap_invalidate_lock_shared(inode->i_mapping); +- ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); +- filemap_invalidate_unlock_shared(inode->i_mapping); +- +- sb_end_pagefault(inode->i_sb); +- return ret; +-} +- +-static const struct vm_operations_struct zonefs_file_vm_ops = { +- .fault = filemap_fault, +- .map_pages = filemap_map_pages, +- .page_mkwrite = zonefs_filemap_page_mkwrite, +-}; +- +-static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) +-{ +- /* +- * Conventional zones accept random writes, so their files can support +- * shared writable mappings. For sequential zone files, only read +- * mappings are possible since there are no guarantees for write +- * ordering between msync() and page cache writeback. +- */ +- if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && +- (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) +- return -EINVAL; +- +- file_accessed(file); +- vma->vm_ops = &zonefs_file_vm_ops; +- +- return 0; +-} +- +-static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) +-{ +- loff_t isize = i_size_read(file_inode(file)); +- +- /* +- * Seeks are limited to below the zone size for conventional zones +- * and below the zone write pointer for sequential zones. In both +- * cases, this limit is the inode size. +- */ +- return generic_file_llseek_size(file, offset, whence, isize, isize); +-} +- +-static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, +- int error, unsigned int flags) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (error) { +- zonefs_io_error(inode, true); +- return error; +- } +- +- if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { +- /* +- * Note that we may be seeing completions out of order, +- * but that is not a problem since a write completed +- * successfully necessarily means that all preceding writes +- * were also successful. So we can safely increase the inode +- * size to the write end location. +- */ +- mutex_lock(&zi->i_truncate_mutex); +- if (i_size_read(inode) < iocb->ki_pos + size) { +- zonefs_update_stats(inode, iocb->ki_pos + size); +- zonefs_i_size_write(inode, iocb->ki_pos + size); +- } +- mutex_unlock(&zi->i_truncate_mutex); +- } +- +- return 0; +-} +- +-static const struct iomap_dio_ops zonefs_write_dio_ops = { +- .end_io = zonefs_file_write_dio_end_io, +-}; +- +-static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct block_device *bdev = inode->i_sb->s_bdev; +- unsigned int max = bdev_max_zone_append_sectors(bdev); +- struct bio *bio; +- ssize_t size; +- int nr_pages; +- ssize_t ret; +- +- max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); +- iov_iter_truncate(from, max); +- +- nr_pages = iov_iter_npages(from, BIO_MAX_VECS); +- if (!nr_pages) +- return 0; +- +- bio = bio_alloc(bdev, nr_pages, +- REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); +- bio->bi_iter.bi_sector = zi->i_zsector; +- bio->bi_ioprio = iocb->ki_ioprio; +- if (iocb_is_dsync(iocb)) +- bio->bi_opf |= REQ_FUA; +- +- ret = bio_iov_iter_get_pages(bio, from); +- if (unlikely(ret)) +- goto out_release; +- +- size = bio->bi_iter.bi_size; +- task_io_account_write(size); +- +- if (iocb->ki_flags & IOCB_HIPRI) +- bio_set_polled(bio, iocb); +- +- ret = submit_bio_wait(bio); +- +- /* +- * If the file zone was written underneath the file system, the zone +- * write pointer may not be where we expect it to be, but the zone +- * append write can still succeed. So check manually that we wrote where +- * we intended to, that is, at zi->i_wpoffset. +- */ +- if (!ret) { +- sector_t wpsector = +- zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); +- +- if (bio->bi_iter.bi_sector != wpsector) { +- zonefs_warn(inode->i_sb, +- "Corrupted write pointer %llu for zone at %llu\n", +- wpsector, zi->i_zsector); +- ret = -EIO; +- } +- } +- +- zonefs_file_write_dio_end_io(iocb, size, ret, 0); +- trace_zonefs_file_dio_append(inode, size, ret); +- +-out_release: +- bio_release_pages(bio, false); +- bio_put(bio); +- +- if (ret >= 0) { +- iocb->ki_pos += size; +- return size; +- } +- +- return ret; +-} +- +-/* +- * Do not exceed the LFS limits nor the file zone size. If pos is under the +- * limit it becomes a short access. If it exceeds the limit, return -EFBIG. +- */ +-static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, +- loff_t count) +-{ +- struct inode *inode = file_inode(file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- loff_t limit = rlimit(RLIMIT_FSIZE); +- loff_t max_size = zi->i_max_size; +- +- if (limit != RLIM_INFINITY) { +- if (pos >= limit) { +- send_sig(SIGXFSZ, current, 0); +- return -EFBIG; +- } +- count = min(count, limit - pos); +- } +- +- if (!(file->f_flags & O_LARGEFILE)) +- max_size = min_t(loff_t, MAX_NON_LFS, max_size); +- +- if (unlikely(pos >= max_size)) +- return -EFBIG; +- +- return min(count, max_size - pos); +-} +- +-static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) +-{ +- struct file *file = iocb->ki_filp; +- struct inode *inode = file_inode(file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- loff_t count; +- +- if (IS_SWAPFILE(inode)) +- return -ETXTBSY; +- +- if (!iov_iter_count(from)) +- return 0; +- +- if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) +- return -EINVAL; +- +- if (iocb->ki_flags & IOCB_APPEND) { +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) +- return -EINVAL; +- mutex_lock(&zi->i_truncate_mutex); +- iocb->ki_pos = zi->i_wpoffset; +- mutex_unlock(&zi->i_truncate_mutex); +- } +- +- count = zonefs_write_check_limits(file, iocb->ki_pos, +- iov_iter_count(from)); +- if (count < 0) +- return count; +- +- iov_iter_truncate(from, count); +- return iov_iter_count(from); +-} +- +-/* +- * Handle direct writes. For sequential zone files, this is the only possible +- * write path. For these files, check that the user is issuing writes +- * sequentially from the end of the file. This code assumes that the block layer +- * delivers write requests to the device in sequential order. This is always the +- * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE +- * elevator feature is being used (e.g. mq-deadline). The block layer always +- * automatically select such an elevator for zoned block devices during the +- * device initialization. +- */ +-static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- bool sync = is_sync_kiocb(iocb); +- bool append = false; +- ssize_t ret, count; +- +- /* +- * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT +- * as this can cause write reordering (e.g. the first aio gets EAGAIN +- * on the inode lock but the second goes through but is now unaligned). +- */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && +- (iocb->ki_flags & IOCB_NOWAIT)) +- return -EOPNOTSUPP; +- +- if (iocb->ki_flags & IOCB_NOWAIT) { +- if (!inode_trylock(inode)) +- return -EAGAIN; +- } else { +- inode_lock(inode); +- } +- +- count = zonefs_write_checks(iocb, from); +- if (count <= 0) { +- ret = count; +- goto inode_unlock; +- } +- +- if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { +- ret = -EINVAL; +- goto inode_unlock; +- } +- +- /* Enforce sequential writes (append only) in sequential zones */ +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { +- mutex_lock(&zi->i_truncate_mutex); +- if (iocb->ki_pos != zi->i_wpoffset) { +- mutex_unlock(&zi->i_truncate_mutex); +- ret = -EINVAL; +- goto inode_unlock; +- } +- mutex_unlock(&zi->i_truncate_mutex); +- append = sync; +- } +- +- if (append) +- ret = zonefs_file_dio_append(iocb, from); +- else +- ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, +- &zonefs_write_dio_ops, 0, NULL, 0); +- if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && +- (ret > 0 || ret == -EIOCBQUEUED)) { +- if (ret > 0) +- count = ret; +- +- /* +- * Update the zone write pointer offset assuming the write +- * operation succeeded. If it did not, the error recovery path +- * will correct it. Also do active seq file accounting. +- */ +- mutex_lock(&zi->i_truncate_mutex); +- zi->i_wpoffset += count; +- zonefs_account_active(inode); +- mutex_unlock(&zi->i_truncate_mutex); +- } +- +-inode_unlock: +- inode_unlock(inode); +- +- return ret; +-} +- +-static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, +- struct iov_iter *from) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- ssize_t ret; +- +- /* +- * Direct IO writes are mandatory for sequential zone files so that the +- * write IO issuing order is preserved. +- */ +- if (zi->i_ztype != ZONEFS_ZTYPE_CNV) +- return -EIO; +- +- if (iocb->ki_flags & IOCB_NOWAIT) { +- if (!inode_trylock(inode)) +- return -EAGAIN; +- } else { +- inode_lock(inode); +- } +- +- ret = zonefs_write_checks(iocb, from); +- if (ret <= 0) +- goto inode_unlock; +- +- ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); +- if (ret > 0) +- iocb->ki_pos += ret; +- else if (ret == -EIO) +- zonefs_io_error(inode, true); +- +-inode_unlock: +- inode_unlock(inode); +- if (ret > 0) +- ret = generic_write_sync(iocb, ret); +- +- return ret; +-} +- +-static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- +- if (unlikely(IS_IMMUTABLE(inode))) +- return -EPERM; +- +- if (sb_rdonly(inode->i_sb)) +- return -EROFS; +- +- /* Write operations beyond the zone size are not allowed */ +- if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) +- return -EFBIG; +- +- if (iocb->ki_flags & IOCB_DIRECT) { +- ssize_t ret = zonefs_file_dio_write(iocb, from); +- if (ret != -ENOTBLK) +- return ret; +- } +- +- return zonefs_file_buffered_write(iocb, from); +-} +- +-static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, +- int error, unsigned int flags) +-{ +- if (error) { +- zonefs_io_error(file_inode(iocb->ki_filp), false); +- return error; +- } +- +- return 0; +-} +- +-static const struct iomap_dio_ops zonefs_read_dio_ops = { +- .end_io = zonefs_file_read_dio_end_io, +-}; +- +-static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +-{ +- struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- loff_t isize; +- ssize_t ret; +- +- /* Offline zones cannot be read */ +- if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) +- return -EPERM; +- +- if (iocb->ki_pos >= zi->i_max_size) +- return 0; +- +- if (iocb->ki_flags & IOCB_NOWAIT) { +- if (!inode_trylock_shared(inode)) +- return -EAGAIN; +- } else { +- inode_lock_shared(inode); +- } +- +- /* Limit read operations to written data */ +- mutex_lock(&zi->i_truncate_mutex); +- isize = i_size_read(inode); +- if (iocb->ki_pos >= isize) { +- mutex_unlock(&zi->i_truncate_mutex); +- ret = 0; +- goto inode_unlock; +- } +- iov_iter_truncate(to, isize - iocb->ki_pos); +- mutex_unlock(&zi->i_truncate_mutex); +- +- if (iocb->ki_flags & IOCB_DIRECT) { +- size_t count = iov_iter_count(to); +- +- if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { +- ret = -EINVAL; +- goto inode_unlock; +- } +- file_accessed(iocb->ki_filp); +- ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, +- &zonefs_read_dio_ops, 0, NULL, 0); +- } else { +- ret = generic_file_read_iter(iocb, to); +- if (ret == -EIO) +- zonefs_io_error(inode, false); +- } +- +-inode_unlock: +- inode_unlock_shared(inode); +- +- return ret; +-} +- +-/* +- * Write open accounting is done only for sequential files. +- */ +-static inline bool zonefs_seq_file_need_wro(struct inode *inode, +- struct file *file) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- +- if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) +- return false; +- +- if (!(file->f_mode & FMODE_WRITE)) +- return false; +- +- return true; +-} +- +-static int zonefs_seq_file_write_open(struct inode *inode) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- int ret = 0; +- +- mutex_lock(&zi->i_truncate_mutex); +- +- if (!zi->i_wr_refcnt) { +- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); +- unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); +- +- if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { +- +- if (sbi->s_max_wro_seq_files +- && wro > sbi->s_max_wro_seq_files) { +- atomic_dec(&sbi->s_wro_seq_files); +- ret = -EBUSY; +- goto unlock; +- } +- +- if (i_size_read(inode) < zi->i_max_size) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); +- if (ret) { +- atomic_dec(&sbi->s_wro_seq_files); +- goto unlock; +- } +- zi->i_flags |= ZONEFS_ZONE_OPEN; +- zonefs_account_active(inode); +- } +- } +- } +- +- zi->i_wr_refcnt++; +- +-unlock: +- mutex_unlock(&zi->i_truncate_mutex); +- +- return ret; +-} +- +-static int zonefs_file_open(struct inode *inode, struct file *file) +-{ +- int ret; +- +- ret = generic_file_open(inode, file); +- if (ret) +- return ret; +- +- if (zonefs_seq_file_need_wro(inode, file)) +- return zonefs_seq_file_write_open(inode); +- +- return 0; +-} +- +-static void zonefs_seq_file_write_close(struct inode *inode) +-{ +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- struct super_block *sb = inode->i_sb; +- struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +- int ret = 0; +- +- mutex_lock(&zi->i_truncate_mutex); +- +- zi->i_wr_refcnt--; +- if (zi->i_wr_refcnt) +- goto unlock; +- +- /* +- * The file zone may not be open anymore (e.g. the file was truncated to +- * its maximum size or it was fully written). For this case, we only +- * need to decrement the write open count. +- */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); +- if (ret) { +- __zonefs_io_error(inode, false); +- /* +- * Leaving zones explicitly open may lead to a state +- * where most zones cannot be written (zone resources +- * exhausted). So take preventive action by remounting +- * read-only. +- */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN && +- !(sb->s_flags & SB_RDONLY)) { +- zonefs_warn(sb, +- "closing zone at %llu failed %d\n", +- zi->i_zsector, ret); +- zonefs_warn(sb, +- "remounting filesystem read-only\n"); +- sb->s_flags |= SB_RDONLY; +- } +- goto unlock; +- } +- +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; +- zonefs_account_active(inode); +- } +- +- atomic_dec(&sbi->s_wro_seq_files); +- +-unlock: +- mutex_unlock(&zi->i_truncate_mutex); +-} +- +-static int zonefs_file_release(struct inode *inode, struct file *file) +-{ +- /* +- * If we explicitly open a zone we must close it again as well, but the +- * zone management operation can fail (either due to an IO error or as +- * the zone has gone offline or read-only). Make sure we don't fail the +- * close(2) for user-space. +- */ +- if (zonefs_seq_file_need_wro(inode, file)) +- zonefs_seq_file_write_close(inode); +- +- return 0; +-} +- +-static const struct file_operations zonefs_file_operations = { +- .open = zonefs_file_open, +- .release = zonefs_file_release, +- .fsync = zonefs_file_fsync, +- .mmap = zonefs_file_mmap, +- .llseek = zonefs_file_llseek, +- .read_iter = zonefs_file_read_iter, +- .write_iter = zonefs_file_write_iter, +- .splice_read = generic_file_splice_read, +- .splice_write = iter_file_splice_write, +- .iopoll = iocb_bio_iopoll, +-}; +- + static struct kmem_cache *zonefs_inode_cachep; + + static struct inode *zonefs_alloc_inode(struct super_block *sb) +@@ -1408,13 +505,47 @@ static int zonefs_remount(struct super_block *sb, int *flags, char *data) + return zonefs_parse_options(sb, data); + } + +-static const struct super_operations zonefs_sops = { +- .alloc_inode = zonefs_alloc_inode, +- .free_inode = zonefs_free_inode, +- .statfs = zonefs_statfs, +- .remount_fs = zonefs_remount, +- .show_options = zonefs_show_options, +-}; ++static int zonefs_inode_setattr(struct user_namespace *mnt_userns, ++ struct dentry *dentry, struct iattr *iattr) ++{ ++ struct inode *inode = d_inode(dentry); ++ int ret; ++ ++ if (unlikely(IS_IMMUTABLE(inode))) ++ return -EPERM; ++ ++ ret = setattr_prepare(&init_user_ns, dentry, iattr); ++ if (ret) ++ return ret; ++ ++ /* ++ * Since files and directories cannot be created nor deleted, do not ++ * allow setting any write attributes on the sub-directories grouping ++ * files by zone type. ++ */ ++ if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && ++ (iattr->ia_mode & 0222)) ++ return -EPERM; ++ ++ if (((iattr->ia_valid & ATTR_UID) && ++ !uid_eq(iattr->ia_uid, inode->i_uid)) || ++ ((iattr->ia_valid & ATTR_GID) && ++ !gid_eq(iattr->ia_gid, inode->i_gid))) { ++ ret = dquot_transfer(mnt_userns, inode, iattr); ++ if (ret) ++ return ret; ++ } ++ ++ if (iattr->ia_valid & ATTR_SIZE) { ++ ret = zonefs_file_truncate(inode, iattr->ia_size); ++ if (ret) ++ return ret; ++ } ++ ++ setattr_copy(&init_user_ns, inode, iattr); ++ ++ return 0; ++} + + static const struct inode_operations zonefs_dir_inode_operations = { + .lookup = simple_lookup, +@@ -1434,6 +565,10 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, + inc_nlink(parent); + } + ++static const struct inode_operations zonefs_file_inode_operations = { ++ .setattr = zonefs_inode_setattr, ++}; ++ + static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + enum zonefs_ztype type) + { +@@ -1785,6 +920,14 @@ static int zonefs_read_super(struct super_block *sb) + return ret; + } + ++static const struct super_operations zonefs_sops = { ++ .alloc_inode = zonefs_alloc_inode, ++ .free_inode = zonefs_free_inode, ++ .statfs = zonefs_statfs, ++ .remount_fs = zonefs_remount, ++ .show_options = zonefs_show_options, ++}; ++ + /* + * Check that the device is zoned. If it is, get the list of zones and create + * sub-directories and files according to the device zone configuration and +diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h +index 1dbe78119ff16..839ebe9afb6c1 100644 +--- a/fs/zonefs/zonefs.h ++++ b/fs/zonefs/zonefs.h +@@ -209,6 +209,28 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) + #define zonefs_warn(sb, format, args...) \ + pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) + ++/* In super.c */ ++void zonefs_account_active(struct inode *inode); ++int zonefs_zone_mgmt(struct inode *inode, enum req_op op); ++void zonefs_i_size_write(struct inode *inode, loff_t isize); ++void zonefs_update_stats(struct inode *inode, loff_t new_isize); ++void __zonefs_io_error(struct inode *inode, bool write); ++ ++static inline void zonefs_io_error(struct inode *inode, bool write) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ mutex_lock(&zi->i_truncate_mutex); ++ __zonefs_io_error(inode, write); ++ mutex_unlock(&zi->i_truncate_mutex); ++} ++ ++/* In file.c */ ++extern const struct address_space_operations zonefs_file_aops; ++extern const struct file_operations zonefs_file_operations; ++int zonefs_file_truncate(struct inode *inode, loff_t isize); ++ ++/* In sysfs.c */ + int zonefs_sysfs_register(struct super_block *sb); + void zonefs_sysfs_unregister(struct super_block *sb); + int zonefs_sysfs_init(void); +-- +2.39.2 + diff --git a/queue-6.2/zonefs-separate-zone-information-from-inode-informat.patch b/queue-6.2/zonefs-separate-zone-information-from-inode-informat.patch new file mode 100644 index 00000000000..748ec4ec25b --- /dev/null +++ b/queue-6.2/zonefs-separate-zone-information-from-inode-informat.patch @@ -0,0 +1,1485 @@ +From 60aaa2368870616ab6b4f218c9194d826ee72f64 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Nov 2022 18:15:40 +0900 +Subject: zonefs: Separate zone information from inode information + +From: Damien Le Moal + +[ Upstream commit aa7f243f32e1d18036ee00d71d3ccfad70ae2121 ] + +In preparation for adding dynamic inode allocation, separate an inode +zone information from the zonefs inode structure. The new data structure +zonefs_zone is introduced to store in memory information about a zone +that must be kept throughout the lifetime of the device mount. + +Linking between a zone file inode and its zone information is done by +setting the inode i_private field to point to a struct zonefs_zone. +Using the i_private pointer avoids the need for adding a pointer in +struct zonefs_inode_info. Beside the vfs inode, this structure is +reduced to a mutex and a write open counter. + +One struct zonefs_zone is created per file inode on mount. These +structures are organized in an array using the new struct +zonefs_zone_group data structure to represent zone groups. The +zonefs_zone arrays are indexed per file number (the index of a struct +zonefs_zone in its array directly gives the file number/name for that +zone file inode). + +Signed-off-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") +Signed-off-by: Sasha Levin +--- + fs/zonefs/file.c | 99 ++++---- + fs/zonefs/super.c | 571 +++++++++++++++++++++++++++------------------ + fs/zonefs/trace.h | 20 +- + fs/zonefs/zonefs.h | 63 +++-- + 4 files changed, 449 insertions(+), 304 deletions(-) + +diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c +index 64873d31d75dd..738b0e28d74b5 100644 +--- a/fs/zonefs/file.c ++++ b/fs/zonefs/file.c +@@ -29,6 +29,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + struct iomap *iomap, struct iomap *srcmap) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + +@@ -46,7 +47,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + iomap->length = length; + } else { + iomap->type = IOMAP_MAPPED; +- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; ++ iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; + iomap->length = isize - iomap->offset; + } + mutex_unlock(&zi->i_truncate_mutex); +@@ -65,11 +66,12 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + struct iomap *iomap, struct iomap *srcmap) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + + /* All write I/Os should always be within the file maximum size */ +- if (WARN_ON_ONCE(offset + length > zi->i_max_size)) ++ if (WARN_ON_ONCE(offset + length > z->z_capacity)) + return -EIO; + + /* +@@ -77,7 +79,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + * checked when writes are issued, so warn if we see a page writeback + * operation. + */ +- if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT))) ++ if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) + return -EIO; + + /* +@@ -88,11 +90,11 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + mutex_lock(&zi->i_truncate_mutex); + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); +- iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; ++ iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; + isize = i_size_read(inode); + if (iomap->offset >= isize) { + iomap->type = IOMAP_UNWRITTEN; +- iomap->length = zi->i_max_size - iomap->offset; ++ iomap->length = z->z_capacity - iomap->offset; + } else { + iomap->type = IOMAP_MAPPED; + iomap->length = isize - iomap->offset; +@@ -125,9 +127,9 @@ static void zonefs_readahead(struct readahead_control *rac) + static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + +- if (WARN_ON_ONCE(zonefs_zone_is_seq(zi))) ++ if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) + return -EIO; + if (WARN_ON_ONCE(offset >= i_size_read(inode))) + return -EIO; +@@ -137,7 +139,8 @@ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + +- return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, ++ return zonefs_write_iomap_begin(inode, offset, ++ z->z_capacity - offset, + IOMAP_WRITE, &wpc->iomap, NULL); + } + +@@ -185,6 +188,7 @@ const struct address_space_operations zonefs_file_aops = { + int zonefs_file_truncate(struct inode *inode, loff_t isize) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t old_isize; + enum req_op op; + int ret = 0; +@@ -194,12 +198,12 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + * only down to a 0 size, which is equivalent to a zone reset, and to + * the maximum file size, which is equivalent to a zone finish. + */ +- if (!zonefs_zone_is_seq(zi)) ++ if (!zonefs_zone_is_seq(z)) + return -EPERM; + + if (!isize) + op = REQ_OP_ZONE_RESET; +- else if (isize == zi->i_max_size) ++ else if (isize == z->z_capacity) + op = REQ_OP_ZONE_FINISH; + else + return -EPERM; +@@ -216,7 +220,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + if (isize == old_isize) + goto unlock; + +- ret = zonefs_zone_mgmt(inode, op); ++ ret = zonefs_inode_zone_mgmt(inode, op); + if (ret) + goto unlock; + +@@ -224,7 +228,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN) { ++ if (z->z_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to +@@ -234,15 +238,15 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize) + * the open flag. + */ + if (!isize) +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); ++ ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; ++ z->z_flags &= ~ZONEFS_ZONE_OPEN; + } + + zonefs_update_stats(inode, isize); + truncate_setsize(inode, isize); +- zi->i_wpoffset = isize; +- zonefs_account_active(inode); ++ z->z_wpoffset = isize; ++ zonefs_inode_account_active(inode); + + unlock: + mutex_unlock(&zi->i_truncate_mutex); +@@ -349,7 +353,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, + return error; + } + +- if (size && zonefs_zone_is_seq(zi)) { ++ if (size && zonefs_inode_is_seq(inode)) { + /* + * Note that we may be seeing completions out of order, + * but that is not a problem since a write completed +@@ -375,7 +379,7 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = { + static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int max = bdev_max_zone_append_sectors(bdev); + struct bio *bio; +@@ -392,7 +396,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + + bio = bio_alloc(bdev, nr_pages, + REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); +- bio->bi_iter.bi_sector = zi->i_zsector; ++ bio->bi_iter.bi_sector = z->z_sector; + bio->bi_ioprio = iocb->ki_ioprio; + if (iocb_is_dsync(iocb)) + bio->bi_opf |= REQ_FUA; +@@ -417,12 +421,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) + */ + if (!ret) { + sector_t wpsector = +- zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT); ++ z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT); + + if (bio->bi_iter.bi_sector != wpsector) { + zonefs_warn(inode->i_sb, + "Corrupted write pointer %llu for zone at %llu\n", +- wpsector, zi->i_zsector); ++ wpsector, z->z_sector); + ret = -EIO; + } + } +@@ -450,9 +454,9 @@ static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, + loff_t count) + { + struct inode *inode = file_inode(file); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t limit = rlimit(RLIMIT_FSIZE); +- loff_t max_size = zi->i_max_size; ++ loff_t max_size = z->z_capacity; + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { +@@ -476,6 +480,7 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + loff_t count; + + if (IS_SWAPFILE(inode)) +@@ -488,10 +493,10 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) + return -EINVAL; + + if (iocb->ki_flags & IOCB_APPEND) { +- if (zonefs_zone_is_cnv(zi)) ++ if (zonefs_zone_is_cnv(z)) + return -EINVAL; + mutex_lock(&zi->i_truncate_mutex); +- iocb->ki_pos = zi->i_wpoffset; ++ iocb->ki_pos = z->z_wpoffset; + mutex_unlock(&zi->i_truncate_mutex); + } + +@@ -518,6 +523,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + bool sync = is_sync_kiocb(iocb); + bool append = false; +@@ -528,7 +534,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + * as this can cause write reordering (e.g. the first aio gets EAGAIN + * on the inode lock but the second goes through but is now unaligned). + */ +- if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) ++ if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) + return -EOPNOTSUPP; + + if (iocb->ki_flags & IOCB_NOWAIT) { +@@ -550,9 +556,9 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + } + + /* Enforce sequential writes (append only) in sequential zones */ +- if (zonefs_zone_is_seq(zi)) { ++ if (zonefs_zone_is_seq(z)) { + mutex_lock(&zi->i_truncate_mutex); +- if (iocb->ki_pos != zi->i_wpoffset) { ++ if (iocb->ki_pos != z->z_wpoffset) { + mutex_unlock(&zi->i_truncate_mutex); + ret = -EINVAL; + goto inode_unlock; +@@ -566,7 +572,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + else + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, + &zonefs_write_dio_ops, 0, NULL, 0); +- if (zonefs_zone_is_seq(zi) && ++ if (zonefs_zone_is_seq(z) && + (ret > 0 || ret == -EIOCBQUEUED)) { + if (ret > 0) + count = ret; +@@ -577,8 +583,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) + * will correct it. Also do active seq file accounting. + */ + mutex_lock(&zi->i_truncate_mutex); +- zi->i_wpoffset += count; +- zonefs_account_active(inode); ++ z->z_wpoffset += count; ++ zonefs_inode_account_active(inode); + mutex_unlock(&zi->i_truncate_mutex); + } + +@@ -629,6 +635,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, + static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + + if (unlikely(IS_IMMUTABLE(inode))) + return -EPERM; +@@ -636,8 +643,8 @@ static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) + if (sb_rdonly(inode->i_sb)) + return -EROFS; + +- /* Write operations beyond the zone size are not allowed */ +- if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) ++ /* Write operations beyond the zone capacity are not allowed */ ++ if (iocb->ki_pos >= z->z_capacity) + return -EFBIG; + + if (iocb->ki_flags & IOCB_DIRECT) { +@@ -669,6 +676,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + { + struct inode *inode = file_inode(iocb->ki_filp); + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + loff_t isize; + ssize_t ret; +@@ -677,7 +685,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) + if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) + return -EPERM; + +- if (iocb->ki_pos >= zi->i_max_size) ++ if (iocb->ki_pos >= z->z_capacity) + return 0; + + if (iocb->ki_flags & IOCB_NOWAIT) { +@@ -738,6 +746,7 @@ static inline bool zonefs_seq_file_need_wro(struct inode *inode, + static int zonefs_seq_file_write_open(struct inode *inode) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); +@@ -755,14 +764,15 @@ static int zonefs_seq_file_write_open(struct inode *inode) + goto unlock; + } + +- if (i_size_read(inode) < zi->i_max_size) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); ++ if (i_size_read(inode) < z->z_capacity) { ++ ret = zonefs_inode_zone_mgmt(inode, ++ REQ_OP_ZONE_OPEN); + if (ret) { + atomic_dec(&sbi->s_wro_seq_files); + goto unlock; + } +- zi->i_flags |= ZONEFS_ZONE_OPEN; +- zonefs_account_active(inode); ++ z->z_flags |= ZONEFS_ZONE_OPEN; ++ zonefs_inode_account_active(inode); + } + } + } +@@ -792,6 +802,7 @@ static int zonefs_file_open(struct inode *inode, struct file *file) + static void zonefs_seq_file_write_close(struct inode *inode) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + int ret = 0; +@@ -807,8 +818,8 @@ static void zonefs_seq_file_write_close(struct inode *inode) + * its maximum size or it was fully written). For this case, we only + * need to decrement the write open count. + */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); ++ if (z->z_flags & ZONEFS_ZONE_OPEN) { ++ ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* +@@ -817,11 +828,11 @@ static void zonefs_seq_file_write_close(struct inode *inode) + * exhausted). So take preventive action by remounting + * read-only. + */ +- if (zi->i_flags & ZONEFS_ZONE_OPEN && ++ if (z->z_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, + "closing zone at %llu failed %d\n", +- zi->i_zsector, ret); ++ z->z_sector, ret); + zonefs_warn(sb, + "remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; +@@ -829,8 +840,8 @@ static void zonefs_seq_file_write_close(struct inode *inode) + goto unlock; + } + +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; +- zonefs_account_active(inode); ++ z->z_flags &= ~ZONEFS_ZONE_OPEN; ++ zonefs_inode_account_active(inode); + } + + atomic_dec(&sbi->s_wro_seq_files); +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index a4af29dc32e7d..270ded209dde5 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -28,33 +28,47 @@ + #include "trace.h" + + /* +- * Manage the active zone count. Called with zi->i_truncate_mutex held. ++ * Get the name of a zone group directory. + */ +-void zonefs_account_active(struct inode *inode) ++static const char *zonefs_zgroup_name(enum zonefs_ztype ztype) + { +- struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ switch (ztype) { ++ case ZONEFS_ZTYPE_CNV: ++ return "cnv"; ++ case ZONEFS_ZTYPE_SEQ: ++ return "seq"; ++ default: ++ WARN_ON_ONCE(1); ++ return "???"; ++ } ++} + +- lockdep_assert_held(&zi->i_truncate_mutex); ++/* ++ * Manage the active zone count. ++ */ ++static void zonefs_account_active(struct super_block *sb, ++ struct zonefs_zone *z) ++{ ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + +- if (zonefs_zone_is_cnv(zi)) ++ if (zonefs_zone_is_cnv(z)) + return; + + /* + * For zones that transitioned to the offline or readonly condition, + * we only need to clear the active state. + */ +- if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) ++ if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) + goto out; + + /* + * If the zone is active, that is, if it is explicitly open or + * partially written, check if it was already accounted as active. + */ +- if ((zi->i_flags & ZONEFS_ZONE_OPEN) || +- (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { +- if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { +- zi->i_flags |= ZONEFS_ZONE_ACTIVE; ++ if ((z->z_flags & ZONEFS_ZONE_OPEN) || ++ (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) { ++ if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) { ++ z->z_flags |= ZONEFS_ZONE_ACTIVE; + atomic_inc(&sbi->s_active_seq_files); + } + return; +@@ -62,18 +76,29 @@ void zonefs_account_active(struct inode *inode) + + out: + /* The zone is not active. If it was, update the active count */ +- if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { +- zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; ++ if (z->z_flags & ZONEFS_ZONE_ACTIVE) { ++ z->z_flags &= ~ZONEFS_ZONE_ACTIVE; + atomic_dec(&sbi->s_active_seq_files); + } + } + +-int zonefs_zone_mgmt(struct inode *inode, enum req_op op) ++/* ++ * Manage the active zone count. Called with zi->i_truncate_mutex held. ++ */ ++void zonefs_inode_account_active(struct inode *inode) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- int ret; ++ lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); + +- lockdep_assert_held(&zi->i_truncate_mutex); ++ return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode)); ++} ++ ++/* ++ * Execute a zone management operation. ++ */ ++static int zonefs_zone_mgmt(struct super_block *sb, ++ struct zonefs_zone *z, enum req_op op) ++{ ++ int ret; + + /* + * With ZNS drives, closing an explicitly open zone that has not been +@@ -83,37 +108,45 @@ int zonefs_zone_mgmt(struct inode *inode, enum req_op op) + * are exceeded, make sure that the zone does not remain active by + * resetting it. + */ +- if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) ++ if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset) + op = REQ_OP_ZONE_RESET; + +- trace_zonefs_zone_mgmt(inode, op); +- ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, +- zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); ++ trace_zonefs_zone_mgmt(sb, z, op); ++ ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, ++ z->z_size >> SECTOR_SHIFT, GFP_NOFS); + if (ret) { +- zonefs_err(inode->i_sb, ++ zonefs_err(sb, + "Zone management operation %s at %llu failed %d\n", +- blk_op_str(op), zi->i_zsector, ret); ++ blk_op_str(op), z->z_sector, ret); + return ret; + } + + return 0; + } + ++int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op) ++{ ++ lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); ++ ++ return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op); ++} ++ + void zonefs_i_size_write(struct inode *inode, loff_t isize) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + + i_size_write(inode, isize); ++ + /* + * A full zone is no longer open/active and does not need + * explicit closing. + */ +- if (isize >= zi->i_max_size) { ++ if (isize >= z->z_capacity) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + +- if (zi->i_flags & ZONEFS_ZONE_ACTIVE) ++ if (z->z_flags & ZONEFS_ZONE_ACTIVE) + atomic_dec(&sbi->s_active_seq_files); +- zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); ++ z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); + } + } + +@@ -150,20 +183,18 @@ void zonefs_update_stats(struct inode *inode, loff_t new_isize) + } + + /* +- * Check a zone condition and adjust its file inode access permissions for +- * offline and readonly zones. Return the inode size corresponding to the +- * amount of readable data in the zone. ++ * Check a zone condition. Return the amount of written (and still readable) ++ * data in the zone. + */ +-static loff_t zonefs_check_zone_condition(struct inode *inode, ++static loff_t zonefs_check_zone_condition(struct super_block *sb, ++ struct zonefs_zone *z, + struct blk_zone *zone) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- + switch (zone->cond) { + case BLK_ZONE_COND_OFFLINE: +- zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", +- inode->i_ino); +- zi->i_flags |= ZONEFS_ZONE_OFFLINE; ++ zonefs_warn(sb, "Zone %llu: offline zone\n", ++ z->z_sector); ++ z->z_flags |= ZONEFS_ZONE_OFFLINE; + return 0; + case BLK_ZONE_COND_READONLY: + /* +@@ -174,18 +205,18 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, + * the inode size as it was when last updated so that the user + * can recover data. + */ +- zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", +- inode->i_ino); +- zi->i_flags |= ZONEFS_ZONE_READONLY; +- if (zonefs_zone_is_cnv(zi)) +- return zi->i_max_size; +- return zi->i_wpoffset; ++ zonefs_warn(sb, "Zone %llu: read-only zone\n", ++ z->z_sector); ++ z->z_flags |= ZONEFS_ZONE_READONLY; ++ if (zonefs_zone_is_cnv(z)) ++ return z->z_capacity; ++ return z->z_wpoffset; + case BLK_ZONE_COND_FULL: + /* The write pointer of full zones is invalid. */ +- return zi->i_max_size; ++ return z->z_capacity; + default: +- if (zonefs_zone_is_cnv(zi)) +- return zi->i_max_size; ++ if (zonefs_zone_is_cnv(z)) ++ return z->z_capacity; + return (zone->wp - zone->start) << SECTOR_SHIFT; + } + } +@@ -196,22 +227,22 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, + */ + static void zonefs_inode_update_mode(struct inode *inode) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + +- if (zi->i_flags & ZONEFS_ZONE_OFFLINE) { ++ if (z->z_flags & ZONEFS_ZONE_OFFLINE) { + /* Offline zones cannot be read nor written */ + inode->i_flags |= S_IMMUTABLE; + inode->i_mode &= ~0777; +- } else if (zi->i_flags & ZONEFS_ZONE_READONLY) { ++ } else if (z->z_flags & ZONEFS_ZONE_READONLY) { + /* Readonly zones cannot be written */ + inode->i_flags |= S_IMMUTABLE; +- if (zi->i_flags & ZONEFS_ZONE_INIT_MODE) ++ if (z->z_flags & ZONEFS_ZONE_INIT_MODE) + inode->i_mode &= ~0777; + else + inode->i_mode &= ~0222; + } + +- zi->i_flags &= ~ZONEFS_ZONE_INIT_MODE; ++ z->z_flags &= ~ZONEFS_ZONE_INIT_MODE; + } + + struct zonefs_ioerr_data { +@@ -224,7 +255,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + { + struct zonefs_ioerr_data *err = data; + struct inode *inode = err->inode; +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + loff_t isize, data_size; +@@ -235,9 +266,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * as there is no inconsistency between the inode size and the amount of + * data writen in the zone (data_size). + */ +- data_size = zonefs_check_zone_condition(inode, zone); ++ data_size = zonefs_check_zone_condition(sb, z, zone); + isize = i_size_read(inode); +- if (!(zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && ++ if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && + !err->write && isize == data_size) + return 0; + +@@ -260,8 +291,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * In all cases, warn about inode size inconsistency and handle the + * IO error according to the zone condition and to the mount options. + */ +- if (zonefs_zone_is_seq(zi) && isize != data_size) +- zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", ++ if (zonefs_zone_is_seq(z) && isize != data_size) ++ zonefs_warn(sb, ++ "inode %lu: invalid size %lld (should be %lld)\n", + inode->i_ino, isize, data_size); + + /* +@@ -270,20 +302,20 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * zone condition to read-only and offline respectively, as if the + * condition was signaled by the hardware. + */ +- if ((zi->i_flags & ZONEFS_ZONE_OFFLINE) || ++ if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { + zonefs_warn(sb, "inode %lu: read/write access disabled\n", + inode->i_ino); +- if (!(zi->i_flags & ZONEFS_ZONE_OFFLINE)) +- zi->i_flags |= ZONEFS_ZONE_OFFLINE; ++ if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) ++ z->z_flags |= ZONEFS_ZONE_OFFLINE; + zonefs_inode_update_mode(inode); + data_size = 0; +- } else if ((zi->i_flags & ZONEFS_ZONE_READONLY) || ++ } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || + (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { + zonefs_warn(sb, "inode %lu: write access disabled\n", + inode->i_ino); +- if (!(zi->i_flags & ZONEFS_ZONE_READONLY)) +- zi->i_flags |= ZONEFS_ZONE_READONLY; ++ if (!(z->z_flags & ZONEFS_ZONE_READONLY)) ++ z->z_flags |= ZONEFS_ZONE_READONLY; + zonefs_inode_update_mode(inode); + data_size = isize; + } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && +@@ -299,8 +331,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && +- (zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) +- zi->i_flags &= ~ZONEFS_ZONE_OPEN; ++ (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) ++ z->z_flags &= ~ZONEFS_ZONE_OPEN; + + /* + * If error=remount-ro was specified, any error result in remounting +@@ -317,8 +349,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + */ + zonefs_update_stats(inode, data_size); + zonefs_i_size_write(inode, data_size); +- zi->i_wpoffset = data_size; +- zonefs_account_active(inode); ++ z->z_wpoffset = data_size; ++ zonefs_inode_account_active(inode); + + return 0; + } +@@ -332,7 +364,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + */ + void __zonefs_io_error(struct inode *inode, bool write) + { +- struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ struct zonefs_zone *z = zonefs_inode_zone(inode); + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); + unsigned int noio_flag; +@@ -348,8 +380,8 @@ void __zonefs_io_error(struct inode *inode, bool write) + * files with aggregated conventional zones, for which the inode zone + * size is always larger than the device zone size. + */ +- if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev)) +- nr_zones = zi->i_zone_size >> ++ if (z->z_size > bdev_zone_sectors(sb->s_bdev)) ++ nr_zones = z->z_size >> + (sbi->s_zone_sectors_shift + SECTOR_SHIFT); + + /* +@@ -361,7 +393,7 @@ void __zonefs_io_error(struct inode *inode, bool write) + * the GFP_NOIO context avoids both problems. + */ + noio_flag = memalloc_noio_save(); +- ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones, ++ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones, + zonefs_io_error_cb, &err); + if (ret != nr_zones) + zonefs_err(sb, "Get inode %lu zone information failed %d\n", +@@ -381,9 +413,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) + + inode_init_once(&zi->i_vnode); + mutex_init(&zi->i_truncate_mutex); +- zi->i_wpoffset = 0; + zi->i_wr_refcnt = 0; +- zi->i_flags = 0; + + return &zi->i_vnode; + } +@@ -416,8 +446,8 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) + buf->f_bavail = buf->f_bfree; + + for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { +- if (sbi->s_nr_files[t]) +- buf->f_files += sbi->s_nr_files[t] + 1; ++ if (sbi->s_zgroup[t].g_nr_zones) ++ buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; + } + buf->f_ffree = 0; + +@@ -557,11 +587,11 @@ static const struct inode_operations zonefs_dir_inode_operations = { + }; + + static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, +- enum zonefs_ztype type) ++ enum zonefs_ztype ztype) + { + struct super_block *sb = parent->i_sb; + +- inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1; ++ inode->i_ino = bdev_nr_zones(sb->s_bdev) + ztype + 1; + inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); + inode->i_op = &zonefs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; +@@ -573,79 +603,34 @@ static const struct inode_operations zonefs_file_inode_operations = { + .setattr = zonefs_inode_setattr, + }; + +-static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, +- enum zonefs_ztype type) ++static void zonefs_init_file_inode(struct inode *inode, ++ struct zonefs_zone *z) + { + struct super_block *sb = inode->i_sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +- struct zonefs_inode_info *zi = ZONEFS_I(inode); +- int ret = 0; +- +- inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; +- inode->i_mode = S_IFREG | sbi->s_perm; + +- if (type == ZONEFS_ZTYPE_CNV) +- zi->i_flags |= ZONEFS_ZONE_CNV; +- +- zi->i_zsector = zone->start; +- zi->i_zone_size = zone->len << SECTOR_SHIFT; +- if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && +- !(sbi->s_features & ZONEFS_F_AGGRCNV)) { +- zonefs_err(sb, +- "zone size %llu doesn't match device's zone sectors %llu\n", +- zi->i_zone_size, +- bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); +- return -EINVAL; +- } +- +- zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, +- zone->capacity << SECTOR_SHIFT); +- zi->i_wpoffset = zonefs_check_zone_condition(inode, zone); ++ inode->i_private = z; + ++ inode->i_ino = z->z_sector >> sbi->s_zone_sectors_shift; ++ inode->i_mode = S_IFREG | sbi->s_perm; + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; +- inode->i_size = zi->i_wpoffset; +- inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT; ++ inode->i_size = z->z_wpoffset; ++ inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; + + inode->i_op = &zonefs_file_inode_operations; + inode->i_fop = &zonefs_file_operations; + inode->i_mapping->a_ops = &zonefs_file_aops; + + /* Update the inode access rights depending on the zone condition */ +- zi->i_flags |= ZONEFS_ZONE_INIT_MODE; ++ z->z_flags |= ZONEFS_ZONE_INIT_MODE; + zonefs_inode_update_mode(inode); +- +- sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); +- sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; +- sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; +- +- mutex_lock(&zi->i_truncate_mutex); +- +- /* +- * For sequential zones, make sure that any open zone is closed first +- * to ensure that the initial number of open zones is 0, in sync with +- * the open zone accounting done when the mount option +- * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. +- */ +- if (type == ZONEFS_ZTYPE_SEQ && +- (zone->cond == BLK_ZONE_COND_IMP_OPEN || +- zone->cond == BLK_ZONE_COND_EXP_OPEN)) { +- ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); +- if (ret) +- goto unlock; +- } +- +- zonefs_account_active(inode); +- +-unlock: +- mutex_unlock(&zi->i_truncate_mutex); +- +- return ret; + } + + static struct dentry *zonefs_create_inode(struct dentry *parent, +- const char *name, struct blk_zone *zone, +- enum zonefs_ztype type) ++ const char *name, ++ struct zonefs_zone *z, ++ enum zonefs_ztype ztype) + { + struct inode *dir = d_inode(parent); + struct dentry *dentry; +@@ -661,15 +646,10 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, + goto dput; + + inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; +- if (zone) { +- ret = zonefs_init_file_inode(inode, zone, type); +- if (ret) { +- iput(inode); +- goto dput; +- } +- } else { +- zonefs_init_dir_inode(dir, inode, type); +- } ++ if (z) ++ zonefs_init_file_inode(inode, z); ++ else ++ zonefs_init_dir_inode(dir, inode, ztype); + + d_add(dentry, inode); + dir->i_size++; +@@ -685,100 +665,51 @@ static struct dentry *zonefs_create_inode(struct dentry *parent, + struct zonefs_zone_data { + struct super_block *sb; + unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; ++ sector_t cnv_zone_start; + struct blk_zone *zones; + }; + + /* +- * Create a zone group and populate it with zone files. ++ * Create the inodes for a zone group. + */ +-static int zonefs_create_zgroup(struct zonefs_zone_data *zd, +- enum zonefs_ztype type) ++static int zonefs_create_zgroup_inodes(struct super_block *sb, ++ enum zonefs_ztype ztype) + { +- struct super_block *sb = zd->sb; + struct zonefs_sb_info *sbi = ZONEFS_SB(sb); +- struct blk_zone *zone, *next, *end; +- const char *zgroup_name; +- char *file_name; ++ struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; + struct dentry *dir, *dent; +- unsigned int n = 0; +- int ret; ++ char *file_name; ++ int i, ret = 0; ++ ++ if (!zgroup) ++ return -ENOMEM; + + /* If the group is empty, there is nothing to do */ +- if (!zd->nr_zones[type]) ++ if (!zgroup->g_nr_zones) + return 0; + + file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); + if (!file_name) + return -ENOMEM; + +- if (type == ZONEFS_ZTYPE_CNV) +- zgroup_name = "cnv"; +- else +- zgroup_name = "seq"; +- +- dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); ++ dir = zonefs_create_inode(sb->s_root, zonefs_zgroup_name(ztype), ++ NULL, ztype); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + goto free; + } + +- /* +- * The first zone contains the super block: skip it. +- */ +- end = zd->zones + bdev_nr_zones(sb->s_bdev); +- for (zone = &zd->zones[1]; zone < end; zone = next) { +- +- next = zone + 1; +- if (zonefs_zone_type(zone) != type) +- continue; +- +- /* +- * For conventional zones, contiguous zones can be aggregated +- * together to form larger files. Note that this overwrites the +- * length of the first zone of the set of contiguous zones +- * aggregated together. If one offline or read-only zone is +- * found, assume that all zones aggregated have the same +- * condition. +- */ +- if (type == ZONEFS_ZTYPE_CNV && +- (sbi->s_features & ZONEFS_F_AGGRCNV)) { +- for (; next < end; next++) { +- if (zonefs_zone_type(next) != type) +- break; +- zone->len += next->len; +- zone->capacity += next->capacity; +- if (next->cond == BLK_ZONE_COND_READONLY && +- zone->cond != BLK_ZONE_COND_OFFLINE) +- zone->cond = BLK_ZONE_COND_READONLY; +- else if (next->cond == BLK_ZONE_COND_OFFLINE) +- zone->cond = BLK_ZONE_COND_OFFLINE; +- } +- if (zone->capacity != zone->len) { +- zonefs_err(sb, "Invalid conventional zone capacity\n"); +- ret = -EINVAL; +- goto free; +- } +- } +- +- /* +- * Use the file number within its group as file name. +- */ +- snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); +- dent = zonefs_create_inode(dir, file_name, zone, type); ++ for (i = 0; i < zgroup->g_nr_zones; i++) { ++ /* Use the zone number within its group as the file name */ ++ snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", i); ++ dent = zonefs_create_inode(dir, file_name, ++ &zgroup->g_zones[i], ztype); + if (IS_ERR(dent)) { + ret = PTR_ERR(dent); +- goto free; ++ break; + } +- +- n++; + } + +- zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", +- zgroup_name, n, n > 1 ? "s" : ""); +- +- sbi->s_nr_files[type] = n; +- ret = 0; +- + free: + kfree(file_name); + +@@ -789,21 +720,38 @@ static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, + void *data) + { + struct zonefs_zone_data *zd = data; ++ struct super_block *sb = zd->sb; ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); ++ ++ /* ++ * We do not care about the first zone: it contains the super block ++ * and not exposed as a file. ++ */ ++ if (!idx) ++ return 0; + + /* +- * Count the number of usable zones: the first zone at index 0 contains +- * the super block and is ignored. ++ * Count the number of zones that will be exposed as files. ++ * For sequential zones, we always have as many files as zones. ++ * FOr conventional zones, the number of files depends on if we have ++ * conventional zones aggregation enabled. + */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: +- zone->wp = zone->start + zone->len; +- if (idx) +- zd->nr_zones[ZONEFS_ZTYPE_CNV]++; ++ if (sbi->s_features & ZONEFS_F_AGGRCNV) { ++ /* One file per set of contiguous conventional zones */ ++ if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) || ++ zone->start != zd->cnv_zone_start) ++ sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; ++ zd->cnv_zone_start = zone->start + zone->len; ++ } else { ++ /* One file per zone */ ++ sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; ++ } + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: +- if (idx) +- zd->nr_zones[ZONEFS_ZTYPE_SEQ]++; ++ sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++; + break; + default: + zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", +@@ -843,11 +791,173 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd) + return 0; + } + +-static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd) ++static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd) + { + kvfree(zd->zones); + } + ++/* ++ * Create a zone group and populate it with zone files. ++ */ ++static int zonefs_init_zgroup(struct super_block *sb, ++ struct zonefs_zone_data *zd, ++ enum zonefs_ztype ztype) ++{ ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); ++ struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; ++ struct blk_zone *zone, *next, *end; ++ struct zonefs_zone *z; ++ unsigned int n = 0; ++ int ret; ++ ++ /* Allocate the zone group. If it is empty, we have nothing to do. */ ++ if (!zgroup->g_nr_zones) ++ return 0; ++ ++ zgroup->g_zones = kvcalloc(zgroup->g_nr_zones, ++ sizeof(struct zonefs_zone), GFP_KERNEL); ++ if (!zgroup->g_zones) ++ return -ENOMEM; ++ ++ /* ++ * Initialize the zone groups using the device zone information. ++ * We always skip the first zone as it contains the super block ++ * and is not use to back a file. ++ */ ++ end = zd->zones + bdev_nr_zones(sb->s_bdev); ++ for (zone = &zd->zones[1]; zone < end; zone = next) { ++ ++ next = zone + 1; ++ if (zonefs_zone_type(zone) != ztype) ++ continue; ++ ++ if (WARN_ON_ONCE(n >= zgroup->g_nr_zones)) ++ return -EINVAL; ++ ++ /* ++ * For conventional zones, contiguous zones can be aggregated ++ * together to form larger files. Note that this overwrites the ++ * length of the first zone of the set of contiguous zones ++ * aggregated together. If one offline or read-only zone is ++ * found, assume that all zones aggregated have the same ++ * condition. ++ */ ++ if (ztype == ZONEFS_ZTYPE_CNV && ++ (sbi->s_features & ZONEFS_F_AGGRCNV)) { ++ for (; next < end; next++) { ++ if (zonefs_zone_type(next) != ztype) ++ break; ++ zone->len += next->len; ++ zone->capacity += next->capacity; ++ if (next->cond == BLK_ZONE_COND_READONLY && ++ zone->cond != BLK_ZONE_COND_OFFLINE) ++ zone->cond = BLK_ZONE_COND_READONLY; ++ else if (next->cond == BLK_ZONE_COND_OFFLINE) ++ zone->cond = BLK_ZONE_COND_OFFLINE; ++ } ++ } ++ ++ z = &zgroup->g_zones[n]; ++ if (ztype == ZONEFS_ZTYPE_CNV) ++ z->z_flags |= ZONEFS_ZONE_CNV; ++ z->z_sector = zone->start; ++ z->z_size = zone->len << SECTOR_SHIFT; ++ if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && ++ !(sbi->s_features & ZONEFS_F_AGGRCNV)) { ++ zonefs_err(sb, ++ "Invalid zone size %llu (device zone sectors %llu)\n", ++ z->z_size, ++ bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); ++ return -EINVAL; ++ } ++ ++ z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE, ++ zone->capacity << SECTOR_SHIFT); ++ z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone); ++ ++ sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes); ++ sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits; ++ sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits; ++ ++ /* ++ * For sequential zones, make sure that any open zone is closed ++ * first to ensure that the initial number of open zones is 0, ++ * in sync with the open zone accounting done when the mount ++ * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used. ++ */ ++ if (ztype == ZONEFS_ZTYPE_SEQ && ++ (zone->cond == BLK_ZONE_COND_IMP_OPEN || ++ zone->cond == BLK_ZONE_COND_EXP_OPEN)) { ++ ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE); ++ if (ret) ++ return ret; ++ } ++ ++ zonefs_account_active(sb, z); ++ ++ n++; ++ } ++ ++ if (WARN_ON_ONCE(n != zgroup->g_nr_zones)) ++ return -EINVAL; ++ ++ zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", ++ zonefs_zgroup_name(ztype), ++ zgroup->g_nr_zones, ++ zgroup->g_nr_zones > 1 ? "s" : ""); ++ ++ return 0; ++} ++ ++static void zonefs_free_zgroups(struct super_block *sb) ++{ ++ struct zonefs_sb_info *sbi = ZONEFS_SB(sb); ++ enum zonefs_ztype ztype; ++ ++ if (!sbi) ++ return; ++ ++ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { ++ kvfree(sbi->s_zgroup[ztype].g_zones); ++ sbi->s_zgroup[ztype].g_zones = NULL; ++ } ++} ++ ++/* ++ * Create a zone group and populate it with zone files. ++ */ ++static int zonefs_init_zgroups(struct super_block *sb) ++{ ++ struct zonefs_zone_data zd; ++ enum zonefs_ztype ztype; ++ int ret; ++ ++ /* First get the device zone information */ ++ memset(&zd, 0, sizeof(struct zonefs_zone_data)); ++ zd.sb = sb; ++ ret = zonefs_get_zone_info(&zd); ++ if (ret) ++ goto cleanup; ++ ++ /* Allocate and initialize the zone groups */ ++ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { ++ ret = zonefs_init_zgroup(sb, &zd, ztype); ++ if (ret) { ++ zonefs_info(sb, ++ "Zone group \"%s\" initialization failed\n", ++ zonefs_zgroup_name(ztype)); ++ break; ++ } ++ } ++ ++cleanup: ++ zonefs_free_zone_info(&zd); ++ if (ret) ++ zonefs_free_zgroups(sb); ++ ++ return ret; ++} ++ + /* + * Read super block information from the device. + */ +@@ -945,7 +1055,6 @@ static const struct super_operations zonefs_sops = { + */ + static int zonefs_fill_super(struct super_block *sb, void *data, int silent) + { +- struct zonefs_zone_data zd; + struct zonefs_sb_info *sbi; + struct inode *inode; + enum zonefs_ztype t; +@@ -998,16 +1107,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) + if (ret) + return ret; + +- memset(&zd, 0, sizeof(struct zonefs_zone_data)); +- zd.sb = sb; +- ret = zonefs_get_zone_info(&zd); +- if (ret) +- goto cleanup; +- +- ret = zonefs_sysfs_register(sb); +- if (ret) +- goto cleanup; +- + zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); + + if (!sbi->s_max_wro_seq_files && +@@ -1018,6 +1117,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } + ++ /* Initialize the zone groups */ ++ ret = zonefs_init_zgroups(sb); ++ if (ret) ++ goto cleanup; ++ + /* Create root directory inode */ + ret = -ENOMEM; + inode = new_inode(sb); +@@ -1037,13 +1141,19 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) + + /* Create and populate files in zone groups directories */ + for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { +- ret = zonefs_create_zgroup(&zd, t); ++ ret = zonefs_create_zgroup_inodes(sb, t); + if (ret) +- break; ++ goto cleanup; + } + ++ ret = zonefs_sysfs_register(sb); ++ if (ret) ++ goto cleanup; ++ ++ return 0; ++ + cleanup: +- zonefs_cleanup_zone_info(&zd); ++ zonefs_free_zgroups(sb); + + return ret; + } +@@ -1062,6 +1172,7 @@ static void zonefs_kill_super(struct super_block *sb) + d_genocide(sb->s_root); + + zonefs_sysfs_unregister(sb); ++ zonefs_free_zgroups(sb); + kill_block_super(sb); + kfree(sbi); + } +diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h +index 42edcfd393ed2..9969db3a9c7dc 100644 +--- a/fs/zonefs/trace.h ++++ b/fs/zonefs/trace.h +@@ -20,8 +20,9 @@ + #define show_dev(dev) MAJOR(dev), MINOR(dev) + + TRACE_EVENT(zonefs_zone_mgmt, +- TP_PROTO(struct inode *inode, enum req_op op), +- TP_ARGS(inode, op), ++ TP_PROTO(struct super_block *sb, struct zonefs_zone *z, ++ enum req_op op), ++ TP_ARGS(sb, z, op), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) +@@ -30,12 +31,12 @@ TRACE_EVENT(zonefs_zone_mgmt, + __field(sector_t, nr_sectors) + ), + TP_fast_assign( +- __entry->dev = inode->i_sb->s_dev; +- __entry->ino = inode->i_ino; ++ __entry->dev = sb->s_dev; ++ __entry->ino = ++ z->z_sector >> ZONEFS_SB(sb)->s_zone_sectors_shift; + __entry->op = op; +- __entry->sector = ZONEFS_I(inode)->i_zsector; +- __entry->nr_sectors = +- ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT; ++ __entry->sector = z->z_sector; ++ __entry->nr_sectors = z->z_size >> SECTOR_SHIFT; + ), + TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu", + show_dev(__entry->dev), (unsigned long)__entry->ino, +@@ -58,9 +59,10 @@ TRACE_EVENT(zonefs_file_dio_append, + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; +- __entry->sector = ZONEFS_I(inode)->i_zsector; ++ __entry->sector = zonefs_inode_zone(inode)->z_sector; + __entry->size = size; +- __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset; ++ __entry->wpoffset = ++ zonefs_inode_zone(inode)->z_wpoffset; + __entry->ret = ret; + ), + TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu", +diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h +index 1a225f74015a0..2d626e18b1411 100644 +--- a/fs/zonefs/zonefs.h ++++ b/fs/zonefs/zonefs.h +@@ -47,22 +47,39 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) + #define ZONEFS_ZONE_CNV (1U << 31) + + /* +- * In-memory inode data. ++ * In-memory per-file inode zone data. + */ +-struct zonefs_inode_info { +- struct inode i_vnode; ++struct zonefs_zone { ++ /* Zone state flags */ ++ unsigned int z_flags; + +- /* File zone start sector (512B unit) */ +- sector_t i_zsector; ++ /* Zone start sector (512B unit) */ ++ sector_t z_sector; + +- /* File zone write pointer position (sequential zones only) */ +- loff_t i_wpoffset; ++ /* Zone size (bytes) */ ++ loff_t z_size; + +- /* File maximum size */ +- loff_t i_max_size; ++ /* Zone capacity (file maximum size, bytes) */ ++ loff_t z_capacity; + +- /* File zone size */ +- loff_t i_zone_size; ++ /* Write pointer offset in the zone (sequential zones only, bytes) */ ++ loff_t z_wpoffset; ++}; ++ ++/* ++ * In memory zone group information: all zones of a group are exposed ++ * as files, one file per zone. ++ */ ++struct zonefs_zone_group { ++ unsigned int g_nr_zones; ++ struct zonefs_zone *g_zones; ++}; ++ ++/* ++ * In-memory inode data. ++ */ ++struct zonefs_inode_info { ++ struct inode i_vnode; + + /* + * To serialise fully against both syscall and mmap based IO and +@@ -81,7 +98,6 @@ struct zonefs_inode_info { + + /* guarded by i_truncate_mutex */ + unsigned int i_wr_refcnt; +- unsigned int i_flags; + }; + + static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) +@@ -89,24 +105,29 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) + return container_of(inode, struct zonefs_inode_info, i_vnode); + } + +-static inline bool zonefs_zone_is_cnv(struct zonefs_inode_info *zi) ++static inline bool zonefs_zone_is_cnv(struct zonefs_zone *z) ++{ ++ return z->z_flags & ZONEFS_ZONE_CNV; ++} ++ ++static inline bool zonefs_zone_is_seq(struct zonefs_zone *z) + { +- return zi->i_flags & ZONEFS_ZONE_CNV; ++ return !zonefs_zone_is_cnv(z); + } + +-static inline bool zonefs_zone_is_seq(struct zonefs_inode_info *zi) ++static inline struct zonefs_zone *zonefs_inode_zone(struct inode *inode) + { +- return !zonefs_zone_is_cnv(zi); ++ return inode->i_private; + } + + static inline bool zonefs_inode_is_cnv(struct inode *inode) + { +- return zonefs_zone_is_cnv(ZONEFS_I(inode)); ++ return zonefs_zone_is_cnv(zonefs_inode_zone(inode)); + } + + static inline bool zonefs_inode_is_seq(struct inode *inode) + { +- return zonefs_zone_is_seq(ZONEFS_I(inode)); ++ return zonefs_zone_is_seq(zonefs_inode_zone(inode)); + } + + /* +@@ -200,7 +221,7 @@ struct zonefs_sb_info { + uuid_t s_uuid; + unsigned int s_zone_sectors_shift; + +- unsigned int s_nr_files[ZONEFS_ZTYPE_MAX]; ++ struct zonefs_zone_group s_zgroup[ZONEFS_ZTYPE_MAX]; + + loff_t s_blocks; + loff_t s_used_blocks; +@@ -229,8 +250,8 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) + pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args) + + /* In super.c */ +-void zonefs_account_active(struct inode *inode); +-int zonefs_zone_mgmt(struct inode *inode, enum req_op op); ++void zonefs_inode_account_active(struct inode *inode); ++int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op); + void zonefs_i_size_write(struct inode *inode, loff_t isize); + void zonefs_update_stats(struct inode *inode, loff_t new_isize); + void __zonefs_io_error(struct inode *inode, bool write); +-- +2.39.2 + diff --git a/queue-6.2/zonefs-simplify-io-error-handling.patch b/queue-6.2/zonefs-simplify-io-error-handling.patch new file mode 100644 index 00000000000..413df010c69 --- /dev/null +++ b/queue-6.2/zonefs-simplify-io-error-handling.patch @@ -0,0 +1,244 @@ +From c0aa18f0ded6555cb0c0a9063a9295abb942e405 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 25 Nov 2022 11:06:20 +0900 +Subject: zonefs: Simplify IO error handling + +From: Damien Le Moal + +[ Upstream commit 46a9c526eef7fb68a00321e2a9591ce5276ae92b ] + +Simplify zonefs_check_zone_condition() by moving the code that changes +an inode access rights to the new function zonefs_inode_update_mode(). +Furthermore, since on mount an inode wpoffset is always zero when +zonefs_check_zone_condition() is called during an inode initialization, +the "mount" boolean argument is not necessary for the readonly zone +case. This argument is thus removed. + +zonefs_io_error_cb() is also modified to use the inode offline and +zone state flags instead of checking the device zone condition. The +multiple calls to zonefs_check_zone_condition() are reduced to the first +call on entry, which allows removing the "warn" argument. +zonefs_inode_update_mode() is also used to update an inode access rights +as zonefs_io_error_cb() modifies the inode flags depending on the volume +error handling mode (defined with a mount option). Since an inode mode +change differs for read-only zones between mount time and IO error time, +the flag ZONEFS_ZONE_INIT_MODE is used to differentiate both cases. + +Signed-off-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()") +Signed-off-by: Sasha Levin +--- + fs/zonefs/super.c | 110 ++++++++++++++++++++++++--------------------- + fs/zonefs/zonefs.h | 9 ++-- + 2 files changed, 64 insertions(+), 55 deletions(-) + +diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c +index e808276b88018..6307cc95be061 100644 +--- a/fs/zonefs/super.c ++++ b/fs/zonefs/super.c +@@ -155,48 +155,31 @@ void zonefs_update_stats(struct inode *inode, loff_t new_isize) + * amount of readable data in the zone. + */ + static loff_t zonefs_check_zone_condition(struct inode *inode, +- struct blk_zone *zone, bool warn, +- bool mount) ++ struct blk_zone *zone) + { + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + switch (zone->cond) { + case BLK_ZONE_COND_OFFLINE: +- /* +- * Dead zone: make the inode immutable, disable all accesses +- * and set the file size to 0 (zone wp set to zone start). +- */ +- if (warn) +- zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", +- inode->i_ino); +- inode->i_flags |= S_IMMUTABLE; +- inode->i_mode &= ~0777; +- zone->wp = zone->start; ++ zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", ++ inode->i_ino); + zi->i_flags |= ZONEFS_ZONE_OFFLINE; + return 0; + case BLK_ZONE_COND_READONLY: + /* +- * The write pointer of read-only zones is invalid. If such a +- * zone is found during mount, the file size cannot be retrieved +- * so we treat the zone as offline (mount == true case). +- * Otherwise, keep the file size as it was when last updated +- * so that the user can recover data. In both cases, writes are +- * always disabled for the zone. ++ * The write pointer of read-only zones is invalid, so we cannot ++ * determine the zone wpoffset (inode size). We thus keep the ++ * zone wpoffset as is, which leads to an empty file ++ * (wpoffset == 0) on mount. For a runtime error, this keeps ++ * the inode size as it was when last updated so that the user ++ * can recover data. + */ +- if (warn) +- zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", +- inode->i_ino); +- inode->i_flags |= S_IMMUTABLE; +- if (mount) { +- zone->cond = BLK_ZONE_COND_OFFLINE; +- inode->i_mode &= ~0777; +- zone->wp = zone->start; +- zi->i_flags |= ZONEFS_ZONE_OFFLINE; +- return 0; +- } ++ zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", ++ inode->i_ino); + zi->i_flags |= ZONEFS_ZONE_READONLY; +- inode->i_mode &= ~0222; +- return i_size_read(inode); ++ if (zi->i_ztype == ZONEFS_ZTYPE_CNV) ++ return zi->i_max_size; ++ return zi->i_wpoffset; + case BLK_ZONE_COND_FULL: + /* The write pointer of full zones is invalid. */ + return zi->i_max_size; +@@ -207,6 +190,30 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, + } + } + ++/* ++ * Check a zone condition and adjust its inode access permissions for ++ * offline and readonly zones. ++ */ ++static void zonefs_inode_update_mode(struct inode *inode) ++{ ++ struct zonefs_inode_info *zi = ZONEFS_I(inode); ++ ++ if (zi->i_flags & ZONEFS_ZONE_OFFLINE) { ++ /* Offline zones cannot be read nor written */ ++ inode->i_flags |= S_IMMUTABLE; ++ inode->i_mode &= ~0777; ++ } else if (zi->i_flags & ZONEFS_ZONE_READONLY) { ++ /* Readonly zones cannot be written */ ++ inode->i_flags |= S_IMMUTABLE; ++ if (zi->i_flags & ZONEFS_ZONE_INIT_MODE) ++ inode->i_mode &= ~0777; ++ else ++ inode->i_mode &= ~0222; ++ } ++ ++ zi->i_flags &= ~ZONEFS_ZONE_INIT_MODE; ++} ++ + struct zonefs_ioerr_data { + struct inode *inode; + bool write; +@@ -228,10 +235,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * as there is no inconsistency between the inode size and the amount of + * data writen in the zone (data_size). + */ +- data_size = zonefs_check_zone_condition(inode, zone, true, false); ++ data_size = zonefs_check_zone_condition(inode, zone); + isize = i_size_read(inode); +- if (zone->cond != BLK_ZONE_COND_OFFLINE && +- zone->cond != BLK_ZONE_COND_READONLY && ++ if (!(zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && + !err->write && isize == data_size) + return 0; + +@@ -264,24 +270,22 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * zone condition to read-only and offline respectively, as if the + * condition was signaled by the hardware. + */ +- if (zone->cond == BLK_ZONE_COND_OFFLINE || +- sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) { ++ if ((zi->i_flags & ZONEFS_ZONE_OFFLINE) || ++ (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { + zonefs_warn(sb, "inode %lu: read/write access disabled\n", + inode->i_ino); +- if (zone->cond != BLK_ZONE_COND_OFFLINE) { +- zone->cond = BLK_ZONE_COND_OFFLINE; +- data_size = zonefs_check_zone_condition(inode, zone, +- false, false); +- } +- } else if (zone->cond == BLK_ZONE_COND_READONLY || +- sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) { ++ if (!(zi->i_flags & ZONEFS_ZONE_OFFLINE)) ++ zi->i_flags |= ZONEFS_ZONE_OFFLINE; ++ zonefs_inode_update_mode(inode); ++ data_size = 0; ++ } else if ((zi->i_flags & ZONEFS_ZONE_READONLY) || ++ (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { + zonefs_warn(sb, "inode %lu: write access disabled\n", + inode->i_ino); +- if (zone->cond != BLK_ZONE_COND_READONLY) { +- zone->cond = BLK_ZONE_COND_READONLY; +- data_size = zonefs_check_zone_condition(inode, zone, +- false, false); +- } ++ if (!(zi->i_flags & ZONEFS_ZONE_READONLY)) ++ zi->i_flags |= ZONEFS_ZONE_READONLY; ++ zonefs_inode_update_mode(inode); ++ data_size = isize; + } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && + data_size > isize) { + /* Do not expose garbage data */ +@@ -295,8 +299,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && +- (zone->cond == BLK_ZONE_COND_OFFLINE || +- zone->cond == BLK_ZONE_COND_READONLY)) ++ (zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + + /* +@@ -378,6 +381,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) + + inode_init_once(&zi->i_vnode); + mutex_init(&zi->i_truncate_mutex); ++ zi->i_wpoffset = 0; + zi->i_wr_refcnt = 0; + zi->i_flags = 0; + +@@ -594,7 +598,7 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + + zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, + zone->capacity << SECTOR_SHIFT); +- zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true); ++ zi->i_wpoffset = zonefs_check_zone_condition(inode, zone); + + inode->i_uid = sbi->s_uid; + inode->i_gid = sbi->s_gid; +@@ -605,6 +609,10 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, + inode->i_fop = &zonefs_file_operations; + inode->i_mapping->a_ops = &zonefs_file_aops; + ++ /* Update the inode access rights depending on the zone condition */ ++ zi->i_flags |= ZONEFS_ZONE_INIT_MODE; ++ zonefs_inode_update_mode(inode); ++ + sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); + sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; + sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; +diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h +index 839ebe9afb6c1..439096445ee53 100644 +--- a/fs/zonefs/zonefs.h ++++ b/fs/zonefs/zonefs.h +@@ -39,10 +39,11 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) + return ZONEFS_ZTYPE_SEQ; + } + +-#define ZONEFS_ZONE_OPEN (1U << 0) +-#define ZONEFS_ZONE_ACTIVE (1U << 1) +-#define ZONEFS_ZONE_OFFLINE (1U << 2) +-#define ZONEFS_ZONE_READONLY (1U << 3) ++#define ZONEFS_ZONE_INIT_MODE (1U << 0) ++#define ZONEFS_ZONE_OPEN (1U << 1) ++#define ZONEFS_ZONE_ACTIVE (1U << 2) ++#define ZONEFS_ZONE_OFFLINE (1U << 3) ++#define ZONEFS_ZONE_READONLY (1U << 4) + + /* + * In-memory inode data. +-- +2.39.2 +