]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 6.2
authorSasha Levin <sashal@kernel.org>
Thu, 30 Mar 2023 11:12:54 +0000 (07:12 -0400)
committerSasha Levin <sashal@kernel.org>
Thu, 30 Mar 2023 11:12:54 +0000 (07:12 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
14 files changed:
queue-6.2/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch [new file with mode: 0644]
queue-6.2/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch [new file with mode: 0644]
queue-6.2/btrfs-zoned-drop-space_info-active_total_bytes.patch [new file with mode: 0644]
queue-6.2/cifs-avoid-race-conditions-with-parallel-reconnects.patch [new file with mode: 0644]
queue-6.2/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch [new file with mode: 0644]
queue-6.2/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch [new file with mode: 0644]
queue-6.2/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch [new file with mode: 0644]
queue-6.2/series [new file with mode: 0644]
queue-6.2/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch [new file with mode: 0644]
queue-6.2/zonefs-fix-error-message-in-zonefs_file_dio_append.patch [new file with mode: 0644]
queue-6.2/zonefs-reduce-struct-zonefs_inode_info-size.patch [new file with mode: 0644]
queue-6.2/zonefs-reorganize-code.patch [new file with mode: 0644]
queue-6.2/zonefs-separate-zone-information-from-inode-informat.patch [new file with mode: 0644]
queue-6.2/zonefs-simplify-io-error-handling.patch [new file with mode: 0644]

diff --git a/queue-6.2/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch b/queue-6.2/btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch
new file mode 100644 (file)
index 0000000..7b9fdf2
--- /dev/null
@@ -0,0 +1,76 @@
+From 8a61e08dbc8fc089eca3ffba2e3a40df1b0a790e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Mar 2023 16:14:42 -0500
+Subject: btrfs: rename BTRFS_FS_NO_OVERCOMMIT to BTRFS_FS_ACTIVE_ZONE_TRACKING
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+[ Upstream commit bf1f1fec2724a33b67ec12032402ea75f2a83622 ]
+
+This flag only gets set when we're doing active zone tracking, and we're
+going to need to use this flag for things related to this behavior.
+Rename the flag to represent what it actually means for the file system
+so it can be used in other ways and still make sense.
+
+Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Stable-dep-of: e15acc25880c ("btrfs: zoned: drop space_info->active_total_bytes")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/fs.h         | 7 ++-----
+ fs/btrfs/space-info.c | 2 +-
+ fs/btrfs/zoned.c      | 3 +--
+ 3 files changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
+index 3d8156fc8523f..f180ca061aef4 100644
+--- a/fs/btrfs/fs.h
++++ b/fs/btrfs/fs.h
+@@ -119,11 +119,8 @@ enum {
+       /* Indicate that we want to commit the transaction. */
+       BTRFS_FS_NEED_TRANS_COMMIT,
+-      /*
+-       * Indicate metadata over-commit is disabled. This is set when active
+-       * zone tracking is needed.
+-       */
+-      BTRFS_FS_NO_OVERCOMMIT,
++      /* This is set when active zone tracking is needed. */
++      BTRFS_FS_ACTIVE_ZONE_TRACKING,
+       /*
+        * Indicate if we have some features changed, this is mostly for
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index 69c09508afb50..2237685d1ed0c 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -407,7 +407,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+               return 0;
+       used = btrfs_space_info_used(space_info, true);
+-      if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) &&
++      if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) &&
+           (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
+               avail = 0;
+       else
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index f3b7d8ae93a9f..a6a8bc112fc42 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -539,8 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
+               }
+               atomic_set(&zone_info->active_zones_left,
+                          max_active_zones - nactive);
+-              /* Overcommit does not work well with active zone tacking. */
+-              set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags);
++              set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags);
+       }
+       /* Validate superblock log */
+-- 
+2.39.2
+
diff --git a/queue-6.2/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch b/queue-6.2/btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch
new file mode 100644 (file)
index 0000000..2de26b2
--- /dev/null
@@ -0,0 +1,135 @@
+From fb89e8debf0fa9e2c31a8441294231789d9ba76a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Mar 2023 16:06:13 +0900
+Subject: btrfs: zoned: count fresh BG region as zone unusable
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit fa2068d7e922b434eba5bfb0131e6d39febfdb48 ]
+
+The naming of space_info->active_total_bytes is misleading. It counts
+not only active block groups but also full ones which are previously
+active but now inactive. That confusion results in a bug not counting
+the full BGs into active_total_bytes on mount time.
+
+For a background, there are three kinds of block groups in terms of
+activation.
+
+  1. Block groups never activated
+  2. Block groups currently active
+  3. Block groups previously active and currently inactive (due to fully
+     written or zone finish)
+
+What we really wanted to exclude from "total_bytes" is the total size of
+BGs #1. They seem empty and allocatable but since they are not activated,
+we cannot rely on them to do the space reservation.
+
+And, since BGs #1 never get activated, they should have no "used",
+"reserved" and "pinned" bytes.
+
+OTOH, BGs #3 can be counted in the "total", since they are already full
+we cannot allocate from them anyway. For them, "total_bytes == used +
+reserved + pinned + zone_unusable" should hold.
+
+Tracking #2 and #3 as "active_total_bytes" (current implementation) is
+confusing. And, tracking #1 and subtract that properly from "total_bytes"
+every time you need space reservation is cumbersome.
+
+Instead, we can count the whole region of a newly allocated block group as
+zone_unusable. Then, once that block group is activated, release
+[0 ..  zone_capacity] from the zone_unusable counters. With this, we can
+eliminate the confusing ->active_total_bytes and the code will be common
+among regular and the zoned mode. Also, no additional counter is needed
+with this approach.
+
+Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes")
+CC: stable@vger.kernel.org # 6.1+
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Stable-dep-of: e15acc25880c ("btrfs: zoned: drop space_info->active_total_bytes")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/free-space-cache.c |  8 +++++++-
+ fs/btrfs/zoned.c            | 24 +++++++++++++++++++-----
+ 2 files changed, 26 insertions(+), 6 deletions(-)
+
+diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
+index 0d250d052487c..d84cef89cdff5 100644
+--- a/fs/btrfs/free-space-cache.c
++++ b/fs/btrfs/free-space-cache.c
+@@ -2693,8 +2693,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
+               bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
+       spin_lock(&ctl->tree_lock);
++      /* Count initial region as zone_unusable until it gets activated. */
+       if (!used)
+               to_free = size;
++      else if (initial &&
++               test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) &&
++               (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
++              to_free = 0;
+       else if (initial)
+               to_free = block_group->zone_capacity;
+       else if (offset >= block_group->alloc_offset)
+@@ -2722,7 +2727,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
+       reclaimable_unusable = block_group->zone_unusable -
+                              (block_group->length - block_group->zone_capacity);
+       /* All the region is now unusable. Mark it as unused and reclaim */
+-      if (block_group->zone_unusable == block_group->length) {
++      if (block_group->zone_unusable == block_group->length &&
++          block_group->alloc_offset) {
+               btrfs_mark_bg_unused(block_group);
+       } else if (bg_reclaim_threshold &&
+                  reclaimable_unusable >=
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index a6a8bc112fc42..c3c763cc06399 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1576,9 +1576,19 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
+               return;
+       WARN_ON(cache->bytes_super != 0);
+-      unusable = (cache->alloc_offset - cache->used) +
+-                 (cache->length - cache->zone_capacity);
+-      free = cache->zone_capacity - cache->alloc_offset;
++
++      /* Check for block groups never get activated */
++      if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) &&
++          cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) &&
++          !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) &&
++          cache->alloc_offset == 0) {
++              unusable = cache->length;
++              free = 0;
++      } else {
++              unusable = (cache->alloc_offset - cache->used) +
++                         (cache->length - cache->zone_capacity);
++              free = cache->zone_capacity - cache->alloc_offset;
++      }
+       /* We only need ->free_space in ALLOC_SEQ block groups */
+       cache->cached = BTRFS_CACHE_FINISHED;
+@@ -1915,7 +1925,11 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
+       /* Successfully activated all the zones */
+       set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
+-      space_info->active_total_bytes += block_group->length;
++      WARN_ON(block_group->alloc_offset != 0);
++      if (block_group->zone_unusable == block_group->length) {
++              block_group->zone_unusable = block_group->length - block_group->zone_capacity;
++              space_info->bytes_zone_unusable -= block_group->zone_capacity;
++      }
+       spin_unlock(&block_group->lock);
+       btrfs_try_granting_tickets(fs_info, space_info);
+       spin_unlock(&space_info->lock);
+@@ -2279,7 +2293,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
+               u64 avail;
+               spin_lock(&block_group->lock);
+-              if (block_group->reserved ||
++              if (block_group->reserved || block_group->alloc_offset == 0 ||
+                   (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) {
+                       spin_unlock(&block_group->lock);
+                       continue;
+-- 
+2.39.2
+
diff --git a/queue-6.2/btrfs-zoned-drop-space_info-active_total_bytes.patch b/queue-6.2/btrfs-zoned-drop-space_info-active_total_bytes.patch
new file mode 100644 (file)
index 0000000..e48bc20
--- /dev/null
@@ -0,0 +1,192 @@
+From 56c14d707e8baa1a944f01457f5509fa7188c304 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Mar 2023 16:06:14 +0900
+Subject: btrfs: zoned: drop space_info->active_total_bytes
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+[ Upstream commit e15acc25880cf048dba9df94d76ed7e7e10040e6 ]
+
+The space_info->active_total_bytes is no longer necessary as we now
+count the region of newly allocated block group as zone_unusable. Drop
+its usage.
+
+Fixes: 6a921de58992 ("btrfs: zoned: introduce space_info->active_total_bytes")
+CC: stable@vger.kernel.org # 6.1+
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/block-group.c |  6 ------
+ fs/btrfs/space-info.c  | 40 +++++++++-------------------------------
+ fs/btrfs/space-info.h  |  2 --
+ fs/btrfs/zoned.c       |  4 ----
+ 4 files changed, 9 insertions(+), 43 deletions(-)
+
+diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
+index d628d545ffea7..8eb625318e785 100644
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1036,14 +1036,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
+                       < block_group->zone_unusable);
+               WARN_ON(block_group->space_info->disk_total
+                       < block_group->length * factor);
+-              WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
+-                               &block_group->runtime_flags) &&
+-                      block_group->space_info->active_total_bytes
+-                      < block_group->length);
+       }
+       block_group->space_info->total_bytes -= block_group->length;
+-      if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+-              block_group->space_info->active_total_bytes -= block_group->length;
+       block_group->space_info->bytes_readonly -=
+               (block_group->length - block_group->zone_unusable);
+       block_group->space_info->bytes_zone_unusable -=
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index 2237685d1ed0c..3eecce86f63fc 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -308,8 +308,6 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
+       ASSERT(found);
+       spin_lock(&found->lock);
+       found->total_bytes += block_group->length;
+-      if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
+-              found->active_total_bytes += block_group->length;
+       found->disk_total += block_group->length * factor;
+       found->bytes_used += block_group->used;
+       found->disk_used += block_group->used * factor;
+@@ -379,22 +377,6 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
+       return avail;
+ }
+-static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
+-                                     struct btrfs_space_info *space_info)
+-{
+-      /*
+-       * On regular filesystem, all total_bytes are always writable. On zoned
+-       * filesystem, there may be a limitation imposed by max_active_zones.
+-       * For metadata allocation, we cannot finish an existing active block
+-       * group to avoid a deadlock. Thus, we need to consider only the active
+-       * groups to be writable for metadata space.
+-       */
+-      if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+-              return space_info->total_bytes;
+-
+-      return space_info->active_total_bytes;
+-}
+-
+ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+                        struct btrfs_space_info *space_info, u64 bytes,
+                        enum btrfs_reserve_flush_enum flush)
+@@ -413,7 +395,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+       else
+               avail = calc_available_free_space(fs_info, space_info, flush);
+-      if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
++      if (used + bytes < space_info->total_bytes + avail)
+               return 1;
+       return 0;
+ }
+@@ -449,7 +431,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
+               ticket = list_first_entry(head, struct reserve_ticket, list);
+               /* Check and see if our ticket can be satisfied now. */
+-              if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
++              if ((used + ticket->bytes <= space_info->total_bytes) ||
+                   btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
+                                        flush)) {
+                       btrfs_space_info_update_bytes_may_use(fs_info,
+@@ -829,7 +811,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+ {
+       u64 used;
+       u64 avail;
+-      u64 total;
+       u64 to_reclaim = space_info->reclaim_size;
+       lockdep_assert_held(&space_info->lock);
+@@ -844,9 +825,8 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
+        * space.  If that's the case add in our overage so we make sure to put
+        * appropriate pressure on the flushing state machine.
+        */
+-      total = writable_total_bytes(fs_info, space_info);
+-      if (total + avail < used)
+-              to_reclaim += used - (total + avail);
++      if (space_info->total_bytes + avail < used)
++              to_reclaim += used - (space_info->total_bytes + avail);
+       return to_reclaim;
+ }
+@@ -856,11 +836,10 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
+ {
+       u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+       u64 ordered, delalloc;
+-      u64 total = writable_total_bytes(fs_info, space_info);
+       u64 thresh;
+       u64 used;
+-      thresh = mult_perc(total, 90);
++      thresh = mult_perc(space_info->total_bytes, 90);
+       lockdep_assert_held(&space_info->lock);
+@@ -923,8 +902,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
+                                          BTRFS_RESERVE_FLUSH_ALL);
+       used = space_info->bytes_used + space_info->bytes_reserved +
+              space_info->bytes_readonly + global_rsv_size;
+-      if (used < total)
+-              thresh += total - used;
++      if (used < space_info->total_bytes)
++              thresh += space_info->total_bytes - used;
+       thresh >>= space_info->clamp;
+       used = space_info->bytes_pinned;
+@@ -1651,7 +1630,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+        * can_overcommit() to ensure we can overcommit to continue.
+        */
+       if (!pending_tickets &&
+-          ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
++          ((used + orig_bytes <= space_info->total_bytes) ||
+            btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
+               btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+                                                     orig_bytes);
+@@ -1665,8 +1644,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+        */
+       if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
+               used = btrfs_space_info_used(space_info, false);
+-              if (used + orig_bytes <=
+-                  writable_total_bytes(fs_info, space_info)) {
++              if (used + orig_bytes <= space_info->total_bytes) {
+                       btrfs_space_info_update_bytes_may_use(fs_info, space_info,
+                                                             orig_bytes);
+                       ret = 0;
+diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
+index fc99ea2b0c34f..2033b71b18cec 100644
+--- a/fs/btrfs/space-info.h
++++ b/fs/btrfs/space-info.h
+@@ -96,8 +96,6 @@ struct btrfs_space_info {
+       u64 bytes_may_use;      /* number of bytes that may be used for
+                                  delalloc/allocations */
+       u64 bytes_readonly;     /* total bytes that are read only */
+-      /* Total bytes in the space, but only accounts active block groups. */
+-      u64 active_total_bytes;
+       u64 bytes_zone_unusable;        /* total bytes that are unusable until
+                                          resetting the device zone */
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index c3c763cc06399..ce5ebba7fdd9a 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2330,10 +2330,6 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
+       if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
+               return 0;
+-      /* No more block groups to activate */
+-      if (space_info->active_total_bytes == space_info->total_bytes)
+-              return 0;
+-
+       for (;;) {
+               int ret;
+               bool need_finish = false;
+-- 
+2.39.2
+
diff --git a/queue-6.2/cifs-avoid-race-conditions-with-parallel-reconnects.patch b/queue-6.2/cifs-avoid-race-conditions-with-parallel-reconnects.patch
new file mode 100644 (file)
index 0000000..5c15f09
--- /dev/null
@@ -0,0 +1,333 @@
+From 1f3fa825883f51944f3e0d3d92717251a8844cb3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 20 Mar 2023 06:08:19 +0000
+Subject: cifs: avoid race conditions with parallel reconnects
+
+From: Shyam Prasad N <sprasad@microsoft.com>
+
+[ Upstream commit bc962159e8e326af634a506508034a375bf2b858 ]
+
+When multiple processes/channels do reconnects in parallel
+we used to return success immediately
+negotiate/session-setup/tree-connect, causing race conditions
+between processes that enter the function in parallel.
+This caused several errors related to session not found to
+show up during parallel reconnects.
+
+Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
+Reviewed-by: Paulo Alcantara (SUSE) <pc@manguebit.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cifs/connect.c       | 48 ++++++++++++++++++++++++++++++-----------
+ fs/cifs/smb2pdu.c       | 44 +++++++++++++++++++++----------------
+ fs/cifs/smb2transport.c | 17 ++++++++++++---
+ 3 files changed, 76 insertions(+), 33 deletions(-)
+
+diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
+index f53837f436d08..985e962cf0858 100644
+--- a/fs/cifs/connect.c
++++ b/fs/cifs/connect.c
+@@ -244,31 +244,42 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
+                       cifs_chan_update_iface(ses, server);
+               spin_lock(&ses->chan_lock);
+-              if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server))
+-                      goto next_session;
++              if (!mark_smb_session && cifs_chan_needs_reconnect(ses, server)) {
++                      spin_unlock(&ses->chan_lock);
++                      continue;
++              }
+               if (mark_smb_session)
+                       CIFS_SET_ALL_CHANS_NEED_RECONNECT(ses);
+               else
+                       cifs_chan_set_need_reconnect(ses, server);
++              cifs_dbg(FYI, "%s: channel connect bitmap: 0x%lx\n",
++                       __func__, ses->chans_need_reconnect);
++
+               /* If all channels need reconnect, then tcon needs reconnect */
+-              if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses))
+-                      goto next_session;
++              if (!mark_smb_session && !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) {
++                      spin_unlock(&ses->chan_lock);
++                      continue;
++              }
++              spin_unlock(&ses->chan_lock);
++              spin_lock(&ses->ses_lock);
+               ses->ses_status = SES_NEED_RECON;
++              spin_unlock(&ses->ses_lock);
+               list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+                       tcon->need_reconnect = true;
++                      spin_lock(&tcon->tc_lock);
+                       tcon->status = TID_NEED_RECON;
++                      spin_unlock(&tcon->tc_lock);
+               }
+               if (ses->tcon_ipc) {
+                       ses->tcon_ipc->need_reconnect = true;
++                      spin_lock(&ses->tcon_ipc->tc_lock);
+                       ses->tcon_ipc->status = TID_NEED_RECON;
++                      spin_unlock(&ses->tcon_ipc->tc_lock);
+               }
+-
+-next_session:
+-              spin_unlock(&ses->chan_lock);
+       }
+       spin_unlock(&cifs_tcp_ses_lock);
+ }
+@@ -3703,11 +3714,19 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
+       /* only send once per connect */
+       spin_lock(&server->srv_lock);
+-      if (!server->ops->need_neg(server) ||
++      if (server->tcpStatus != CifsGood &&
++          server->tcpStatus != CifsNew &&
+           server->tcpStatus != CifsNeedNegotiate) {
++              spin_unlock(&server->srv_lock);
++              return -EHOSTDOWN;
++      }
++
++      if (!server->ops->need_neg(server) &&
++          server->tcpStatus == CifsGood) {
+               spin_unlock(&server->srv_lock);
+               return 0;
+       }
++
+       server->tcpStatus = CifsInNegotiate;
+       spin_unlock(&server->srv_lock);
+@@ -3741,23 +3760,28 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+       bool is_binding = false;
+       spin_lock(&ses->ses_lock);
++      cifs_dbg(FYI, "%s: channel connect bitmap: 0x%lx\n",
++               __func__, ses->chans_need_reconnect);
++
+       if (ses->ses_status != SES_GOOD &&
+           ses->ses_status != SES_NEW &&
+           ses->ses_status != SES_NEED_RECON) {
+               spin_unlock(&ses->ses_lock);
+-              return 0;
++              return -EHOSTDOWN;
+       }
+       /* only send once per connect */
+       spin_lock(&ses->chan_lock);
+-      if (CIFS_ALL_CHANS_GOOD(ses) ||
+-          cifs_chan_in_reconnect(ses, server)) {
++      if (CIFS_ALL_CHANS_GOOD(ses)) {
++              if (ses->ses_status == SES_NEED_RECON)
++                      ses->ses_status = SES_GOOD;
+               spin_unlock(&ses->chan_lock);
+               spin_unlock(&ses->ses_lock);
+               return 0;
+       }
+-      is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
++
+       cifs_chan_set_in_reconnect(ses, server);
++      is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+       spin_unlock(&ses->chan_lock);
+       if (!is_binding)
+diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
+index 83d04cd2f9df8..f0b1ae0835d71 100644
+--- a/fs/cifs/smb2pdu.c
++++ b/fs/cifs/smb2pdu.c
+@@ -199,6 +199,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
+       }
+       spin_unlock(&server->srv_lock);
++again:
+       rc = cifs_wait_for_server_reconnect(server, tcon->retry);
+       if (rc)
+               return rc;
+@@ -217,6 +218,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
+       nls_codepage = load_nls_default();
++      mutex_lock(&ses->session_mutex);
+       /*
+        * Recheck after acquire mutex. If another thread is negotiating
+        * and the server never sends an answer the socket will be closed
+@@ -225,6 +227,11 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
+       spin_lock(&server->srv_lock);
+       if (server->tcpStatus == CifsNeedReconnect) {
+               spin_unlock(&server->srv_lock);
++              mutex_unlock(&ses->session_mutex);
++
++              if (tcon->retry)
++                      goto again;
++
+               rc = -EHOSTDOWN;
+               goto out;
+       }
+@@ -234,19 +241,22 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
+        * need to prevent multiple threads trying to simultaneously
+        * reconnect the same SMB session
+        */
++      spin_lock(&ses->ses_lock);
+       spin_lock(&ses->chan_lock);
+-      if (!cifs_chan_needs_reconnect(ses, server)) {
++      if (!cifs_chan_needs_reconnect(ses, server) &&
++          ses->ses_status == SES_GOOD) {
+               spin_unlock(&ses->chan_lock);
+-
++              spin_unlock(&ses->ses_lock);
+               /* this means that we only need to tree connect */
+               if (tcon->need_reconnect)
+                       goto skip_sess_setup;
++              mutex_unlock(&ses->session_mutex);
+               goto out;
+       }
+       spin_unlock(&ses->chan_lock);
++      spin_unlock(&ses->ses_lock);
+-      mutex_lock(&ses->session_mutex);
+       rc = cifs_negotiate_protocol(0, ses, server);
+       if (!rc) {
+               rc = cifs_setup_session(0, ses, server, nls_codepage);
+@@ -262,10 +272,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
+               mutex_unlock(&ses->session_mutex);
+               goto out;
+       }
+-      mutex_unlock(&ses->session_mutex);
+ skip_sess_setup:
+-      mutex_lock(&ses->session_mutex);
+       if (!tcon->need_reconnect) {
+               mutex_unlock(&ses->session_mutex);
+               goto out;
+@@ -280,7 +288,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
+       cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
+       if (rc) {
+               /* If sess reconnected but tcon didn't, something strange ... */
+-              pr_warn_once("reconnect tcon failed rc = %d\n", rc);
++              cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc);
+               goto out;
+       }
+@@ -1252,9 +1260,9 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
+       if (rc)
+               return rc;
+-      spin_lock(&ses->chan_lock);
+-      is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+-      spin_unlock(&ses->chan_lock);
++      spin_lock(&ses->ses_lock);
++      is_binding = (ses->ses_status == SES_GOOD);
++      spin_unlock(&ses->ses_lock);
+       if (is_binding) {
+               req->hdr.SessionId = cpu_to_le64(ses->Suid);
+@@ -1412,9 +1420,9 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
+               goto out_put_spnego_key;
+       }
+-      spin_lock(&ses->chan_lock);
+-      is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+-      spin_unlock(&ses->chan_lock);
++      spin_lock(&ses->ses_lock);
++      is_binding = (ses->ses_status == SES_GOOD);
++      spin_unlock(&ses->ses_lock);
+       /* keep session key if binding */
+       if (!is_binding) {
+@@ -1538,9 +1546,9 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
+       cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
+-      spin_lock(&ses->chan_lock);
+-      is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+-      spin_unlock(&ses->chan_lock);
++      spin_lock(&ses->ses_lock);
++      is_binding = (ses->ses_status == SES_GOOD);
++      spin_unlock(&ses->ses_lock);
+       /* keep existing ses id and flags if binding */
+       if (!is_binding) {
+@@ -1606,9 +1614,9 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
+       rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
+-      spin_lock(&ses->chan_lock);
+-      is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
+-      spin_unlock(&ses->chan_lock);
++      spin_lock(&ses->ses_lock);
++      is_binding = (ses->ses_status == SES_GOOD);
++      spin_unlock(&ses->ses_lock);
+       /* keep existing ses id and flags if binding */
+       if (!is_binding) {
+diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
+index d827b7547ffad..790acf65a0926 100644
+--- a/fs/cifs/smb2transport.c
++++ b/fs/cifs/smb2transport.c
+@@ -81,6 +81,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
+       struct cifs_ses *ses = NULL;
+       int i;
+       int rc = 0;
++      bool is_binding = false;
+       spin_lock(&cifs_tcp_ses_lock);
+@@ -97,9 +98,12 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
+       goto out;
+ found:
++      spin_lock(&ses->ses_lock);
+       spin_lock(&ses->chan_lock);
+-      if (cifs_chan_needs_reconnect(ses, server) &&
+-          !CIFS_ALL_CHANS_NEED_RECONNECT(ses)) {
++
++      is_binding = (cifs_chan_needs_reconnect(ses, server) &&
++                    ses->ses_status == SES_GOOD);
++      if (is_binding) {
+               /*
+                * If we are in the process of binding a new channel
+                * to an existing session, use the master connection
+@@ -107,6 +111,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
+                */
+               memcpy(key, ses->smb3signingkey, SMB3_SIGN_KEY_SIZE);
+               spin_unlock(&ses->chan_lock);
++              spin_unlock(&ses->ses_lock);
+               goto out;
+       }
+@@ -119,10 +124,12 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
+               if (chan->server == server) {
+                       memcpy(key, chan->signkey, SMB3_SIGN_KEY_SIZE);
+                       spin_unlock(&ses->chan_lock);
++                      spin_unlock(&ses->ses_lock);
+                       goto out;
+               }
+       }
+       spin_unlock(&ses->chan_lock);
++      spin_unlock(&ses->ses_lock);
+       cifs_dbg(VFS,
+                "%s: Could not find channel signing key for session 0x%llx\n",
+@@ -392,11 +399,15 @@ generate_smb3signingkey(struct cifs_ses *ses,
+       bool is_binding = false;
+       int chan_index = 0;
++      spin_lock(&ses->ses_lock);
+       spin_lock(&ses->chan_lock);
+-      is_binding = !CIFS_ALL_CHANS_NEED_RECONNECT(ses);
++      is_binding = (cifs_chan_needs_reconnect(ses, server) &&
++                    ses->ses_status == SES_GOOD);
++
+       chan_index = cifs_ses_get_chan_index(ses, server);
+       /* TODO: introduce ref counting for channels when the can be freed */
+       spin_unlock(&ses->chan_lock);
++      spin_unlock(&ses->ses_lock);
+       /*
+        * All channels use the same encryption/decryption keys but
+-- 
+2.39.2
+
diff --git a/queue-6.2/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch b/queue-6.2/cifs-prevent-data-race-in-cifs_reconnect_tcon.patch
new file mode 100644 (file)
index 0000000..e9c4e04
--- /dev/null
@@ -0,0 +1,255 @@
+From 9b1a97c7aa9103eaed0e2fde976931dbb03b6622 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Feb 2023 19:01:55 -0300
+Subject: cifs: prevent data race in cifs_reconnect_tcon()
+
+From: Paulo Alcantara <pc@manguebit.com>
+
+[ Upstream commit 1bcd548d935a33c6fc58331405eb1b82fd6150de ]
+
+Make sure to get an up-to-date TCP_Server_Info::nr_targets value prior
+to waiting the server to be reconnected in cifs_reconnect_tcon().  It
+is set in cifs_tcp_ses_needs_reconnect() and protected by
+TCP_Server_Info::srv_lock.
+
+Create a new cifs_wait_for_server_reconnect() helper that can be used
+by both SMB2+ and CIFS reconnect code.
+
+Signed-off-by: Paulo Alcantara (SUSE) <pc@manguebit.com>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Stable-dep-of: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cifs/cifsproto.h |  1 +
+ fs/cifs/cifssmb.c   | 43 ++----------------------
+ fs/cifs/misc.c      | 44 ++++++++++++++++++++++++
+ fs/cifs/smb2pdu.c   | 82 ++++++++++++---------------------------------
+ 4 files changed, 69 insertions(+), 101 deletions(-)
+
+diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
+index e75184544ecb4..639df85dafd6c 100644
+--- a/fs/cifs/cifsproto.h
++++ b/fs/cifs/cifsproto.h
+@@ -697,5 +697,6 @@ static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options)
+ struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon);
+ void cifs_put_tcon_super(struct super_block *sb);
++int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry);
+ #endif                        /* _CIFSPROTO_H */
+diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
+index 566e6a26b897c..5ca4a5383aaae 100644
+--- a/fs/cifs/cifssmb.c
++++ b/fs/cifs/cifssmb.c
+@@ -70,7 +70,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
+       struct cifs_ses *ses;
+       struct TCP_Server_Info *server;
+       struct nls_table *nls_codepage;
+-      int retries;
+       /*
+        * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
+@@ -98,45 +97,9 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
+       }
+       spin_unlock(&tcon->tc_lock);
+-      retries = server->nr_targets;
+-
+-      /*
+-       * Give demultiplex thread up to 10 seconds to each target available for
+-       * reconnect -- should be greater than cifs socket timeout which is 7
+-       * seconds.
+-       */
+-      while (server->tcpStatus == CifsNeedReconnect) {
+-              rc = wait_event_interruptible_timeout(server->response_q,
+-                                                    (server->tcpStatus != CifsNeedReconnect),
+-                                                    10 * HZ);
+-              if (rc < 0) {
+-                      cifs_dbg(FYI, "%s: aborting reconnect due to a received signal by the process\n",
+-                               __func__);
+-                      return -ERESTARTSYS;
+-              }
+-
+-              /* are we still trying to reconnect? */
+-              spin_lock(&server->srv_lock);
+-              if (server->tcpStatus != CifsNeedReconnect) {
+-                      spin_unlock(&server->srv_lock);
+-                      break;
+-              }
+-              spin_unlock(&server->srv_lock);
+-
+-              if (retries && --retries)
+-                      continue;
+-
+-              /*
+-               * on "soft" mounts we wait once. Hard mounts keep
+-               * retrying until process is killed or server comes
+-               * back on-line
+-               */
+-              if (!tcon->retry) {
+-                      cifs_dbg(FYI, "gave up waiting on reconnect in smb_init\n");
+-                      return -EHOSTDOWN;
+-              }
+-              retries = server->nr_targets;
+-      }
++      rc = cifs_wait_for_server_reconnect(server, tcon->retry);
++      if (rc)
++              return rc;
+       spin_lock(&ses->chan_lock);
+       if (!cifs_chan_needs_reconnect(ses, server) && !tcon->need_reconnect) {
+diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
+index 9f4486b705d5c..5542893ef03f7 100644
+--- a/fs/cifs/misc.c
++++ b/fs/cifs/misc.c
+@@ -1376,3 +1376,47 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid,
+       return 0;
+ }
+ #endif
++
++int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry)
++{
++      int timeout = 10;
++      int rc;
++
++      spin_lock(&server->srv_lock);
++      if (server->tcpStatus != CifsNeedReconnect) {
++              spin_unlock(&server->srv_lock);
++              return 0;
++      }
++      timeout *= server->nr_targets;
++      spin_unlock(&server->srv_lock);
++
++      /*
++       * Give demultiplex thread up to 10 seconds to each target available for
++       * reconnect -- should be greater than cifs socket timeout which is 7
++       * seconds.
++       *
++       * On "soft" mounts we wait once. Hard mounts keep retrying until
++       * process is killed or server comes back on-line.
++       */
++      do {
++              rc = wait_event_interruptible_timeout(server->response_q,
++                                                    (server->tcpStatus != CifsNeedReconnect),
++                                                    timeout * HZ);
++              if (rc < 0) {
++                      cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n",
++                               __func__);
++                      return -ERESTARTSYS;
++              }
++
++              /* are we still trying to reconnect? */
++              spin_lock(&server->srv_lock);
++              if (server->tcpStatus != CifsNeedReconnect) {
++                      spin_unlock(&server->srv_lock);
++                      return 0;
++              }
++              spin_unlock(&server->srv_lock);
++      } while (retry);
++
++      cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__);
++      return -EHOSTDOWN;
++}
+diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
+index 6e6e44d8b4c79..83d04cd2f9df8 100644
+--- a/fs/cifs/smb2pdu.c
++++ b/fs/cifs/smb2pdu.c
+@@ -139,66 +139,6 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd,
+       return;
+ }
+-static int wait_for_server_reconnect(struct TCP_Server_Info *server,
+-                                   __le16 smb2_command, bool retry)
+-{
+-      int timeout = 10;
+-      int rc;
+-
+-      spin_lock(&server->srv_lock);
+-      if (server->tcpStatus != CifsNeedReconnect) {
+-              spin_unlock(&server->srv_lock);
+-              return 0;
+-      }
+-      timeout *= server->nr_targets;
+-      spin_unlock(&server->srv_lock);
+-
+-      /*
+-       * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE
+-       * here since they are implicitly done when session drops.
+-       */
+-      switch (smb2_command) {
+-      /*
+-       * BB Should we keep oplock break and add flush to exceptions?
+-       */
+-      case SMB2_TREE_DISCONNECT:
+-      case SMB2_CANCEL:
+-      case SMB2_CLOSE:
+-      case SMB2_OPLOCK_BREAK:
+-              return -EAGAIN;
+-      }
+-
+-      /*
+-       * Give demultiplex thread up to 10 seconds to each target available for
+-       * reconnect -- should be greater than cifs socket timeout which is 7
+-       * seconds.
+-       *
+-       * On "soft" mounts we wait once. Hard mounts keep retrying until
+-       * process is killed or server comes back on-line.
+-       */
+-      do {
+-              rc = wait_event_interruptible_timeout(server->response_q,
+-                                                    (server->tcpStatus != CifsNeedReconnect),
+-                                                    timeout * HZ);
+-              if (rc < 0) {
+-                      cifs_dbg(FYI, "%s: aborting reconnect due to received signal\n",
+-                               __func__);
+-                      return -ERESTARTSYS;
+-              }
+-
+-              /* are we still trying to reconnect? */
+-              spin_lock(&server->srv_lock);
+-              if (server->tcpStatus != CifsNeedReconnect) {
+-                      spin_unlock(&server->srv_lock);
+-                      return 0;
+-              }
+-              spin_unlock(&server->srv_lock);
+-      } while (retry);
+-
+-      cifs_dbg(FYI, "%s: gave up waiting on reconnect\n", __func__);
+-      return -EHOSTDOWN;
+-}
+-
+ static int
+ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
+              struct TCP_Server_Info *server)
+@@ -239,7 +179,27 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
+           (!tcon->ses->server) || !server)
+               return -EIO;
+-      rc = wait_for_server_reconnect(server, smb2_command, tcon->retry);
++      spin_lock(&server->srv_lock);
++      if (server->tcpStatus == CifsNeedReconnect) {
++              /*
++               * Return to caller for TREE_DISCONNECT and LOGOFF and CLOSE
++               * here since they are implicitly done when session drops.
++               */
++              switch (smb2_command) {
++              /*
++               * BB Should we keep oplock break and add flush to exceptions?
++               */
++              case SMB2_TREE_DISCONNECT:
++              case SMB2_CANCEL:
++              case SMB2_CLOSE:
++              case SMB2_OPLOCK_BREAK:
++                      spin_unlock(&server->srv_lock);
++                      return -EAGAIN;
++              }
++      }
++      spin_unlock(&server->srv_lock);
++
++      rc = cifs_wait_for_server_reconnect(server, tcon->retry);
+       if (rc)
+               return rc;
+-- 
+2.39.2
+
diff --git a/queue-6.2/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch b/queue-6.2/cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch
new file mode 100644 (file)
index 0000000..93f14e4
--- /dev/null
@@ -0,0 +1,64 @@
+From 6ada185dd647bb8d36c926924cab533738bf13de Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 10 Feb 2023 17:41:17 +0000
+Subject: cifs: update ip_addr for ses only for primary chan setup
+
+From: Shyam Prasad N <sprasad@microsoft.com>
+
+[ Upstream commit e77978de4765229e09c8fabcf4f8419ff367317f ]
+
+We update ses->ip_addr whenever we do a session setup.
+But this should happen only for primary channel in mchan
+scenario.
+
+Signed-off-by: Shyam Prasad N <sprasad@microsoft.com>
+Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Stable-dep-of: bc962159e8e3 ("cifs: avoid race conditions with parallel reconnects")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/cifs/connect.c | 18 +++++++++++-------
+ 1 file changed, 11 insertions(+), 7 deletions(-)
+
+diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
+index 6da2af97b8bac..f53837f436d08 100644
+--- a/fs/cifs/connect.c
++++ b/fs/cifs/connect.c
+@@ -3735,16 +3735,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+                  struct nls_table *nls_info)
+ {
+       int rc = -ENOSYS;
+-      struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
+-      struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
++      struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
++      struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr;
++      struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr;
+       bool is_binding = false;
+       spin_lock(&ses->ses_lock);
+-      if (server->dstaddr.ss_family == AF_INET6)
+-              scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
+-      else
+-              scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr);
+-
+       if (ses->ses_status != SES_GOOD &&
+           ses->ses_status != SES_NEW &&
+           ses->ses_status != SES_NEED_RECON) {
+@@ -3768,6 +3764,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
+               ses->ses_status = SES_IN_SETUP;
+       spin_unlock(&ses->ses_lock);
++      /* update ses ip_addr only for primary chan */
++      if (server == pserver) {
++              if (server->dstaddr.ss_family == AF_INET6)
++                      scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI6", &addr6->sin6_addr);
++              else
++                      scnprintf(ses->ip_addr, sizeof(ses->ip_addr), "%pI4", &addr->sin_addr);
++      }
++
+       if (!is_binding) {
+               ses->capabilities = server->capabilities;
+               if (!linuxExtEnabled)
+-- 
+2.39.2
+
diff --git a/queue-6.2/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch b/queue-6.2/fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch
new file mode 100644 (file)
index 0000000..1872cd6
--- /dev/null
@@ -0,0 +1,73 @@
+From 8e38bc5af3d0e0d81a9ccb3a22c97e9361acd81d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Mar 2023 16:31:32 -0700
+Subject: fsverity: don't drop pagecache at end of FS_IOC_ENABLE_VERITY
+
+From: Eric Biggers <ebiggers@google.com>
+
+[ Upstream commit a075bacde257f755bea0e53400c9f1cdd1b8e8e6 ]
+
+The full pagecache drop at the end of FS_IOC_ENABLE_VERITY is causing
+performance problems and is hindering adoption of fsverity.  It was
+intended to solve a race condition where unverified pages might be left
+in the pagecache.  But actually it doesn't solve it fully.
+
+Since the incomplete solution for this race condition has too much
+performance impact for it to be worth it, let's remove it for now.
+
+Fixes: 3fda4c617e84 ("fs-verity: implement FS_IOC_ENABLE_VERITY ioctl")
+Cc: stable@vger.kernel.org
+Reviewed-by: Victor Hsieh <victorhsieh@google.com>
+Link: https://lore.kernel.org/r/20230314235332.50270-1-ebiggers@kernel.org
+Signed-off-by: Eric Biggers <ebiggers@google.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/verity/enable.c | 24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+diff --git a/fs/verity/enable.c b/fs/verity/enable.c
+index df6b499bf6a14..400c264bf8930 100644
+--- a/fs/verity/enable.c
++++ b/fs/verity/enable.c
+@@ -390,25 +390,27 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
+               goto out_drop_write;
+       err = enable_verity(filp, &arg);
+-      if (err)
+-              goto out_allow_write_access;
+       /*
+-       * Some pages of the file may have been evicted from pagecache after
+-       * being used in the Merkle tree construction, then read into pagecache
+-       * again by another process reading from the file concurrently.  Since
+-       * these pages didn't undergo verification against the file digest which
+-       * fs-verity now claims to be enforcing, we have to wipe the pagecache
+-       * to ensure that all future reads are verified.
++       * We no longer drop the inode's pagecache after enabling verity.  This
++       * used to be done to try to avoid a race condition where pages could be
++       * evicted after being used in the Merkle tree construction, then
++       * re-instantiated by a concurrent read.  Such pages are unverified, and
++       * the backing storage could have filled them with different content, so
++       * they shouldn't be used to fulfill reads once verity is enabled.
++       *
++       * But, dropping the pagecache has a big performance impact, and it
++       * doesn't fully solve the race condition anyway.  So for those reasons,
++       * and also because this race condition isn't very important relatively
++       * speaking (especially for small-ish files, where the chance of a page
++       * being used, evicted, *and* re-instantiated all while enabling verity
++       * is quite small), we no longer drop the inode's pagecache.
+        */
+-      filemap_write_and_wait(inode->i_mapping);
+-      invalidate_inode_pages2(inode->i_mapping);
+       /*
+        * allow_write_access() is needed to pair with deny_write_access().
+        * Regardless, the filesystem won't allow writing to verity files.
+        */
+-out_allow_write_access:
+       allow_write_access(filp);
+ out_drop_write:
+       mnt_drop_write_file(filp);
+-- 
+2.39.2
+
diff --git a/queue-6.2/series b/queue-6.2/series
new file mode 100644 (file)
index 0000000..17cefee
--- /dev/null
@@ -0,0 +1,13 @@
+thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch
+cifs-update-ip_addr-for-ses-only-for-primary-chan-se.patch
+cifs-prevent-data-race-in-cifs_reconnect_tcon.patch
+cifs-avoid-race-conditions-with-parallel-reconnects.patch
+zonefs-reorganize-code.patch
+zonefs-simplify-io-error-handling.patch
+zonefs-reduce-struct-zonefs_inode_info-size.patch
+zonefs-separate-zone-information-from-inode-informat.patch
+zonefs-fix-error-message-in-zonefs_file_dio_append.patch
+btrfs-rename-btrfs_fs_no_overcommit-to-btrfs_fs_acti.patch
+btrfs-zoned-count-fresh-bg-region-as-zone-unusable.patch
+btrfs-zoned-drop-space_info-active_total_bytes.patch
+fsverity-don-t-drop-pagecache-at-end-of-fs_ioc_enabl.patch
diff --git a/queue-6.2/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch b/queue-6.2/thunderbolt-limit-usb3-bandwidth-of-certain-intel-us.patch
new file mode 100644 (file)
index 0000000..a7ccee3
--- /dev/null
@@ -0,0 +1,138 @@
+From 37e2a48598a23c799b1dd4fabaa5b7d6bec7ab4c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 31 Jan 2023 13:04:52 +0200
+Subject: thunderbolt: Limit USB3 bandwidth of certain Intel USB4 host routers
+
+From: Gil Fine <gil.fine@linux.intel.com>
+
+[ Upstream commit f0a57dd33b3eadf540912cd130db727ea824d174 ]
+
+Current Intel USB4 host routers have hardware limitation that the USB3
+bandwidth cannot go higher than 16376 Mb/s. Work this around by adding a
+new quirk that limits the bandwidth for the affected host routers.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Gil Fine <gil.fine@linux.intel.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/thunderbolt/quirks.c | 31 +++++++++++++++++++++++++++++++
+ drivers/thunderbolt/tb.h     |  3 +++
+ drivers/thunderbolt/usb4.c   | 17 +++++++++++++++--
+ 3 files changed, 49 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/thunderbolt/quirks.c b/drivers/thunderbolt/quirks.c
+index ae28a03fa890b..1157b8869bcca 100644
+--- a/drivers/thunderbolt/quirks.c
++++ b/drivers/thunderbolt/quirks.c
+@@ -26,6 +26,19 @@ static void quirk_clx_disable(struct tb_switch *sw)
+       tb_sw_dbg(sw, "disabling CL states\n");
+ }
++static void quirk_usb3_maximum_bandwidth(struct tb_switch *sw)
++{
++      struct tb_port *port;
++
++      tb_switch_for_each_port(sw, port) {
++              if (!tb_port_is_usb3_down(port))
++                      continue;
++              port->max_bw = 16376;
++              tb_port_dbg(port, "USB3 maximum bandwidth limited to %u Mb/s\n",
++                          port->max_bw);
++      }
++}
++
+ struct tb_quirk {
+       u16 hw_vendor_id;
+       u16 hw_device_id;
+@@ -43,6 +56,24 @@ static const struct tb_quirk tb_quirks[] = {
+        * DP buffers.
+        */
+       { 0x8087, 0x0b26, 0x0000, 0x0000, quirk_dp_credit_allocation },
++      /*
++       * Limit the maximum USB3 bandwidth for the following Intel USB4
++       * host routers due to a hardware issue.
++       */
++      { 0x8087, PCI_DEVICE_ID_INTEL_ADL_NHI0, 0x0000, 0x0000,
++                quirk_usb3_maximum_bandwidth },
++      { 0x8087, PCI_DEVICE_ID_INTEL_ADL_NHI1, 0x0000, 0x0000,
++                quirk_usb3_maximum_bandwidth },
++      { 0x8087, PCI_DEVICE_ID_INTEL_RPL_NHI0, 0x0000, 0x0000,
++                quirk_usb3_maximum_bandwidth },
++      { 0x8087, PCI_DEVICE_ID_INTEL_RPL_NHI1, 0x0000, 0x0000,
++                quirk_usb3_maximum_bandwidth },
++      { 0x8087, PCI_DEVICE_ID_INTEL_MTL_M_NHI0, 0x0000, 0x0000,
++                quirk_usb3_maximum_bandwidth },
++      { 0x8087, PCI_DEVICE_ID_INTEL_MTL_P_NHI0, 0x0000, 0x0000,
++                quirk_usb3_maximum_bandwidth },
++      { 0x8087, PCI_DEVICE_ID_INTEL_MTL_P_NHI1, 0x0000, 0x0000,
++                quirk_usb3_maximum_bandwidth },
+       /*
+        * CLx is not supported on AMD USB4 Yellow Carp and Pink Sardine platforms.
+        */
+diff --git a/drivers/thunderbolt/tb.h b/drivers/thunderbolt/tb.h
+index e11d973a8f9b6..f034723b1b40e 100644
+--- a/drivers/thunderbolt/tb.h
++++ b/drivers/thunderbolt/tb.h
+@@ -252,6 +252,8 @@ struct tb_switch {
+  * @ctl_credits: Buffers reserved for control path
+  * @dma_credits: Number of credits allocated for DMA tunneling for all
+  *             DMA paths through this port.
++ * @max_bw: Maximum possible bandwidth through this adapter if set to
++ *        non-zero.
+  *
+  * In USB4 terminology this structure represents an adapter (protocol or
+  * lane adapter).
+@@ -277,6 +279,7 @@ struct tb_port {
+       unsigned int total_credits;
+       unsigned int ctl_credits;
+       unsigned int dma_credits;
++      unsigned int max_bw;
+ };
+ /**
+diff --git a/drivers/thunderbolt/usb4.c b/drivers/thunderbolt/usb4.c
+index d5cd219ee9e6b..3a11b30b6c86a 100644
+--- a/drivers/thunderbolt/usb4.c
++++ b/drivers/thunderbolt/usb4.c
+@@ -1882,6 +1882,15 @@ int usb4_port_retimer_nvm_read(struct tb_port *port, u8 index,
+                               usb4_port_retimer_nvm_read_block, &info);
+ }
++static inline unsigned int
++usb4_usb3_port_max_bandwidth(const struct tb_port *port, unsigned int bw)
++{
++      /* Take the possible bandwidth limitation into account */
++      if (port->max_bw)
++              return min(bw, port->max_bw);
++      return bw;
++}
++
+ /**
+  * usb4_usb3_port_max_link_rate() - Maximum support USB3 link rate
+  * @port: USB3 adapter port
+@@ -1903,7 +1912,9 @@ int usb4_usb3_port_max_link_rate(struct tb_port *port)
+               return ret;
+       lr = (val & ADP_USB3_CS_4_MSLR_MASK) >> ADP_USB3_CS_4_MSLR_SHIFT;
+-      return lr == ADP_USB3_CS_4_MSLR_20G ? 20000 : 10000;
++      ret = lr == ADP_USB3_CS_4_MSLR_20G ? 20000 : 10000;
++
++      return usb4_usb3_port_max_bandwidth(port, ret);
+ }
+ /**
+@@ -1930,7 +1941,9 @@ int usb4_usb3_port_actual_link_rate(struct tb_port *port)
+               return 0;
+       lr = val & ADP_USB3_CS_4_ALR_MASK;
+-      return lr == ADP_USB3_CS_4_ALR_20G ? 20000 : 10000;
++      ret = lr == ADP_USB3_CS_4_ALR_20G ? 20000 : 10000;
++
++      return usb4_usb3_port_max_bandwidth(port, ret);
+ }
+ static int usb4_usb3_port_cm_request(struct tb_port *port, bool request)
+-- 
+2.39.2
+
diff --git a/queue-6.2/zonefs-fix-error-message-in-zonefs_file_dio_append.patch b/queue-6.2/zonefs-fix-error-message-in-zonefs_file_dio_append.patch
new file mode 100644 (file)
index 0000000..87e59c4
--- /dev/null
@@ -0,0 +1,41 @@
+From 9c08f088ff35a6aceeb24a61989b6866012caeb4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 20 Mar 2023 22:49:15 +0900
+Subject: zonefs: Fix error message in zonefs_file_dio_append()
+
+From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+
+[ Upstream commit 88b170088ad2c3e27086fe35769aa49f8a512564 ]
+
+Since the expected write location in a sequential file is always at the
+end of the file (append write), when an invalid write append location is
+detected in zonefs_file_dio_append(), print the invalid written location
+instead of the expected write location.
+
+Fixes: a608da3bd730 ("zonefs: Detect append writes at invalid locations")
+Cc: stable@vger.kernel.org
+Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/zonefs/file.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
+index 738b0e28d74b5..c71cc0fcb3ec8 100644
+--- a/fs/zonefs/file.c
++++ b/fs/zonefs/file.c
+@@ -426,7 +426,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
+               if (bio->bi_iter.bi_sector != wpsector) {
+                       zonefs_warn(inode->i_sb,
+                               "Corrupted write pointer %llu for zone at %llu\n",
+-                              wpsector, z->z_sector);
++                              bio->bi_iter.bi_sector, z->z_sector);
+                       ret = -EIO;
+               }
+       }
+-- 
+2.39.2
+
diff --git a/queue-6.2/zonefs-reduce-struct-zonefs_inode_info-size.patch b/queue-6.2/zonefs-reduce-struct-zonefs_inode_info-size.patch
new file mode 100644 (file)
index 0000000..ff3bba7
--- /dev/null
@@ -0,0 +1,283 @@
+From 9f7ff5e239ee3af9be4cfb57edccf0c120518282 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Nov 2022 19:43:30 +0900
+Subject: zonefs: Reduce struct zonefs_inode_info size
+
+From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+
+[ Upstream commit 34422914dc00b291d1c47dbdabe93b154c2f2b25 ]
+
+Instead of using the i_ztype field in struct zonefs_inode_info to
+indicate the zone type of an inode, introduce the new inode flag
+ZONEFS_ZONE_CNV to be set in the i_flags field of struct
+zonefs_inode_info to identify conventional zones. If this flag is not
+set, the zone of an inode is considered to be a sequential zone.
+
+The helpers zonefs_zone_is_cnv(), zonefs_zone_is_seq(),
+zonefs_inode_is_cnv() and zonefs_inode_is_seq() are introduced to
+simplify testing the zone type of a struct zonefs_inode_info and of a
+struct inode.
+
+Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/zonefs/file.c   | 35 ++++++++++++++---------------------
+ fs/zonefs/super.c  | 12 +++++++-----
+ fs/zonefs/zonefs.h | 24 +++++++++++++++++++++---
+ 3 files changed, 42 insertions(+), 29 deletions(-)
+
+diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
+index ece0f3959b6d1..64873d31d75dd 100644
+--- a/fs/zonefs/file.c
++++ b/fs/zonefs/file.c
+@@ -77,8 +77,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+        * checked when writes are issued, so warn if we see a page writeback
+        * operation.
+        */
+-      if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
+-                       !(flags & IOMAP_DIRECT)))
++      if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT)))
+               return -EIO;
+       /*
+@@ -128,7 +127,7 @@ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
++      if (WARN_ON_ONCE(zonefs_zone_is_seq(zi)))
+               return -EIO;
+       if (WARN_ON_ONCE(offset >= i_size_read(inode)))
+               return -EIO;
+@@ -158,9 +157,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis,
+                               struct file *swap_file, sector_t *span)
+ {
+       struct inode *inode = file_inode(swap_file);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
++      if (zonefs_inode_is_seq(inode)) {
+               zonefs_err(inode->i_sb,
+                          "swap file: not a conventional zone file\n");
+               return -EINVAL;
+@@ -196,7 +194,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize)
+        * only down to a 0 size, which is equivalent to a zone reset, and to
+        * the maximum file size, which is equivalent to a zone finish.
+        */
+-      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
++      if (!zonefs_zone_is_seq(zi))
+               return -EPERM;
+       if (!isize)
+@@ -266,7 +264,7 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
+        * Since only direct writes are allowed in sequential files, page cache
+        * flush is needed only for conventional zone files.
+        */
+-      if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
++      if (zonefs_inode_is_cnv(inode))
+               ret = file_write_and_wait_range(file, start, end);
+       if (!ret)
+               ret = blkdev_issue_flush(inode->i_sb->s_bdev);
+@@ -280,7 +278,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
+ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
+ {
+       struct inode *inode = file_inode(vmf->vma->vm_file);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       vm_fault_t ret;
+       if (unlikely(IS_IMMUTABLE(inode)))
+@@ -290,7 +287,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
+        * Sanity check: only conventional zone files can have shared
+        * writeable mappings.
+        */
+-      if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
++      if (zonefs_inode_is_seq(inode))
+               return VM_FAULT_NOPAGE;
+       sb_start_pagefault(inode->i_sb);
+@@ -319,7 +316,7 @@ static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+        * mappings are possible since there are no guarantees for write
+        * ordering between msync() and page cache writeback.
+        */
+-      if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ &&
++      if (zonefs_inode_is_seq(file_inode(file)) &&
+           (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+               return -EINVAL;
+@@ -352,7 +349,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
+               return error;
+       }
+-      if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) {
++      if (size && zonefs_zone_is_seq(zi)) {
+               /*
+                * Note that we may be seeing completions out of order,
+                * but that is not a problem since a write completed
+@@ -491,7 +488,7 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
+               return -EINVAL;
+       if (iocb->ki_flags & IOCB_APPEND) {
+-              if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
++              if (zonefs_zone_is_cnv(zi))
+                       return -EINVAL;
+               mutex_lock(&zi->i_truncate_mutex);
+               iocb->ki_pos = zi->i_wpoffset;
+@@ -531,8 +528,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+        * as this can cause write reordering (e.g. the first aio gets EAGAIN
+        * on the inode lock but the second goes through but is now unaligned).
+        */
+-      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
+-          (iocb->ki_flags & IOCB_NOWAIT))
++      if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
+               return -EOPNOTSUPP;
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+@@ -554,7 +550,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+       }
+       /* Enforce sequential writes (append only) in sequential zones */
+-      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
++      if (zonefs_zone_is_seq(zi)) {
+               mutex_lock(&zi->i_truncate_mutex);
+               if (iocb->ki_pos != zi->i_wpoffset) {
+                       mutex_unlock(&zi->i_truncate_mutex);
+@@ -570,7 +566,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+       else
+               ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
+                                  &zonefs_write_dio_ops, 0, NULL, 0);
+-      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
++      if (zonefs_zone_is_seq(zi) &&
+           (ret > 0 || ret == -EIOCBQUEUED)) {
+               if (ret > 0)
+                       count = ret;
+@@ -596,14 +592,13 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
+                                         struct iov_iter *from)
+ {
+       struct inode *inode = file_inode(iocb->ki_filp);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       ssize_t ret;
+       /*
+        * Direct IO writes are mandatory for sequential zone files so that the
+        * write IO issuing order is preserved.
+        */
+-      if (zi->i_ztype != ZONEFS_ZTYPE_CNV)
++      if (zonefs_inode_is_seq(inode))
+               return -EIO;
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+@@ -731,9 +726,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ static inline bool zonefs_seq_file_need_wro(struct inode *inode,
+                                           struct file *file)
+ {
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-
+-      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
++      if (zonefs_inode_is_cnv(inode))
+               return false;
+       if (!(file->f_mode & FMODE_WRITE))
+diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
+index 6307cc95be061..a4af29dc32e7d 100644
+--- a/fs/zonefs/super.c
++++ b/fs/zonefs/super.c
+@@ -37,7 +37,7 @@ void zonefs_account_active(struct inode *inode)
+       lockdep_assert_held(&zi->i_truncate_mutex);
+-      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
++      if (zonefs_zone_is_cnv(zi))
+               return;
+       /*
+@@ -177,14 +177,14 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
+               zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
+                           inode->i_ino);
+               zi->i_flags |= ZONEFS_ZONE_READONLY;
+-              if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
++              if (zonefs_zone_is_cnv(zi))
+                       return zi->i_max_size;
+               return zi->i_wpoffset;
+       case BLK_ZONE_COND_FULL:
+               /* The write pointer of full zones is invalid. */
+               return zi->i_max_size;
+       default:
+-              if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
++              if (zonefs_zone_is_cnv(zi))
+                       return zi->i_max_size;
+               return (zone->wp - zone->start) << SECTOR_SHIFT;
+       }
+@@ -260,7 +260,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        * In all cases, warn about inode size inconsistency and handle the
+        * IO error according to the zone condition and to the mount options.
+        */
+-      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size)
++      if (zonefs_zone_is_seq(zi) && isize != data_size)
+               zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n",
+                           inode->i_ino, isize, data_size);
+@@ -584,7 +584,9 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+       inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
+       inode->i_mode = S_IFREG | sbi->s_perm;
+-      zi->i_ztype = type;
++      if (type == ZONEFS_ZTYPE_CNV)
++              zi->i_flags |= ZONEFS_ZONE_CNV;
++
+       zi->i_zsector = zone->start;
+       zi->i_zone_size = zone->len << SECTOR_SHIFT;
+       if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT &&
+diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
+index 439096445ee53..1a225f74015a0 100644
+--- a/fs/zonefs/zonefs.h
++++ b/fs/zonefs/zonefs.h
+@@ -44,6 +44,7 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
+ #define ZONEFS_ZONE_ACTIVE    (1U << 2)
+ #define ZONEFS_ZONE_OFFLINE   (1U << 3)
+ #define ZONEFS_ZONE_READONLY  (1U << 4)
++#define ZONEFS_ZONE_CNV               (1U << 31)
+ /*
+  * In-memory inode data.
+@@ -51,9 +52,6 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
+ struct zonefs_inode_info {
+       struct inode            i_vnode;
+-      /* File zone type */
+-      enum zonefs_ztype       i_ztype;
+-
+       /* File zone start sector (512B unit) */
+       sector_t                i_zsector;
+@@ -91,6 +89,26 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
+       return container_of(inode, struct zonefs_inode_info, i_vnode);
+ }
++static inline bool zonefs_zone_is_cnv(struct zonefs_inode_info *zi)
++{
++      return zi->i_flags & ZONEFS_ZONE_CNV;
++}
++
++static inline bool zonefs_zone_is_seq(struct zonefs_inode_info *zi)
++{
++      return !zonefs_zone_is_cnv(zi);
++}
++
++static inline bool zonefs_inode_is_cnv(struct inode *inode)
++{
++      return zonefs_zone_is_cnv(ZONEFS_I(inode));
++}
++
++static inline bool zonefs_inode_is_seq(struct inode *inode)
++{
++      return zonefs_zone_is_seq(ZONEFS_I(inode));
++}
++
+ /*
+  * On-disk super block (block 0).
+  */
+-- 
+2.39.2
+
diff --git a/queue-6.2/zonefs-reorganize-code.patch b/queue-6.2/zonefs-reorganize-code.patch
new file mode 100644 (file)
index 0000000..b340322
--- /dev/null
@@ -0,0 +1,1990 @@
+From 05bc54065b57e4fb5ff76d916d5de3526881794c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 25 Nov 2022 09:39:33 +0900
+Subject: zonefs: Reorganize code
+
+From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+
+[ Upstream commit 4008e2a0b01aba982356fd15b128a47bf11bd9c7 ]
+
+Move all code related to zone file operations from super.c to the new
+file.c file. Inode and zone management code remains in super.c.
+
+Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/zonefs/Makefile |   2 +-
+ fs/zonefs/file.c   | 874 ++++++++++++++++++++++++++++++++++++++++
+ fs/zonefs/super.c  | 973 +++------------------------------------------
+ fs/zonefs/zonefs.h |  22 +
+ 4 files changed, 955 insertions(+), 916 deletions(-)
+ create mode 100644 fs/zonefs/file.c
+
+diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile
+index 9fe54f5319f22..645f7229de4a0 100644
+--- a/fs/zonefs/Makefile
++++ b/fs/zonefs/Makefile
+@@ -3,4 +3,4 @@ ccflags-y                              += -I$(src)
+ obj-$(CONFIG_ZONEFS_FS) += zonefs.o
+-zonefs-y      := super.o sysfs.o
++zonefs-y      := super.o file.o sysfs.o
+diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
+new file mode 100644
+index 0000000000000..ece0f3959b6d1
+--- /dev/null
++++ b/fs/zonefs/file.c
+@@ -0,0 +1,874 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Simple file system for zoned block devices exposing zones as files.
++ *
++ * Copyright (C) 2022 Western Digital Corporation or its affiliates.
++ */
++#include <linux/module.h>
++#include <linux/pagemap.h>
++#include <linux/iomap.h>
++#include <linux/init.h>
++#include <linux/slab.h>
++#include <linux/blkdev.h>
++#include <linux/statfs.h>
++#include <linux/writeback.h>
++#include <linux/quotaops.h>
++#include <linux/seq_file.h>
++#include <linux/parser.h>
++#include <linux/uio.h>
++#include <linux/mman.h>
++#include <linux/sched/mm.h>
++#include <linux/task_io_accounting_ops.h>
++
++#include "zonefs.h"
++
++#include "trace.h"
++
++static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
++                                 loff_t length, unsigned int flags,
++                                 struct iomap *iomap, struct iomap *srcmap)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct super_block *sb = inode->i_sb;
++      loff_t isize;
++
++      /*
++       * All blocks are always mapped below EOF. If reading past EOF,
++       * act as if there is a hole up to the file maximum size.
++       */
++      mutex_lock(&zi->i_truncate_mutex);
++      iomap->bdev = inode->i_sb->s_bdev;
++      iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
++      isize = i_size_read(inode);
++      if (iomap->offset >= isize) {
++              iomap->type = IOMAP_HOLE;
++              iomap->addr = IOMAP_NULL_ADDR;
++              iomap->length = length;
++      } else {
++              iomap->type = IOMAP_MAPPED;
++              iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
++              iomap->length = isize - iomap->offset;
++      }
++      mutex_unlock(&zi->i_truncate_mutex);
++
++      trace_zonefs_iomap_begin(inode, iomap);
++
++      return 0;
++}
++
++static const struct iomap_ops zonefs_read_iomap_ops = {
++      .iomap_begin    = zonefs_read_iomap_begin,
++};
++
++static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
++                                  loff_t length, unsigned int flags,
++                                  struct iomap *iomap, struct iomap *srcmap)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct super_block *sb = inode->i_sb;
++      loff_t isize;
++
++      /* All write I/Os should always be within the file maximum size */
++      if (WARN_ON_ONCE(offset + length > zi->i_max_size))
++              return -EIO;
++
++      /*
++       * Sequential zones can only accept direct writes. This is already
++       * checked when writes are issued, so warn if we see a page writeback
++       * operation.
++       */
++      if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
++                       !(flags & IOMAP_DIRECT)))
++              return -EIO;
++
++      /*
++       * For conventional zones, all blocks are always mapped. For sequential
++       * zones, all blocks after always mapped below the inode size (zone
++       * write pointer) and unwriten beyond.
++       */
++      mutex_lock(&zi->i_truncate_mutex);
++      iomap->bdev = inode->i_sb->s_bdev;
++      iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
++      iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
++      isize = i_size_read(inode);
++      if (iomap->offset >= isize) {
++              iomap->type = IOMAP_UNWRITTEN;
++              iomap->length = zi->i_max_size - iomap->offset;
++      } else {
++              iomap->type = IOMAP_MAPPED;
++              iomap->length = isize - iomap->offset;
++      }
++      mutex_unlock(&zi->i_truncate_mutex);
++
++      trace_zonefs_iomap_begin(inode, iomap);
++
++      return 0;
++}
++
++static const struct iomap_ops zonefs_write_iomap_ops = {
++      .iomap_begin    = zonefs_write_iomap_begin,
++};
++
++static int zonefs_read_folio(struct file *unused, struct folio *folio)
++{
++      return iomap_read_folio(folio, &zonefs_read_iomap_ops);
++}
++
++static void zonefs_readahead(struct readahead_control *rac)
++{
++      iomap_readahead(rac, &zonefs_read_iomap_ops);
++}
++
++/*
++ * Map blocks for page writeback. This is used only on conventional zone files,
++ * which implies that the page range can only be within the fixed inode size.
++ */
++static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
++                                 struct inode *inode, loff_t offset)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++
++      if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
++              return -EIO;
++      if (WARN_ON_ONCE(offset >= i_size_read(inode)))
++              return -EIO;
++
++      /* If the mapping is already OK, nothing needs to be done */
++      if (offset >= wpc->iomap.offset &&
++          offset < wpc->iomap.offset + wpc->iomap.length)
++              return 0;
++
++      return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
++                                      IOMAP_WRITE, &wpc->iomap, NULL);
++}
++
++static const struct iomap_writeback_ops zonefs_writeback_ops = {
++      .map_blocks             = zonefs_write_map_blocks,
++};
++
++static int zonefs_writepages(struct address_space *mapping,
++                           struct writeback_control *wbc)
++{
++      struct iomap_writepage_ctx wpc = { };
++
++      return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
++}
++
++static int zonefs_swap_activate(struct swap_info_struct *sis,
++                              struct file *swap_file, sector_t *span)
++{
++      struct inode *inode = file_inode(swap_file);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++
++      if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
++              zonefs_err(inode->i_sb,
++                         "swap file: not a conventional zone file\n");
++              return -EINVAL;
++      }
++
++      return iomap_swapfile_activate(sis, swap_file, span,
++                                     &zonefs_read_iomap_ops);
++}
++
++const struct address_space_operations zonefs_file_aops = {
++      .read_folio             = zonefs_read_folio,
++      .readahead              = zonefs_readahead,
++      .writepages             = zonefs_writepages,
++      .dirty_folio            = filemap_dirty_folio,
++      .release_folio          = iomap_release_folio,
++      .invalidate_folio       = iomap_invalidate_folio,
++      .migrate_folio          = filemap_migrate_folio,
++      .is_partially_uptodate  = iomap_is_partially_uptodate,
++      .error_remove_page      = generic_error_remove_page,
++      .direct_IO              = noop_direct_IO,
++      .swap_activate          = zonefs_swap_activate,
++};
++
++int zonefs_file_truncate(struct inode *inode, loff_t isize)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      loff_t old_isize;
++      enum req_op op;
++      int ret = 0;
++
++      /*
++       * Only sequential zone files can be truncated and truncation is allowed
++       * only down to a 0 size, which is equivalent to a zone reset, and to
++       * the maximum file size, which is equivalent to a zone finish.
++       */
++      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
++              return -EPERM;
++
++      if (!isize)
++              op = REQ_OP_ZONE_RESET;
++      else if (isize == zi->i_max_size)
++              op = REQ_OP_ZONE_FINISH;
++      else
++              return -EPERM;
++
++      inode_dio_wait(inode);
++
++      /* Serialize against page faults */
++      filemap_invalidate_lock(inode->i_mapping);
++
++      /* Serialize against zonefs_iomap_begin() */
++      mutex_lock(&zi->i_truncate_mutex);
++
++      old_isize = i_size_read(inode);
++      if (isize == old_isize)
++              goto unlock;
++
++      ret = zonefs_zone_mgmt(inode, op);
++      if (ret)
++              goto unlock;
++
++      /*
++       * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
++       * take care of open zones.
++       */
++      if (zi->i_flags & ZONEFS_ZONE_OPEN) {
++              /*
++               * Truncating a zone to EMPTY or FULL is the equivalent of
++               * closing the zone. For a truncation to 0, we need to
++               * re-open the zone to ensure new writes can be processed.
++               * For a truncation to the maximum file size, the zone is
++               * closed and writes cannot be accepted anymore, so clear
++               * the open flag.
++               */
++              if (!isize)
++                      ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
++              else
++                      zi->i_flags &= ~ZONEFS_ZONE_OPEN;
++      }
++
++      zonefs_update_stats(inode, isize);
++      truncate_setsize(inode, isize);
++      zi->i_wpoffset = isize;
++      zonefs_account_active(inode);
++
++unlock:
++      mutex_unlock(&zi->i_truncate_mutex);
++      filemap_invalidate_unlock(inode->i_mapping);
++
++      return ret;
++}
++
++static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
++                           int datasync)
++{
++      struct inode *inode = file_inode(file);
++      int ret = 0;
++
++      if (unlikely(IS_IMMUTABLE(inode)))
++              return -EPERM;
++
++      /*
++       * Since only direct writes are allowed in sequential files, page cache
++       * flush is needed only for conventional zone files.
++       */
++      if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
++              ret = file_write_and_wait_range(file, start, end);
++      if (!ret)
++              ret = blkdev_issue_flush(inode->i_sb->s_bdev);
++
++      if (ret)
++              zonefs_io_error(inode, true);
++
++      return ret;
++}
++
++static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
++{
++      struct inode *inode = file_inode(vmf->vma->vm_file);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      vm_fault_t ret;
++
++      if (unlikely(IS_IMMUTABLE(inode)))
++              return VM_FAULT_SIGBUS;
++
++      /*
++       * Sanity check: only conventional zone files can have shared
++       * writeable mappings.
++       */
++      if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
++              return VM_FAULT_NOPAGE;
++
++      sb_start_pagefault(inode->i_sb);
++      file_update_time(vmf->vma->vm_file);
++
++      /* Serialize against truncates */
++      filemap_invalidate_lock_shared(inode->i_mapping);
++      ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
++      filemap_invalidate_unlock_shared(inode->i_mapping);
++
++      sb_end_pagefault(inode->i_sb);
++      return ret;
++}
++
++static const struct vm_operations_struct zonefs_file_vm_ops = {
++      .fault          = filemap_fault,
++      .map_pages      = filemap_map_pages,
++      .page_mkwrite   = zonefs_filemap_page_mkwrite,
++};
++
++static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
++{
++      /*
++       * Conventional zones accept random writes, so their files can support
++       * shared writable mappings. For sequential zone files, only read
++       * mappings are possible since there are no guarantees for write
++       * ordering between msync() and page cache writeback.
++       */
++      if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ &&
++          (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
++              return -EINVAL;
++
++      file_accessed(file);
++      vma->vm_ops = &zonefs_file_vm_ops;
++
++      return 0;
++}
++
++static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
++{
++      loff_t isize = i_size_read(file_inode(file));
++
++      /*
++       * Seeks are limited to below the zone size for conventional zones
++       * and below the zone write pointer for sequential zones. In both
++       * cases, this limit is the inode size.
++       */
++      return generic_file_llseek_size(file, offset, whence, isize, isize);
++}
++
++static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
++                                      int error, unsigned int flags)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++
++      if (error) {
++              zonefs_io_error(inode, true);
++              return error;
++      }
++
++      if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) {
++              /*
++               * Note that we may be seeing completions out of order,
++               * but that is not a problem since a write completed
++               * successfully necessarily means that all preceding writes
++               * were also successful. So we can safely increase the inode
++               * size to the write end location.
++               */
++              mutex_lock(&zi->i_truncate_mutex);
++              if (i_size_read(inode) < iocb->ki_pos + size) {
++                      zonefs_update_stats(inode, iocb->ki_pos + size);
++                      zonefs_i_size_write(inode, iocb->ki_pos + size);
++              }
++              mutex_unlock(&zi->i_truncate_mutex);
++      }
++
++      return 0;
++}
++
++static const struct iomap_dio_ops zonefs_write_dio_ops = {
++      .end_io                 = zonefs_file_write_dio_end_io,
++};
++
++static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct block_device *bdev = inode->i_sb->s_bdev;
++      unsigned int max = bdev_max_zone_append_sectors(bdev);
++      struct bio *bio;
++      ssize_t size;
++      int nr_pages;
++      ssize_t ret;
++
++      max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
++      iov_iter_truncate(from, max);
++
++      nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
++      if (!nr_pages)
++              return 0;
++
++      bio = bio_alloc(bdev, nr_pages,
++                      REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
++      bio->bi_iter.bi_sector = zi->i_zsector;
++      bio->bi_ioprio = iocb->ki_ioprio;
++      if (iocb_is_dsync(iocb))
++              bio->bi_opf |= REQ_FUA;
++
++      ret = bio_iov_iter_get_pages(bio, from);
++      if (unlikely(ret))
++              goto out_release;
++
++      size = bio->bi_iter.bi_size;
++      task_io_account_write(size);
++
++      if (iocb->ki_flags & IOCB_HIPRI)
++              bio_set_polled(bio, iocb);
++
++      ret = submit_bio_wait(bio);
++
++      /*
++       * If the file zone was written underneath the file system, the zone
++       * write pointer may not be where we expect it to be, but the zone
++       * append write can still succeed. So check manually that we wrote where
++       * we intended to, that is, at zi->i_wpoffset.
++       */
++      if (!ret) {
++              sector_t wpsector =
++                      zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT);
++
++              if (bio->bi_iter.bi_sector != wpsector) {
++                      zonefs_warn(inode->i_sb,
++                              "Corrupted write pointer %llu for zone at %llu\n",
++                              wpsector, zi->i_zsector);
++                      ret = -EIO;
++              }
++      }
++
++      zonefs_file_write_dio_end_io(iocb, size, ret, 0);
++      trace_zonefs_file_dio_append(inode, size, ret);
++
++out_release:
++      bio_release_pages(bio, false);
++      bio_put(bio);
++
++      if (ret >= 0) {
++              iocb->ki_pos += size;
++              return size;
++      }
++
++      return ret;
++}
++
++/*
++ * Do not exceed the LFS limits nor the file zone size. If pos is under the
++ * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
++ */
++static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
++                                      loff_t count)
++{
++      struct inode *inode = file_inode(file);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      loff_t limit = rlimit(RLIMIT_FSIZE);
++      loff_t max_size = zi->i_max_size;
++
++      if (limit != RLIM_INFINITY) {
++              if (pos >= limit) {
++                      send_sig(SIGXFSZ, current, 0);
++                      return -EFBIG;
++              }
++              count = min(count, limit - pos);
++      }
++
++      if (!(file->f_flags & O_LARGEFILE))
++              max_size = min_t(loff_t, MAX_NON_LFS, max_size);
++
++      if (unlikely(pos >= max_size))
++              return -EFBIG;
++
++      return min(count, max_size - pos);
++}
++
++static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
++{
++      struct file *file = iocb->ki_filp;
++      struct inode *inode = file_inode(file);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      loff_t count;
++
++      if (IS_SWAPFILE(inode))
++              return -ETXTBSY;
++
++      if (!iov_iter_count(from))
++              return 0;
++
++      if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
++              return -EINVAL;
++
++      if (iocb->ki_flags & IOCB_APPEND) {
++              if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
++                      return -EINVAL;
++              mutex_lock(&zi->i_truncate_mutex);
++              iocb->ki_pos = zi->i_wpoffset;
++              mutex_unlock(&zi->i_truncate_mutex);
++      }
++
++      count = zonefs_write_check_limits(file, iocb->ki_pos,
++                                        iov_iter_count(from));
++      if (count < 0)
++              return count;
++
++      iov_iter_truncate(from, count);
++      return iov_iter_count(from);
++}
++
++/*
++ * Handle direct writes. For sequential zone files, this is the only possible
++ * write path. For these files, check that the user is issuing writes
++ * sequentially from the end of the file. This code assumes that the block layer
++ * delivers write requests to the device in sequential order. This is always the
++ * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
++ * elevator feature is being used (e.g. mq-deadline). The block layer always
++ * automatically select such an elevator for zoned block devices during the
++ * device initialization.
++ */
++static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct super_block *sb = inode->i_sb;
++      bool sync = is_sync_kiocb(iocb);
++      bool append = false;
++      ssize_t ret, count;
++
++      /*
++       * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
++       * as this can cause write reordering (e.g. the first aio gets EAGAIN
++       * on the inode lock but the second goes through but is now unaligned).
++       */
++      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
++          (iocb->ki_flags & IOCB_NOWAIT))
++              return -EOPNOTSUPP;
++
++      if (iocb->ki_flags & IOCB_NOWAIT) {
++              if (!inode_trylock(inode))
++                      return -EAGAIN;
++      } else {
++              inode_lock(inode);
++      }
++
++      count = zonefs_write_checks(iocb, from);
++      if (count <= 0) {
++              ret = count;
++              goto inode_unlock;
++      }
++
++      if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
++              ret = -EINVAL;
++              goto inode_unlock;
++      }
++
++      /* Enforce sequential writes (append only) in sequential zones */
++      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
++              mutex_lock(&zi->i_truncate_mutex);
++              if (iocb->ki_pos != zi->i_wpoffset) {
++                      mutex_unlock(&zi->i_truncate_mutex);
++                      ret = -EINVAL;
++                      goto inode_unlock;
++              }
++              mutex_unlock(&zi->i_truncate_mutex);
++              append = sync;
++      }
++
++      if (append)
++              ret = zonefs_file_dio_append(iocb, from);
++      else
++              ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
++                                 &zonefs_write_dio_ops, 0, NULL, 0);
++      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
++          (ret > 0 || ret == -EIOCBQUEUED)) {
++              if (ret > 0)
++                      count = ret;
++
++              /*
++               * Update the zone write pointer offset assuming the write
++               * operation succeeded. If it did not, the error recovery path
++               * will correct it. Also do active seq file accounting.
++               */
++              mutex_lock(&zi->i_truncate_mutex);
++              zi->i_wpoffset += count;
++              zonefs_account_active(inode);
++              mutex_unlock(&zi->i_truncate_mutex);
++      }
++
++inode_unlock:
++      inode_unlock(inode);
++
++      return ret;
++}
++
++static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
++                                        struct iov_iter *from)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      ssize_t ret;
++
++      /*
++       * Direct IO writes are mandatory for sequential zone files so that the
++       * write IO issuing order is preserved.
++       */
++      if (zi->i_ztype != ZONEFS_ZTYPE_CNV)
++              return -EIO;
++
++      if (iocb->ki_flags & IOCB_NOWAIT) {
++              if (!inode_trylock(inode))
++                      return -EAGAIN;
++      } else {
++              inode_lock(inode);
++      }
++
++      ret = zonefs_write_checks(iocb, from);
++      if (ret <= 0)
++              goto inode_unlock;
++
++      ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
++      if (ret > 0)
++              iocb->ki_pos += ret;
++      else if (ret == -EIO)
++              zonefs_io_error(inode, true);
++
++inode_unlock:
++      inode_unlock(inode);
++      if (ret > 0)
++              ret = generic_write_sync(iocb, ret);
++
++      return ret;
++}
++
++static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++
++      if (unlikely(IS_IMMUTABLE(inode)))
++              return -EPERM;
++
++      if (sb_rdonly(inode->i_sb))
++              return -EROFS;
++
++      /* Write operations beyond the zone size are not allowed */
++      if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
++              return -EFBIG;
++
++      if (iocb->ki_flags & IOCB_DIRECT) {
++              ssize_t ret = zonefs_file_dio_write(iocb, from);
++
++              if (ret != -ENOTBLK)
++                      return ret;
++      }
++
++      return zonefs_file_buffered_write(iocb, from);
++}
++
++static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
++                                     int error, unsigned int flags)
++{
++      if (error) {
++              zonefs_io_error(file_inode(iocb->ki_filp), false);
++              return error;
++      }
++
++      return 0;
++}
++
++static const struct iomap_dio_ops zonefs_read_dio_ops = {
++      .end_io                 = zonefs_file_read_dio_end_io,
++};
++
++static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct super_block *sb = inode->i_sb;
++      loff_t isize;
++      ssize_t ret;
++
++      /* Offline zones cannot be read */
++      if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
++              return -EPERM;
++
++      if (iocb->ki_pos >= zi->i_max_size)
++              return 0;
++
++      if (iocb->ki_flags & IOCB_NOWAIT) {
++              if (!inode_trylock_shared(inode))
++                      return -EAGAIN;
++      } else {
++              inode_lock_shared(inode);
++      }
++
++      /* Limit read operations to written data */
++      mutex_lock(&zi->i_truncate_mutex);
++      isize = i_size_read(inode);
++      if (iocb->ki_pos >= isize) {
++              mutex_unlock(&zi->i_truncate_mutex);
++              ret = 0;
++              goto inode_unlock;
++      }
++      iov_iter_truncate(to, isize - iocb->ki_pos);
++      mutex_unlock(&zi->i_truncate_mutex);
++
++      if (iocb->ki_flags & IOCB_DIRECT) {
++              size_t count = iov_iter_count(to);
++
++              if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
++                      ret = -EINVAL;
++                      goto inode_unlock;
++              }
++              file_accessed(iocb->ki_filp);
++              ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
++                                 &zonefs_read_dio_ops, 0, NULL, 0);
++      } else {
++              ret = generic_file_read_iter(iocb, to);
++              if (ret == -EIO)
++                      zonefs_io_error(inode, false);
++      }
++
++inode_unlock:
++      inode_unlock_shared(inode);
++
++      return ret;
++}
++
++/*
++ * Write open accounting is done only for sequential files.
++ */
++static inline bool zonefs_seq_file_need_wro(struct inode *inode,
++                                          struct file *file)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++
++      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
++              return false;
++
++      if (!(file->f_mode & FMODE_WRITE))
++              return false;
++
++      return true;
++}
++
++static int zonefs_seq_file_write_open(struct inode *inode)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      int ret = 0;
++
++      mutex_lock(&zi->i_truncate_mutex);
++
++      if (!zi->i_wr_refcnt) {
++              struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
++              unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
++
++              if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
++
++                      if (sbi->s_max_wro_seq_files
++                          && wro > sbi->s_max_wro_seq_files) {
++                              atomic_dec(&sbi->s_wro_seq_files);
++                              ret = -EBUSY;
++                              goto unlock;
++                      }
++
++                      if (i_size_read(inode) < zi->i_max_size) {
++                              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
++                              if (ret) {
++                                      atomic_dec(&sbi->s_wro_seq_files);
++                                      goto unlock;
++                              }
++                              zi->i_flags |= ZONEFS_ZONE_OPEN;
++                              zonefs_account_active(inode);
++                      }
++              }
++      }
++
++      zi->i_wr_refcnt++;
++
++unlock:
++      mutex_unlock(&zi->i_truncate_mutex);
++
++      return ret;
++}
++
++static int zonefs_file_open(struct inode *inode, struct file *file)
++{
++      int ret;
++
++      ret = generic_file_open(inode, file);
++      if (ret)
++              return ret;
++
++      if (zonefs_seq_file_need_wro(inode, file))
++              return zonefs_seq_file_write_open(inode);
++
++      return 0;
++}
++
++static void zonefs_seq_file_write_close(struct inode *inode)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct super_block *sb = inode->i_sb;
++      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
++      int ret = 0;
++
++      mutex_lock(&zi->i_truncate_mutex);
++
++      zi->i_wr_refcnt--;
++      if (zi->i_wr_refcnt)
++              goto unlock;
++
++      /*
++       * The file zone may not be open anymore (e.g. the file was truncated to
++       * its maximum size or it was fully written). For this case, we only
++       * need to decrement the write open count.
++       */
++      if (zi->i_flags & ZONEFS_ZONE_OPEN) {
++              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
++              if (ret) {
++                      __zonefs_io_error(inode, false);
++                      /*
++                       * Leaving zones explicitly open may lead to a state
++                       * where most zones cannot be written (zone resources
++                       * exhausted). So take preventive action by remounting
++                       * read-only.
++                       */
++                      if (zi->i_flags & ZONEFS_ZONE_OPEN &&
++                          !(sb->s_flags & SB_RDONLY)) {
++                              zonefs_warn(sb,
++                                      "closing zone at %llu failed %d\n",
++                                      zi->i_zsector, ret);
++                              zonefs_warn(sb,
++                                      "remounting filesystem read-only\n");
++                              sb->s_flags |= SB_RDONLY;
++                      }
++                      goto unlock;
++              }
++
++              zi->i_flags &= ~ZONEFS_ZONE_OPEN;
++              zonefs_account_active(inode);
++      }
++
++      atomic_dec(&sbi->s_wro_seq_files);
++
++unlock:
++      mutex_unlock(&zi->i_truncate_mutex);
++}
++
++static int zonefs_file_release(struct inode *inode, struct file *file)
++{
++      /*
++       * If we explicitly open a zone we must close it again as well, but the
++       * zone management operation can fail (either due to an IO error or as
++       * the zone has gone offline or read-only). Make sure we don't fail the
++       * close(2) for user-space.
++       */
++      if (zonefs_seq_file_need_wro(inode, file))
++              zonefs_seq_file_write_close(inode);
++
++      return 0;
++}
++
++const struct file_operations zonefs_file_operations = {
++      .open           = zonefs_file_open,
++      .release        = zonefs_file_release,
++      .fsync          = zonefs_file_fsync,
++      .mmap           = zonefs_file_mmap,
++      .llseek         = zonefs_file_llseek,
++      .read_iter      = zonefs_file_read_iter,
++      .write_iter     = zonefs_file_write_iter,
++      .splice_read    = generic_file_splice_read,
++      .splice_write   = iter_file_splice_write,
++      .iopoll         = iocb_bio_iopoll,
++};
+diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
+index a9c5c3f720adf..e808276b88018 100644
+--- a/fs/zonefs/super.c
++++ b/fs/zonefs/super.c
+@@ -30,7 +30,7 @@
+ /*
+  * Manage the active zone count. Called with zi->i_truncate_mutex held.
+  */
+-static void zonefs_account_active(struct inode *inode)
++void zonefs_account_active(struct inode *inode)
+ {
+       struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+@@ -68,7 +68,7 @@ static void zonefs_account_active(struct inode *inode)
+       }
+ }
+-static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
++int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       int ret;
+@@ -99,7 +99,7 @@ static inline int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
+       return 0;
+ }
+-static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
++void zonefs_i_size_write(struct inode *inode, loff_t isize)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+@@ -117,167 +117,7 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize)
+       }
+ }
+-static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+-                                 loff_t length, unsigned int flags,
+-                                 struct iomap *iomap, struct iomap *srcmap)
+-{
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      struct super_block *sb = inode->i_sb;
+-      loff_t isize;
+-
+-      /*
+-       * All blocks are always mapped below EOF. If reading past EOF,
+-       * act as if there is a hole up to the file maximum size.
+-       */
+-      mutex_lock(&zi->i_truncate_mutex);
+-      iomap->bdev = inode->i_sb->s_bdev;
+-      iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+-      isize = i_size_read(inode);
+-      if (iomap->offset >= isize) {
+-              iomap->type = IOMAP_HOLE;
+-              iomap->addr = IOMAP_NULL_ADDR;
+-              iomap->length = length;
+-      } else {
+-              iomap->type = IOMAP_MAPPED;
+-              iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+-              iomap->length = isize - iomap->offset;
+-      }
+-      mutex_unlock(&zi->i_truncate_mutex);
+-
+-      trace_zonefs_iomap_begin(inode, iomap);
+-
+-      return 0;
+-}
+-
+-static const struct iomap_ops zonefs_read_iomap_ops = {
+-      .iomap_begin    = zonefs_read_iomap_begin,
+-};
+-
+-static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+-                                  loff_t length, unsigned int flags,
+-                                  struct iomap *iomap, struct iomap *srcmap)
+-{
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      struct super_block *sb = inode->i_sb;
+-      loff_t isize;
+-
+-      /* All write I/Os should always be within the file maximum size */
+-      if (WARN_ON_ONCE(offset + length > zi->i_max_size))
+-              return -EIO;
+-
+-      /*
+-       * Sequential zones can only accept direct writes. This is already
+-       * checked when writes are issued, so warn if we see a page writeback
+-       * operation.
+-       */
+-      if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
+-                       !(flags & IOMAP_DIRECT)))
+-              return -EIO;
+-
+-      /*
+-       * For conventional zones, all blocks are always mapped. For sequential
+-       * zones, all blocks after always mapped below the inode size (zone
+-       * write pointer) and unwriten beyond.
+-       */
+-      mutex_lock(&zi->i_truncate_mutex);
+-      iomap->bdev = inode->i_sb->s_bdev;
+-      iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+-      iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+-      isize = i_size_read(inode);
+-      if (iomap->offset >= isize) {
+-              iomap->type = IOMAP_UNWRITTEN;
+-              iomap->length = zi->i_max_size - iomap->offset;
+-      } else {
+-              iomap->type = IOMAP_MAPPED;
+-              iomap->length = isize - iomap->offset;
+-      }
+-      mutex_unlock(&zi->i_truncate_mutex);
+-
+-      trace_zonefs_iomap_begin(inode, iomap);
+-
+-      return 0;
+-}
+-
+-static const struct iomap_ops zonefs_write_iomap_ops = {
+-      .iomap_begin    = zonefs_write_iomap_begin,
+-};
+-
+-static int zonefs_read_folio(struct file *unused, struct folio *folio)
+-{
+-      return iomap_read_folio(folio, &zonefs_read_iomap_ops);
+-}
+-
+-static void zonefs_readahead(struct readahead_control *rac)
+-{
+-      iomap_readahead(rac, &zonefs_read_iomap_ops);
+-}
+-
+-/*
+- * Map blocks for page writeback. This is used only on conventional zone files,
+- * which implies that the page range can only be within the fixed inode size.
+- */
+-static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+-                                 struct inode *inode, loff_t offset)
+-{
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-
+-      if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
+-              return -EIO;
+-      if (WARN_ON_ONCE(offset >= i_size_read(inode)))
+-              return -EIO;
+-
+-      /* If the mapping is already OK, nothing needs to be done */
+-      if (offset >= wpc->iomap.offset &&
+-          offset < wpc->iomap.offset + wpc->iomap.length)
+-              return 0;
+-
+-      return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
+-                                      IOMAP_WRITE, &wpc->iomap, NULL);
+-}
+-
+-static const struct iomap_writeback_ops zonefs_writeback_ops = {
+-      .map_blocks             = zonefs_write_map_blocks,
+-};
+-
+-static int zonefs_writepages(struct address_space *mapping,
+-                           struct writeback_control *wbc)
+-{
+-      struct iomap_writepage_ctx wpc = { };
+-
+-      return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
+-}
+-
+-static int zonefs_swap_activate(struct swap_info_struct *sis,
+-                              struct file *swap_file, sector_t *span)
+-{
+-      struct inode *inode = file_inode(swap_file);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-
+-      if (zi->i_ztype != ZONEFS_ZTYPE_CNV) {
+-              zonefs_err(inode->i_sb,
+-                         "swap file: not a conventional zone file\n");
+-              return -EINVAL;
+-      }
+-
+-      return iomap_swapfile_activate(sis, swap_file, span,
+-                                     &zonefs_read_iomap_ops);
+-}
+-
+-static const struct address_space_operations zonefs_file_aops = {
+-      .read_folio             = zonefs_read_folio,
+-      .readahead              = zonefs_readahead,
+-      .writepages             = zonefs_writepages,
+-      .dirty_folio            = filemap_dirty_folio,
+-      .release_folio          = iomap_release_folio,
+-      .invalidate_folio       = iomap_invalidate_folio,
+-      .migrate_folio          = filemap_migrate_folio,
+-      .is_partially_uptodate  = iomap_is_partially_uptodate,
+-      .error_remove_page      = generic_error_remove_page,
+-      .direct_IO              = noop_direct_IO,
+-      .swap_activate          = zonefs_swap_activate,
+-};
+-
+-static void zonefs_update_stats(struct inode *inode, loff_t new_isize)
++void zonefs_update_stats(struct inode *inode, loff_t new_isize)
+ {
+       struct super_block *sb = inode->i_sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+@@ -487,7 +327,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+  * eventually correct the file size and zonefs inode write pointer offset
+  * (which can be out of sync with the drive due to partial write failures).
+  */
+-static void __zonefs_io_error(struct inode *inode, bool write)
++void __zonefs_io_error(struct inode *inode, bool write)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       struct super_block *sb = inode->i_sb;
+@@ -526,749 +366,6 @@ static void __zonefs_io_error(struct inode *inode, bool write)
+       memalloc_noio_restore(noio_flag);
+ }
+-static void zonefs_io_error(struct inode *inode, bool write)
+-{
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-
+-      mutex_lock(&zi->i_truncate_mutex);
+-      __zonefs_io_error(inode, write);
+-      mutex_unlock(&zi->i_truncate_mutex);
+-}
+-
+-static int zonefs_file_truncate(struct inode *inode, loff_t isize)
+-{
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      loff_t old_isize;
+-      enum req_op op;
+-      int ret = 0;
+-
+-      /*
+-       * Only sequential zone files can be truncated and truncation is allowed
+-       * only down to a 0 size, which is equivalent to a zone reset, and to
+-       * the maximum file size, which is equivalent to a zone finish.
+-       */
+-      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+-              return -EPERM;
+-
+-      if (!isize)
+-              op = REQ_OP_ZONE_RESET;
+-      else if (isize == zi->i_max_size)
+-              op = REQ_OP_ZONE_FINISH;
+-      else
+-              return -EPERM;
+-
+-      inode_dio_wait(inode);
+-
+-      /* Serialize against page faults */
+-      filemap_invalidate_lock(inode->i_mapping);
+-
+-      /* Serialize against zonefs_iomap_begin() */
+-      mutex_lock(&zi->i_truncate_mutex);
+-
+-      old_isize = i_size_read(inode);
+-      if (isize == old_isize)
+-              goto unlock;
+-
+-      ret = zonefs_zone_mgmt(inode, op);
+-      if (ret)
+-              goto unlock;
+-
+-      /*
+-       * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
+-       * take care of open zones.
+-       */
+-      if (zi->i_flags & ZONEFS_ZONE_OPEN) {
+-              /*
+-               * Truncating a zone to EMPTY or FULL is the equivalent of
+-               * closing the zone. For a truncation to 0, we need to
+-               * re-open the zone to ensure new writes can be processed.
+-               * For a truncation to the maximum file size, the zone is
+-               * closed and writes cannot be accepted anymore, so clear
+-               * the open flag.
+-               */
+-              if (!isize)
+-                      ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+-              else
+-                      zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+-      }
+-
+-      zonefs_update_stats(inode, isize);
+-      truncate_setsize(inode, isize);
+-      zi->i_wpoffset = isize;
+-      zonefs_account_active(inode);
+-
+-unlock:
+-      mutex_unlock(&zi->i_truncate_mutex);
+-      filemap_invalidate_unlock(inode->i_mapping);
+-
+-      return ret;
+-}
+-
+-static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
+-                              struct dentry *dentry, struct iattr *iattr)
+-{
+-      struct inode *inode = d_inode(dentry);
+-      int ret;
+-
+-      if (unlikely(IS_IMMUTABLE(inode)))
+-              return -EPERM;
+-
+-      ret = setattr_prepare(&init_user_ns, dentry, iattr);
+-      if (ret)
+-              return ret;
+-
+-      /*
+-       * Since files and directories cannot be created nor deleted, do not
+-       * allow setting any write attributes on the sub-directories grouping
+-       * files by zone type.
+-       */
+-      if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) &&
+-          (iattr->ia_mode & 0222))
+-              return -EPERM;
+-
+-      if (((iattr->ia_valid & ATTR_UID) &&
+-           !uid_eq(iattr->ia_uid, inode->i_uid)) ||
+-          ((iattr->ia_valid & ATTR_GID) &&
+-           !gid_eq(iattr->ia_gid, inode->i_gid))) {
+-              ret = dquot_transfer(mnt_userns, inode, iattr);
+-              if (ret)
+-                      return ret;
+-      }
+-
+-      if (iattr->ia_valid & ATTR_SIZE) {
+-              ret = zonefs_file_truncate(inode, iattr->ia_size);
+-              if (ret)
+-                      return ret;
+-      }
+-
+-      setattr_copy(&init_user_ns, inode, iattr);
+-
+-      return 0;
+-}
+-
+-static const struct inode_operations zonefs_file_inode_operations = {
+-      .setattr        = zonefs_inode_setattr,
+-};
+-
+-static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end,
+-                           int datasync)
+-{
+-      struct inode *inode = file_inode(file);
+-      int ret = 0;
+-
+-      if (unlikely(IS_IMMUTABLE(inode)))
+-              return -EPERM;
+-
+-      /*
+-       * Since only direct writes are allowed in sequential files, page cache
+-       * flush is needed only for conventional zone files.
+-       */
+-      if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV)
+-              ret = file_write_and_wait_range(file, start, end);
+-      if (!ret)
+-              ret = blkdev_issue_flush(inode->i_sb->s_bdev);
+-
+-      if (ret)
+-              zonefs_io_error(inode, true);
+-
+-      return ret;
+-}
+-
+-static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
+-{
+-      struct inode *inode = file_inode(vmf->vma->vm_file);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      vm_fault_t ret;
+-
+-      if (unlikely(IS_IMMUTABLE(inode)))
+-              return VM_FAULT_SIGBUS;
+-
+-      /*
+-       * Sanity check: only conventional zone files can have shared
+-       * writeable mappings.
+-       */
+-      if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV))
+-              return VM_FAULT_NOPAGE;
+-
+-      sb_start_pagefault(inode->i_sb);
+-      file_update_time(vmf->vma->vm_file);
+-
+-      /* Serialize against truncates */
+-      filemap_invalidate_lock_shared(inode->i_mapping);
+-      ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
+-      filemap_invalidate_unlock_shared(inode->i_mapping);
+-
+-      sb_end_pagefault(inode->i_sb);
+-      return ret;
+-}
+-
+-static const struct vm_operations_struct zonefs_file_vm_ops = {
+-      .fault          = filemap_fault,
+-      .map_pages      = filemap_map_pages,
+-      .page_mkwrite   = zonefs_filemap_page_mkwrite,
+-};
+-
+-static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+-{
+-      /*
+-       * Conventional zones accept random writes, so their files can support
+-       * shared writable mappings. For sequential zone files, only read
+-       * mappings are possible since there are no guarantees for write
+-       * ordering between msync() and page cache writeback.
+-       */
+-      if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ &&
+-          (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+-              return -EINVAL;
+-
+-      file_accessed(file);
+-      vma->vm_ops = &zonefs_file_vm_ops;
+-
+-      return 0;
+-}
+-
+-static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence)
+-{
+-      loff_t isize = i_size_read(file_inode(file));
+-
+-      /*
+-       * Seeks are limited to below the zone size for conventional zones
+-       * and below the zone write pointer for sequential zones. In both
+-       * cases, this limit is the inode size.
+-       */
+-      return generic_file_llseek_size(file, offset, whence, isize, isize);
+-}
+-
+-static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
+-                                      int error, unsigned int flags)
+-{
+-      struct inode *inode = file_inode(iocb->ki_filp);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-
+-      if (error) {
+-              zonefs_io_error(inode, true);
+-              return error;
+-      }
+-
+-      if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) {
+-              /*
+-               * Note that we may be seeing completions out of order,
+-               * but that is not a problem since a write completed
+-               * successfully necessarily means that all preceding writes
+-               * were also successful. So we can safely increase the inode
+-               * size to the write end location.
+-               */
+-              mutex_lock(&zi->i_truncate_mutex);
+-              if (i_size_read(inode) < iocb->ki_pos + size) {
+-                      zonefs_update_stats(inode, iocb->ki_pos + size);
+-                      zonefs_i_size_write(inode, iocb->ki_pos + size);
+-              }
+-              mutex_unlock(&zi->i_truncate_mutex);
+-      }
+-
+-      return 0;
+-}
+-
+-static const struct iomap_dio_ops zonefs_write_dio_ops = {
+-      .end_io                 = zonefs_file_write_dio_end_io,
+-};
+-
+-static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
+-{
+-      struct inode *inode = file_inode(iocb->ki_filp);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      struct block_device *bdev = inode->i_sb->s_bdev;
+-      unsigned int max = bdev_max_zone_append_sectors(bdev);
+-      struct bio *bio;
+-      ssize_t size;
+-      int nr_pages;
+-      ssize_t ret;
+-
+-      max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize);
+-      iov_iter_truncate(from, max);
+-
+-      nr_pages = iov_iter_npages(from, BIO_MAX_VECS);
+-      if (!nr_pages)
+-              return 0;
+-
+-      bio = bio_alloc(bdev, nr_pages,
+-                      REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
+-      bio->bi_iter.bi_sector = zi->i_zsector;
+-      bio->bi_ioprio = iocb->ki_ioprio;
+-      if (iocb_is_dsync(iocb))
+-              bio->bi_opf |= REQ_FUA;
+-
+-      ret = bio_iov_iter_get_pages(bio, from);
+-      if (unlikely(ret))
+-              goto out_release;
+-
+-      size = bio->bi_iter.bi_size;
+-      task_io_account_write(size);
+-
+-      if (iocb->ki_flags & IOCB_HIPRI)
+-              bio_set_polled(bio, iocb);
+-
+-      ret = submit_bio_wait(bio);
+-
+-      /*
+-       * If the file zone was written underneath the file system, the zone
+-       * write pointer may not be where we expect it to be, but the zone
+-       * append write can still succeed. So check manually that we wrote where
+-       * we intended to, that is, at zi->i_wpoffset.
+-       */
+-      if (!ret) {
+-              sector_t wpsector =
+-                      zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT);
+-
+-              if (bio->bi_iter.bi_sector != wpsector) {
+-                      zonefs_warn(inode->i_sb,
+-                              "Corrupted write pointer %llu for zone at %llu\n",
+-                              wpsector, zi->i_zsector);
+-                      ret = -EIO;
+-              }
+-      }
+-
+-      zonefs_file_write_dio_end_io(iocb, size, ret, 0);
+-      trace_zonefs_file_dio_append(inode, size, ret);
+-
+-out_release:
+-      bio_release_pages(bio, false);
+-      bio_put(bio);
+-
+-      if (ret >= 0) {
+-              iocb->ki_pos += size;
+-              return size;
+-      }
+-
+-      return ret;
+-}
+-
+-/*
+- * Do not exceed the LFS limits nor the file zone size. If pos is under the
+- * limit it becomes a short access. If it exceeds the limit, return -EFBIG.
+- */
+-static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
+-                                      loff_t count)
+-{
+-      struct inode *inode = file_inode(file);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      loff_t limit = rlimit(RLIMIT_FSIZE);
+-      loff_t max_size = zi->i_max_size;
+-
+-      if (limit != RLIM_INFINITY) {
+-              if (pos >= limit) {
+-                      send_sig(SIGXFSZ, current, 0);
+-                      return -EFBIG;
+-              }
+-              count = min(count, limit - pos);
+-      }
+-
+-      if (!(file->f_flags & O_LARGEFILE))
+-              max_size = min_t(loff_t, MAX_NON_LFS, max_size);
+-
+-      if (unlikely(pos >= max_size))
+-              return -EFBIG;
+-
+-      return min(count, max_size - pos);
+-}
+-
+-static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
+-{
+-      struct file *file = iocb->ki_filp;
+-      struct inode *inode = file_inode(file);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      loff_t count;
+-
+-      if (IS_SWAPFILE(inode))
+-              return -ETXTBSY;
+-
+-      if (!iov_iter_count(from))
+-              return 0;
+-
+-      if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+-              return -EINVAL;
+-
+-      if (iocb->ki_flags & IOCB_APPEND) {
+-              if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+-                      return -EINVAL;
+-              mutex_lock(&zi->i_truncate_mutex);
+-              iocb->ki_pos = zi->i_wpoffset;
+-              mutex_unlock(&zi->i_truncate_mutex);
+-      }
+-
+-      count = zonefs_write_check_limits(file, iocb->ki_pos,
+-                                        iov_iter_count(from));
+-      if (count < 0)
+-              return count;
+-
+-      iov_iter_truncate(from, count);
+-      return iov_iter_count(from);
+-}
+-
+-/*
+- * Handle direct writes. For sequential zone files, this is the only possible
+- * write path. For these files, check that the user is issuing writes
+- * sequentially from the end of the file. This code assumes that the block layer
+- * delivers write requests to the device in sequential order. This is always the
+- * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE
+- * elevator feature is being used (e.g. mq-deadline). The block layer always
+- * automatically select such an elevator for zoned block devices during the
+- * device initialization.
+- */
+-static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+-{
+-      struct inode *inode = file_inode(iocb->ki_filp);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      struct super_block *sb = inode->i_sb;
+-      bool sync = is_sync_kiocb(iocb);
+-      bool append = false;
+-      ssize_t ret, count;
+-
+-      /*
+-       * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT
+-       * as this can cause write reordering (e.g. the first aio gets EAGAIN
+-       * on the inode lock but the second goes through but is now unaligned).
+-       */
+-      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync &&
+-          (iocb->ki_flags & IOCB_NOWAIT))
+-              return -EOPNOTSUPP;
+-
+-      if (iocb->ki_flags & IOCB_NOWAIT) {
+-              if (!inode_trylock(inode))
+-                      return -EAGAIN;
+-      } else {
+-              inode_lock(inode);
+-      }
+-
+-      count = zonefs_write_checks(iocb, from);
+-      if (count <= 0) {
+-              ret = count;
+-              goto inode_unlock;
+-      }
+-
+-      if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
+-              ret = -EINVAL;
+-              goto inode_unlock;
+-      }
+-
+-      /* Enforce sequential writes (append only) in sequential zones */
+-      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) {
+-              mutex_lock(&zi->i_truncate_mutex);
+-              if (iocb->ki_pos != zi->i_wpoffset) {
+-                      mutex_unlock(&zi->i_truncate_mutex);
+-                      ret = -EINVAL;
+-                      goto inode_unlock;
+-              }
+-              mutex_unlock(&zi->i_truncate_mutex);
+-              append = sync;
+-      }
+-
+-      if (append)
+-              ret = zonefs_file_dio_append(iocb, from);
+-      else
+-              ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
+-                                 &zonefs_write_dio_ops, 0, NULL, 0);
+-      if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
+-          (ret > 0 || ret == -EIOCBQUEUED)) {
+-              if (ret > 0)
+-                      count = ret;
+-
+-              /*
+-               * Update the zone write pointer offset assuming the write
+-               * operation succeeded. If it did not, the error recovery path
+-               * will correct it. Also do active seq file accounting.
+-               */
+-              mutex_lock(&zi->i_truncate_mutex);
+-              zi->i_wpoffset += count;
+-              zonefs_account_active(inode);
+-              mutex_unlock(&zi->i_truncate_mutex);
+-      }
+-
+-inode_unlock:
+-      inode_unlock(inode);
+-
+-      return ret;
+-}
+-
+-static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
+-                                        struct iov_iter *from)
+-{
+-      struct inode *inode = file_inode(iocb->ki_filp);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      ssize_t ret;
+-
+-      /*
+-       * Direct IO writes are mandatory for sequential zone files so that the
+-       * write IO issuing order is preserved.
+-       */
+-      if (zi->i_ztype != ZONEFS_ZTYPE_CNV)
+-              return -EIO;
+-
+-      if (iocb->ki_flags & IOCB_NOWAIT) {
+-              if (!inode_trylock(inode))
+-                      return -EAGAIN;
+-      } else {
+-              inode_lock(inode);
+-      }
+-
+-      ret = zonefs_write_checks(iocb, from);
+-      if (ret <= 0)
+-              goto inode_unlock;
+-
+-      ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
+-      if (ret > 0)
+-              iocb->ki_pos += ret;
+-      else if (ret == -EIO)
+-              zonefs_io_error(inode, true);
+-
+-inode_unlock:
+-      inode_unlock(inode);
+-      if (ret > 0)
+-              ret = generic_write_sync(iocb, ret);
+-
+-      return ret;
+-}
+-
+-static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+-{
+-      struct inode *inode = file_inode(iocb->ki_filp);
+-
+-      if (unlikely(IS_IMMUTABLE(inode)))
+-              return -EPERM;
+-
+-      if (sb_rdonly(inode->i_sb))
+-              return -EROFS;
+-
+-      /* Write operations beyond the zone size are not allowed */
+-      if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
+-              return -EFBIG;
+-
+-      if (iocb->ki_flags & IOCB_DIRECT) {
+-              ssize_t ret = zonefs_file_dio_write(iocb, from);
+-              if (ret != -ENOTBLK)
+-                      return ret;
+-      }
+-
+-      return zonefs_file_buffered_write(iocb, from);
+-}
+-
+-static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size,
+-                                     int error, unsigned int flags)
+-{
+-      if (error) {
+-              zonefs_io_error(file_inode(iocb->ki_filp), false);
+-              return error;
+-      }
+-
+-      return 0;
+-}
+-
+-static const struct iomap_dio_ops zonefs_read_dio_ops = {
+-      .end_io                 = zonefs_file_read_dio_end_io,
+-};
+-
+-static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+-{
+-      struct inode *inode = file_inode(iocb->ki_filp);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      struct super_block *sb = inode->i_sb;
+-      loff_t isize;
+-      ssize_t ret;
+-
+-      /* Offline zones cannot be read */
+-      if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
+-              return -EPERM;
+-
+-      if (iocb->ki_pos >= zi->i_max_size)
+-              return 0;
+-
+-      if (iocb->ki_flags & IOCB_NOWAIT) {
+-              if (!inode_trylock_shared(inode))
+-                      return -EAGAIN;
+-      } else {
+-              inode_lock_shared(inode);
+-      }
+-
+-      /* Limit read operations to written data */
+-      mutex_lock(&zi->i_truncate_mutex);
+-      isize = i_size_read(inode);
+-      if (iocb->ki_pos >= isize) {
+-              mutex_unlock(&zi->i_truncate_mutex);
+-              ret = 0;
+-              goto inode_unlock;
+-      }
+-      iov_iter_truncate(to, isize - iocb->ki_pos);
+-      mutex_unlock(&zi->i_truncate_mutex);
+-
+-      if (iocb->ki_flags & IOCB_DIRECT) {
+-              size_t count = iov_iter_count(to);
+-
+-              if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) {
+-                      ret = -EINVAL;
+-                      goto inode_unlock;
+-              }
+-              file_accessed(iocb->ki_filp);
+-              ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
+-                                 &zonefs_read_dio_ops, 0, NULL, 0);
+-      } else {
+-              ret = generic_file_read_iter(iocb, to);
+-              if (ret == -EIO)
+-                      zonefs_io_error(inode, false);
+-      }
+-
+-inode_unlock:
+-      inode_unlock_shared(inode);
+-
+-      return ret;
+-}
+-
+-/*
+- * Write open accounting is done only for sequential files.
+- */
+-static inline bool zonefs_seq_file_need_wro(struct inode *inode,
+-                                          struct file *file)
+-{
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-
+-      if (zi->i_ztype != ZONEFS_ZTYPE_SEQ)
+-              return false;
+-
+-      if (!(file->f_mode & FMODE_WRITE))
+-              return false;
+-
+-      return true;
+-}
+-
+-static int zonefs_seq_file_write_open(struct inode *inode)
+-{
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      int ret = 0;
+-
+-      mutex_lock(&zi->i_truncate_mutex);
+-
+-      if (!zi->i_wr_refcnt) {
+-              struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+-              unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files);
+-
+-              if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+-
+-                      if (sbi->s_max_wro_seq_files
+-                          && wro > sbi->s_max_wro_seq_files) {
+-                              atomic_dec(&sbi->s_wro_seq_files);
+-                              ret = -EBUSY;
+-                              goto unlock;
+-                      }
+-
+-                      if (i_size_read(inode) < zi->i_max_size) {
+-                              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+-                              if (ret) {
+-                                      atomic_dec(&sbi->s_wro_seq_files);
+-                                      goto unlock;
+-                              }
+-                              zi->i_flags |= ZONEFS_ZONE_OPEN;
+-                              zonefs_account_active(inode);
+-                      }
+-              }
+-      }
+-
+-      zi->i_wr_refcnt++;
+-
+-unlock:
+-      mutex_unlock(&zi->i_truncate_mutex);
+-
+-      return ret;
+-}
+-
+-static int zonefs_file_open(struct inode *inode, struct file *file)
+-{
+-      int ret;
+-
+-      ret = generic_file_open(inode, file);
+-      if (ret)
+-              return ret;
+-
+-      if (zonefs_seq_file_need_wro(inode, file))
+-              return zonefs_seq_file_write_open(inode);
+-
+-      return 0;
+-}
+-
+-static void zonefs_seq_file_write_close(struct inode *inode)
+-{
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      struct super_block *sb = inode->i_sb;
+-      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+-      int ret = 0;
+-
+-      mutex_lock(&zi->i_truncate_mutex);
+-
+-      zi->i_wr_refcnt--;
+-      if (zi->i_wr_refcnt)
+-              goto unlock;
+-
+-      /*
+-       * The file zone may not be open anymore (e.g. the file was truncated to
+-       * its maximum size or it was fully written). For this case, we only
+-       * need to decrement the write open count.
+-       */
+-      if (zi->i_flags & ZONEFS_ZONE_OPEN) {
+-              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+-              if (ret) {
+-                      __zonefs_io_error(inode, false);
+-                      /*
+-                       * Leaving zones explicitly open may lead to a state
+-                       * where most zones cannot be written (zone resources
+-                       * exhausted). So take preventive action by remounting
+-                       * read-only.
+-                       */
+-                      if (zi->i_flags & ZONEFS_ZONE_OPEN &&
+-                          !(sb->s_flags & SB_RDONLY)) {
+-                              zonefs_warn(sb,
+-                                      "closing zone at %llu failed %d\n",
+-                                      zi->i_zsector, ret);
+-                              zonefs_warn(sb,
+-                                      "remounting filesystem read-only\n");
+-                              sb->s_flags |= SB_RDONLY;
+-                      }
+-                      goto unlock;
+-              }
+-
+-              zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+-              zonefs_account_active(inode);
+-      }
+-
+-      atomic_dec(&sbi->s_wro_seq_files);
+-
+-unlock:
+-      mutex_unlock(&zi->i_truncate_mutex);
+-}
+-
+-static int zonefs_file_release(struct inode *inode, struct file *file)
+-{
+-      /*
+-       * If we explicitly open a zone we must close it again as well, but the
+-       * zone management operation can fail (either due to an IO error or as
+-       * the zone has gone offline or read-only). Make sure we don't fail the
+-       * close(2) for user-space.
+-       */
+-      if (zonefs_seq_file_need_wro(inode, file))
+-              zonefs_seq_file_write_close(inode);
+-
+-      return 0;
+-}
+-
+-static const struct file_operations zonefs_file_operations = {
+-      .open           = zonefs_file_open,
+-      .release        = zonefs_file_release,
+-      .fsync          = zonefs_file_fsync,
+-      .mmap           = zonefs_file_mmap,
+-      .llseek         = zonefs_file_llseek,
+-      .read_iter      = zonefs_file_read_iter,
+-      .write_iter     = zonefs_file_write_iter,
+-      .splice_read    = generic_file_splice_read,
+-      .splice_write   = iter_file_splice_write,
+-      .iopoll         = iocb_bio_iopoll,
+-};
+-
+ static struct kmem_cache *zonefs_inode_cachep;
+ static struct inode *zonefs_alloc_inode(struct super_block *sb)
+@@ -1408,13 +505,47 @@ static int zonefs_remount(struct super_block *sb, int *flags, char *data)
+       return zonefs_parse_options(sb, data);
+ }
+-static const struct super_operations zonefs_sops = {
+-      .alloc_inode    = zonefs_alloc_inode,
+-      .free_inode     = zonefs_free_inode,
+-      .statfs         = zonefs_statfs,
+-      .remount_fs     = zonefs_remount,
+-      .show_options   = zonefs_show_options,
+-};
++static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
++                              struct dentry *dentry, struct iattr *iattr)
++{
++      struct inode *inode = d_inode(dentry);
++      int ret;
++
++      if (unlikely(IS_IMMUTABLE(inode)))
++              return -EPERM;
++
++      ret = setattr_prepare(&init_user_ns, dentry, iattr);
++      if (ret)
++              return ret;
++
++      /*
++       * Since files and directories cannot be created nor deleted, do not
++       * allow setting any write attributes on the sub-directories grouping
++       * files by zone type.
++       */
++      if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) &&
++          (iattr->ia_mode & 0222))
++              return -EPERM;
++
++      if (((iattr->ia_valid & ATTR_UID) &&
++           !uid_eq(iattr->ia_uid, inode->i_uid)) ||
++          ((iattr->ia_valid & ATTR_GID) &&
++           !gid_eq(iattr->ia_gid, inode->i_gid))) {
++              ret = dquot_transfer(mnt_userns, inode, iattr);
++              if (ret)
++                      return ret;
++      }
++
++      if (iattr->ia_valid & ATTR_SIZE) {
++              ret = zonefs_file_truncate(inode, iattr->ia_size);
++              if (ret)
++                      return ret;
++      }
++
++      setattr_copy(&init_user_ns, inode, iattr);
++
++      return 0;
++}
+ static const struct inode_operations zonefs_dir_inode_operations = {
+       .lookup         = simple_lookup,
+@@ -1434,6 +565,10 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
+       inc_nlink(parent);
+ }
++static const struct inode_operations zonefs_file_inode_operations = {
++      .setattr        = zonefs_inode_setattr,
++};
++
+ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+                                 enum zonefs_ztype type)
+ {
+@@ -1785,6 +920,14 @@ static int zonefs_read_super(struct super_block *sb)
+       return ret;
+ }
++static const struct super_operations zonefs_sops = {
++      .alloc_inode    = zonefs_alloc_inode,
++      .free_inode     = zonefs_free_inode,
++      .statfs         = zonefs_statfs,
++      .remount_fs     = zonefs_remount,
++      .show_options   = zonefs_show_options,
++};
++
+ /*
+  * Check that the device is zoned. If it is, get the list of zones and create
+  * sub-directories and files according to the device zone configuration and
+diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
+index 1dbe78119ff16..839ebe9afb6c1 100644
+--- a/fs/zonefs/zonefs.h
++++ b/fs/zonefs/zonefs.h
+@@ -209,6 +209,28 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
+ #define zonefs_warn(sb, format, args...)      \
+       pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args)
++/* In super.c */
++void zonefs_account_active(struct inode *inode);
++int zonefs_zone_mgmt(struct inode *inode, enum req_op op);
++void zonefs_i_size_write(struct inode *inode, loff_t isize);
++void zonefs_update_stats(struct inode *inode, loff_t new_isize);
++void __zonefs_io_error(struct inode *inode, bool write);
++
++static inline void zonefs_io_error(struct inode *inode, bool write)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++
++      mutex_lock(&zi->i_truncate_mutex);
++      __zonefs_io_error(inode, write);
++      mutex_unlock(&zi->i_truncate_mutex);
++}
++
++/* In file.c */
++extern const struct address_space_operations zonefs_file_aops;
++extern const struct file_operations zonefs_file_operations;
++int zonefs_file_truncate(struct inode *inode, loff_t isize);
++
++/* In sysfs.c */
+ int zonefs_sysfs_register(struct super_block *sb);
+ void zonefs_sysfs_unregister(struct super_block *sb);
+ int zonefs_sysfs_init(void);
+-- 
+2.39.2
+
diff --git a/queue-6.2/zonefs-separate-zone-information-from-inode-informat.patch b/queue-6.2/zonefs-separate-zone-information-from-inode-informat.patch
new file mode 100644 (file)
index 0000000..748ec4e
--- /dev/null
@@ -0,0 +1,1485 @@
+From 60aaa2368870616ab6b4f218c9194d826ee72f64 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Nov 2022 18:15:40 +0900
+Subject: zonefs: Separate zone information from inode information
+
+From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+
+[ Upstream commit aa7f243f32e1d18036ee00d71d3ccfad70ae2121 ]
+
+In preparation for adding dynamic inode allocation, separate an inode
+zone information from the zonefs inode structure. The new data structure
+zonefs_zone is introduced to store in memory information about a zone
+that must be kept throughout the lifetime of the device mount.
+
+Linking between a zone file inode and its zone information is done by
+setting the inode i_private field to point to a struct zonefs_zone.
+Using the i_private pointer avoids the need for adding a pointer in
+struct zonefs_inode_info. Beside the vfs inode, this structure is
+reduced to a mutex and a write open counter.
+
+One struct zonefs_zone is created per file inode on mount. These
+structures are organized in an array using the new struct
+zonefs_zone_group data structure to represent zone groups. The
+zonefs_zone arrays are indexed per file number (the index of a struct
+zonefs_zone in its array directly gives the file number/name for that
+zone file inode).
+
+Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/zonefs/file.c   |  99 ++++----
+ fs/zonefs/super.c  | 571 +++++++++++++++++++++++++++------------------
+ fs/zonefs/trace.h  |  20 +-
+ fs/zonefs/zonefs.h |  63 +++--
+ 4 files changed, 449 insertions(+), 304 deletions(-)
+
+diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
+index 64873d31d75dd..738b0e28d74b5 100644
+--- a/fs/zonefs/file.c
++++ b/fs/zonefs/file.c
+@@ -29,6 +29,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+                                  struct iomap *iomap, struct iomap *srcmap)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       struct super_block *sb = inode->i_sb;
+       loff_t isize;
+@@ -46,7 +47,7 @@ static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+               iomap->length = length;
+       } else {
+               iomap->type = IOMAP_MAPPED;
+-              iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
++              iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
+               iomap->length = isize - iomap->offset;
+       }
+       mutex_unlock(&zi->i_truncate_mutex);
+@@ -65,11 +66,12 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+                                   struct iomap *iomap, struct iomap *srcmap)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       struct super_block *sb = inode->i_sb;
+       loff_t isize;
+       /* All write I/Os should always be within the file maximum size */
+-      if (WARN_ON_ONCE(offset + length > zi->i_max_size))
++      if (WARN_ON_ONCE(offset + length > z->z_capacity))
+               return -EIO;
+       /*
+@@ -77,7 +79,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+        * checked when writes are issued, so warn if we see a page writeback
+        * operation.
+        */
+-      if (WARN_ON_ONCE(zonefs_zone_is_seq(zi) && !(flags & IOMAP_DIRECT)))
++      if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT)))
+               return -EIO;
+       /*
+@@ -88,11 +90,11 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+       mutex_lock(&zi->i_truncate_mutex);
+       iomap->bdev = inode->i_sb->s_bdev;
+       iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+-      iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
++      iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset;
+       isize = i_size_read(inode);
+       if (iomap->offset >= isize) {
+               iomap->type = IOMAP_UNWRITTEN;
+-              iomap->length = zi->i_max_size - iomap->offset;
++              iomap->length = z->z_capacity - iomap->offset;
+       } else {
+               iomap->type = IOMAP_MAPPED;
+               iomap->length = isize - iomap->offset;
+@@ -125,9 +127,9 @@ static void zonefs_readahead(struct readahead_control *rac)
+ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+                                  struct inode *inode, loff_t offset)
+ {
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+-      if (WARN_ON_ONCE(zonefs_zone_is_seq(zi)))
++      if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
+               return -EIO;
+       if (WARN_ON_ONCE(offset >= i_size_read(inode)))
+               return -EIO;
+@@ -137,7 +139,8 @@ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+           offset < wpc->iomap.offset + wpc->iomap.length)
+               return 0;
+-      return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
++      return zonefs_write_iomap_begin(inode, offset,
++                                      z->z_capacity - offset,
+                                       IOMAP_WRITE, &wpc->iomap, NULL);
+ }
+@@ -185,6 +188,7 @@ const struct address_space_operations zonefs_file_aops = {
+ int zonefs_file_truncate(struct inode *inode, loff_t isize)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       loff_t old_isize;
+       enum req_op op;
+       int ret = 0;
+@@ -194,12 +198,12 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize)
+        * only down to a 0 size, which is equivalent to a zone reset, and to
+        * the maximum file size, which is equivalent to a zone finish.
+        */
+-      if (!zonefs_zone_is_seq(zi))
++      if (!zonefs_zone_is_seq(z))
+               return -EPERM;
+       if (!isize)
+               op = REQ_OP_ZONE_RESET;
+-      else if (isize == zi->i_max_size)
++      else if (isize == z->z_capacity)
+               op = REQ_OP_ZONE_FINISH;
+       else
+               return -EPERM;
+@@ -216,7 +220,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize)
+       if (isize == old_isize)
+               goto unlock;
+-      ret = zonefs_zone_mgmt(inode, op);
++      ret = zonefs_inode_zone_mgmt(inode, op);
+       if (ret)
+               goto unlock;
+@@ -224,7 +228,7 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize)
+        * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set,
+        * take care of open zones.
+        */
+-      if (zi->i_flags & ZONEFS_ZONE_OPEN) {
++      if (z->z_flags & ZONEFS_ZONE_OPEN) {
+               /*
+                * Truncating a zone to EMPTY or FULL is the equivalent of
+                * closing the zone. For a truncation to 0, we need to
+@@ -234,15 +238,15 @@ int zonefs_file_truncate(struct inode *inode, loff_t isize)
+                * the open flag.
+                */
+               if (!isize)
+-                      ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
++                      ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
+               else
+-                      zi->i_flags &= ~ZONEFS_ZONE_OPEN;
++                      z->z_flags &= ~ZONEFS_ZONE_OPEN;
+       }
+       zonefs_update_stats(inode, isize);
+       truncate_setsize(inode, isize);
+-      zi->i_wpoffset = isize;
+-      zonefs_account_active(inode);
++      z->z_wpoffset = isize;
++      zonefs_inode_account_active(inode);
+ unlock:
+       mutex_unlock(&zi->i_truncate_mutex);
+@@ -349,7 +353,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
+               return error;
+       }
+-      if (size && zonefs_zone_is_seq(zi)) {
++      if (size && zonefs_inode_is_seq(inode)) {
+               /*
+                * Note that we may be seeing completions out of order,
+                * but that is not a problem since a write completed
+@@ -375,7 +379,7 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = {
+ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
+ {
+       struct inode *inode = file_inode(iocb->ki_filp);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       struct block_device *bdev = inode->i_sb->s_bdev;
+       unsigned int max = bdev_max_zone_append_sectors(bdev);
+       struct bio *bio;
+@@ -392,7 +396,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
+       bio = bio_alloc(bdev, nr_pages,
+                       REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
+-      bio->bi_iter.bi_sector = zi->i_zsector;
++      bio->bi_iter.bi_sector = z->z_sector;
+       bio->bi_ioprio = iocb->ki_ioprio;
+       if (iocb_is_dsync(iocb))
+               bio->bi_opf |= REQ_FUA;
+@@ -417,12 +421,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
+        */
+       if (!ret) {
+               sector_t wpsector =
+-                      zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT);
++                      z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT);
+               if (bio->bi_iter.bi_sector != wpsector) {
+                       zonefs_warn(inode->i_sb,
+                               "Corrupted write pointer %llu for zone at %llu\n",
+-                              wpsector, zi->i_zsector);
++                              wpsector, z->z_sector);
+                       ret = -EIO;
+               }
+       }
+@@ -450,9 +454,9 @@ static loff_t zonefs_write_check_limits(struct file *file, loff_t pos,
+                                       loff_t count)
+ {
+       struct inode *inode = file_inode(file);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       loff_t limit = rlimit(RLIMIT_FSIZE);
+-      loff_t max_size = zi->i_max_size;
++      loff_t max_size = z->z_capacity;
+       if (limit != RLIM_INFINITY) {
+               if (pos >= limit) {
+@@ -476,6 +480,7 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       loff_t count;
+       if (IS_SWAPFILE(inode))
+@@ -488,10 +493,10 @@ static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from)
+               return -EINVAL;
+       if (iocb->ki_flags & IOCB_APPEND) {
+-              if (zonefs_zone_is_cnv(zi))
++              if (zonefs_zone_is_cnv(z))
+                       return -EINVAL;
+               mutex_lock(&zi->i_truncate_mutex);
+-              iocb->ki_pos = zi->i_wpoffset;
++              iocb->ki_pos = z->z_wpoffset;
+               mutex_unlock(&zi->i_truncate_mutex);
+       }
+@@ -518,6 +523,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+ {
+       struct inode *inode = file_inode(iocb->ki_filp);
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       struct super_block *sb = inode->i_sb;
+       bool sync = is_sync_kiocb(iocb);
+       bool append = false;
+@@ -528,7 +534,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+        * as this can cause write reordering (e.g. the first aio gets EAGAIN
+        * on the inode lock but the second goes through but is now unaligned).
+        */
+-      if (zonefs_zone_is_seq(zi) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
++      if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT))
+               return -EOPNOTSUPP;
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+@@ -550,9 +556,9 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+       }
+       /* Enforce sequential writes (append only) in sequential zones */
+-      if (zonefs_zone_is_seq(zi)) {
++      if (zonefs_zone_is_seq(z)) {
+               mutex_lock(&zi->i_truncate_mutex);
+-              if (iocb->ki_pos != zi->i_wpoffset) {
++              if (iocb->ki_pos != z->z_wpoffset) {
+                       mutex_unlock(&zi->i_truncate_mutex);
+                       ret = -EINVAL;
+                       goto inode_unlock;
+@@ -566,7 +572,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+       else
+               ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
+                                  &zonefs_write_dio_ops, 0, NULL, 0);
+-      if (zonefs_zone_is_seq(zi) &&
++      if (zonefs_zone_is_seq(z) &&
+           (ret > 0 || ret == -EIOCBQUEUED)) {
+               if (ret > 0)
+                       count = ret;
+@@ -577,8 +583,8 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
+                * will correct it. Also do active seq file accounting.
+                */
+               mutex_lock(&zi->i_truncate_mutex);
+-              zi->i_wpoffset += count;
+-              zonefs_account_active(inode);
++              z->z_wpoffset += count;
++              zonefs_inode_account_active(inode);
+               mutex_unlock(&zi->i_truncate_mutex);
+       }
+@@ -629,6 +635,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
+ static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ {
+       struct inode *inode = file_inode(iocb->ki_filp);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       if (unlikely(IS_IMMUTABLE(inode)))
+               return -EPERM;
+@@ -636,8 +643,8 @@ static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+       if (sb_rdonly(inode->i_sb))
+               return -EROFS;
+-      /* Write operations beyond the zone size are not allowed */
+-      if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size)
++      /* Write operations beyond the zone capacity are not allowed */
++      if (iocb->ki_pos >= z->z_capacity)
+               return -EFBIG;
+       if (iocb->ki_flags & IOCB_DIRECT) {
+@@ -669,6 +676,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ {
+       struct inode *inode = file_inode(iocb->ki_filp);
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       struct super_block *sb = inode->i_sb;
+       loff_t isize;
+       ssize_t ret;
+@@ -677,7 +685,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+       if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777)))
+               return -EPERM;
+-      if (iocb->ki_pos >= zi->i_max_size)
++      if (iocb->ki_pos >= z->z_capacity)
+               return 0;
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+@@ -738,6 +746,7 @@ static inline bool zonefs_seq_file_need_wro(struct inode *inode,
+ static int zonefs_seq_file_write_open(struct inode *inode)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       int ret = 0;
+       mutex_lock(&zi->i_truncate_mutex);
+@@ -755,14 +764,15 @@ static int zonefs_seq_file_write_open(struct inode *inode)
+                               goto unlock;
+                       }
+-                      if (i_size_read(inode) < zi->i_max_size) {
+-                              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
++                      if (i_size_read(inode) < z->z_capacity) {
++                              ret = zonefs_inode_zone_mgmt(inode,
++                                                           REQ_OP_ZONE_OPEN);
+                               if (ret) {
+                                       atomic_dec(&sbi->s_wro_seq_files);
+                                       goto unlock;
+                               }
+-                              zi->i_flags |= ZONEFS_ZONE_OPEN;
+-                              zonefs_account_active(inode);
++                              z->z_flags |= ZONEFS_ZONE_OPEN;
++                              zonefs_inode_account_active(inode);
+                       }
+               }
+       }
+@@ -792,6 +802,7 @@ static int zonefs_file_open(struct inode *inode, struct file *file)
+ static void zonefs_seq_file_write_close(struct inode *inode)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       struct super_block *sb = inode->i_sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+       int ret = 0;
+@@ -807,8 +818,8 @@ static void zonefs_seq_file_write_close(struct inode *inode)
+        * its maximum size or it was fully written). For this case, we only
+        * need to decrement the write open count.
+        */
+-      if (zi->i_flags & ZONEFS_ZONE_OPEN) {
+-              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
++      if (z->z_flags & ZONEFS_ZONE_OPEN) {
++              ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+               if (ret) {
+                       __zonefs_io_error(inode, false);
+                       /*
+@@ -817,11 +828,11 @@ static void zonefs_seq_file_write_close(struct inode *inode)
+                        * exhausted). So take preventive action by remounting
+                        * read-only.
+                        */
+-                      if (zi->i_flags & ZONEFS_ZONE_OPEN &&
++                      if (z->z_flags & ZONEFS_ZONE_OPEN &&
+                           !(sb->s_flags & SB_RDONLY)) {
+                               zonefs_warn(sb,
+                                       "closing zone at %llu failed %d\n",
+-                                      zi->i_zsector, ret);
++                                      z->z_sector, ret);
+                               zonefs_warn(sb,
+                                       "remounting filesystem read-only\n");
+                               sb->s_flags |= SB_RDONLY;
+@@ -829,8 +840,8 @@ static void zonefs_seq_file_write_close(struct inode *inode)
+                       goto unlock;
+               }
+-              zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+-              zonefs_account_active(inode);
++              z->z_flags &= ~ZONEFS_ZONE_OPEN;
++              zonefs_inode_account_active(inode);
+       }
+       atomic_dec(&sbi->s_wro_seq_files);
+diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
+index a4af29dc32e7d..270ded209dde5 100644
+--- a/fs/zonefs/super.c
++++ b/fs/zonefs/super.c
+@@ -28,33 +28,47 @@
+ #include "trace.h"
+ /*
+- * Manage the active zone count. Called with zi->i_truncate_mutex held.
++ * Get the name of a zone group directory.
+  */
+-void zonefs_account_active(struct inode *inode)
++static const char *zonefs_zgroup_name(enum zonefs_ztype ztype)
+ {
+-      struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      switch (ztype) {
++      case ZONEFS_ZTYPE_CNV:
++              return "cnv";
++      case ZONEFS_ZTYPE_SEQ:
++              return "seq";
++      default:
++              WARN_ON_ONCE(1);
++              return "???";
++      }
++}
+-      lockdep_assert_held(&zi->i_truncate_mutex);
++/*
++ * Manage the active zone count.
++ */
++static void zonefs_account_active(struct super_block *sb,
++                                struct zonefs_zone *z)
++{
++      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+-      if (zonefs_zone_is_cnv(zi))
++      if (zonefs_zone_is_cnv(z))
+               return;
+       /*
+        * For zones that transitioned to the offline or readonly condition,
+        * we only need to clear the active state.
+        */
+-      if (zi->i_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY))
++      if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY))
+               goto out;
+       /*
+        * If the zone is active, that is, if it is explicitly open or
+        * partially written, check if it was already accounted as active.
+        */
+-      if ((zi->i_flags & ZONEFS_ZONE_OPEN) ||
+-          (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) {
+-              if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) {
+-                      zi->i_flags |= ZONEFS_ZONE_ACTIVE;
++      if ((z->z_flags & ZONEFS_ZONE_OPEN) ||
++          (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) {
++              if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) {
++                      z->z_flags |= ZONEFS_ZONE_ACTIVE;
+                       atomic_inc(&sbi->s_active_seq_files);
+               }
+               return;
+@@ -62,18 +76,29 @@ void zonefs_account_active(struct inode *inode)
+ out:
+       /* The zone is not active. If it was, update the active count */
+-      if (zi->i_flags & ZONEFS_ZONE_ACTIVE) {
+-              zi->i_flags &= ~ZONEFS_ZONE_ACTIVE;
++      if (z->z_flags & ZONEFS_ZONE_ACTIVE) {
++              z->z_flags &= ~ZONEFS_ZONE_ACTIVE;
+               atomic_dec(&sbi->s_active_seq_files);
+       }
+ }
+-int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
++/*
++ * Manage the active zone count. Called with zi->i_truncate_mutex held.
++ */
++void zonefs_inode_account_active(struct inode *inode)
+ {
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      int ret;
++      lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex);
+-      lockdep_assert_held(&zi->i_truncate_mutex);
++      return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode));
++}
++
++/*
++ * Execute a zone management operation.
++ */
++static int zonefs_zone_mgmt(struct super_block *sb,
++                          struct zonefs_zone *z, enum req_op op)
++{
++      int ret;
+       /*
+        * With ZNS drives, closing an explicitly open zone that has not been
+@@ -83,37 +108,45 @@ int zonefs_zone_mgmt(struct inode *inode, enum req_op op)
+        * are exceeded, make sure that the zone does not remain active by
+        * resetting it.
+        */
+-      if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
++      if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset)
+               op = REQ_OP_ZONE_RESET;
+-      trace_zonefs_zone_mgmt(inode, op);
+-      ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
+-                             zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
++      trace_zonefs_zone_mgmt(sb, z, op);
++      ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
++                             z->z_size >> SECTOR_SHIFT, GFP_NOFS);
+       if (ret) {
+-              zonefs_err(inode->i_sb,
++              zonefs_err(sb,
+                          "Zone management operation %s at %llu failed %d\n",
+-                         blk_op_str(op), zi->i_zsector, ret);
++                         blk_op_str(op), z->z_sector, ret);
+               return ret;
+       }
+       return 0;
+ }
++int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op)
++{
++      lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex);
++
++      return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op);
++}
++
+ void zonefs_i_size_write(struct inode *inode, loff_t isize)
+ {
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       i_size_write(inode, isize);
++
+       /*
+        * A full zone is no longer open/active and does not need
+        * explicit closing.
+        */
+-      if (isize >= zi->i_max_size) {
++      if (isize >= z->z_capacity) {
+               struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb);
+-              if (zi->i_flags & ZONEFS_ZONE_ACTIVE)
++              if (z->z_flags & ZONEFS_ZONE_ACTIVE)
+                       atomic_dec(&sbi->s_active_seq_files);
+-              zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
++              z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE);
+       }
+ }
+@@ -150,20 +183,18 @@ void zonefs_update_stats(struct inode *inode, loff_t new_isize)
+ }
+ /*
+- * Check a zone condition and adjust its file inode access permissions for
+- * offline and readonly zones. Return the inode size corresponding to the
+- * amount of readable data in the zone.
++ * Check a zone condition. Return the amount of written (and still readable)
++ * data in the zone.
+  */
+-static loff_t zonefs_check_zone_condition(struct inode *inode,
++static loff_t zonefs_check_zone_condition(struct super_block *sb,
++                                        struct zonefs_zone *z,
+                                         struct blk_zone *zone)
+ {
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-
+       switch (zone->cond) {
+       case BLK_ZONE_COND_OFFLINE:
+-              zonefs_warn(inode->i_sb, "inode %lu: offline zone\n",
+-                          inode->i_ino);
+-              zi->i_flags |= ZONEFS_ZONE_OFFLINE;
++              zonefs_warn(sb, "Zone %llu: offline zone\n",
++                          z->z_sector);
++              z->z_flags |= ZONEFS_ZONE_OFFLINE;
+               return 0;
+       case BLK_ZONE_COND_READONLY:
+               /*
+@@ -174,18 +205,18 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
+                * the inode size as it was when last updated so that the user
+                * can recover data.
+                */
+-              zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
+-                          inode->i_ino);
+-              zi->i_flags |= ZONEFS_ZONE_READONLY;
+-              if (zonefs_zone_is_cnv(zi))
+-                      return zi->i_max_size;
+-              return zi->i_wpoffset;
++              zonefs_warn(sb, "Zone %llu: read-only zone\n",
++                          z->z_sector);
++              z->z_flags |= ZONEFS_ZONE_READONLY;
++              if (zonefs_zone_is_cnv(z))
++                      return z->z_capacity;
++              return z->z_wpoffset;
+       case BLK_ZONE_COND_FULL:
+               /* The write pointer of full zones is invalid. */
+-              return zi->i_max_size;
++              return z->z_capacity;
+       default:
+-              if (zonefs_zone_is_cnv(zi))
+-                      return zi->i_max_size;
++              if (zonefs_zone_is_cnv(z))
++                      return z->z_capacity;
+               return (zone->wp - zone->start) << SECTOR_SHIFT;
+       }
+ }
+@@ -196,22 +227,22 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
+  */
+ static void zonefs_inode_update_mode(struct inode *inode)
+ {
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+-      if (zi->i_flags & ZONEFS_ZONE_OFFLINE) {
++      if (z->z_flags & ZONEFS_ZONE_OFFLINE) {
+               /* Offline zones cannot be read nor written */
+               inode->i_flags |= S_IMMUTABLE;
+               inode->i_mode &= ~0777;
+-      } else if (zi->i_flags & ZONEFS_ZONE_READONLY) {
++      } else if (z->z_flags & ZONEFS_ZONE_READONLY) {
+               /* Readonly zones cannot be written */
+               inode->i_flags |= S_IMMUTABLE;
+-              if (zi->i_flags & ZONEFS_ZONE_INIT_MODE)
++              if (z->z_flags & ZONEFS_ZONE_INIT_MODE)
+                       inode->i_mode &= ~0777;
+               else
+                       inode->i_mode &= ~0222;
+       }
+-      zi->i_flags &= ~ZONEFS_ZONE_INIT_MODE;
++      z->z_flags &= ~ZONEFS_ZONE_INIT_MODE;
+ }
+ struct zonefs_ioerr_data {
+@@ -224,7 +255,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+ {
+       struct zonefs_ioerr_data *err = data;
+       struct inode *inode = err->inode;
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       struct super_block *sb = inode->i_sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+       loff_t isize, data_size;
+@@ -235,9 +266,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        * as there is no inconsistency between the inode size and the amount of
+        * data writen in the zone (data_size).
+        */
+-      data_size = zonefs_check_zone_condition(inode, zone);
++      data_size = zonefs_check_zone_condition(sb, z, zone);
+       isize = i_size_read(inode);
+-      if (!(zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
++      if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
+           !err->write && isize == data_size)
+               return 0;
+@@ -260,8 +291,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        * In all cases, warn about inode size inconsistency and handle the
+        * IO error according to the zone condition and to the mount options.
+        */
+-      if (zonefs_zone_is_seq(zi) && isize != data_size)
+-              zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n",
++      if (zonefs_zone_is_seq(z) && isize != data_size)
++              zonefs_warn(sb,
++                          "inode %lu: invalid size %lld (should be %lld)\n",
+                           inode->i_ino, isize, data_size);
+       /*
+@@ -270,20 +302,20 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        * zone condition to read-only and offline respectively, as if the
+        * condition was signaled by the hardware.
+        */
+-      if ((zi->i_flags & ZONEFS_ZONE_OFFLINE) ||
++      if ((z->z_flags & ZONEFS_ZONE_OFFLINE) ||
+           (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) {
+               zonefs_warn(sb, "inode %lu: read/write access disabled\n",
+                           inode->i_ino);
+-              if (!(zi->i_flags & ZONEFS_ZONE_OFFLINE))
+-                      zi->i_flags |= ZONEFS_ZONE_OFFLINE;
++              if (!(z->z_flags & ZONEFS_ZONE_OFFLINE))
++                      z->z_flags |= ZONEFS_ZONE_OFFLINE;
+               zonefs_inode_update_mode(inode);
+               data_size = 0;
+-      } else if ((zi->i_flags & ZONEFS_ZONE_READONLY) ||
++      } else if ((z->z_flags & ZONEFS_ZONE_READONLY) ||
+                  (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) {
+               zonefs_warn(sb, "inode %lu: write access disabled\n",
+                           inode->i_ino);
+-              if (!(zi->i_flags & ZONEFS_ZONE_READONLY))
+-                      zi->i_flags |= ZONEFS_ZONE_READONLY;
++              if (!(z->z_flags & ZONEFS_ZONE_READONLY))
++                      z->z_flags |= ZONEFS_ZONE_READONLY;
+               zonefs_inode_update_mode(inode);
+               data_size = isize;
+       } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO &&
+@@ -299,8 +331,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        * close of the zone when the inode file is closed.
+        */
+       if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
+-          (zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)))
+-              zi->i_flags &= ~ZONEFS_ZONE_OPEN;
++          (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)))
++              z->z_flags &= ~ZONEFS_ZONE_OPEN;
+       /*
+        * If error=remount-ro was specified, any error result in remounting
+@@ -317,8 +349,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        */
+       zonefs_update_stats(inode, data_size);
+       zonefs_i_size_write(inode, data_size);
+-      zi->i_wpoffset = data_size;
+-      zonefs_account_active(inode);
++      z->z_wpoffset = data_size;
++      zonefs_inode_account_active(inode);
+       return 0;
+ }
+@@ -332,7 +364,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+  */
+ void __zonefs_io_error(struct inode *inode, bool write)
+ {
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++      struct zonefs_zone *z = zonefs_inode_zone(inode);
+       struct super_block *sb = inode->i_sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+       unsigned int noio_flag;
+@@ -348,8 +380,8 @@ void __zonefs_io_error(struct inode *inode, bool write)
+        * files with aggregated conventional zones, for which the inode zone
+        * size is always larger than the device zone size.
+        */
+-      if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev))
+-              nr_zones = zi->i_zone_size >>
++      if (z->z_size > bdev_zone_sectors(sb->s_bdev))
++              nr_zones = z->z_size >>
+                       (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+       /*
+@@ -361,7 +393,7 @@ void __zonefs_io_error(struct inode *inode, bool write)
+        * the GFP_NOIO context avoids both problems.
+        */
+       noio_flag = memalloc_noio_save();
+-      ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones,
++      ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
+                                 zonefs_io_error_cb, &err);
+       if (ret != nr_zones)
+               zonefs_err(sb, "Get inode %lu zone information failed %d\n",
+@@ -381,9 +413,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
+       inode_init_once(&zi->i_vnode);
+       mutex_init(&zi->i_truncate_mutex);
+-      zi->i_wpoffset = 0;
+       zi->i_wr_refcnt = 0;
+-      zi->i_flags = 0;
+       return &zi->i_vnode;
+ }
+@@ -416,8 +446,8 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
+       buf->f_bavail = buf->f_bfree;
+       for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
+-              if (sbi->s_nr_files[t])
+-                      buf->f_files += sbi->s_nr_files[t] + 1;
++              if (sbi->s_zgroup[t].g_nr_zones)
++                      buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1;
+       }
+       buf->f_ffree = 0;
+@@ -557,11 +587,11 @@ static const struct inode_operations zonefs_dir_inode_operations = {
+ };
+ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode,
+-                                enum zonefs_ztype type)
++                                enum zonefs_ztype ztype)
+ {
+       struct super_block *sb = parent->i_sb;
+-      inode->i_ino = bdev_nr_zones(sb->s_bdev) + type + 1;
++      inode->i_ino = bdev_nr_zones(sb->s_bdev) + ztype + 1;
+       inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555);
+       inode->i_op = &zonefs_dir_inode_operations;
+       inode->i_fop = &simple_dir_operations;
+@@ -573,79 +603,34 @@ static const struct inode_operations zonefs_file_inode_operations = {
+       .setattr        = zonefs_inode_setattr,
+ };
+-static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+-                                enum zonefs_ztype type)
++static void zonefs_init_file_inode(struct inode *inode,
++                                 struct zonefs_zone *z)
+ {
+       struct super_block *sb = inode->i_sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+-      struct zonefs_inode_info *zi = ZONEFS_I(inode);
+-      int ret = 0;
+-
+-      inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
+-      inode->i_mode = S_IFREG | sbi->s_perm;
+-      if (type == ZONEFS_ZTYPE_CNV)
+-              zi->i_flags |= ZONEFS_ZONE_CNV;
+-
+-      zi->i_zsector = zone->start;
+-      zi->i_zone_size = zone->len << SECTOR_SHIFT;
+-      if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT &&
+-          !(sbi->s_features & ZONEFS_F_AGGRCNV)) {
+-              zonefs_err(sb,
+-                         "zone size %llu doesn't match device's zone sectors %llu\n",
+-                         zi->i_zone_size,
+-                         bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT);
+-              return -EINVAL;
+-      }
+-
+-      zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
+-                             zone->capacity << SECTOR_SHIFT);
+-      zi->i_wpoffset = zonefs_check_zone_condition(inode, zone);
++      inode->i_private = z;
++      inode->i_ino = z->z_sector >> sbi->s_zone_sectors_shift;
++      inode->i_mode = S_IFREG | sbi->s_perm;
+       inode->i_uid = sbi->s_uid;
+       inode->i_gid = sbi->s_gid;
+-      inode->i_size = zi->i_wpoffset;
+-      inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT;
++      inode->i_size = z->z_wpoffset;
++      inode->i_blocks = z->z_capacity >> SECTOR_SHIFT;
+       inode->i_op = &zonefs_file_inode_operations;
+       inode->i_fop = &zonefs_file_operations;
+       inode->i_mapping->a_ops = &zonefs_file_aops;
+       /* Update the inode access rights depending on the zone condition */
+-      zi->i_flags |= ZONEFS_ZONE_INIT_MODE;
++      z->z_flags |= ZONEFS_ZONE_INIT_MODE;
+       zonefs_inode_update_mode(inode);
+-
+-      sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
+-      sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
+-      sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+-
+-      mutex_lock(&zi->i_truncate_mutex);
+-
+-      /*
+-       * For sequential zones, make sure that any open zone is closed first
+-       * to ensure that the initial number of open zones is 0, in sync with
+-       * the open zone accounting done when the mount option
+-       * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
+-       */
+-      if (type == ZONEFS_ZTYPE_SEQ &&
+-          (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
+-           zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
+-              ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+-              if (ret)
+-                      goto unlock;
+-      }
+-
+-      zonefs_account_active(inode);
+-
+-unlock:
+-      mutex_unlock(&zi->i_truncate_mutex);
+-
+-      return ret;
+ }
+ static struct dentry *zonefs_create_inode(struct dentry *parent,
+-                                      const char *name, struct blk_zone *zone,
+-                                      enum zonefs_ztype type)
++                                        const char *name,
++                                        struct zonefs_zone *z,
++                                        enum zonefs_ztype ztype)
+ {
+       struct inode *dir = d_inode(parent);
+       struct dentry *dentry;
+@@ -661,15 +646,10 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
+               goto dput;
+       inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
+-      if (zone) {
+-              ret = zonefs_init_file_inode(inode, zone, type);
+-              if (ret) {
+-                      iput(inode);
+-                      goto dput;
+-              }
+-      } else {
+-              zonefs_init_dir_inode(dir, inode, type);
+-      }
++      if (z)
++              zonefs_init_file_inode(inode, z);
++      else
++              zonefs_init_dir_inode(dir, inode, ztype);
+       d_add(dentry, inode);
+       dir->i_size++;
+@@ -685,100 +665,51 @@ static struct dentry *zonefs_create_inode(struct dentry *parent,
+ struct zonefs_zone_data {
+       struct super_block      *sb;
+       unsigned int            nr_zones[ZONEFS_ZTYPE_MAX];
++      sector_t                cnv_zone_start;
+       struct blk_zone         *zones;
+ };
+ /*
+- * Create a zone group and populate it with zone files.
++ * Create the inodes for a zone group.
+  */
+-static int zonefs_create_zgroup(struct zonefs_zone_data *zd,
+-                              enum zonefs_ztype type)
++static int zonefs_create_zgroup_inodes(struct super_block *sb,
++                                     enum zonefs_ztype ztype)
+ {
+-      struct super_block *sb = zd->sb;
+       struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
+-      struct blk_zone *zone, *next, *end;
+-      const char *zgroup_name;
+-      char *file_name;
++      struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype];
+       struct dentry *dir, *dent;
+-      unsigned int n = 0;
+-      int ret;
++      char *file_name;
++      int i, ret = 0;
++
++      if (!zgroup)
++              return -ENOMEM;
+       /* If the group is empty, there is nothing to do */
+-      if (!zd->nr_zones[type])
++      if (!zgroup->g_nr_zones)
+               return 0;
+       file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL);
+       if (!file_name)
+               return -ENOMEM;
+-      if (type == ZONEFS_ZTYPE_CNV)
+-              zgroup_name = "cnv";
+-      else
+-              zgroup_name = "seq";
+-
+-      dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
++      dir = zonefs_create_inode(sb->s_root, zonefs_zgroup_name(ztype),
++                                NULL, ztype);
+       if (IS_ERR(dir)) {
+               ret = PTR_ERR(dir);
+               goto free;
+       }
+-      /*
+-       * The first zone contains the super block: skip it.
+-       */
+-      end = zd->zones + bdev_nr_zones(sb->s_bdev);
+-      for (zone = &zd->zones[1]; zone < end; zone = next) {
+-
+-              next = zone + 1;
+-              if (zonefs_zone_type(zone) != type)
+-                      continue;
+-
+-              /*
+-               * For conventional zones, contiguous zones can be aggregated
+-               * together to form larger files. Note that this overwrites the
+-               * length of the first zone of the set of contiguous zones
+-               * aggregated together. If one offline or read-only zone is
+-               * found, assume that all zones aggregated have the same
+-               * condition.
+-               */
+-              if (type == ZONEFS_ZTYPE_CNV &&
+-                  (sbi->s_features & ZONEFS_F_AGGRCNV)) {
+-                      for (; next < end; next++) {
+-                              if (zonefs_zone_type(next) != type)
+-                                      break;
+-                              zone->len += next->len;
+-                              zone->capacity += next->capacity;
+-                              if (next->cond == BLK_ZONE_COND_READONLY &&
+-                                  zone->cond != BLK_ZONE_COND_OFFLINE)
+-                                      zone->cond = BLK_ZONE_COND_READONLY;
+-                              else if (next->cond == BLK_ZONE_COND_OFFLINE)
+-                                      zone->cond = BLK_ZONE_COND_OFFLINE;
+-                      }
+-                      if (zone->capacity != zone->len) {
+-                              zonefs_err(sb, "Invalid conventional zone capacity\n");
+-                              ret = -EINVAL;
+-                              goto free;
+-                      }
+-              }
+-
+-              /*
+-               * Use the file number within its group as file name.
+-               */
+-              snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
+-              dent = zonefs_create_inode(dir, file_name, zone, type);
++      for (i = 0; i < zgroup->g_nr_zones; i++) {
++              /* Use the zone number within its group as the file name */
++              snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", i);
++              dent = zonefs_create_inode(dir, file_name,
++                                         &zgroup->g_zones[i], ztype);
+               if (IS_ERR(dent)) {
+                       ret = PTR_ERR(dent);
+-                      goto free;
++                      break;
+               }
+-
+-              n++;
+       }
+-      zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
+-                  zgroup_name, n, n > 1 ? "s" : "");
+-
+-      sbi->s_nr_files[type] = n;
+-      ret = 0;
+-
+ free:
+       kfree(file_name);
+@@ -789,21 +720,38 @@ static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx,
+                                  void *data)
+ {
+       struct zonefs_zone_data *zd = data;
++      struct super_block *sb = zd->sb;
++      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
++
++      /*
++       * We do not care about the first zone: it contains the super block
++       * and not exposed as a file.
++       */
++      if (!idx)
++              return 0;
+       /*
+-       * Count the number of usable zones: the first zone at index 0 contains
+-       * the super block and is ignored.
++       * Count the number of zones that will be exposed as files.
++       * For sequential zones, we always have as many files as zones.
++       * FOr conventional zones, the number of files depends on if we have
++       * conventional zones aggregation enabled.
+        */
+       switch (zone->type) {
+       case BLK_ZONE_TYPE_CONVENTIONAL:
+-              zone->wp = zone->start + zone->len;
+-              if (idx)
+-                      zd->nr_zones[ZONEFS_ZTYPE_CNV]++;
++              if (sbi->s_features & ZONEFS_F_AGGRCNV) {
++                      /* One file per set of contiguous conventional zones */
++                      if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) ||
++                          zone->start != zd->cnv_zone_start)
++                              sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++;
++                      zd->cnv_zone_start = zone->start + zone->len;
++              } else {
++                      /* One file per zone */
++                      sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++;
++              }
+               break;
+       case BLK_ZONE_TYPE_SEQWRITE_REQ:
+       case BLK_ZONE_TYPE_SEQWRITE_PREF:
+-              if (idx)
+-                      zd->nr_zones[ZONEFS_ZTYPE_SEQ]++;
++              sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++;
+               break;
+       default:
+               zonefs_err(zd->sb, "Unsupported zone type 0x%x\n",
+@@ -843,11 +791,173 @@ static int zonefs_get_zone_info(struct zonefs_zone_data *zd)
+       return 0;
+ }
+-static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd)
++static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd)
+ {
+       kvfree(zd->zones);
+ }
++/*
++ * Create a zone group and populate it with zone files.
++ */
++static int zonefs_init_zgroup(struct super_block *sb,
++                            struct zonefs_zone_data *zd,
++                            enum zonefs_ztype ztype)
++{
++      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
++      struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype];
++      struct blk_zone *zone, *next, *end;
++      struct zonefs_zone *z;
++      unsigned int n = 0;
++      int ret;
++
++      /* Allocate the zone group. If it is empty, we have nothing to do. */
++      if (!zgroup->g_nr_zones)
++              return 0;
++
++      zgroup->g_zones = kvcalloc(zgroup->g_nr_zones,
++                                 sizeof(struct zonefs_zone), GFP_KERNEL);
++      if (!zgroup->g_zones)
++              return -ENOMEM;
++
++      /*
++       * Initialize the zone groups using the device zone information.
++       * We always skip the first zone as it contains the super block
++       * and is not use to back a file.
++       */
++      end = zd->zones + bdev_nr_zones(sb->s_bdev);
++      for (zone = &zd->zones[1]; zone < end; zone = next) {
++
++              next = zone + 1;
++              if (zonefs_zone_type(zone) != ztype)
++                      continue;
++
++              if (WARN_ON_ONCE(n >= zgroup->g_nr_zones))
++                      return -EINVAL;
++
++              /*
++               * For conventional zones, contiguous zones can be aggregated
++               * together to form larger files. Note that this overwrites the
++               * length of the first zone of the set of contiguous zones
++               * aggregated together. If one offline or read-only zone is
++               * found, assume that all zones aggregated have the same
++               * condition.
++               */
++              if (ztype == ZONEFS_ZTYPE_CNV &&
++                  (sbi->s_features & ZONEFS_F_AGGRCNV)) {
++                      for (; next < end; next++) {
++                              if (zonefs_zone_type(next) != ztype)
++                                      break;
++                              zone->len += next->len;
++                              zone->capacity += next->capacity;
++                              if (next->cond == BLK_ZONE_COND_READONLY &&
++                                  zone->cond != BLK_ZONE_COND_OFFLINE)
++                                      zone->cond = BLK_ZONE_COND_READONLY;
++                              else if (next->cond == BLK_ZONE_COND_OFFLINE)
++                                      zone->cond = BLK_ZONE_COND_OFFLINE;
++                      }
++              }
++
++              z = &zgroup->g_zones[n];
++              if (ztype == ZONEFS_ZTYPE_CNV)
++                      z->z_flags |= ZONEFS_ZONE_CNV;
++              z->z_sector = zone->start;
++              z->z_size = zone->len << SECTOR_SHIFT;
++              if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT &&
++                  !(sbi->s_features & ZONEFS_F_AGGRCNV)) {
++                      zonefs_err(sb,
++                              "Invalid zone size %llu (device zone sectors %llu)\n",
++                              z->z_size,
++                              bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT);
++                      return -EINVAL;
++              }
++
++              z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE,
++                                    zone->capacity << SECTOR_SHIFT);
++              z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone);
++
++              sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes);
++              sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits;
++              sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits;
++
++              /*
++               * For sequential zones, make sure that any open zone is closed
++               * first to ensure that the initial number of open zones is 0,
++               * in sync with the open zone accounting done when the mount
++               * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
++               */
++              if (ztype == ZONEFS_ZTYPE_SEQ &&
++                  (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
++                   zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
++                      ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE);
++                      if (ret)
++                              return ret;
++              }
++
++              zonefs_account_active(sb, z);
++
++              n++;
++      }
++
++      if (WARN_ON_ONCE(n != zgroup->g_nr_zones))
++              return -EINVAL;
++
++      zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
++                  zonefs_zgroup_name(ztype),
++                  zgroup->g_nr_zones,
++                  zgroup->g_nr_zones > 1 ? "s" : "");
++
++      return 0;
++}
++
++static void zonefs_free_zgroups(struct super_block *sb)
++{
++      struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
++      enum zonefs_ztype ztype;
++
++      if (!sbi)
++              return;
++
++      for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
++              kvfree(sbi->s_zgroup[ztype].g_zones);
++              sbi->s_zgroup[ztype].g_zones = NULL;
++      }
++}
++
++/*
++ * Create a zone group and populate it with zone files.
++ */
++static int zonefs_init_zgroups(struct super_block *sb)
++{
++      struct zonefs_zone_data zd;
++      enum zonefs_ztype ztype;
++      int ret;
++
++      /* First get the device zone information */
++      memset(&zd, 0, sizeof(struct zonefs_zone_data));
++      zd.sb = sb;
++      ret = zonefs_get_zone_info(&zd);
++      if (ret)
++              goto cleanup;
++
++      /* Allocate and initialize the zone groups */
++      for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) {
++              ret = zonefs_init_zgroup(sb, &zd, ztype);
++              if (ret) {
++                      zonefs_info(sb,
++                                  "Zone group \"%s\" initialization failed\n",
++                                  zonefs_zgroup_name(ztype));
++                      break;
++              }
++      }
++
++cleanup:
++      zonefs_free_zone_info(&zd);
++      if (ret)
++              zonefs_free_zgroups(sb);
++
++      return ret;
++}
++
+ /*
+  * Read super block information from the device.
+  */
+@@ -945,7 +1055,6 @@ static const struct super_operations zonefs_sops = {
+  */
+ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
+ {
+-      struct zonefs_zone_data zd;
+       struct zonefs_sb_info *sbi;
+       struct inode *inode;
+       enum zonefs_ztype t;
+@@ -998,16 +1107,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
+       if (ret)
+               return ret;
+-      memset(&zd, 0, sizeof(struct zonefs_zone_data));
+-      zd.sb = sb;
+-      ret = zonefs_get_zone_info(&zd);
+-      if (ret)
+-              goto cleanup;
+-
+-      ret = zonefs_sysfs_register(sb);
+-      if (ret)
+-              goto cleanup;
+-
+       zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev));
+       if (!sbi->s_max_wro_seq_files &&
+@@ -1018,6 +1117,11 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
+               sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
+       }
++      /* Initialize the zone groups */
++      ret = zonefs_init_zgroups(sb);
++      if (ret)
++              goto cleanup;
++
+       /* Create root directory inode */
+       ret = -ENOMEM;
+       inode = new_inode(sb);
+@@ -1037,13 +1141,19 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
+       /* Create and populate files in zone groups directories */
+       for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) {
+-              ret = zonefs_create_zgroup(&zd, t);
++              ret = zonefs_create_zgroup_inodes(sb, t);
+               if (ret)
+-                      break;
++                      goto cleanup;
+       }
++      ret = zonefs_sysfs_register(sb);
++      if (ret)
++              goto cleanup;
++
++      return 0;
++
+ cleanup:
+-      zonefs_cleanup_zone_info(&zd);
++      zonefs_free_zgroups(sb);
+       return ret;
+ }
+@@ -1062,6 +1172,7 @@ static void zonefs_kill_super(struct super_block *sb)
+               d_genocide(sb->s_root);
+       zonefs_sysfs_unregister(sb);
++      zonefs_free_zgroups(sb);
+       kill_block_super(sb);
+       kfree(sbi);
+ }
+diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h
+index 42edcfd393ed2..9969db3a9c7dc 100644
+--- a/fs/zonefs/trace.h
++++ b/fs/zonefs/trace.h
+@@ -20,8 +20,9 @@
+ #define show_dev(dev) MAJOR(dev), MINOR(dev)
+ TRACE_EVENT(zonefs_zone_mgmt,
+-          TP_PROTO(struct inode *inode, enum req_op op),
+-          TP_ARGS(inode, op),
++          TP_PROTO(struct super_block *sb, struct zonefs_zone *z,
++                   enum req_op op),
++          TP_ARGS(sb, z, op),
+           TP_STRUCT__entry(
+                            __field(dev_t, dev)
+                            __field(ino_t, ino)
+@@ -30,12 +31,12 @@ TRACE_EVENT(zonefs_zone_mgmt,
+                            __field(sector_t, nr_sectors)
+           ),
+           TP_fast_assign(
+-                         __entry->dev = inode->i_sb->s_dev;
+-                         __entry->ino = inode->i_ino;
++                         __entry->dev = sb->s_dev;
++                         __entry->ino =
++                              z->z_sector >> ZONEFS_SB(sb)->s_zone_sectors_shift;
+                          __entry->op = op;
+-                         __entry->sector = ZONEFS_I(inode)->i_zsector;
+-                         __entry->nr_sectors =
+-                                 ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT;
++                         __entry->sector = z->z_sector;
++                         __entry->nr_sectors = z->z_size >> SECTOR_SHIFT;
+           ),
+           TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu",
+                     show_dev(__entry->dev), (unsigned long)__entry->ino,
+@@ -58,9 +59,10 @@ TRACE_EVENT(zonefs_file_dio_append,
+           TP_fast_assign(
+                          __entry->dev = inode->i_sb->s_dev;
+                          __entry->ino = inode->i_ino;
+-                         __entry->sector = ZONEFS_I(inode)->i_zsector;
++                         __entry->sector = zonefs_inode_zone(inode)->z_sector;
+                          __entry->size = size;
+-                         __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset;
++                         __entry->wpoffset =
++                              zonefs_inode_zone(inode)->z_wpoffset;
+                          __entry->ret = ret;
+           ),
+           TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu",
+diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
+index 1a225f74015a0..2d626e18b1411 100644
+--- a/fs/zonefs/zonefs.h
++++ b/fs/zonefs/zonefs.h
+@@ -47,22 +47,39 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
+ #define ZONEFS_ZONE_CNV               (1U << 31)
+ /*
+- * In-memory inode data.
++ * In-memory per-file inode zone data.
+  */
+-struct zonefs_inode_info {
+-      struct inode            i_vnode;
++struct zonefs_zone {
++      /* Zone state flags */
++      unsigned int            z_flags;
+-      /* File zone start sector (512B unit) */
+-      sector_t                i_zsector;
++      /* Zone start sector (512B unit) */
++      sector_t                z_sector;
+-      /* File zone write pointer position (sequential zones only) */
+-      loff_t                  i_wpoffset;
++      /* Zone size (bytes) */
++      loff_t                  z_size;
+-      /* File maximum size */
+-      loff_t                  i_max_size;
++      /* Zone capacity (file maximum size, bytes) */
++      loff_t                  z_capacity;
+-      /* File zone size */
+-      loff_t                  i_zone_size;
++      /* Write pointer offset in the zone (sequential zones only, bytes) */
++      loff_t                  z_wpoffset;
++};
++
++/*
++ * In memory zone group information: all zones of a group are exposed
++ * as files, one file per zone.
++ */
++struct zonefs_zone_group {
++      unsigned int            g_nr_zones;
++      struct zonefs_zone      *g_zones;
++};
++
++/*
++ * In-memory inode data.
++ */
++struct zonefs_inode_info {
++      struct inode            i_vnode;
+       /*
+        * To serialise fully against both syscall and mmap based IO and
+@@ -81,7 +98,6 @@ struct zonefs_inode_info {
+       /* guarded by i_truncate_mutex */
+       unsigned int            i_wr_refcnt;
+-      unsigned int            i_flags;
+ };
+ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
+@@ -89,24 +105,29 @@ static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode)
+       return container_of(inode, struct zonefs_inode_info, i_vnode);
+ }
+-static inline bool zonefs_zone_is_cnv(struct zonefs_inode_info *zi)
++static inline bool zonefs_zone_is_cnv(struct zonefs_zone *z)
++{
++      return z->z_flags & ZONEFS_ZONE_CNV;
++}
++
++static inline bool zonefs_zone_is_seq(struct zonefs_zone *z)
+ {
+-      return zi->i_flags & ZONEFS_ZONE_CNV;
++      return !zonefs_zone_is_cnv(z);
+ }
+-static inline bool zonefs_zone_is_seq(struct zonefs_inode_info *zi)
++static inline struct zonefs_zone *zonefs_inode_zone(struct inode *inode)
+ {
+-      return !zonefs_zone_is_cnv(zi);
++      return inode->i_private;
+ }
+ static inline bool zonefs_inode_is_cnv(struct inode *inode)
+ {
+-      return zonefs_zone_is_cnv(ZONEFS_I(inode));
++      return zonefs_zone_is_cnv(zonefs_inode_zone(inode));
+ }
+ static inline bool zonefs_inode_is_seq(struct inode *inode)
+ {
+-      return zonefs_zone_is_seq(ZONEFS_I(inode));
++      return zonefs_zone_is_seq(zonefs_inode_zone(inode));
+ }
+ /*
+@@ -200,7 +221,7 @@ struct zonefs_sb_info {
+       uuid_t                  s_uuid;
+       unsigned int            s_zone_sectors_shift;
+-      unsigned int            s_nr_files[ZONEFS_ZTYPE_MAX];
++      struct zonefs_zone_group s_zgroup[ZONEFS_ZTYPE_MAX];
+       loff_t                  s_blocks;
+       loff_t                  s_used_blocks;
+@@ -229,8 +250,8 @@ static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb)
+       pr_warn("zonefs (%s) WARNING: " format, sb->s_id, ## args)
+ /* In super.c */
+-void zonefs_account_active(struct inode *inode);
+-int zonefs_zone_mgmt(struct inode *inode, enum req_op op);
++void zonefs_inode_account_active(struct inode *inode);
++int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op);
+ void zonefs_i_size_write(struct inode *inode, loff_t isize);
+ void zonefs_update_stats(struct inode *inode, loff_t new_isize);
+ void __zonefs_io_error(struct inode *inode, bool write);
+-- 
+2.39.2
+
diff --git a/queue-6.2/zonefs-simplify-io-error-handling.patch b/queue-6.2/zonefs-simplify-io-error-handling.patch
new file mode 100644 (file)
index 0000000..413df01
--- /dev/null
@@ -0,0 +1,244 @@
+From c0aa18f0ded6555cb0c0a9063a9295abb942e405 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 25 Nov 2022 11:06:20 +0900
+Subject: zonefs: Simplify IO error handling
+
+From: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+
+[ Upstream commit 46a9c526eef7fb68a00321e2a9591ce5276ae92b ]
+
+Simplify zonefs_check_zone_condition() by moving the code that changes
+an inode access rights to the new function zonefs_inode_update_mode().
+Furthermore, since on mount an inode wpoffset is always zero when
+zonefs_check_zone_condition() is called during an inode initialization,
+the "mount" boolean argument is not necessary for the readonly zone
+case. This argument is thus removed.
+
+zonefs_io_error_cb() is also modified to use the inode offline and
+zone state flags instead of checking the device zone condition. The
+multiple calls to zonefs_check_zone_condition() are reduced to the first
+call on entry, which allows removing the "warn" argument.
+zonefs_inode_update_mode() is also used to update an inode access rights
+as zonefs_io_error_cb() modifies the inode flags depending on the volume
+error handling mode (defined with a mount option). Since an inode mode
+change differs for read-only zones between mount time and IO error time,
+the flag ZONEFS_ZONE_INIT_MODE is used to differentiate both cases.
+
+Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Stable-dep-of: 88b170088ad2 ("zonefs: Fix error message in zonefs_file_dio_append()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/zonefs/super.c  | 110 ++++++++++++++++++++++++---------------------
+ fs/zonefs/zonefs.h |   9 ++--
+ 2 files changed, 64 insertions(+), 55 deletions(-)
+
+diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
+index e808276b88018..6307cc95be061 100644
+--- a/fs/zonefs/super.c
++++ b/fs/zonefs/super.c
+@@ -155,48 +155,31 @@ void zonefs_update_stats(struct inode *inode, loff_t new_isize)
+  * amount of readable data in the zone.
+  */
+ static loff_t zonefs_check_zone_condition(struct inode *inode,
+-                                        struct blk_zone *zone, bool warn,
+-                                        bool mount)
++                                        struct blk_zone *zone)
+ {
+       struct zonefs_inode_info *zi = ZONEFS_I(inode);
+       switch (zone->cond) {
+       case BLK_ZONE_COND_OFFLINE:
+-              /*
+-               * Dead zone: make the inode immutable, disable all accesses
+-               * and set the file size to 0 (zone wp set to zone start).
+-               */
+-              if (warn)
+-                      zonefs_warn(inode->i_sb, "inode %lu: offline zone\n",
+-                                  inode->i_ino);
+-              inode->i_flags |= S_IMMUTABLE;
+-              inode->i_mode &= ~0777;
+-              zone->wp = zone->start;
++              zonefs_warn(inode->i_sb, "inode %lu: offline zone\n",
++                          inode->i_ino);
+               zi->i_flags |= ZONEFS_ZONE_OFFLINE;
+               return 0;
+       case BLK_ZONE_COND_READONLY:
+               /*
+-               * The write pointer of read-only zones is invalid. If such a
+-               * zone is found during mount, the file size cannot be retrieved
+-               * so we treat the zone as offline (mount == true case).
+-               * Otherwise, keep the file size as it was when last updated
+-               * so that the user can recover data. In both cases, writes are
+-               * always disabled for the zone.
++               * The write pointer of read-only zones is invalid, so we cannot
++               * determine the zone wpoffset (inode size). We thus keep the
++               * zone wpoffset as is, which leads to an empty file
++               * (wpoffset == 0) on mount. For a runtime error, this keeps
++               * the inode size as it was when last updated so that the user
++               * can recover data.
+                */
+-              if (warn)
+-                      zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
+-                                  inode->i_ino);
+-              inode->i_flags |= S_IMMUTABLE;
+-              if (mount) {
+-                      zone->cond = BLK_ZONE_COND_OFFLINE;
+-                      inode->i_mode &= ~0777;
+-                      zone->wp = zone->start;
+-                      zi->i_flags |= ZONEFS_ZONE_OFFLINE;
+-                      return 0;
+-              }
++              zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n",
++                          inode->i_ino);
+               zi->i_flags |= ZONEFS_ZONE_READONLY;
+-              inode->i_mode &= ~0222;
+-              return i_size_read(inode);
++              if (zi->i_ztype == ZONEFS_ZTYPE_CNV)
++                      return zi->i_max_size;
++              return zi->i_wpoffset;
+       case BLK_ZONE_COND_FULL:
+               /* The write pointer of full zones is invalid. */
+               return zi->i_max_size;
+@@ -207,6 +190,30 @@ static loff_t zonefs_check_zone_condition(struct inode *inode,
+       }
+ }
++/*
++ * Check a zone condition and adjust its inode access permissions for
++ * offline and readonly zones.
++ */
++static void zonefs_inode_update_mode(struct inode *inode)
++{
++      struct zonefs_inode_info *zi = ZONEFS_I(inode);
++
++      if (zi->i_flags & ZONEFS_ZONE_OFFLINE) {
++              /* Offline zones cannot be read nor written */
++              inode->i_flags |= S_IMMUTABLE;
++              inode->i_mode &= ~0777;
++      } else if (zi->i_flags & ZONEFS_ZONE_READONLY) {
++              /* Readonly zones cannot be written */
++              inode->i_flags |= S_IMMUTABLE;
++              if (zi->i_flags & ZONEFS_ZONE_INIT_MODE)
++                      inode->i_mode &= ~0777;
++              else
++                      inode->i_mode &= ~0222;
++      }
++
++      zi->i_flags &= ~ZONEFS_ZONE_INIT_MODE;
++}
++
+ struct zonefs_ioerr_data {
+       struct inode    *inode;
+       bool            write;
+@@ -228,10 +235,9 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        * as there is no inconsistency between the inode size and the amount of
+        * data writen in the zone (data_size).
+        */
+-      data_size = zonefs_check_zone_condition(inode, zone, true, false);
++      data_size = zonefs_check_zone_condition(inode, zone);
+       isize = i_size_read(inode);
+-      if (zone->cond != BLK_ZONE_COND_OFFLINE &&
+-          zone->cond != BLK_ZONE_COND_READONLY &&
++      if (!(zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
+           !err->write && isize == data_size)
+               return 0;
+@@ -264,24 +270,22 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        * zone condition to read-only and offline respectively, as if the
+        * condition was signaled by the hardware.
+        */
+-      if (zone->cond == BLK_ZONE_COND_OFFLINE ||
+-          sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) {
++      if ((zi->i_flags & ZONEFS_ZONE_OFFLINE) ||
++          (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) {
+               zonefs_warn(sb, "inode %lu: read/write access disabled\n",
+                           inode->i_ino);
+-              if (zone->cond != BLK_ZONE_COND_OFFLINE) {
+-                      zone->cond = BLK_ZONE_COND_OFFLINE;
+-                      data_size = zonefs_check_zone_condition(inode, zone,
+-                                                              false, false);
+-              }
+-      } else if (zone->cond == BLK_ZONE_COND_READONLY ||
+-                 sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) {
++              if (!(zi->i_flags & ZONEFS_ZONE_OFFLINE))
++                      zi->i_flags |= ZONEFS_ZONE_OFFLINE;
++              zonefs_inode_update_mode(inode);
++              data_size = 0;
++      } else if ((zi->i_flags & ZONEFS_ZONE_READONLY) ||
++                 (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) {
+               zonefs_warn(sb, "inode %lu: write access disabled\n",
+                           inode->i_ino);
+-              if (zone->cond != BLK_ZONE_COND_READONLY) {
+-                      zone->cond = BLK_ZONE_COND_READONLY;
+-                      data_size = zonefs_check_zone_condition(inode, zone,
+-                                                              false, false);
+-              }
++              if (!(zi->i_flags & ZONEFS_ZONE_READONLY))
++                      zi->i_flags |= ZONEFS_ZONE_READONLY;
++              zonefs_inode_update_mode(inode);
++              data_size = isize;
+       } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO &&
+                  data_size > isize) {
+               /* Do not expose garbage data */
+@@ -295,8 +299,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
+        * close of the zone when the inode file is closed.
+        */
+       if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) &&
+-          (zone->cond == BLK_ZONE_COND_OFFLINE ||
+-           zone->cond == BLK_ZONE_COND_READONLY))
++          (zi->i_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)))
+               zi->i_flags &= ~ZONEFS_ZONE_OPEN;
+       /*
+@@ -378,6 +381,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
+       inode_init_once(&zi->i_vnode);
+       mutex_init(&zi->i_truncate_mutex);
++      zi->i_wpoffset = 0;
+       zi->i_wr_refcnt = 0;
+       zi->i_flags = 0;
+@@ -594,7 +598,7 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+       zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
+                              zone->capacity << SECTOR_SHIFT);
+-      zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true);
++      zi->i_wpoffset = zonefs_check_zone_condition(inode, zone);
+       inode->i_uid = sbi->s_uid;
+       inode->i_gid = sbi->s_gid;
+@@ -605,6 +609,10 @@ static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+       inode->i_fop = &zonefs_file_operations;
+       inode->i_mapping->a_ops = &zonefs_file_aops;
++      /* Update the inode access rights depending on the zone condition */
++      zi->i_flags |= ZONEFS_ZONE_INIT_MODE;
++      zonefs_inode_update_mode(inode);
++
+       sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
+       sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
+       sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h
+index 839ebe9afb6c1..439096445ee53 100644
+--- a/fs/zonefs/zonefs.h
++++ b/fs/zonefs/zonefs.h
+@@ -39,10 +39,11 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone)
+       return ZONEFS_ZTYPE_SEQ;
+ }
+-#define ZONEFS_ZONE_OPEN      (1U << 0)
+-#define ZONEFS_ZONE_ACTIVE    (1U << 1)
+-#define ZONEFS_ZONE_OFFLINE   (1U << 2)
+-#define ZONEFS_ZONE_READONLY  (1U << 3)
++#define ZONEFS_ZONE_INIT_MODE (1U << 0)
++#define ZONEFS_ZONE_OPEN      (1U << 1)
++#define ZONEFS_ZONE_ACTIVE    (1U << 2)
++#define ZONEFS_ZONE_OFFLINE   (1U << 3)
++#define ZONEFS_ZONE_READONLY  (1U << 4)
+ /*
+  * In-memory inode data.
+-- 
+2.39.2
+