]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
ceph: add diagnostic timeout loop to wait_caps_flush()
authorAlex Markuze <amarkuze@redhat.com>
Thu, 7 May 2026 08:45:27 +0000 (08:45 +0000)
committerIlya Dryomov <idryomov@gmail.com>
Mon, 22 Jun 2026 20:44:56 +0000 (22:44 +0200)
Convert wait_caps_flush() from a silent indefinite wait into a diagnostic
wait loop that periodically dumps pending cap flush state.

The underlying wait semantics remain intact: callers still wait until the
requested cap flushes complete. The difference is that long stalls now
produce actionable diagnostics instead of looking like a silent hang.

CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES limits the number of entries
emitted per diagnostic dump, and CEPH_CAP_FLUSH_MAX_DUMP_ITERS
limits the number of timed diagnostic dumps before the wait
continues silently.  When more entries exist than the per-dump
limit, a truncation count is reported.  When the dump iteration
limit is reached, a final suppression message is emitted so the
transition to silence is explicit.

The diagnostic dump collects flush entry data under cap_dirty_lock into
a bounded on-stack array, then prints after releasing the lock.  This
avoids holding the spinlock across printk calls.

A null cf->ci on the global flush list indicates a bug since all
cap_flush entries are initialized with a valid ci before being added.
Signal this with WARN_ON_ONCE while still printing enough context for
debugging.

READ_ONCE is used for the i_last_cap_flush_ack field, which is read
outside the inode lock domain. Flush tids are monotonically increasing
and acks are processed in order under i_ceph_lock, so the latest ack
tid is always the most recently written value.

Add a ci pointer to struct ceph_cap_flush so that the diagnostic
dump can identify which inode each pending flush belongs to.  The
new i_last_cap_flush_ack field tracks the latest acknowledged flush
tid per inode for diagnostic correlation.

This improves reset-drain observability and is also useful for
existing sync and writeback troubleshooting paths.

Signed-off-by: Alex Markuze <amarkuze@redhat.com>
Reviewed-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Viacheslav Dubeyko <Slava.Dubeyko@ibm.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
fs/ceph/caps.c
fs/ceph/inode.c
fs/ceph/mds_client.c
fs/ceph/mds_client.h
fs/ceph/super.h

index cb9e78b713d9f19a7cdc8be0bbb612333dadf704..4b37d9ffdf7f55167105dfcaec3be9188c8ee2d0 100644 (file)
@@ -1648,6 +1648,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
 
                spin_lock(&mdsc->cap_dirty_lock);
                capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+               capsnap->cap_flush.ci = ci;
                list_add_tail(&capsnap->cap_flush.g_list,
                              &mdsc->cap_flush_list);
                if (oldest_flush_tid == 0)
@@ -1846,6 +1847,7 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void)
                return NULL;
 
        cf->is_capsnap = false;
+       cf->ci = NULL;
        return cf;
 }
 
@@ -1931,6 +1933,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
        doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
 
        swap(cf, ci->i_prealloc_cap_flush);
+       cf->ci = ci;
        cf->caps = flushing;
        cf->wake = wake;
 
@@ -3826,6 +3829,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        bool wake_ci = false;
        bool wake_mdsc = false;
 
+       /*
+        * Flush tids are monotonically increasing and acks arrive in
+        * order under i_ceph_lock, so this is always the latest tid.
+        * Diagnostic readers use READ_ONCE() without holding the lock.
+        */
+       WRITE_ONCE(ci->i_last_cap_flush_ack, flush_tid);
+
        list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
                /* Is this the one that was flushed? */
                if (cf->tid == flush_tid)
index 4871d7ab27309aab7551ed941e94f6750aa8c7aa..61d7c0b8161ff7533858a241d8c3871d2b102008 100644 (file)
@@ -671,6 +671,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
        INIT_LIST_HEAD(&ci->i_cap_snaps);
        ci->i_head_snapc = NULL;
        ci->i_snap_caps = 0;
+       ci->i_last_cap_flush_ack = 0;
 
        ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
        for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
index 249419c17d3c469a5293705ea1ec4b641839d46c..eddd3ccf5bba86fab5dcda9ca5fd18b6e785171f 100644 (file)
@@ -2330,19 +2330,112 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
 }
 
 /*
- * flush all dirty inode data to disk.
+ * Snapshot of a single cap_flush entry for diagnostic dump.
+ * Collected under cap_dirty_lock, printed after releasing it.
+ */
+struct flush_dump_entry {
+       u64 ino;                /* inode number */
+       u64 snap;               /* snap id */
+       int caps;               /* dirty cap bits */
+       u64 tid;                /* flush transaction id */
+       u64 last_ack;           /* most recent ack tid for this inode */
+       bool wake;              /* whether completion was requested */
+       bool is_capsnap;        /* true if this is a cap snap flush */
+       bool ci_null;           /* true if cf->ci was unexpectedly NULL */
+};
+
+/*
+ * Dump pending cap flushes for diagnostic purposes.
  *
- * returns true if we've flushed through want_flush_tid
+ * cf->ci is safe to dereference here: cap_flush entries hold a
+ * reference on the inode (via the cap), and entries are removed from
+ * cap_flush_list under cap_dirty_lock before the cap (and thus the
+ * inode reference) is released.  Holding cap_dirty_lock therefore
+ * guarantees the inode remains valid for the lifetime of the scan.
+ */
+
+static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+       struct ceph_client *cl = mdsc->fsc->client;
+       struct flush_dump_entry entries[CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES];
+       struct ceph_cap_flush *cf;
+       int n = 0, remaining = 0;
+       int i;
+
+       spin_lock(&mdsc->cap_dirty_lock);
+       list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) {
+               if (cf->tid > want_tid)
+                       break;
+               if (n < CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES) {
+                       struct flush_dump_entry *e = &entries[n++];
+
+                       e->ci_null = WARN_ON_ONCE(!cf->ci);
+                       if (!e->ci_null) {
+                               e->ino = ceph_ino(&cf->ci->netfs.inode);
+                               e->snap = ceph_snap(&cf->ci->netfs.inode);
+                               e->last_ack = READ_ONCE(cf->ci->i_last_cap_flush_ack);
+                       }
+                       e->caps = cf->caps;
+                       e->tid = cf->tid;
+                       e->wake = cf->wake;
+                       e->is_capsnap = cf->is_capsnap;
+               } else {
+                       remaining++;
+               }
+       }
+       spin_unlock(&mdsc->cap_dirty_lock);
+
+       pr_info_client(cl, "still waiting for cap flushes through %llu:\n",
+                      want_tid);
+       for (i = 0; i < n; i++) {
+               struct flush_dump_entry *e = &entries[i];
+
+               if (e->ci_null)
+                       pr_info_client(cl,
+                                      "  (null ci) %s tid=%llu wake=%d%s\n",
+                                      ceph_cap_string(e->caps), e->tid,
+                                      e->wake,
+                                      e->is_capsnap ? " is_capsnap" : "");
+               else
+                       pr_info_client(cl,
+                                      "  %llx.%llx %s tid=%llu last_ack=%llu wake=%d%s\n",
+                                      e->ino, e->snap,
+                                      ceph_cap_string(e->caps), e->tid,
+                                      e->last_ack, e->wake,
+                                      e->is_capsnap ? " is_capsnap" : "");
+       }
+       if (remaining)
+               pr_info_client(cl, "  ... and %d more pending flushes\n",
+                              remaining);
+}
+
+/*
+ * Wait for all cap flushes through @want_flush_tid to complete.
+ * Periodically dumps pending cap flush state for diagnostics.
  */
 static void wait_caps_flush(struct ceph_mds_client *mdsc,
                            u64 want_flush_tid)
 {
        struct ceph_client *cl = mdsc->fsc->client;
+       int i = 0;
+       long ret;
 
        doutc(cl, "want %llu\n", want_flush_tid);
 
-       wait_event(mdsc->cap_flushing_wq,
-                  check_caps_flush(mdsc, want_flush_tid));
+       do {
+               /* 60 * HZ fits in a long on all supported architectures. */
+               ret = wait_event_timeout(mdsc->cap_flushing_wq,
+                          check_caps_flush(mdsc, want_flush_tid),
+                          CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC * HZ);
+               if (ret == 0) {
+                       if (i < CEPH_CAP_FLUSH_MAX_DUMP_ITERS)
+                               dump_cap_flushes(mdsc, want_flush_tid);
+                       else if (i == CEPH_CAP_FLUSH_MAX_DUMP_ITERS)
+                               pr_info_client(cl,
+                                              "still waiting for cap flushes; suppressing further dumps\n");
+                       i++;
+               }
+       } while (ret == 0);
 
        doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
 }
index d873e784b0253e85d93185c89348ab6a544ddad0..8208fdf02efe8728fba6761d2410618659b6e3d0 100644 (file)
@@ -77,6 +77,9 @@ struct ceph_fs_client;
 struct ceph_cap;
 
 #define MDS_AUTH_UID_ANY -1
+#define CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC 60
+#define CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES 5
+#define CEPH_CAP_FLUSH_MAX_DUMP_ITERS 5
 
 struct ceph_mds_cap_match {
        s64 uid;  /* default to MDS_AUTH_UID_ANY */
index 8afc6f3a10dab4e59cbc600ff3e688a48f73c6ee..a4993644d543de6c0c7e6ee84860a29574c12c95 100644 (file)
@@ -239,6 +239,7 @@ struct ceph_cap_flush {
        bool is_capsnap; /* true means capsnap */
        struct list_head g_list; // global
        struct list_head i_list; // per inode
+       struct ceph_inode_info *ci;
 };
 
 /*
@@ -453,6 +454,11 @@ struct ceph_inode_info {
        struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
                                                    dirty|flushing caps */
        unsigned i_snap_caps;           /* cap bits for snapped files */
+       /*
+        * Written under i_ceph_lock, read via READ_ONCE()
+        * from diagnostic paths.
+        */
+       u64 i_last_cap_flush_ack;
 
        unsigned long i_last_rd;
        unsigned long i_last_wr;