From: Alex Markuze Date: Thu, 7 May 2026 08:45:27 +0000 (+0000) Subject: ceph: add diagnostic timeout loop to wait_caps_flush() X-Git-Tag: v7.2-rc1~11^2~2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ebbbab66bd74dbd213d51afc3b029dc8b109ee47;p=thirdparty%2Fkernel%2Flinux.git ceph: add diagnostic timeout loop to wait_caps_flush() Convert wait_caps_flush() from a silent indefinite wait into a diagnostic wait loop that periodically dumps pending cap flush state. The underlying wait semantics remain intact: callers still wait until the requested cap flushes complete. The difference is that long stalls now produce actionable diagnostics instead of looking like a silent hang. CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES limits the number of entries emitted per diagnostic dump, and CEPH_CAP_FLUSH_MAX_DUMP_ITERS limits the number of timed diagnostic dumps before the wait continues silently. When more entries exist than the per-dump limit, a truncation count is reported. When the dump iteration limit is reached, a final suppression message is emitted so the transition to silence is explicit. The diagnostic dump collects flush entry data under cap_dirty_lock into a bounded on-stack array, then prints after releasing the lock. This avoids holding the spinlock across printk calls. A null cf->ci on the global flush list indicates a bug since all cap_flush entries are initialized with a valid ci before being added. Signal this with WARN_ON_ONCE while still printing enough context for debugging. READ_ONCE is used for the i_last_cap_flush_ack field, which is read outside the inode lock domain. Flush tids are monotonically increasing and acks are processed in order under i_ceph_lock, so the latest ack tid is always the most recently written value. Add a ci pointer to struct ceph_cap_flush so that the diagnostic dump can identify which inode each pending flush belongs to. The new i_last_cap_flush_ack field tracks the latest acknowledged flush tid per inode for diagnostic correlation. This improves reset-drain observability and is also useful for existing sync and writeback troubleshooting paths. Signed-off-by: Alex Markuze Reviewed-by: Viacheslav Dubeyko Signed-off-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cb9e78b713d9f..4b37d9ffdf7f5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1648,6 +1648,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, spin_lock(&mdsc->cap_dirty_lock); capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; + capsnap->cap_flush.ci = ci; list_add_tail(&capsnap->cap_flush.g_list, &mdsc->cap_flush_list); if (oldest_flush_tid == 0) @@ -1846,6 +1847,7 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void) return NULL; cf->is_capsnap = false; + cf->ci = NULL; return cf; } @@ -1931,6 +1933,7 @@ static u64 __mark_caps_flushing(struct inode *inode, doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode)); swap(cf, ci->i_prealloc_cap_flush); + cf->ci = ci; cf->caps = flushing; cf->wake = wake; @@ -3826,6 +3829,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, bool wake_ci = false; bool wake_mdsc = false; + /* + * Flush tids are monotonically increasing and acks arrive in + * order under i_ceph_lock, so this is always the latest tid. + * Diagnostic readers use READ_ONCE() without holding the lock. + */ + WRITE_ONCE(ci->i_last_cap_flush_ack, flush_tid); + list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { /* Is this the one that was flushed? */ if (cf->tid == flush_tid) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4871d7ab27309..61d7c0b8161ff 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -671,6 +671,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ci->i_cap_snaps); ci->i_head_snapc = NULL; ci->i_snap_caps = 0; + ci->i_last_cap_flush_ack = 0; ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ; for (i = 0; i < CEPH_FILE_MODE_BITS; i++) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 249419c17d3c4..eddd3ccf5bba8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2330,19 +2330,112 @@ static int check_caps_flush(struct ceph_mds_client *mdsc, } /* - * flush all dirty inode data to disk. + * Snapshot of a single cap_flush entry for diagnostic dump. + * Collected under cap_dirty_lock, printed after releasing it. + */ +struct flush_dump_entry { + u64 ino; /* inode number */ + u64 snap; /* snap id */ + int caps; /* dirty cap bits */ + u64 tid; /* flush transaction id */ + u64 last_ack; /* most recent ack tid for this inode */ + bool wake; /* whether completion was requested */ + bool is_capsnap; /* true if this is a cap snap flush */ + bool ci_null; /* true if cf->ci was unexpectedly NULL */ +}; + +/* + * Dump pending cap flushes for diagnostic purposes. * - * returns true if we've flushed through want_flush_tid + * cf->ci is safe to dereference here: cap_flush entries hold a + * reference on the inode (via the cap), and entries are removed from + * cap_flush_list under cap_dirty_lock before the cap (and thus the + * inode reference) is released. Holding cap_dirty_lock therefore + * guarantees the inode remains valid for the lifetime of the scan. + */ + +static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid) +{ + struct ceph_client *cl = mdsc->fsc->client; + struct flush_dump_entry entries[CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES]; + struct ceph_cap_flush *cf; + int n = 0, remaining = 0; + int i; + + spin_lock(&mdsc->cap_dirty_lock); + list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) { + if (cf->tid > want_tid) + break; + if (n < CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES) { + struct flush_dump_entry *e = &entries[n++]; + + e->ci_null = WARN_ON_ONCE(!cf->ci); + if (!e->ci_null) { + e->ino = ceph_ino(&cf->ci->netfs.inode); + e->snap = ceph_snap(&cf->ci->netfs.inode); + e->last_ack = READ_ONCE(cf->ci->i_last_cap_flush_ack); + } + e->caps = cf->caps; + e->tid = cf->tid; + e->wake = cf->wake; + e->is_capsnap = cf->is_capsnap; + } else { + remaining++; + } + } + spin_unlock(&mdsc->cap_dirty_lock); + + pr_info_client(cl, "still waiting for cap flushes through %llu:\n", + want_tid); + for (i = 0; i < n; i++) { + struct flush_dump_entry *e = &entries[i]; + + if (e->ci_null) + pr_info_client(cl, + " (null ci) %s tid=%llu wake=%d%s\n", + ceph_cap_string(e->caps), e->tid, + e->wake, + e->is_capsnap ? " is_capsnap" : ""); + else + pr_info_client(cl, + " %llx.%llx %s tid=%llu last_ack=%llu wake=%d%s\n", + e->ino, e->snap, + ceph_cap_string(e->caps), e->tid, + e->last_ack, e->wake, + e->is_capsnap ? " is_capsnap" : ""); + } + if (remaining) + pr_info_client(cl, " ... and %d more pending flushes\n", + remaining); +} + +/* + * Wait for all cap flushes through @want_flush_tid to complete. + * Periodically dumps pending cap flush state for diagnostics. */ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_tid) { struct ceph_client *cl = mdsc->fsc->client; + int i = 0; + long ret; doutc(cl, "want %llu\n", want_flush_tid); - wait_event(mdsc->cap_flushing_wq, - check_caps_flush(mdsc, want_flush_tid)); + do { + /* 60 * HZ fits in a long on all supported architectures. */ + ret = wait_event_timeout(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid), + CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC * HZ); + if (ret == 0) { + if (i < CEPH_CAP_FLUSH_MAX_DUMP_ITERS) + dump_cap_flushes(mdsc, want_flush_tid); + else if (i == CEPH_CAP_FLUSH_MAX_DUMP_ITERS) + pr_info_client(cl, + "still waiting for cap flushes; suppressing further dumps\n"); + i++; + } + } while (ret == 0); doutc(cl, "ok, flushed thru %llu\n", want_flush_tid); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d873e784b0253..8208fdf02efe8 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -77,6 +77,9 @@ struct ceph_fs_client; struct ceph_cap; #define MDS_AUTH_UID_ANY -1 +#define CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC 60 +#define CEPH_CAP_FLUSH_MAX_DUMP_ENTRIES 5 +#define CEPH_CAP_FLUSH_MAX_DUMP_ITERS 5 struct ceph_mds_cap_match { s64 uid; /* default to MDS_AUTH_UID_ANY */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8afc6f3a10dab..a4993644d543d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -239,6 +239,7 @@ struct ceph_cap_flush { bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode + struct ceph_inode_info *ci; }; /* @@ -453,6 +454,11 @@ struct ceph_inode_info { struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ + /* + * Written under i_ceph_lock, read via READ_ONCE() + * from diagnostic paths. + */ + u64 i_last_cap_flush_ack; unsigned long i_last_rd; unsigned long i_last_wr;