break;
case CEPH_SESSION_REJECT:
- WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
- pr_info_client(cl, "mds%d rejected session\n",
- session->s_mds);
+ WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING &&
+ session->s_state != CEPH_MDS_SESSION_RECONNECTING);
+ if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+ pr_info_client(cl, "mds%d reconnect rejected\n",
+ session->s_mds);
+ else
+ pr_info_client(cl, "mds%d rejected session\n",
+ session->s_mds);
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = atomic_read(&cap->session->s_cap_gen);
+ /*
+ * Note: CEPH_I_ERROR_FILELOCK is not set during reconnect.
+ * Instead, locks are submitted for best-effort MDS reclaim
+ * via the flock_len field below. If reclaim fails (e.g.,
+ * another client grabbed a conflicting lock), future lock
+ * operations will fail and set the error flag at that point.
+ */
+
/* These are lost when the session goes away */
if (S_ISDIR(inode->i_mode)) {
if (cap->issued & CEPH_CAP_DIR_CREATE) {
*
* This is a relatively heavyweight operation, but it's rare.
*/
-static void send_mds_reconnect(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static int send_mds_reconnect(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *reply;
int mds = session->s_mds;
int err = -ENOMEM;
+ int old_state;
struct ceph_reconnect_state recon_state = {
.session = session,
};
LIST_HEAD(dispose);
- pr_info_client(cl, "mds%d reconnect start\n", mds);
-
recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!recon_state.pagelist)
goto fail_nopagelist;
if (!reply)
goto fail_nomsg;
+ mutex_lock(&session->s_mutex);
+
+ /* Serialized by s_mutex against concurrent ceph_get_deleg_ino(). */
xa_destroy(&session->s_delegated_inos);
+ if (session->s_state == CEPH_MDS_SESSION_CLOSED ||
+ session->s_state == CEPH_MDS_SESSION_REJECTED) {
+ pr_info_client(cl, "mds%d skipping reconnect, session %s\n",
+ mds,
+ ceph_session_state_name(session->s_state));
+ mutex_unlock(&session->s_mutex);
+ ceph_msg_put(reply);
+ err = -ESTALE;
+ goto fail_return;
+ }
- mutex_lock(&session->s_mutex);
+ /* s_mutex -> mdsc->mutex matches cleanup_session_requests() order. */
+ mutex_lock(&mdsc->mutex);
+ if (mds >= mdsc->max_sessions || mdsc->sessions[mds] != session) {
+ mutex_unlock(&mdsc->mutex);
+ pr_info_client(cl,
+ "mds%d skipping reconnect, session unregistered\n",
+ mds);
+ mutex_unlock(&session->s_mutex);
+ ceph_msg_put(reply);
+ err = -ENOENT;
+ goto fail_return;
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ pr_info_client(cl, "mds%d reconnect start\n", mds);
+ old_state = session->s_state;
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;
up_read(&mdsc->snap_rwsem);
ceph_pagelist_release(recon_state.pagelist);
- return;
+ return 0;
fail_clear_cap_reconnect:
spin_lock(&session->s_cap_lock);
fail:
ceph_msg_put(reply);
up_read(&mdsc->snap_rwsem);
+ /*
+ * Restore prior session state so map-driven reconnect logic
+ * (check_new_map) can retry. Without this, a transient build
+ * failure strands the session in RECONNECTING indefinitely.
+ */
+ session->s_state = old_state;
mutex_unlock(&session->s_mutex);
fail_nomsg:
ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
pr_err_client(cl, "error %d preparing reconnect for mds%d\n",
err, mds);
- return;
+ return err;
+
+fail_return:
+ /*
+ * Early-exit path for expected concurrent-teardown races
+ * (-ESTALE for closed/rejected sessions, -ENOENT for
+ * unregistered sessions). Skip the pr_err_client diagnostic
+ * since these are not genuine reconnect build failures.
+ */
+ ceph_pagelist_release(recon_state.pagelist);
+ return err;
}
*/
if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
newstate >= CEPH_MDS_STATE_RECONNECT) {
+ int rc;
+
mutex_unlock(&mdsc->mutex);
clear_bit(i, targets);
- send_mds_reconnect(mdsc, s);
+ rc = send_mds_reconnect(mdsc, s);
+ if (rc)
+ pr_warn_client(cl,
+ "mds%d reconnect failed: %d\n",
+ i, rc);
mutex_lock(&mdsc->mutex);
}
}
doutc(cl, "send reconnect to export target mds.%d\n", i);
mutex_unlock(&mdsc->mutex);
- send_mds_reconnect(mdsc, s);
+ err = send_mds_reconnect(mdsc, s);
+ if (err)
+ pr_warn_client(cl,
+ "mds%d export target reconnect failed: %d\n",
+ i, err);
ceph_put_mds_session(s);
mutex_lock(&mdsc->mutex);
}
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
+ int session_state;
pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
s->s_mds);
- if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO &&
- ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT)
- send_mds_reconnect(mdsc, s);
+
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO ||
+ ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) < CEPH_MDS_STATE_RECONNECT)
+ return;
+
+ /*
+ * Only reconnect if MDS is in its RECONNECT phase. An MDS past
+ * RECONNECT (REJOIN, CLIENTREPLAY, ACTIVE) will reject reconnect
+ * attempts, so those states fall through to session teardown below.
+ */
+ if (ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) == CEPH_MDS_STATE_RECONNECT) {
+ int rc = send_mds_reconnect(mdsc, s);
+
+ if (rc)
+ pr_warn_client(mdsc->fsc->client,
+ "mds%d reconnect failed: %d\n",
+ s->s_mds, rc);
+ return;
+ }
+
+ /*
+ * MDS is active (past RECONNECT). It will not accept a
+ * CLIENT_RECONNECT from us, so tear the session down locally
+ * and let new requests re-open a fresh session.
+ *
+ * Snapshot session state with READ_ONCE, then revalidate under
+ * mdsc->mutex before acting. The subsequent mdsc->mutex
+ * section rechecks s_state to catch concurrent transitions, so
+ * the lockless snapshot here is safe. s->s_mutex is taken
+ * separately for cleanup after unregistration, which avoids
+ * introducing a new s->s_mutex + mdsc->mutex nesting.
+ */
+ session_state = READ_ONCE(s->s_state);
+
+ switch (session_state) {
+ case CEPH_MDS_SESSION_RESTARTING:
+ case CEPH_MDS_SESSION_RECONNECTING:
+ case CEPH_MDS_SESSION_CLOSING:
+ case CEPH_MDS_SESSION_OPEN:
+ case CEPH_MDS_SESSION_HUNG:
+ case CEPH_MDS_SESSION_OPENING:
+ mutex_lock(&mdsc->mutex);
+ if (s->s_mds >= mdsc->max_sessions ||
+ mdsc->sessions[s->s_mds] != s ||
+ s->s_state != session_state) {
+ pr_info_client(mdsc->fsc->client,
+ "mds%d state changed to %s during peer reset\n",
+ s->s_mds,
+ ceph_session_state_name(s->s_state));
+ mutex_unlock(&mdsc->mutex);
+ return;
+ }
+
+ ceph_get_mds_session(s);
+ s->s_state = CEPH_MDS_SESSION_CLOSED;
+ __unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ cleanup_session_requests(mdsc, s);
+ remove_session_caps(s);
+ mutex_unlock(&s->s_mutex);
+
+ wake_up_all(&mdsc->session_close_wq);
+
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, s->s_mds);
+ mutex_unlock(&mdsc->mutex);
+
+ ceph_put_mds_session(s);
+ break;
+ case CEPH_MDS_SESSION_CLOSED:
+ case CEPH_MDS_SESSION_REJECTED:
+ break;
+ default:
+ pr_warn_client(mdsc->fsc->client,
+ "mds%d peer reset in unexpected state %s\n",
+ s->s_mds,
+ ceph_session_state_name(session_state));
+ break;
+ }
}
static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
mutex_lock(&mdsc->mutex);
if (__verify_registered_session(mdsc, s) < 0) {
+ doutc(cl, "dropping tid %llu from unregistered session %d\n",
+ le64_to_cpu(msg->hdr.tid), s->s_mds);
mutex_unlock(&mdsc->mutex);
goto out;
}