From 9b43e6793b0f9dcd2c3935af020cf96cd7b05ec2 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 21 Apr 2026 09:39:59 +0900
Subject: [PATCH] Fix orphaned processes when startup process fails during
 PM_STARTUP

When the startup process exists with a FATAL error during PM_STARTUP,
the postmaster called ExitPostmaster() directly, assuming that no other
processes are running at this stage.  Since 7ff23c6d277d, this
assumption is not true, as the checkpointer, the background writer, the
IO workers and bgworkers kicking in early would be around.

This commit removes the startup-specific shortcut happening in
process_pm_child_exit() for a failing startup process during PM_STARTUP,
falling down to the existing exit() flow to signal all the started
children with SIGQUIT, so as we have no risk of creating orphaned
processes.

This required an extra change in HandleFatalError() for v18 and newer
versions, as an assertion could be triggered for PM_STARTUP.  It is now
incorrect.  In v17 and older versions, HandleChildCrash() needs to be
changed to handle PM_STARTUP so as children can be waited on.

While on it, fix a comment at the top of postmaster.c.  It was claiming
that the checkpointer and the background writer were started after
PM_RECOVERY.  That is not the case.

Author: Ayush Tiwari <ayushtiwari.slg01@gmail.com>
Discussion: https://postgr.es/m/CAJTYsWVoD3V9yhhqSae1_wqcnTdpFY-hDT7dPm5005ZFsL_bpA@mail.gmail.com
Backpatch-through: 15
---
 src/backend/postmaster/postmaster.c | 48 +++++++++--------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index b6fd332f196..90c7c4528e8 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -304,12 +304,13 @@ static bool FatalError = false; /* T if recovering from backend crash */
  *
  * When the startup process is ready to start archive recovery, it signals the
  * postmaster, and we switch to PM_RECOVERY state. The background writer and
- * checkpointer are launched, while the startup process continues applying WAL.
- * If Hot Standby is enabled, then, after reaching a consistent point in WAL
- * redo, startup process signals us again, and we switch to PM_HOT_STANDBY
- * state and begin accepting connections to perform read-only queries.  When
- * archive recovery is finished, the startup process exits with exit code 0
- * and we switch to PM_RUN state.
+ * checkpointer are already running (as these are launched during PM_STARTUP),
+ * and the startup process continues applying WAL.  If Hot Standby is enabled,
+ * then, after reaching a consistent point in WAL redo, startup process
+ * signals us again, and we switch to PM_HOT_STANDBY state and begin accepting
+ * connections to perform read-only queries.  When archive recovery is
+ * finished, the startup process exits with exit code 0 and we switch to
+ * PM_RUN state.
  *
  * Normal child backends can only be launched when we are in PM_RUN or
  * PM_HOT_STANDBY state.  (connsAllowed can also restrict launching.)
@@ -2305,29 +2306,13 @@ process_pm_child_exit(void)
 			}
 
 			/*
-			 * Unexpected exit of startup process (including FATAL exit)
-			 * during PM_STARTUP is treated as catastrophic. There are no
-			 * other processes running yet, so we can just exit.
-			 */
-			if (pmState == PM_STARTUP &&
-				StartupStatus != STARTUP_SIGNALED &&
-				!EXIT_STATUS_0(exitstatus))
-			{
-				LogChildExit(LOG, _("startup process"),
-							 pid, exitstatus);
-				ereport(LOG,
-						(errmsg("aborting startup due to startup process failure")));
-				ExitPostmaster(1);
-			}
-
-			/*
-			 * After PM_STARTUP, any unexpected exit (including FATAL exit) of
-			 * the startup process is catastrophic, so kill other children,
-			 * and set StartupStatus so we don't try to reinitialize after
-			 * they're gone.  Exception: if StartupStatus is STARTUP_SIGNALED,
-			 * then we previously sent the startup process a SIGQUIT; so
-			 * that's probably the reason it died, and we do want to try to
-			 * restart in that case.
+			 * Any unexpected exit (including FATAL exit) of the startup
+			 * process is catastrophic, so kill other children, and set
+			 * StartupStatus so we don't try to reinitialize after they're
+			 * gone.  Exception: if StartupStatus is STARTUP_SIGNALED, then we
+			 * previously sent the startup process a SIGQUIT; so that's
+			 * probably the reason it died, and we do want to try to restart
+			 * in that case.
 			 *
 			 * This stanza also handles the case where we sent a SIGQUIT
 			 * during PM_STARTUP due to some dead-end child crashing: in that
@@ -2780,12 +2765,9 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
 			/* shouldn't have any children */
 			Assert(false);
 			break;
-		case PM_STARTUP:
-			/* should have been handled in process_pm_child_exit */
-			Assert(false);
-			break;
 
 			/* wait for children to die */
+		case PM_STARTUP:
 		case PM_RECOVERY:
 		case PM_HOT_STANDBY:
 		case PM_RUN:
-- 
2.47.3