]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Perform an immediate shutdown if the postmaster.pid file is removed.
authorTom Lane <tgl@sss.pgh.pa.us>
Tue, 6 Oct 2015 21:15:27 +0000 (17:15 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Tue, 6 Oct 2015 21:15:27 +0000 (17:15 -0400)
The postmaster now checks every minute or so (worst case, at most two
minutes) that postmaster.pid is still there and still contains its own PID.
If not, it performs an immediate shutdown, as though it had received
SIGQUIT.

The original goal behind this change was to ensure that failed buildfarm
runs would get fully cleaned up, even if the test scripts had left a
postmaster running, which is not an infrequent occurrence.  When the
buildfarm script removes a test postmaster's $PGDATA directory, its next
check on postmaster.pid will fail and cause it to exit.  Previously, manual
intervention was often needed to get rid of such orphaned postmasters,
since they'd block new test postmasters from obtaining the expected socket
address.

However, by checking postmaster.pid and not something else, we can provide
additional robustness: manual removal of postmaster.pid is a frequent DBA
mistake, and now we can at least limit the damage that will ensue if a new
postmaster is started while the old one is still alive.

Back-patch to all supported branches, since we won't get the desired
improvement in buildfarm reliability otherwise.

src/backend/postmaster/postmaster.c
src/backend/utils/init/miscinit.c
src/include/miscadmin.h

index 94ba381b0f0d99d5783f24bd0785cfe890e690b2..bdc809dd8dfc47b94cb487f307cd583345505e2a 100644 (file)
@@ -1471,9 +1471,10 @@ ServerLoop(void)
        fd_set          readmask;
        int                     nSockets;
        time_t          now,
+                               last_lockfile_recheck_time,
                                last_touch_time;
 
-       last_touch_time = time(NULL);
+       last_lockfile_recheck_time = last_touch_time = time(NULL);
 
        nSockets = initMasks(&readmask);
 
@@ -1614,27 +1615,56 @@ ServerLoop(void)
                                kill(AutoVacPID, SIGUSR2);
                }
 
+#ifdef HAVE_PTHREAD_IS_THREADED_NP
+
+               /*
+                * With assertions enabled, check regularly for appearance of
+                * additional threads.  All builds check at start and exit.
+                */
+               Assert(pthread_is_threaded_np() == 0);
+#endif
+
+               /*
+                * Lastly, check to see if it's time to do some things that we don't
+                * want to do every single time through the loop, because they're a
+                * bit expensive.  Note that there's up to a minute of slop in when
+                * these tasks will be performed, since DetermineSleepTime() will let
+                * us sleep at most that long.
+                */
+               now = time(NULL);
+
                /*
-                * Touch the socket and lock file every 58 minutes, to ensure that
+                * Once a minute, verify that postmaster.pid hasn't been removed or
+                * overwritten.  If it has, we force a shutdown.  This avoids having
+                * postmasters and child processes hanging around after their database
+                * is gone, and maybe causing problems if a new database cluster is
+                * created in the same place.  It also provides some protection
+                * against a DBA foolishly removing postmaster.pid and manually
+                * starting a new postmaster.  Data corruption is likely to ensue from
+                * that anyway, but we can minimize the damage by aborting ASAP.
+                */
+               if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE)
+               {
+                       if (!RecheckDataDirLockFile())
+                       {
+                               ereport(LOG,
+                                               (errmsg("performing immediate shutdown because data directory lock file is invalid")));
+                               kill(MyProcPid, SIGQUIT);
+                       }
+                       last_lockfile_recheck_time = now;
+               }
+
+               /*
+                * Touch Unix socket and lock file every 58 minutes, to ensure that
                 * they are not removed by overzealous /tmp-cleaning tasks.  We assume
                 * no one runs cleaners with cutoff times of less than an hour ...
                 */
-               now = time(NULL);
                if (now - last_touch_time >= 58 * SECS_PER_MINUTE)
                {
                        TouchSocketFile();
                        TouchSocketLockFile();
                        last_touch_time = now;
                }
-
-#ifdef HAVE_PTHREAD_IS_THREADED_NP
-
-               /*
-                * With assertions enabled, check regularly for appearance of
-                * additional threads.  All builds check at start and exit.
-                */
-               Assert(pthread_is_threaded_np() == 0);
-#endif
        }
 }
 
index fc37bf6d4671b027020b9c7cccc4b6b883cbdafe..d5d7f4d9deefb73ff7d0de36f2b979966ff8882b 100644 (file)
@@ -1116,6 +1116,76 @@ AddToDataDirLockFile(int target_line, const char *str)
 }
 
 
+/*
+ * Recheck that the data directory lock file still exists with expected
+ * content.  Return TRUE if the lock file appears OK, FALSE if it isn't.
+ *
+ * We call this periodically in the postmaster.  The idea is that if the
+ * lock file has been removed or replaced by another postmaster, we should
+ * do a panic database shutdown.  Therefore, we should return TRUE if there
+ * is any doubt: we do not want to cause a panic shutdown unnecessarily.
+ * Transient failures like EINTR or ENFILE should not cause us to fail.
+ * (If there really is something wrong, we'll detect it on a future recheck.)
+ */
+bool
+RecheckDataDirLockFile(void)
+{
+       int                     fd;
+       int                     len;
+       long            file_pid;
+       char            buffer[BLCKSZ];
+
+       fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0);
+       if (fd < 0)
+       {
+               /*
+                * There are many foreseeable false-positive error conditions.  For
+                * safety, fail only on enumerated clearly-something-is-wrong
+                * conditions.
+                */
+               switch (errno)
+               {
+                       case ENOENT:
+                       case ENOTDIR:
+                               /* disaster */
+                               ereport(LOG,
+                                               (errcode_for_file_access(),
+                                                errmsg("could not open file \"%s\": %m",
+                                                               DIRECTORY_LOCK_FILE)));
+                               return false;
+                       default:
+                               /* non-fatal, at least for now */
+                               ereport(LOG,
+                                               (errcode_for_file_access(),
+                                 errmsg("could not open file \"%s\": %m; continuing anyway",
+                                                DIRECTORY_LOCK_FILE)));
+                               return true;
+               }
+       }
+       len = read(fd, buffer, sizeof(buffer) - 1);
+       if (len < 0)
+       {
+               ereport(LOG,
+                               (errcode_for_file_access(),
+                                errmsg("could not read from file \"%s\": %m",
+                                               DIRECTORY_LOCK_FILE)));
+               close(fd);
+               return true;                    /* treat read failure as nonfatal */
+       }
+       buffer[len] = '\0';
+       close(fd);
+       file_pid = atol(buffer);
+       if (file_pid == getpid())
+               return true;                    /* all is well */
+
+       /* Trouble: someone's overwritten the lock file */
+       ereport(LOG,
+                       (errmsg("lock file \"%s\" contains wrong PID: %ld instead of %ld",
+                                       DIRECTORY_LOCK_FILE, file_pid, (long) getpid())));
+       return false;
+}
+
+
 /*-------------------------------------------------------------------------
  *                             Version checking support
  *-------------------------------------------------------------------------
index 3ca73b78026f0c2bec2f158229e4bfd7cdf178f9..7e723357371c0c6414a080267705d4f2a99ab1cf 100644 (file)
@@ -405,6 +405,7 @@ extern void CreateDataDirLockFile(bool amPostmaster);
 extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster);
 extern void TouchSocketLockFile(void);
 extern void AddToDataDirLockFile(int target_line, const char *str);
+extern bool RecheckDataDirLockFile(void);
 extern void ValidatePgVersion(const char *path);
 extern void process_shared_preload_libraries(void);
 extern void process_local_preload_libraries(void);