implement new logic to define difference limit and limit check with SHM clock (driver28)

author Juergen Perlinger <perlinger@ntp.org>

Fri, 8 Aug 2014 20:47:50 +0000 (22:47 +0200)

committer Juergen Perlinger <perlinger@ntp.org>

Fri, 8 Aug 2014 20:47:50 +0000 (22:47 +0200)
author Juergen Perlinger <perlinger@ntp.org>
Fri, 8 Aug 2014 20:47:50 +0000 (22:47 +0200)
committer Juergen Perlinger <perlinger@ntp.org>
Fri, 8 Aug 2014 20:47:50 +0000 (22:47 +0200)
diff --git a/html/drivers/driver28.html b/html/drivers/driver28.html

index bcca4b9c16bb6ff758361dfcfd72594a876a7d84..8c7fd802e623000caf1ec7dad3bbed348bb159cf 100644 (file)
--- a/html/drivers/driver28.html
+++ b/html/drivers/driver28.html
@@ -12,7 +12,7 @@
      <body>
          <h3>Shared Memory Driver</h3>
  <p>Last update:
-  <!-- #BeginDate format:En2m -->14-Sep-2012  18:48<!-- #EndDate -->
+  <!-- #BeginDate format:En2m -->8-Aug-2014  19:17<!-- #EndDate -->
    UTC</p>
          <hr>
          <h4>Synopsis</h4>
@@ -56,19 +56,33 @@
          <p>If set, the <code>count</code> field of the record is remembered, and the values in the record (clockTimeStampSec, clockTimeStampUSec, receiveTimeStampSec, receiveTimeStampUSec, leap, precision) are read. Then, the remembered <code>count</code> is compared to current value of <code>count</code> now in the record. If both are equal, the values read from the record are passed to ntp. If they differ, another process has modified the record while it was read out (was not able to produce this case), and failure is reported to ntp. The <code>valid</code> flag is cleared and <code>count</code> is bumped.</p>
          <p>If not set, <code>count</code> is bumped</p>
  
+<h4>Mode-independent postprocessing</h4>
+After the time stamps have been successfully plucked from the SHM
+segment, some sanity checks take place:
+<ul>
+  <li>The receive time stamp of the SHM data must be in the last 5
+  seconds before the time the data is processed. This helps in weeding
+  out stale data.
+  <li>If the absolute difference between remote and local clock
+  exceeds the limit (either <i>time2</i> or the default of 4hrs), then
+  the sample is discarded. This check is disabled when <i>flag1</i> is
+  set to 1.
+</ul>
  
  <h4>gpsd</h4>
  
  <a href="http://gpsd.berlios.de/"><i>gpsd</i></a>
  knows how to talk to many GPS devices.
-It works with <i>ntpd</i> through the SHM driver.
+It can work with <i>ntpd</i> through the SHM driver.
  <P>
  The <i>gpsd</i> man page suggests setting minpoll and maxpoll to 4.
  That was an attempt to reduce jitter.
  The SHM driver was fixed (ntp-4.2.5p138) to collect data each second rather than
  once per polling interval so that suggestion is no longer reasonable.
  <P>
-
+  <b>Note:</b> The GPSD client driver (type 46) uses the <i>gpsd</i>
+  client protocol to connect and talk to <i>gpsd</i>, but using the
+  SHM driver is the ancient way to have <i>gpsd</i> talk to <i>ntpd</i>.
  
  <h4>Clockstats</h4>
  If flag4 is set when the driver is polled, a clockstats record is written.
@@ -103,13 +117,19 @@ Here is a sample showing the GPS reception fading out:
              <dt><tt>time1 <i>time</i></tt>
              <dd>Specifies the time offset calibration factor, in seconds and fraction, with default 0.0.
              <dt><tt>time2 <i>time</i></tt>
-            <dd>Not used by this driver.
+            <dd>Maximum allowed difference between remote and local
+            clock, in seconds. Values <1.0 or >86400.0 are ignored, and the
+            default value of 4hrs (14400s) is used instead. See also flag 1. 
              <dt><tt>stratum <i>number</i></tt>
              <dd>Specifies the driver stratum, in decimal from 0 to 15, with default 0.
              <dt><tt>refid <i>string</i></tt>
              <dd>Specifies the driver reference identifier, an ASCII string from one to four characters, with default <tt>SHM</tt>.
              <dt><tt>flag1 0 | 1</tt>
-            <dd>Not used by this driver.
+            <dd><i>Skip</i> the difference limit check if set. Useful
+            for systems where the RTC backup cannot keep the time over
+            long periods without power and the SHM clock must be able
+            to force long-distance initial jumps. <i>Check</i> the
+            difference limit if cleared (default).
              <dt><tt>flag2 0 | 1</tt>
              <dd>Not used by this driver.
              <dt><tt>flag3 0 | 1</tt>
diff --git a/ntpd/refclock_shm.c b/ntpd/refclock_shm.c

index 1a5eb8ff1ad1e079e05cd3fb6044b34a05c9acea..5c32b81aaed9f9775f854fe0c66c1b3fb3cba3a8 100644 (file)
--- a/ntpd/refclock_shm.c
+++ b/ntpd/refclock_shm.c
@@ -41,9 +41,6 @@
   * This driver supports a reference clock attached thru shared memory
   */
  
-/* Temp hack to simplify testing of the old mode. */
-#define OLDWAY 0
-
  /*
   * SHM interface definitions
   */
@@ -60,8 +57,8 @@ static  int     shm_start       (int unit, struct peer *peer);
  static  void    shm_shutdown    (int unit, struct peer *peer);
  static  void    shm_poll        (int unit, struct peer *peer);
  static  void    shm_timer       (int unit, struct peer *peer);
-       int     shm_peek        (int unit, struct peer *peer);
-       void    shm_clockstats  (int unit, struct peer *peer);
+static void    shm_peek        (int unit, struct peer *peer);
+static void    shm_clockstats  (int unit, struct peer *peer);
  
  /*
   * Transfer vector
@@ -108,6 +105,9 @@ struct shmunit {
         int notready;           /* number of peeks without data ready */
         int bad;                /* number of invalid samples */
         int clash;              /* number of access clashes while reading */
+
+       time_t max_delta;       /* difference limit */
+       time_t max_delay;       /* age/stale limit */
  };
  
  
@@ -214,6 +214,15 @@ shm_start(
                 up->shm->valid = 0;
                 up->shm->nsamples = NSAMPLES;
                 pp->clockdesc = DESCRIPTION;
+
+               up->max_delay = 5;
+               if (pp->sloppyclockflag & CLK_FLAG1)
+                       up->max_delta = 0;
+               else if (pp->fudgetime2 < 1. || pp->fudgetime2 > 86400.)
+                       up->max_delta = 4*3600;
+               else
+                       up->max_delta = (time_t)floor(pp->fudgetime2 + 0.5);
+               
                 return 1;
         } else {
                 free(up);
@@ -255,9 +264,6 @@ shm_shutdown(
  static  void
  shm_timer(int unit, struct peer *peer)
  {
-       if (OLDWAY)
-               return;
-
         shm_peek(unit, peer);
  }
  
@@ -272,52 +278,76 @@ shm_poll(
         )
  {
         struct refclockproc *pp;
-       int ok;
+       struct shmunit *up;
+       int major_error;
  
         pp = peer->procptr;
+       up = pp->unitptr;
  
-       if (OLDWAY) {
-               ok = shm_peek(unit, peer);
-               if (!ok)
-                       return;
-       }
+       pp->polls++;
+
+       /* get dominant reason if we have no samples at all */
+       major_error = max(up->notready, up->bad);
+       major_error = max(major_error, up->clash);
  
          /*
-         * Process median filter samples. If none received, declare a
-         * timeout and keep going.
+         * Process median filter samples. If none received, see what
+         * happened, tell the core and keep going.
           */
-        if (pp->coderecv == pp->codeproc) {
+        if (pp->coderecv != pp->codeproc) {
+               /* have some samples, everything OK */
+               pp->lastref = pp->lastrec;
+               refclock_receive(peer);
+       } else if (NULL == up->shm) { /* is this possible at all? */
+               /* we're out of business without SHM access */
+               refclock_report(peer, CEVNT_FAULT);
+       } else if (major_error == up->clash) {
+               /* too many collisions is like a bad signal */
+                refclock_report(peer, CEVNT_PROP);
+       } else if (major_error == up->bad) {
+               /* too much stale/bad/garbled data */
+                refclock_report(peer, CEVNT_BADREPLY);
+       } else {
+               /* in any other case assume it's just a timeout */
                  refclock_report(peer, CEVNT_TIMEOUT);
-               shm_clockstats(unit, peer);
-                return;
          }
-       pp->lastref = pp->lastrec;
-       refclock_receive(peer);
+       /* shm_clockstats() clears the tallies, so it must be last... */
         shm_clockstats(unit, peer);
  }
  
  /*
   * shm_peek - try to grab a sample
   */
-int shm_peek(
+static void
+shm_peek(
         int unit,
         struct peer *peer
         )
  {
         struct refclockproc *pp;
         struct shmunit *up;
-       struct shmTime *shm;
+
+       /* access order is important for lock-free SHM access; we
+       ** enforce order by treating the whole structure volatile.
+       **
+       ** IMPORTANT NOTE: This does not protect from reordering on CPU
+       ** level, and it does nothing for cache consistency and
+       ** visibility of changes by other cores. We need atomic and/or
+       ** fence instructions for that.
+       */
+       volatile struct shmTime *shm;
  
         struct timespec tvr;
         struct timespec tvt;
+       l_fp tsrcv;
+       l_fp tsref;
         unsigned int c;
-       int ok = 1;
-       unsigned cns_new, rns_new;
+       unsigned int cns_new, rns_new;
         int cnt;
  
         /* for formatting 'a_lastcode': */
         struct calendar cd;
-       time_t tt;
+       time_t tt, now;
         vint64 ts;
  
         /*
@@ -334,12 +364,15 @@ int shm_peek(
         }
         shm = up->shm;
         if (shm == 0) {
-               refclock_report(peer, CEVNT_FAULT);
-               return 0;
+               DPRINTF(1, ("%s: no SHM segment\n",
+                           refnumtoa(&peer->srcadr)));
+               return;
         }
         if ( ! shm->valid) {
+               DPRINTF(1, ("%s: SHM not ready\n",
+                           refnumtoa(&peer->srcadr)));
                 up->notready++;
-               return 0;
+               return;
         }
  
         switch (shm->mode) {
@@ -371,6 +404,8 @@ int shm_peek(
                 ** timestamps, possibly generated by extending the old
                 ** us-level timestamps
                 */
+               DPRINTF(2, ("%s: SHM type 0 sample\n",
+                           refnumtoa(&peer->srcadr)));
                 break;
  
         case 1:
@@ -382,7 +417,13 @@ int shm_peek(
                 tvt.tv_sec      = shm->clockTimeStampSec;
                 tvt.tv_nsec     = shm->clockTimeStampUSec * 1000;
                 cns_new         = shm->clockTimeStampNSec;
-               ok = (cnt == shm->count);
+               if (cnt != shm->count) {
+                       DPRINTF(1, ("%s: type 1 access clash\n",
+                                   refnumtoa(&peer->srcadr)));
+                       msyslog (LOG_NOTICE, "SHM: access clash in shared memory");
+                       up->clash++;
+                       return;
+               }
                 
                 /* See the case above for an explanation of the
                 ** following test.
@@ -396,20 +437,25 @@ int shm_peek(
                 ** timestamps, possibly generated by extending the old
                 ** us-level timestamps
                 */
+               DPRINTF(2, ("%s: SHM type 1 sample\n",
+                           refnumtoa(&peer->srcadr)));
                 break;
  
         default:
+               DPRINTF(1, ("%s: SHM type blooper, mode=%d\n",
+                           refnumtoa(&peer->srcadr), shm->mode));
                 up->bad++;
                 msyslog (LOG_ERR, "SHM: bad mode found in shared memory: %d",
                          shm->mode);
-               return 0;
+               return;
         }
+       shm->valid = 0;
  
         /* format the last time code in human-readable form into
          * 'pp->a_lastcode'. Someone claimed: "NetBSD has incompatible
          * tv_sec". I can't find a base for this claim, but we can work
          * around that potential problem. BTW, simply casting a pointer
-        * is a receipe for desaster on some architectures.
+        * is a receipe for disaster on some architectures.
          */
         tt = (time_t)tvt.tv_sec;
         ts = time_to_vint64(&tt);
@@ -423,46 +469,70 @@ int shm_peek(
                      (long)tvt.tv_nsec);
         pp->lencode = (c < sizeof(pp->a_lastcode)) ? c : 0;
  
-       shm->valid = 0;
-       if (ok) {
-               l_fp tsrcv = tspec_stamp_to_lfp(tvr);
-               l_fp tsref = tspec_stamp_to_lfp(tvt);
-               pp->polls++;
-               pp->leap = shm->leap;
-               peer->precision = shm->precision;
-               refclock_process_offset(pp, tsref, tsrcv, pp->fudgetime1);
-       } else {
-               refclock_report(peer, CEVNT_FAULT);
-               msyslog (LOG_NOTICE, "SHM: access clash in shared memory");
-               up->clash++;
-               return 0;
+       /* check 1: age control of local time stamp */
+       time(&now);
+       tt = now - tvr.tv_sec;
+       if (tt < 0 || tt > up->max_delay) {
+               DPRINTF(1, ("%s:SHM stale/bad receive time, delay=%llds\n",
+                           refnumtoa(&peer->srcadr), (long long)tt));
+               up->bad++;
+               msyslog (LOG_ERR, "SHM: stale/bad receive time, delay=%llds",
+                        (long long)tt);
+               return;
         }
+
+       /* check 2: delta check */
+       tt = tvr.tv_sec - tvt.tv_sec - (tvr.tv_nsec < tvt.tv_nsec);
+       if (tt < 0)
+               tt = -tt;
+       if (up->max_delta > 0 && tt > up->max_delta) {
+               DPRINTF(1, ("%s: SHM diff limit exceeded, delta=%llds\n",
+                           refnumtoa(&peer->srcadr), (long long)tt));
+               up->bad++;
+               msyslog (LOG_ERR, "SHM: difference limit exceeded, delta=%llds\n",
+                        (long long)tt);
+               return;
+       }
+
+       /* if we really made it to this point... we're winners! */
+       DPRINTF(2, ("%s: SHM feeding data\n",
+                   refnumtoa(&peer->srcadr)));
+       tsrcv = tspec_stamp_to_lfp(tvr);
+       tsref = tspec_stamp_to_lfp(tvt);
+       pp->leap = shm->leap;
+       peer->precision = shm->precision;
+       refclock_process_offset(pp, tsref, tsrcv, pp->fudgetime1);
         up->good++;
-       return 1;
  }
  
  /*
   * shm_clockstats - dump and reset counters
   */
-void shm_clockstats(
+static void shm_clockstats(
         int unit,
         struct peer *peer
         )
  {
         struct refclockproc *pp;
         struct shmunit *up;
-       char logbuf[256];
+       char logbuf[64];
+       unsigned int llen;
  
         pp = peer->procptr;
         up = pp->unitptr;
  
-       if (!(pp->sloppyclockflag & CLK_FLAG4))
-               return;
-
-       snprintf(logbuf, sizeof(logbuf), "%3d %3d %3d %3d %3d",
-                up->ticks, up->good, up->notready, up->bad, up->clash);
-       record_clock_stats(&peer->srcadr, logbuf);
-
+       if (pp->sloppyclockflag & CLK_FLAG4) {
+               /* if snprintf() returns a negative values on errors
+               ** (some older ones do) make sure we are NUL
+               ** terminated. Using an unsigned result does the trick.
+               */
+               llen = snprintf(logbuf, sizeof(logbuf),
+                               "%3d %3d %3d %3d %3d",
+                               up->ticks, up->good, up->notready,
+                               up->bad, up->clash);
+               logbuf[min(llen, sizeof(logbuf)-1)] = '\0';
+               record_clock_stats(&peer->srcadr, logbuf);
+       }
         up->ticks = up->good = up->notready = up->bad = up->clash = 0;
  
  }
author	Juergen Perlinger <perlinger@ntp.org>
	Fri, 8 Aug 2014 20:47:50 +0000 (22:47 +0200)
committer	Juergen Perlinger <perlinger@ntp.org>
	Fri, 8 Aug 2014 20:47:50 +0000 (22:47 +0200)
html/drivers/driver28.html		patch \| blob \| blame \| history
ntpd/refclock_shm.c		patch \| blob \| blame \| history