From: Willy Tarreau <w@1wt.eu>
Date: Thu, 23 Nov 2017 13:52:28 +0000 (+0100)
Subject: BUG/MEDIUM: threads/time: maintain a common time reference between all threads
X-Git-Tag: v1.8.0~57
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9fefc51c569b692a09229182a76fae78883b1cd8;p=thirdparty%2Fhaproxy.git

BUG/MEDIUM: threads/time: maintain a common time reference between all threads

During high loads it becomes visible that the time drifts between threads,
sometimes showing tens of seconds after several minutes. The root cause is
the per-thread correction which is performed based on a local offset and
local time. But we can't use a unique global time either as we need the
thread-local time to be stable between two poll() calls.

This commit takes a stab at this problem by proceeding this way :

  - a global "global_now" date is monotonous and common between all threads.
  - each thread has its own local <now> which is resynced with <global_now>
    on each invocation of tv_update_date()
  - each thread detects its own drift based on its poll() timeout and its
    local <now>, and recalculates its adjusted local time
  - each thread then ensures its new local time is no older than the current
    global time, otherwise it readjusts its local time to match this one
  - finally threads do atomically update the global time to match its own
    local one

This guarantees a monotonous global time and a monotonous+stable local time.

It is still possible by definition for two threads to report a minor time
variation on subsequent events but that variation will only be caused by
the moment they watched the time and are very small. When a common global
time is needed between all threads, global_now could be used as a reference
(with care). The wallclock time used in logs is still <date> anyway.
---

diff --git a/src/time.c b/src/time.c
index bcd6a722a0..789b47c1ab 100644
--- a/src/time.c
+++ b/src/time.c
@@ -28,6 +28,9 @@ struct timeval start_date;      /* the process's start date */
 THREAD_LOCAL struct timeval before_poll;     /* system date before calling poll() */
 THREAD_LOCAL struct timeval after_poll;      /* system date after leaving poll() */
 
+static THREAD_LOCAL struct timeval tv_offset;  /* per-thread time ofsset relative to global time */
+volatile unsigned long long global_now;        /* common date between all threads (32:32) */
+
 /*
  * adds <ms> ms to <from>, set the result to <tv> and returns a pointer <tv>
  */
@@ -170,55 +173,70 @@ REGPRM2 int _tv_isgt(const struct timeval *tv1, const struct timeval *tv2)
  */
 REGPRM2 void tv_update_date(int max_wait, int interrupted)
 {
-	volatile static long long offset = 0;
-	struct timeval adjusted, deadline;
+	struct timeval adjusted, deadline, tmp_now;
 	unsigned int   curr_sec_ms;     /* millisecond of current second (0..999) */
-	long long new_ofs;
+	unsigned long long old_now;
+	unsigned long long new_now;
 
 	gettimeofday(&date, NULL);
 	if (unlikely(max_wait < 0)) {
-		HA_ATOMIC_STORE(&offset, 0);
+		tv_zero(&tv_offset);
 		adjusted = date;
 		after_poll = date;
 		samp_time = idle_time = 0;
 		idle_pct = 100;
+		global_now = (((unsigned long long)adjusted.tv_sec) << 32) +
+		             (unsigned int)adjusted.tv_usec;
 		goto to_ms;
 	}
 
-	new_ofs = offset;
+	__tv_add(&adjusted, &date, &tv_offset);
 
-	adjusted.tv_sec  = date.tv_sec  + (int)(new_ofs >> 32);
-	adjusted.tv_usec = date.tv_usec + (int)(new_ofs & 0xFFFFFFFFU);
-	if (adjusted.tv_usec > 999999) {
-		adjusted.tv_usec -= 1000000;
-		adjusted.tv_sec  += 1;
-	}
+	/* compute the minimum and maximum local date we may have reached based
+	 * on our past date and the associated timeout.
+	 */
+	_tv_ms_add(&deadline, &now, max_wait + MAX_DELAY_MS);
 
-	if (unlikely(__tv_islt(&adjusted, &now))) {
-		goto fixup; /* jump in the past */
+	if (unlikely(__tv_islt(&adjusted, &now) || __tv_islt(&deadline, &adjusted))) {
+		/* Large jump. If the poll was interrupted, we consider that the
+		 * date has not changed (immediate wake-up), otherwise we add
+		 * the poll time-out to the previous date. The new offset is
+		 * recomputed.
+		 */
+		_tv_ms_add(&adjusted, &now, interrupted ? 0 : max_wait);
 	}
 
-	/* OK we did not jump backwards, let's see if we have jumped too far
-	 * forwards. The poll value was in <max_wait>, we accept that plus
-	 * MAX_DELAY_MS to cover additional time.
-	 */
-	_tv_ms_add(&deadline, &now, max_wait + MAX_DELAY_MS);
-	if (likely(__tv_islt(&adjusted, &deadline)))
-		goto to_ms; /* OK time is within expected range */
- fixup:
-	/* Large jump. If the poll was interrupted, we consider that the date
-	 * has not changed (immediate wake-up), otherwise we add the poll
-	 * time-out to the previous date. The new offset is recomputed.
+	/* now that we have bounded the local time, let's check if it's
+	 * realistic regarding the global date, which only moves forward,
+	 * otherwise catch up.
 	 */
-	_tv_ms_add(&adjusted, &now, interrupted ? 0 : max_wait);
+	old_now = global_now;
 
-	new_ofs = (((long)(adjusted.tv_sec  - date.tv_sec)) << 32) +
-	          (unsigned int)(adjusted.tv_usec - date.tv_usec);
+	do {
+		tmp_now.tv_sec  = (unsigned int)(old_now >> 32);
+		tmp_now.tv_usec = old_now & 0xFFFFFFFFU;
 
-	if ((int)(new_ofs & 0xFFFFFFFFU) < 0)
-		new_ofs = new_ofs + 1000000 - 0x100000000UL;
+		if (__tv_islt(&adjusted, &tmp_now))
+			adjusted = tmp_now;
+
+		/* now <adjusted> is expected to be the most accurate date,
+		 * equal to <global_now> or newer.
+		 */
+		new_now = (((unsigned long long)adjusted.tv_sec) << 32) + (unsigned int)adjusted.tv_usec;
+
+		/* let's try to update the global <now> or loop again */
+	} while (!HA_ATOMIC_CAS(&global_now, &old_now, new_now));
+
+	/* the new global date when we looked was old_now, and the new one is
+	 * new_now == adjusted. We can recompute our local offset.
+	 */
+	tv_offset.tv_sec  = adjusted.tv_sec  - date.tv_sec;
+	tv_offset.tv_usec = adjusted.tv_usec - date.tv_usec;
+	if (tv_offset.tv_usec < 0) {
+		tv_offset.tv_usec += 1000000;
+		tv_offset.tv_sec--;
+	}
 
-	HA_ATOMIC_STORE(&offset, new_ofs);
  to_ms:
 	now = adjusted;
 	curr_sec_ms = now.tv_usec / 1000;            /* ms of current second */