[MEDIUM] scheduler: get rid of the 4 trees thanks and use ebtree v4.1

author Willy Tarreau <w@1wt.eu>

Sat, 21 Mar 2009 09:01:42 +0000 (10:01 +0100)

committer Willy Tarreau <w@1wt.eu>

Sat, 21 Mar 2009 09:25:14 +0000 (10:25 +0100)
author Willy Tarreau <w@1wt.eu>
Sat, 21 Mar 2009 09:01:42 +0000 (10:01 +0100)
committer Willy Tarreau <w@1wt.eu>
Sat, 21 Mar 2009 09:25:14 +0000 (10:25 +0100)
diff --git a/include/common/ticks.h b/include/common/ticks.h

index 4587d56686868f7ba2a66433dd27ee42b00a3d34..de29b315a1480aed17bf199b4753a81d0b021c27 100644 (file)
--- a/include/common/ticks.h
+++ b/include/common/ticks.h
@@ -40,7 +40,7 @@
   * in the past and as much in the future.
   * 
   * We must both support absolute dates (well in fact, dates relative to now+/-
- * 12 days), and intervals (for timeouts). Both types need an "eternity" magic
+ * 24 days), and intervals (for timeouts). Both types need an "eternity" magic
   * value. For optimal code generation, we'll use zero as the magic value
   * indicating that an expiration timer or a timeout is not set. We have to
   * check that we don't return this value when adding timeouts to <now>. If a
@@ -90,6 +90,18 @@ static inline int tick_add_ifset(int now, int timeout)
         return tick_add(now, timeout);
  }
  
+/* return 1 if timer <t1> is before <t2>, none of which can be infinite. */
+static inline int tick_is_lt(int t1, int t2)
+{
+       return (t1 - t2) < 0;
+}
+
+/* return 1 if timer <t1> is before or equal to <t2>, none of which can be infinite. */
+static inline int tick_is_le(int t1, int t2)
+{
+       return (t1 - t2) <= 0;
+}
+
  /* return 1 if timer <timer> is expired at date <now>, otherwise zero */
  static inline int tick_is_expired(int timer, int now)
  {
diff --git a/include/proto/task.h b/include/proto/task.h

index b5f2280938c1d35b46c624c938f8948f41cc963c..9d90b1737e8b3e6770fb3e4bf623ca9307e435cf 100644 (file)
--- a/include/proto/task.h
+++ b/include/proto/task.h
@@ -44,52 +44,17 @@
   * cannot use that to store sorted information because that reference changes
   * all the time.
   *
- * So we cut the time in 3 ranges, only one of which <now> can be. The base of
- * the range holding <now> serves as a reference for as long as <now> remains
- * in this range :
- *   - previous : those ones are expired by definition (all before <now>)
- *   - current  : some are expired, some are not (holds <now>)
- *   - next     : none are expired (all after <now>)
+ * We'll use the fact that the time wraps to sort timers. Timers above <now>
+ * are in the future, timers below <now> are in the past. Here, "above" and
+ * "below" are to be considered modulo 2^31.
   *
- * We use the higher two bits of the timers expressed in ticks to determine
- * which range a timer is in, compared to <now> :
- *
- *   now     previous     current      next0     next1
- * [31:30]   [31:30]      [31:30]     [31:30]   [31:30]
- *    00        11           00          01        10
- *    01        00           01          10        11
- *    10        01           10          11        00
- *    11        10           11          00        01
- *
- * By definition, <current> is the range containing <now> as well as all timers
- * which have the same 2 high bits as <now>, <previous> is the range just
- * before, which contains all timers whose high bits equal those of <now> minus
- * 1. Last, <next> is composed of the two remaining ranges.
- *
- * For ease of implementation, the timers will then be stored into 4 queues 0-3
- * determined by the 2 higher bits of the timer. The expiration algorithm is
- * very simple :
- *  - expire everything in <previous>=queue[((now>>30)-1)&3]
- *  - expire from <current>=queue[(now>>30)&3] everything where timer >= now
- *
- * With this algorithm, it's possible to queue tasks meant to expire 24.8 days
- * in the future, and still be able to detect events remaining unprocessed for
- * the last 12.4 days! Note that the principle might be extended to any number
- * of higher bits as long as there is only one range for expired tasks. For
- * instance, using the 8 higher bits to index the range, we would have one past
- * range of 4.6 hours (24 bits in ms), and 254 ranges in the future totalizing
- * 49.3 days. This would eat more memory for very little added benefit though.
- *
- * Also, in order to maintain the ability to perform time comparisons, it is
- * preferable to avoid using the <next1> range above, as values in this range
- * may not easily be compared to <now> outside of these functions as it is the
- * opposite of the <current> range, and <timer>-<now> may randomly be positive
- * or negative. That means we're left with +/- 12.4 days timers.
- *
- * To keep timers ordered, we use 4 ebtrees [0..3]. We could have used instead
- * of ticks, (seconds*1024)+milliseconds, as well as 1024th of seconds, but
- * that makes comparisons with ticks more difficult, so in the end it's better
- * to stick to the ticks.
+ * Timers are stored sorted in an ebtree. We use the new ability for ebtrees to
+ * lookup values starting from X to only expire tasks between <now> - 2^31 and
+ * <now>. If the end of the tree is reached while walking over it, we simply
+ * loop back to the beginning. That way, we have no problem keeping sorted
+ * wrapping timers in a tree, between (now - 24 days) and (now + 24 days). The
+ * keys in the tree always reflect their real position, none can be infinite.
+ * This reduces the number of checks to be performed.
   *
   * Another nice optimisation is to allow a timer to stay at an old place in the
   * queue as long as it's not further than the real expiration date. That way,
@@ -102,20 +67,14 @@
   * So, to summarize, we have :
   *   - node->key always defines current position in the wait queue
   *   - timer is the real expiration date (possibly infinite)
- *   - node->key <= timer
+ *   - node->key is always before or equal to timer
   *
   * The run queue works similarly to the wait queue except that the current date
   * is replaced by an insertion counter which can also wrap without any problem.
   */
  
-/* the timers are stored as 32-bit values in the queues */
-#define TIMER_TICK_BITS       32
-#define TIMER_TREE_BITS        2
-#define TIMER_TREES           (1 << TIMER_TREE_BITS)
-#define TIMER_TREE_SHIFT      (TIMER_TICK_BITS - TIMER_TREE_BITS)
-#define TIMER_TREE_MASK       (TIMER_TREES - 1)
-#define TIMER_TICK_MASK       ((1U << (TIMER_TICK_BITS-1)) * 2 - 1)
-#define TIMER_SIGN_BIT        (1 << (TIMER_TICK_BITS - 1))
+/* The farthest we can look back in a timer tree */
+#define TIMER_LOOK_BACK       (1U << 31)
  
  /* a few exported variables */
  extern unsigned int run_queue;    /* run queue size */
@@ -123,31 +82,6 @@ extern unsigned int niced_tasks;  /* number of niced tasks in the run queue */
  extern struct pool_head *pool2_task;
  extern struct task *last_timer;   /* optimization: last queued timer */
  
-/* Convert ticks to timers. Must not be called with TICK_ETERNITY, which is not
- * a problem inside tree scanning functions. Note that ticks are signed while
- * timers are not.
- */
-static inline unsigned int tick_to_timer(int tick)
-{
-       return tick & TIMER_TICK_MASK;
-}
-
-/* Convert timer to ticks. This operation will be correct only as long as
- * timers are stored on a minimum of 32-bit. We take care of not returning zero
- * which would mean "eternity" for a tick. Also note that ticks are signed and
- * timers are not.
- */
-static inline int timer_to_tick(unsigned int timer)
-{
-       return timer ? timer : 1;
-}
-
-/* returns a tree number based on a ticks value */
-static inline unsigned int timer_to_tree(unsigned int timer)
-{
-       return (timer >> TIMER_TREE_SHIFT) & TIMER_TREE_MASK;
-}       
-
  /* return 0 if task is in run queue, otherwise non-zero */
  static inline int task_in_rq(struct task *t)
  {
@@ -263,8 +197,7 @@ static inline void task_queue(struct task *task)
         if (!tick_isset(task->expire))
                 return;
  
-       if (((tick_to_timer(task->expire) - task->wq.key) & TIMER_SIGN_BIT)
-               || !task_in_wq(task))
+       if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
                 __task_queue(task);
  }
  
diff --git a/src/task.c b/src/task.c

index 6f7b26406626d5816f2bf75d1ad34251cdc238c6..b4e2b97ca53ecd9fa8cc9640e7291400b1bac8b3 100644 (file)
--- a/src/task.c
+++ b/src/task.c
@@ -26,12 +26,12 @@
  struct pool_head *pool2_task;
  
  unsigned int run_queue = 0;
-unsigned int niced_tasks = 0; /* number of niced tasks in the run queue */
-struct task *last_timer = NULL;  /* optimization: last queued timer */
+unsigned int niced_tasks = 0;      /* number of niced tasks in the run queue */
+struct task *last_timer = NULL;    /* optimization: last queued timer */
  
-static struct eb_root timers[TIMER_TREES];  /* trees with MSB 00, 01, 10 and 11 */
-static struct eb_root rqueue[TIMER_TREES];  /* trees constituting the run queue */
-static unsigned int rqueue_ticks;           /* insertion count */
+static struct eb_root timers;      /* sorted timers tree */
+static struct eb_root rqueue;      /* tree constituting the run queue */
+static unsigned int rqueue_ticks;  /* insertion count */
  
  /* Puts the task <t> in run queue at a position depending on t->nice. <t> is
   * returned. The nice value assigns boosts in 32th of the run queue size. A
@@ -60,7 +60,7 @@ struct task *__task_wakeup(struct task *t)
         /* clear state flags at the same time */
         t->state &= ~TASK_WOKEN_ANY;
  
-       eb32_insert(&rqueue[timer_to_tree(t->rq.key)], &t->rq);
+       eb32_insert(&rqueue, &t->rq);
         return t;
  }
  
@@ -70,9 +70,8 @@ struct task *__task_wakeup(struct task *t)
   * Inserts a task into the wait queue at the position given by its expiration
   * date. It does not matter if the task was already in the wait queue or not,
   * as it will be unlinked. The task must not have an infinite expiration timer.
- * Last, tasks must not be queued further than the end of the next tree, which
- * is between <now_ms> and <now_ms> + TIMER_SIGN_BIT ms (now+12days..24days in
- * 32bit).
+ * Last, tasks must not be queued further than the end of the tree, which is
+ * between <now_ms> and <now_ms> + 2^31 ms (now+24days in 32bit).
   *
   * This function should not be used directly, it is meant to be called by the
   * inline version of task_queue() which performs a few cheap preliminary tests
@@ -84,12 +83,9 @@ void __task_queue(struct task *task)
                 __task_unlink_wq(task);
  
         /* the task is not in the queue now */
-       if (unlikely(!tick_isset(task->expire)))
-               return;
-
-       task->wq.key = tick_to_timer(task->expire);
+       task->wq.key = task->expire;
  #ifdef DEBUG_CHECK_INVALID_EXPIRATION_DATES
-       if ((task->wq.key - tick_to_timer(now_ms)) & TIMER_SIGN_BIT)
+       if (tick_is_lt(task->wq.key, now_ms))
                 /* we're queuing too far away or in the past (most likely) */
                 return;
  #endif
@@ -105,67 +101,59 @@ void __task_queue(struct task *task)
                 eb_insert_dup(&last_timer->wq.node, &task->wq.node);
                 return;
         }
-       eb32_insert(&timers[timer_to_tree(task->wq.key)], &task->wq);
+       eb32_insert(&timers, &task->wq);
         if (task->wq.node.bit == -1)
-               last_timer = task; /* we only want dup a tree's root */
+               last_timer = task; /* we only want a dup tree's root */
         return;
  }
  
  /*
   * Extract all expired timers from the timer queue, and wakes up all
- * associated tasks. Returns the date of next event (or eternity).
+ * associated tasks. Returns the date of next event (or eternity) in <next>.
   */
  void wake_expired_tasks(int *next)
  {
         struct task *task;
         struct eb32_node *eb;
-       unsigned int now_tree;
-       unsigned int tree;
  
-       /* In theory, we should :
-        *   - wake all tasks from the <previous> tree
-        *   - wake all expired tasks from the <current> tree
-        *   - scan <next> trees for next expiration date if not found earlier.
-        * But we can do all this more easily : we scan all 3 trees before we
-        * wrap, and wake everything expired from there, then stop on the first
-        * non-expired entry.
-        */
+       eb = eb32_lookup_ge(&timers, now_ms - TIMER_LOOK_BACK);
+       while (1) {
+               if (unlikely(!eb)) {
+                       /* we might have reached the end of the tree, typically because
+                       * <now_ms> is in the first half and we're first scanning the last
+                       * half. Let's loop back to the beginning of the tree now.
+                       */
+                       eb = eb32_first(&timers);
+                       if (likely(!eb))
+                               break;
+               }
  
-       now_tree = timer_to_tree(tick_to_timer(now_ms));
-       tree = (now_tree - 1) & TIMER_TREE_MASK;
-       do {
-               eb = eb32_first(&timers[tree]);
-               while (eb) {
-                       task = eb32_entry(eb, struct task, wq);
-                       if (likely((tick_to_timer(now_ms) - eb->key) & TIMER_SIGN_BIT)) {
-                               /* note that we don't need this check for the <previous>
-                                * tree, but it's cheaper than duplicating the code.
-                                */
-                               *next = timer_to_tick(eb->key);
-                               return;
-                       }
+               if (likely(tick_is_lt(now_ms, eb->key))) {
+                       /* timer not expired yet, revisit it later */
+                       *next = eb->key;
+                       return;
+               }
  
-                       /* detach the task from the queue and add the task to the run queue */
-                       eb = eb32_next(eb);
-                       __task_unlink_wq(task);
+               /* timer looks expired, detach it from the queue */
+               task = eb32_entry(eb, struct task, wq);
+               eb = eb32_next(eb);
+               __task_unlink_wq(task);
  
-                       /* It is possible that this task was left at an earlier place in the
-                        * tree because a recent call to task_queue() has not moved it. This
-                        * happens when the new expiration date is later than the old one.
-                        * Since it is very unlikely that we reach a timeout anyway, it's a
-                        * lot cheaper to proceed like this because we almost never update
-                        * the tree. We may also find disabled expiration dates there. Since
-                        * we have detached the task from the tree, we simply call task_queue
-                        * to take care of this.
-                        */
-                       if (!tick_is_expired(task->expire, now_ms)) {
-                               task_queue(task);
-                               continue;
-                       }
-                       task_wakeup(task, TASK_WOKEN_TIMER);
+               /* It is possible that this task was left at an earlier place in the
+                * tree because a recent call to task_queue() has not moved it. This
+                * happens when the new expiration date is later than the old one.
+                * Since it is very unlikely that we reach a timeout anyway, it's a
+                * lot cheaper to proceed like this because we almost never update
+                * the tree. We may also find disabled expiration dates there. Since
+                * we have detached the task from the tree, we simply call task_queue
+                * to take care of this.
+                */
+               if (!tick_is_expired(task->expire, now_ms)) {
+                       task_queue(task);
+                       continue;
                 }
-               tree = (tree + 1) & TIMER_TREE_MASK;
-       } while (((tree - now_tree) & TIMER_TREE_MASK) < TIMER_TREES/2);
+               task_wakeup(task, TASK_WOKEN_TIMER);
+       }
  
         /* We have found no task to expire in any tree */
         *next = TICK_ETERNITY;
@@ -178,12 +166,7 @@ void wake_expired_tasks(int *next)
   * counter may wrap without a problem, of course. We then limit the number of
   * tasks processed at once to 1/4 of the number of tasks in the queue, and to
   * 200 max in any case, so that general latency remains low and so that task
- * positions have a chance to be considered. It also reduces the number of
- * trees to be evaluated when no task remains.
- *
- * Just like with timers, we start with tree[(current - 1)], which holds past
- * values, and stop when we reach the middle of the list. In practise, we visit
- * 3 out of 4 trees.
+ * positions have a chance to be considered.
   *
   * The function adjusts <next> if a new event is closer.
   */
@@ -191,13 +174,9 @@ void process_runnable_tasks(int *next)
  {
         struct task *t;
         struct eb32_node *eb;
-       unsigned int tree, stop;
         unsigned int max_processed;
         int expire;
  
-       if (!run_queue)
-               return;
-
         max_processed = run_queue;
         if (max_processed > 200)
                 max_processed = 200;
@@ -205,47 +184,46 @@ void process_runnable_tasks(int *next)
         if (likely(niced_tasks))
                 max_processed /= 4;
  
-       tree = timer_to_tree(rqueue_ticks);
-       stop = (tree + TIMER_TREES / 2) & TIMER_TREE_MASK;
-       tree = (tree - 1) & TIMER_TREE_MASK;
-
         expire = *next;
-       do {
-               eb = eb32_first(&rqueue[tree]);
-               while (eb) {
-                       /* Note: this loop is one of the fastest code path in
-                        * the whole program. It should not be re-arranged
-                        * without a good reason.
-                        */
-                       t = eb32_entry(eb, struct task, rq);
+       eb = eb32_lookup_ge(&rqueue, rqueue_ticks - TIMER_LOOK_BACK);
+       while (max_processed--) {
+               /* Note: this loop is one of the fastest code path in
+                * the whole program. It should not be re-arranged
+                * without a good reason.
+                */
+
+               if (unlikely(!eb)) {
+                       /* we might have reached the end of the tree, typically because
+                       * <rqueue_ticks> is in the first half and we're first scanning
+                       * the last half. Let's loop back to the beginning of the tree now.
+                       */
+                       eb = eb32_first(&rqueue);
+                       if (likely(!eb))
+                               break;
+               }
  
-                       /* detach the task from the queue and add the task to the run queue */
-                       eb = eb32_next(eb);
-                       __task_unlink_rq(t);
+               /* detach the task from the queue */
+               t = eb32_entry(eb, struct task, rq);
+               eb = eb32_next(eb);
+               __task_unlink_rq(t);
  
-                       t->state |= TASK_RUNNING;
-                       /* This is an optimisation to help the processor's branch
-                        * predictor take this most common call.
-                        */
-                       if (likely(t->process == process_session))
-                               t = process_session(t);
-                       else
-                               t = t->process(t);
+               t->state |= TASK_RUNNING;
+               /* This is an optimisation to help the processor's branch
+                * predictor take this most common call.
+                */
+               if (likely(t->process == process_session))
+                       t = process_session(t);
+               else
+                       t = t->process(t);
  
-                       if (likely(t != NULL)) {
-                               t->state &= ~TASK_RUNNING;
-                               if (t->expire) {
-                                       task_queue(t);
-                                       expire = tick_first_2nz(expire, t->expire);
-                               }
+               if (likely(t != NULL)) {
+                       t->state &= ~TASK_RUNNING;
+                       if (t->expire) {
+                               task_queue(t);
+                               expire = tick_first_2nz(expire, t->expire);
                         }
-
-                       if (!--max_processed)
-                               goto out;
                 }
-               tree = (tree + 1) & TIMER_TREE_MASK;
-       } while (tree != stop);
- out:
+       }
         *next = expire;
  }
author	Willy Tarreau <w@1wt.eu>
	Sat, 21 Mar 2009 09:01:42 +0000 (10:01 +0100)
committer	Willy Tarreau <w@1wt.eu>
	Sat, 21 Mar 2009 09:25:14 +0000 (10:25 +0100)
include/common/ticks.h		patch \| blob \| blame \| history
include/proto/task.h		patch \| blob \| blame \| history
src/task.c		patch \| blob \| blame \| history