]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.10-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 15 Oct 2013 20:50:21 +0000 (13:50 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 15 Oct 2013 20:50:21 +0000 (13:50 -0700)
added patches:
ipc-sem.c-always-use-only-one-queue-for-alter-operations.patch
ipc-sem.c-cacheline-align-the-semaphore-structures.patch
ipc-sem.c-rename-try_atomic_semop-to-perform_atomic_semop-docu-update.patch
ipc-sem.c-replace-shared-sem_otime-with-per-semaphore-value.patch
ipc-sem-separate-wait-for-zero-and-alter-tasks-into-seperate-queues.patch
ipc-util.c-ipc_rcu_alloc-cacheline-align-allocation.patch

queue-3.10/ipc-sem-separate-wait-for-zero-and-alter-tasks-into-seperate-queues.patch [new file with mode: 0644]
queue-3.10/ipc-sem.c-always-use-only-one-queue-for-alter-operations.patch [new file with mode: 0644]
queue-3.10/ipc-sem.c-cacheline-align-the-semaphore-structures.patch [new file with mode: 0644]
queue-3.10/ipc-sem.c-rename-try_atomic_semop-to-perform_atomic_semop-docu-update.patch [new file with mode: 0644]
queue-3.10/ipc-sem.c-replace-shared-sem_otime-with-per-semaphore-value.patch [new file with mode: 0644]
queue-3.10/ipc-util.c-ipc_rcu_alloc-cacheline-align-allocation.patch [new file with mode: 0644]
queue-3.10/series

diff --git a/queue-3.10/ipc-sem-separate-wait-for-zero-and-alter-tasks-into-seperate-queues.patch b/queue-3.10/ipc-sem-separate-wait-for-zero-and-alter-tasks-into-seperate-queues.patch
new file mode 100644 (file)
index 0000000..a296de4
--- /dev/null
@@ -0,0 +1,408 @@
+From 1a82e9e1d0f1b45f47a97c9e2349020536ff8987 Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:23 -0700
+Subject: ipc/sem: separate wait-for-zero and alter tasks into seperate queues
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit 1a82e9e1d0f1b45f47a97c9e2349020536ff8987 upstream.
+
+Introduce separate queues for operations that do not modify the
+semaphore values.  Advantages:
+
+ - Simpler logic in check_restart().
+ - Faster update_queue(): Right now, all wait-for-zero operations are
+   always tested, even if the semaphore value is not 0.
+ - wait-for-zero gets again priority, as in linux <=3.0.9
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sem.h |    5 -
+ ipc/sem.c           |  211 +++++++++++++++++++++++++++++++++++++---------------
+ 2 files changed, 155 insertions(+), 61 deletions(-)
+
+--- a/include/linux/sem.h
++++ b/include/linux/sem.h
+@@ -15,7 +15,10 @@ struct sem_array {
+       time_t                  sem_otime;      /* last semop time */
+       time_t                  sem_ctime;      /* last change time */
+       struct sem              *sem_base;      /* ptr to first semaphore in array */
+-      struct list_head        sem_pending;    /* pending operations to be processed */
++      struct list_head        pending_alter;  /* pending operations */
++                                              /* that alter the array */
++      struct list_head        pending_const;  /* pending complex operations */
++                                              /* that do not alter semvals */
+       struct list_head        list_id;        /* undo requests on this array */
+       int                     sem_nsems;      /* no. of semaphores in array */
+       int                     complex_count;  /* pending complex operations */
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -95,7 +95,10 @@ struct sem {
+       int     semval;         /* current value */
+       int     sempid;         /* pid of last operation */
+       spinlock_t      lock;   /* spinlock for fine-grained semtimedop */
+-      struct list_head sem_pending; /* pending single-sop operations */
++      struct list_head pending_alter; /* pending single-sop operations */
++                                      /* that alter the semaphore */
++      struct list_head pending_const; /* pending single-sop operations */
++                                      /* that do not alter the semaphore*/
+ } ____cacheline_aligned_in_smp;
+ /* One queue for each sleeping process in the system. */
+@@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct
+ /*
+  * linked list protection:
+  *    sem_undo.id_next,
+- *    sem_array.sem_pending{,last},
++ *    sem_array.pending{_alter,_cont},
+  *    sem_array.sem_undo: sem_lock() for read/write
+  *    sem_undo.proc_next: only "current" is allowed to read/write that field.
+  *    
+@@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_n
+  * Without the check/retry algorithm a lockless wakeup is possible:
+  * - queue.status is initialized to -EINTR before blocking.
+  * - wakeup is performed by
+- *    * unlinking the queue entry from sma->sem_pending
++ *    * unlinking the queue entry from the pending list
+  *    * setting queue.status to IN_WAKEUP
+  *      This is the notification for the blocked thread that a
+  *      result value is imminent.
+@@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *
+       sma->sem_base = (struct sem *) &sma[1];
+       for (i = 0; i < nsems; i++) {
+-              INIT_LIST_HEAD(&sma->sem_base[i].sem_pending);
++              INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
++              INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
+               spin_lock_init(&sma->sem_base[i].lock);
+       }
+       sma->complex_count = 0;
+-      INIT_LIST_HEAD(&sma->sem_pending);
++      INIT_LIST_HEAD(&sma->pending_alter);
++      INIT_LIST_HEAD(&sma->pending_const);
+       INIT_LIST_HEAD(&sma->list_id);
+       sma->sem_nsems = nsems;
+       sma->sem_ctime = get_seconds();
+@@ -609,60 +614,132 @@ static void unlink_queue(struct sem_arra
+  * update_queue is O(N^2) when it restarts scanning the whole queue of
+  * waiting operations. Therefore this function checks if the restart is
+  * really necessary. It is called after a previously waiting operation
+- * was completed.
++ * modified the array.
++ * Note that wait-for-zero operations are handled without restart.
+  */
+ static int check_restart(struct sem_array *sma, struct sem_queue *q)
+ {
+-      struct sem *curr;
+-      struct sem_queue *h;
+-
+-      /* if the operation didn't modify the array, then no restart */
+-      if (q->alter == 0)
+-              return 0;
+-
+-      /* pending complex operations are too difficult to analyse */
+-      if (sma->complex_count)
++      /* pending complex alter operations are too difficult to analyse */
++      if (!list_empty(&sma->pending_alter))
+               return 1;
+       /* we were a sleeping complex operation. Too difficult */
+       if (q->nsops > 1)
+               return 1;
+-      curr = sma->sem_base + q->sops[0].sem_num;
++      /* It is impossible that someone waits for the new value:
++       * - complex operations always restart.
++       * - wait-for-zero are handled seperately.
++       * - q is a previously sleeping simple operation that
++       *   altered the array. It must be a decrement, because
++       *   simple increments never sleep.
++       * - If there are older (higher priority) decrements
++       *   in the queue, then they have observed the original
++       *   semval value and couldn't proceed. The operation
++       *   decremented to value - thus they won't proceed either.
++       */
++      return 0;
++}
+-      /* No-one waits on this queue */
+-      if (list_empty(&curr->sem_pending))
+-              return 0;
++/**
++ * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks
++ * @sma: semaphore array.
++ * @semnum: semaphore that was modified.
++ * @pt: list head for the tasks that must be woken up.
++ *
++ * wake_const_ops must be called after a semaphore in a semaphore array
++ * was set to 0. If complex const operations are pending, wake_const_ops must
++ * be called with semnum = -1, as well as with the number of each modified
++ * semaphore.
++ * The tasks that must be woken up are added to @pt. The return code
++ * is stored in q->pid.
++ * The function returns 1 if at least one operation was completed successfully.
++ */
++static int wake_const_ops(struct sem_array *sma, int semnum,
++                              struct list_head *pt)
++{
++      struct sem_queue *q;
++      struct list_head *walk;
++      struct list_head *pending_list;
++      int semop_completed = 0;
++
++      if (semnum == -1)
++              pending_list = &sma->pending_const;
++      else
++              pending_list = &sma->sem_base[semnum].pending_const;
++
++      walk = pending_list->next;
++      while (walk != pending_list) {
++              int error;
++
++              q = container_of(walk, struct sem_queue, list);
++              walk = walk->next;
++
++              error = try_atomic_semop(sma, q->sops, q->nsops,
++                                              q->undo, q->pid);
++
++              if (error <= 0) {
++                      /* operation completed, remove from queue & wakeup */
++
++                      unlink_queue(sma, q);
++
++                      wake_up_sem_queue_prepare(pt, q, error);
++                      if (error == 0)
++                              semop_completed = 1;
++              }
++      }
++      return semop_completed;
++}
+-      /* the new semaphore value */
+-      if (curr->semval) {
+-              /* It is impossible that someone waits for the new value:
+-               * - q is a previously sleeping simple operation that
+-               *   altered the array. It must be a decrement, because
+-               *   simple increments never sleep.
+-               * - The value is not 0, thus wait-for-zero won't proceed.
+-               * - If there are older (higher priority) decrements
+-               *   in the queue, then they have observed the original
+-               *   semval value and couldn't proceed. The operation
+-               *   decremented to value - thus they won't proceed either.
++/**
++ * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks
++ * @sma: semaphore array
++ * @sops: operations that were performed
++ * @nsops: number of operations
++ * @pt: list head of the tasks that must be woken up.
++ *
++ * do_smart_wakeup_zero() checks all required queue for wait-for-zero
++ * operations, based on the actual changes that were performed on the
++ * semaphore array.
++ * The function returns 1 if at least one operation was completed successfully.
++ */
++static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
++                                      int nsops, struct list_head *pt)
++{
++      int i;
++      int semop_completed = 0;
++      int got_zero = 0;
++
++      /* first: the per-semaphore queues, if known */
++      if (sops) {
++              for (i = 0; i < nsops; i++) {
++                      int num = sops[i].sem_num;
++
++                      if (sma->sem_base[num].semval == 0) {
++                              got_zero = 1;
++                              semop_completed |= wake_const_ops(sma, num, pt);
++                      }
++              }
++      } else {
++              /*
++               * No sops means modified semaphores not known.
++               * Assume all were changed.
+                */
+-              BUG_ON(q->sops[0].sem_op >= 0);
+-              return 0;
++              for (i = 0; i < sma->sem_nsems; i++) {
++                      if (sma->sem_base[i].semval == 0) {
++                              got_zero = 1;
++                              semop_completed |= wake_const_ops(sma, i, pt);
++                      }
++              }
+       }
+       /*
+-       * semval is 0. Check if there are wait-for-zero semops.
+-       * They must be the first entries in the per-semaphore queue
++       * If one of the modified semaphores got 0,
++       * then check the global queue, too.
+        */
+-      h = list_first_entry(&curr->sem_pending, struct sem_queue, list);
+-      BUG_ON(h->nsops != 1);
+-      BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num);
++      if (got_zero)
++              semop_completed |= wake_const_ops(sma, -1, pt);
+-      /* Yes, there is a wait-for-zero semop. Restart */
+-      if (h->sops[0].sem_op == 0)
+-              return 1;
+-
+-      /* Again - no-one is waiting for the new value. */
+-      return 0;
++      return semop_completed;
+ }
+@@ -678,6 +755,8 @@ static int check_restart(struct sem_arra
+  * semaphore.
+  * The tasks that must be woken up are added to @pt. The return code
+  * is stored in q->pid.
++ * The function internally checks if const operations can now succeed.
++ *
+  * The function return 1 if at least one semop was completed successfully.
+  */
+ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
+@@ -688,9 +767,9 @@ static int update_queue(struct sem_array
+       int semop_completed = 0;
+       if (semnum == -1)
+-              pending_list = &sma->sem_pending;
++              pending_list = &sma->pending_alter;
+       else
+-              pending_list = &sma->sem_base[semnum].sem_pending;
++              pending_list = &sma->sem_base[semnum].pending_alter;
+ again:
+       walk = pending_list->next;
+@@ -702,13 +781,12 @@ again:
+               /* If we are scanning the single sop, per-semaphore list of
+                * one semaphore and that semaphore is 0, then it is not
+-               * necessary to scan the "alter" entries: simple increments
++               * necessary to scan further: simple increments
+                * that affect only one entry succeed immediately and cannot
+                * be in the  per semaphore pending queue, and decrements
+                * cannot be successful if the value is already 0.
+                */
+-              if (semnum != -1 && sma->sem_base[semnum].semval == 0 &&
+-                              q->alter)
++              if (semnum != -1 && sma->sem_base[semnum].semval == 0)
+                       break;
+               error = try_atomic_semop(sma, q->sops, q->nsops,
+@@ -724,6 +802,7 @@ again:
+                       restart = 0;
+               } else {
+                       semop_completed = 1;
++                      do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
+                       restart = check_restart(sma, q);
+               }
+@@ -742,8 +821,8 @@ again:
+  * @otime: force setting otime
+  * @pt: list head of the tasks that must be woken up.
+  *
+- * do_smart_update() does the required called to update_queue, based on the
+- * actual changes that were performed on the semaphore array.
++ * do_smart_update() does the required calls to update_queue and wakeup_zero,
++ * based on the actual changes that were performed on the semaphore array.
+  * Note that the function does not do the actual wake-up: the caller is
+  * responsible for calling wake_up_sem_queue_do(@pt).
+  * It is safe to perform this call after dropping all locks.
+@@ -754,6 +833,8 @@ static void do_smart_update(struct sem_a
+       int i;
+       int progress;
++      otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
++
+       progress = 1;
+ retry_global:
+       if (sma->complex_count) {
+@@ -813,14 +894,14 @@ static int count_semncnt (struct sem_arr
+       struct sem_queue * q;
+       semncnt = 0;
+-      list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) {
++      list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) {
+               struct sembuf * sops = q->sops;
+               BUG_ON(sops->sem_num != semnum);
+               if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT))
+                       semncnt++;
+       }
+-      list_for_each_entry(q, &sma->sem_pending, list) {
++      list_for_each_entry(q, &sma->pending_alter, list) {
+               struct sembuf * sops = q->sops;
+               int nsops = q->nsops;
+               int i;
+@@ -839,14 +920,14 @@ static int count_semzcnt (struct sem_arr
+       struct sem_queue * q;
+       semzcnt = 0;
+-      list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) {
++      list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) {
+               struct sembuf * sops = q->sops;
+               BUG_ON(sops->sem_num != semnum);
+               if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT))
+                       semzcnt++;
+       }
+-      list_for_each_entry(q, &sma->sem_pending, list) {
++      list_for_each_entry(q, &sma->pending_const, list) {
+               struct sembuf * sops = q->sops;
+               int nsops = q->nsops;
+               int i;
+@@ -884,13 +965,22 @@ static void freeary(struct ipc_namespace
+       /* Wake up all pending processes and let them fail with EIDRM. */
+       INIT_LIST_HEAD(&tasks);
+-      list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
++      list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
++              unlink_queue(sma, q);
++              wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
++      }
++
++      list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
+               unlink_queue(sma, q);
+               wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+       }
+       for (i = 0; i < sma->sem_nsems; i++) {
+               struct sem *sem = sma->sem_base + i;
+-              list_for_each_entry_safe(q, tq, &sem->sem_pending, list) {
++              list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
++                      unlink_queue(sma, q);
++                      wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
++              }
++              list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
+                       unlink_queue(sma, q);
+                       wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+               }
+@@ -1658,14 +1748,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid,
+               curr = &sma->sem_base[sops->sem_num];
+               if (alter)
+-                      list_add_tail(&queue.list, &curr->sem_pending);
++                      list_add_tail(&queue.list, &curr->pending_alter);
+               else
+-                      list_add(&queue.list, &curr->sem_pending);
++                      list_add_tail(&queue.list, &curr->pending_const);
+       } else {
+               if (alter)
+-                      list_add_tail(&queue.list, &sma->sem_pending);
++                      list_add_tail(&queue.list, &sma->pending_alter);
+               else
+-                      list_add(&queue.list, &sma->sem_pending);
++                      list_add_tail(&queue.list, &sma->pending_const);
++
+               sma->complex_count++;
+       }
diff --git a/queue-3.10/ipc-sem.c-always-use-only-one-queue-for-alter-operations.patch b/queue-3.10/ipc-sem.c-always-use-only-one-queue-for-alter-operations.patch
new file mode 100644 (file)
index 0000000..ed1aaa0
--- /dev/null
@@ -0,0 +1,205 @@
+From f269f40ad5aeee229ed70044926f44318abe41ef Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:24 -0700
+Subject: ipc/sem.c: always use only one queue for alter operations
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit f269f40ad5aeee229ed70044926f44318abe41ef upstream.
+
+There are two places that can contain alter operations:
+ - the global queue: sma->pending_alter
+ - the per-semaphore queues: sma->sem_base[].pending_alter.
+
+Since one of the queues must be processed first, this causes an odd
+priorization of the wakeups: complex operations have priority over
+simple ops.
+
+The patch restores the behavior of linux <=3.0.9: The longest waiting
+operation has the highest priority.
+
+This is done by using only one queue:
+ - if there are complex ops, then sma->pending_alter is used.
+ - otherwise, the per-semaphore queues are used.
+
+As a side effect, do_smart_update_queue() becomes much simpler: no more
+goto logic.
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/sem.c |  128 ++++++++++++++++++++++++++++++++++++++++++--------------------
+ 1 file changed, 88 insertions(+), 40 deletions(-)
+
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -192,6 +192,53 @@ void __init sem_init (void)
+                               IPC_SEM_IDS, sysvipc_sem_proc_show);
+ }
++/**
++ * unmerge_queues - unmerge queues, if possible.
++ * @sma: semaphore array
++ *
++ * The function unmerges the wait queues if complex_count is 0.
++ * It must be called prior to dropping the global semaphore array lock.
++ */
++static void unmerge_queues(struct sem_array *sma)
++{
++      struct sem_queue *q, *tq;
++
++      /* complex operations still around? */
++      if (sma->complex_count)
++              return;
++      /*
++       * We will switch back to simple mode.
++       * Move all pending operation back into the per-semaphore
++       * queues.
++       */
++      list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
++              struct sem *curr;
++              curr = &sma->sem_base[q->sops[0].sem_num];
++
++              list_add_tail(&q->list, &curr->pending_alter);
++      }
++      INIT_LIST_HEAD(&sma->pending_alter);
++}
++
++/**
++ * merge_queues - Merge single semop queues into global queue
++ * @sma: semaphore array
++ *
++ * This function merges all per-semaphore queues into the global queue.
++ * It is necessary to achieve FIFO ordering for the pending single-sop
++ * operations when a multi-semop operation must sleep.
++ * Only the alter operations must be moved, the const operations can stay.
++ */
++static void merge_queues(struct sem_array *sma)
++{
++      int i;
++      for (i = 0; i < sma->sem_nsems; i++) {
++              struct sem *sem = sma->sem_base + i;
++
++              list_splice_init(&sem->pending_alter, &sma->pending_alter);
++      }
++}
++
+ /*
+  * If the request contains only one semaphore operation, and there are
+  * no complex transactions pending, lock only the semaphore involved.
+@@ -262,6 +309,7 @@ static inline int sem_lock(struct sem_ar
+ static inline void sem_unlock(struct sem_array *sma, int locknum)
+ {
+       if (locknum == -1) {
++              unmerge_queues(sma);
+               ipc_unlock_object(&sma->sem_perm);
+       } else {
+               struct sem *sem = sma->sem_base + locknum;
+@@ -831,49 +879,38 @@ static void do_smart_update(struct sem_a
+                       int otime, struct list_head *pt)
+ {
+       int i;
+-      int progress;
+       otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
+-      progress = 1;
+-retry_global:
+-      if (sma->complex_count) {
+-              if (update_queue(sma, -1, pt)) {
+-                      progress = 1;
+-                      otime = 1;
+-                      sops = NULL;
+-              }
+-      }
+-      if (!progress)
+-              goto done;
+-
+-      if (!sops) {
+-              /* No semops; something special is going on. */
+-              for (i = 0; i < sma->sem_nsems; i++) {
+-                      if (update_queue(sma, i, pt)) {
+-                              otime = 1;
+-                              progress = 1;
++      if (!list_empty(&sma->pending_alter)) {
++              /* semaphore array uses the global queue - just process it. */
++              otime |= update_queue(sma, -1, pt);
++      } else {
++              if (!sops) {
++                      /*
++                       * No sops, thus the modified semaphores are not
++                       * known. Check all.
++                       */
++                      for (i = 0; i < sma->sem_nsems; i++)
++                              otime |= update_queue(sma, i, pt);
++              } else {
++                      /*
++                       * Check the semaphores that were increased:
++                       * - No complex ops, thus all sleeping ops are
++                       *   decrease.
++                       * - if we decreased the value, then any sleeping
++                       *   semaphore ops wont be able to run: If the
++                       *   previous value was too small, then the new
++                       *   value will be too small, too.
++                       */
++                      for (i = 0; i < nsops; i++) {
++                              if (sops[i].sem_op > 0) {
++                                      otime |= update_queue(sma,
++                                                      sops[i].sem_num, pt);
++                              }
+                       }
+               }
+-              goto done_checkretry;
+-      }
+-
+-      /* Check the semaphores that were modified. */
+-      for (i = 0; i < nsops; i++) {
+-              if (sops[i].sem_op > 0 ||
+-                      (sops[i].sem_op < 0 &&
+-                              sma->sem_base[sops[i].sem_num].semval == 0))
+-                      if (update_queue(sma, sops[i].sem_num, pt)) {
+-                              otime = 1;
+-                              progress = 1;
+-                      }
+-      }
+-done_checkretry:
+-      if (progress) {
+-              progress = 0;
+-              goto retry_global;
+       }
+-done:
+       if (otime)
+               sma->sem_otime = get_seconds();
+ }
+@@ -1747,11 +1784,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid,
+               struct sem *curr;
+               curr = &sma->sem_base[sops->sem_num];
+-              if (alter)
+-                      list_add_tail(&queue.list, &curr->pending_alter);
+-              else
++              if (alter) {
++                      if (sma->complex_count) {
++                              list_add_tail(&queue.list,
++                                              &sma->pending_alter);
++                      } else {
++
++                              list_add_tail(&queue.list,
++                                              &curr->pending_alter);
++                      }
++              } else {
+                       list_add_tail(&queue.list, &curr->pending_const);
++              }
+       } else {
++              if (!sma->complex_count)
++                      merge_queues(sma);
++
+               if (alter)
+                       list_add_tail(&queue.list, &sma->pending_alter);
+               else
diff --git a/queue-3.10/ipc-sem.c-cacheline-align-the-semaphore-structures.patch b/queue-3.10/ipc-sem.c-cacheline-align-the-semaphore-structures.patch
new file mode 100644 (file)
index 0000000..62b5bd8
--- /dev/null
@@ -0,0 +1,54 @@
+From f5c936c0f267ec58641451cf8b8d39b4c207ee4d Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:22 -0700
+Subject: ipc/sem.c: cacheline align the semaphore structures
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit f5c936c0f267ec58641451cf8b8d39b4c207ee4d upstream.
+
+As now each semaphore has its own spinlock and parallel operations are
+possible, give each semaphore its own cacheline.
+
+On a i3 laptop, this gives up to 28% better performance:
+
+  #semscale 10 | grep "interleave 2"
+  - before:
+  Cpus 1, interleave 2 delay 0: 36109234 in 10 secs
+  Cpus 2, interleave 2 delay 0: 55276317 in 10 secs
+  Cpus 3, interleave 2 delay 0: 62411025 in 10 secs
+  Cpus 4, interleave 2 delay 0: 81963928 in 10 secs
+
+  -after:
+  Cpus 1, interleave 2 delay 0: 35527306 in 10 secs
+  Cpus 2, interleave 2 delay 0: 70922909 in 10 secs <<< + 28%
+  Cpus 3, interleave 2 delay 0: 80518538 in 10 secs
+  Cpus 4, interleave 2 delay 0: 89115148 in 10 secs <<< + 8.7%
+
+i3, with 2 cores and with hyperthreading enabled.  Interleave 2 in order
+use first the full cores.  HT partially hides the delay from cacheline
+trashing, thus the improvement is "only" 8.7% if 4 threads are running.
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/sem.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -96,7 +96,7 @@ struct sem {
+       int     sempid;         /* pid of last operation */
+       spinlock_t      lock;   /* spinlock for fine-grained semtimedop */
+       struct list_head sem_pending; /* pending single-sop operations */
+-};
++} ____cacheline_aligned_in_smp;
+ /* One queue for each sleeping process in the system. */
+ struct sem_queue {
diff --git a/queue-3.10/ipc-sem.c-rename-try_atomic_semop-to-perform_atomic_semop-docu-update.patch b/queue-3.10/ipc-sem.c-rename-try_atomic_semop-to-perform_atomic_semop-docu-update.patch
new file mode 100644 (file)
index 0000000..09adc2e
--- /dev/null
@@ -0,0 +1,113 @@
+From 758a6ba39ef6df4cdc615e5edd7bd86eab81a5f7 Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:26 -0700
+Subject: ipc/sem.c: rename try_atomic_semop() to perform_atomic_semop(), docu update
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit 758a6ba39ef6df4cdc615e5edd7bd86eab81a5f7 upstream.
+
+Cleanup: Some minor points that I noticed while writing the previous
+patches
+
+1) The name try_atomic_semop() is misleading: The function performs the
+   operation (if it is possible).
+
+2) Some documentation updates.
+
+No real code change, a rename and documentation changes.
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/sem.c |   32 +++++++++++++++++++++-----------
+ 1 file changed, 21 insertions(+), 11 deletions(-)
+
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -154,12 +154,15 @@ static int sysvipc_sem_proc_show(struct
+ #define SEMOPM_FAST   64  /* ~ 372 bytes on stack */
+ /*
+- * linked list protection:
++ * Locking:
+  *    sem_undo.id_next,
++ *    sem_array.complex_count,
+  *    sem_array.pending{_alter,_cont},
+- *    sem_array.sem_undo: sem_lock() for read/write
++ *    sem_array.sem_undo: global sem_lock() for read/write
+  *    sem_undo.proc_next: only "current" is allowed to read/write that field.
+  *    
++ *    sem_array.sem_base[i].pending_{const,alter}:
++ *            global or semaphore sem_lock() for read/write
+  */
+ #define sc_semmsl     sem_ctls[0]
+@@ -536,12 +539,19 @@ SYSCALL_DEFINE3(semget, key_t, key, int,
+       return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
+ }
+-/*
+- * Determine whether a sequence of semaphore operations would succeed
+- * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
++/** perform_atomic_semop - Perform (if possible) a semaphore operation
++ * @sma: semaphore array
++ * @sops: array with operations that should be checked
++ * @nsems: number of sops
++ * @un: undo array
++ * @pid: pid that did the change
++ *
++ * Returns 0 if the operation was possible.
++ * Returns 1 if the operation is impossible, the caller must sleep.
++ * Negative values are error codes.
+  */
+-static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
++static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
+                            int nsops, struct sem_undo *un, int pid)
+ {
+       int result, sem_op;
+@@ -724,8 +734,8 @@ static int wake_const_ops(struct sem_arr
+               q = container_of(walk, struct sem_queue, list);
+               walk = walk->next;
+-              error = try_atomic_semop(sma, q->sops, q->nsops,
+-                                              q->undo, q->pid);
++              error = perform_atomic_semop(sma, q->sops, q->nsops,
++                                               q->undo, q->pid);
+               if (error <= 0) {
+                       /* operation completed, remove from queue & wakeup */
+@@ -838,7 +848,7 @@ again:
+               if (semnum != -1 && sma->sem_base[semnum].semval == 0)
+                       break;
+-              error = try_atomic_semop(sma, q->sops, q->nsops,
++              error = perform_atomic_semop(sma, q->sops, q->nsops,
+                                        q->undo, q->pid);
+               /* Does q->sleeper still need to sleep? */
+@@ -1686,7 +1696,6 @@ static int get_queue_result(struct sem_q
+       return error;
+ }
+-
+ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
+               unsigned, nsops, const struct timespec __user *, timeout)
+ {
+@@ -1784,7 +1793,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid,
+       if (un && un->semid == -1)
+               goto out_unlock_free;
+-      error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
++      error = perform_atomic_semop(sma, sops, nsops, un,
++                                      task_tgid_vnr(current));
+       if (error <= 0) {
+               if (alter && error == 0)
+                       do_smart_update(sma, sops, nsops, 1, &tasks);
diff --git a/queue-3.10/ipc-sem.c-replace-shared-sem_otime-with-per-semaphore-value.patch b/queue-3.10/ipc-sem.c-replace-shared-sem_otime-with-per-semaphore-value.patch
new file mode 100644 (file)
index 0000000..8daa594
--- /dev/null
@@ -0,0 +1,123 @@
+From d12e1e50e47e0900dbbf52237b7e171f4f15ea1e Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:25 -0700
+Subject: ipc/sem.c: replace shared sem_otime with per-semaphore value
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit d12e1e50e47e0900dbbf52237b7e171f4f15ea1e upstream.
+
+sem_otime contains the time of the last semaphore operation that
+completed successfully.  Every operation updates this value, thus access
+from multiple cpus can cause thrashing.
+
+Therefore the patch replaces the variable with a per-semaphore variable.
+The per-array sem_otime is only calculated when required.
+
+No performance improvement on a single-socket i3 - only important for
+larger systems.
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sem.h |    1 -
+ ipc/sem.c           |   37 +++++++++++++++++++++++++++++++------
+ 2 files changed, 31 insertions(+), 7 deletions(-)
+
+--- a/include/linux/sem.h
++++ b/include/linux/sem.h
+@@ -12,7 +12,6 @@ struct task_struct;
+ struct sem_array {
+       struct kern_ipc_perm    ____cacheline_aligned_in_smp
+                               sem_perm;       /* permissions .. see ipc.h */
+-      time_t                  sem_otime;      /* last semop time */
+       time_t                  sem_ctime;      /* last change time */
+       struct sem              *sem_base;      /* ptr to first semaphore in array */
+       struct list_head        pending_alter;  /* pending operations */
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -99,6 +99,7 @@ struct sem {
+                                       /* that alter the semaphore */
+       struct list_head pending_const; /* pending single-sop operations */
+                                       /* that do not alter the semaphore*/
++      time_t  sem_otime;      /* candidate for sem_otime */
+ } ____cacheline_aligned_in_smp;
+ /* One queue for each sleeping process in the system. */
+@@ -911,8 +912,14 @@ static void do_smart_update(struct sem_a
+                       }
+               }
+       }
+-      if (otime)
+-              sma->sem_otime = get_seconds();
++      if (otime) {
++              if (sops == NULL) {
++                      sma->sem_base[0].sem_otime = get_seconds();
++              } else {
++                      sma->sem_base[sops[0].sem_num].sem_otime =
++                                                              get_seconds();
++              }
++      }
+ }
+@@ -1058,6 +1065,21 @@ static unsigned long copy_semid_to_user(
+       }
+ }
++static time_t get_semotime(struct sem_array *sma)
++{
++      int i;
++      time_t res;
++
++      res = sma->sem_base[0].sem_otime;
++      for (i = 1; i < sma->sem_nsems; i++) {
++              time_t to = sma->sem_base[i].sem_otime;
++
++              if (to > res)
++                      res = to;
++      }
++      return res;
++}
++
+ static int semctl_nolock(struct ipc_namespace *ns, int semid,
+                        int cmd, int version, void __user *p)
+ {
+@@ -1131,9 +1153,9 @@ static int semctl_nolock(struct ipc_name
+                       goto out_unlock;
+               kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
+-              tbuf.sem_otime  = sma->sem_otime;
+-              tbuf.sem_ctime  = sma->sem_ctime;
+-              tbuf.sem_nsems  = sma->sem_nsems;
++              tbuf.sem_otime = get_semotime(sma);
++              tbuf.sem_ctime = sma->sem_ctime;
++              tbuf.sem_nsems = sma->sem_nsems;
+               rcu_read_unlock();
+               if (copy_semid_to_user(p, &tbuf, version))
+                       return -EFAULT;
+@@ -2025,6 +2047,9 @@ static int sysvipc_sem_proc_show(struct
+ {
+       struct user_namespace *user_ns = seq_user_ns(s);
+       struct sem_array *sma = it;
++      time_t sem_otime;
++
++      sem_otime = get_semotime(sma);
+       return seq_printf(s,
+                         "%10d %10d  %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
+@@ -2036,7 +2061,7 @@ static int sysvipc_sem_proc_show(struct
+                         from_kgid_munged(user_ns, sma->sem_perm.gid),
+                         from_kuid_munged(user_ns, sma->sem_perm.cuid),
+                         from_kgid_munged(user_ns, sma->sem_perm.cgid),
+-                        sma->sem_otime,
++                        sem_otime,
+                         sma->sem_ctime);
+ }
+ #endif
diff --git a/queue-3.10/ipc-util.c-ipc_rcu_alloc-cacheline-align-allocation.patch b/queue-3.10/ipc-util.c-ipc_rcu_alloc-cacheline-align-allocation.patch
new file mode 100644 (file)
index 0000000..89598f3
--- /dev/null
@@ -0,0 +1,71 @@
+From 196aa0132fc7261f34b10ae1bfb44abc1bc69b3c Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:20 -0700
+Subject: ipc/util.c, ipc_rcu_alloc: cacheline align allocation
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit 196aa0132fc7261f34b10ae1bfb44abc1bc69b3c upstream.
+
+Enforce that ipc_rcu_alloc returns a cacheline aligned pointer on SMP.
+
+Rationale:
+
+The SysV sem code tries to move the main spinlock into a seperate
+cacheline (____cacheline_aligned_in_smp).  This works only if
+ipc_rcu_alloc returns cacheline aligned pointers.  vmalloc and kmalloc
+return cacheline algined pointers, the implementation of ipc_rcu_alloc
+breaks that.
+
+[akpm@linux-foundation.org: coding-style fixes]
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/util.c |   12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/ipc/util.c
++++ b/ipc/util.c
+@@ -468,9 +468,7 @@ void ipc_free(void* ptr, int size)
+ struct ipc_rcu {
+       struct rcu_head rcu;
+       atomic_t refcount;
+-      /* "void *" makes sure alignment of following data is sane. */
+-      void *data[0];
+-};
++} ____cacheline_aligned_in_smp;
+ /**
+  *    ipc_rcu_alloc   -       allocate ipc and rcu space 
+@@ -488,12 +486,14 @@ void *ipc_rcu_alloc(int size)
+       if (unlikely(!out))
+               return NULL;
+       atomic_set(&out->refcount, 1);
+-      return out->data;
++      return out + 1;
+ }
+ int ipc_rcu_getref(void *ptr)
+ {
+-      return atomic_inc_not_zero(&container_of(ptr, struct ipc_rcu, data)->refcount);
++      struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
++
++      return atomic_inc_not_zero(&p->refcount);
+ }
+ /**
+@@ -507,7 +507,7 @@ static void ipc_schedule_free(struct rcu
+ void ipc_rcu_putref(void *ptr)
+ {
+-      struct ipc_rcu *p = container_of(ptr, struct ipc_rcu, data);
++      struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
+       if (!atomic_dec_and_test(&p->refcount))
+               return;
index 456597e69cc8b45d30d5317af05e5b9dae929cd5..74a2f032320e01915c0874fd42b4aba79fab61a2 100644 (file)
@@ -38,3 +38,9 @@ ipc-msg-make-msgctl_nolock-lockless.patch
 ipc-msg-shorten-critical-region-in-msgsnd.patch
 ipc-msg-shorten-critical-region-in-msgrcv.patch
 ipc-remove-unused-functions.patch
+ipc-util.c-ipc_rcu_alloc-cacheline-align-allocation.patch
+ipc-sem.c-cacheline-align-the-semaphore-structures.patch
+ipc-sem-separate-wait-for-zero-and-alter-tasks-into-seperate-queues.patch
+ipc-sem.c-always-use-only-one-queue-for-alter-operations.patch
+ipc-sem.c-replace-shared-sem_otime-with-per-semaphore-value.patch
+ipc-sem.c-rename-try_atomic_semop-to-perform_atomic_semop-docu-update.patch