--- /dev/null
+From 1a82e9e1d0f1b45f47a97c9e2349020536ff8987 Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:23 -0700
+Subject: ipc/sem: separate wait-for-zero and alter tasks into seperate queues
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit 1a82e9e1d0f1b45f47a97c9e2349020536ff8987 upstream.
+
+Introduce separate queues for operations that do not modify the
+semaphore values. Advantages:
+
+ - Simpler logic in check_restart().
+ - Faster update_queue(): Right now, all wait-for-zero operations are
+ always tested, even if the semaphore value is not 0.
+ - wait-for-zero gets again priority, as in linux <=3.0.9
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sem.h | 5 -
+ ipc/sem.c | 211 +++++++++++++++++++++++++++++++++++++---------------
+ 2 files changed, 155 insertions(+), 61 deletions(-)
+
+--- a/include/linux/sem.h
++++ b/include/linux/sem.h
+@@ -15,7 +15,10 @@ struct sem_array {
+ time_t sem_otime; /* last semop time */
+ time_t sem_ctime; /* last change time */
+ struct sem *sem_base; /* ptr to first semaphore in array */
+- struct list_head sem_pending; /* pending operations to be processed */
++ struct list_head pending_alter; /* pending operations */
++ /* that alter the array */
++ struct list_head pending_const; /* pending complex operations */
++ /* that do not alter semvals */
+ struct list_head list_id; /* undo requests on this array */
+ int sem_nsems; /* no. of semaphores in array */
+ int complex_count; /* pending complex operations */
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -95,7 +95,10 @@ struct sem {
+ int semval; /* current value */
+ int sempid; /* pid of last operation */
+ spinlock_t lock; /* spinlock for fine-grained semtimedop */
+- struct list_head sem_pending; /* pending single-sop operations */
++ struct list_head pending_alter; /* pending single-sop operations */
++ /* that alter the semaphore */
++ struct list_head pending_const; /* pending single-sop operations */
++ /* that do not alter the semaphore*/
+ } ____cacheline_aligned_in_smp;
+
+ /* One queue for each sleeping process in the system. */
+@@ -152,7 +155,7 @@ static int sysvipc_sem_proc_show(struct
+ /*
+ * linked list protection:
+ * sem_undo.id_next,
+- * sem_array.sem_pending{,last},
++ * sem_array.pending{_alter,_cont},
+ * sem_array.sem_undo: sem_lock() for read/write
+ * sem_undo.proc_next: only "current" is allowed to read/write that field.
+ *
+@@ -337,7 +340,7 @@ static inline void sem_rmid(struct ipc_n
+ * Without the check/retry algorithm a lockless wakeup is possible:
+ * - queue.status is initialized to -EINTR before blocking.
+ * - wakeup is performed by
+- * * unlinking the queue entry from sma->sem_pending
++ * * unlinking the queue entry from the pending list
+ * * setting queue.status to IN_WAKEUP
+ * This is the notification for the blocked thread that a
+ * result value is imminent.
+@@ -418,12 +421,14 @@ static int newary(struct ipc_namespace *
+ sma->sem_base = (struct sem *) &sma[1];
+
+ for (i = 0; i < nsems; i++) {
+- INIT_LIST_HEAD(&sma->sem_base[i].sem_pending);
++ INIT_LIST_HEAD(&sma->sem_base[i].pending_alter);
++ INIT_LIST_HEAD(&sma->sem_base[i].pending_const);
+ spin_lock_init(&sma->sem_base[i].lock);
+ }
+
+ sma->complex_count = 0;
+- INIT_LIST_HEAD(&sma->sem_pending);
++ INIT_LIST_HEAD(&sma->pending_alter);
++ INIT_LIST_HEAD(&sma->pending_const);
+ INIT_LIST_HEAD(&sma->list_id);
+ sma->sem_nsems = nsems;
+ sma->sem_ctime = get_seconds();
+@@ -609,60 +614,132 @@ static void unlink_queue(struct sem_arra
+ * update_queue is O(N^2) when it restarts scanning the whole queue of
+ * waiting operations. Therefore this function checks if the restart is
+ * really necessary. It is called after a previously waiting operation
+- * was completed.
++ * modified the array.
++ * Note that wait-for-zero operations are handled without restart.
+ */
+ static int check_restart(struct sem_array *sma, struct sem_queue *q)
+ {
+- struct sem *curr;
+- struct sem_queue *h;
+-
+- /* if the operation didn't modify the array, then no restart */
+- if (q->alter == 0)
+- return 0;
+-
+- /* pending complex operations are too difficult to analyse */
+- if (sma->complex_count)
++ /* pending complex alter operations are too difficult to analyse */
++ if (!list_empty(&sma->pending_alter))
+ return 1;
+
+ /* we were a sleeping complex operation. Too difficult */
+ if (q->nsops > 1)
+ return 1;
+
+- curr = sma->sem_base + q->sops[0].sem_num;
++ /* It is impossible that someone waits for the new value:
++ * - complex operations always restart.
++ * - wait-for-zero are handled seperately.
++ * - q is a previously sleeping simple operation that
++ * altered the array. It must be a decrement, because
++ * simple increments never sleep.
++ * - If there are older (higher priority) decrements
++ * in the queue, then they have observed the original
++ * semval value and couldn't proceed. The operation
++ * decremented to value - thus they won't proceed either.
++ */
++ return 0;
++}
+
+- /* No-one waits on this queue */
+- if (list_empty(&curr->sem_pending))
+- return 0;
++/**
++ * wake_const_ops(sma, semnum, pt) - Wake up non-alter tasks
++ * @sma: semaphore array.
++ * @semnum: semaphore that was modified.
++ * @pt: list head for the tasks that must be woken up.
++ *
++ * wake_const_ops must be called after a semaphore in a semaphore array
++ * was set to 0. If complex const operations are pending, wake_const_ops must
++ * be called with semnum = -1, as well as with the number of each modified
++ * semaphore.
++ * The tasks that must be woken up are added to @pt. The return code
++ * is stored in q->pid.
++ * The function returns 1 if at least one operation was completed successfully.
++ */
++static int wake_const_ops(struct sem_array *sma, int semnum,
++ struct list_head *pt)
++{
++ struct sem_queue *q;
++ struct list_head *walk;
++ struct list_head *pending_list;
++ int semop_completed = 0;
++
++ if (semnum == -1)
++ pending_list = &sma->pending_const;
++ else
++ pending_list = &sma->sem_base[semnum].pending_const;
++
++ walk = pending_list->next;
++ while (walk != pending_list) {
++ int error;
++
++ q = container_of(walk, struct sem_queue, list);
++ walk = walk->next;
++
++ error = try_atomic_semop(sma, q->sops, q->nsops,
++ q->undo, q->pid);
++
++ if (error <= 0) {
++ /* operation completed, remove from queue & wakeup */
++
++ unlink_queue(sma, q);
++
++ wake_up_sem_queue_prepare(pt, q, error);
++ if (error == 0)
++ semop_completed = 1;
++ }
++ }
++ return semop_completed;
++}
+
+- /* the new semaphore value */
+- if (curr->semval) {
+- /* It is impossible that someone waits for the new value:
+- * - q is a previously sleeping simple operation that
+- * altered the array. It must be a decrement, because
+- * simple increments never sleep.
+- * - The value is not 0, thus wait-for-zero won't proceed.
+- * - If there are older (higher priority) decrements
+- * in the queue, then they have observed the original
+- * semval value and couldn't proceed. The operation
+- * decremented to value - thus they won't proceed either.
++/**
++ * do_smart_wakeup_zero(sma, sops, nsops, pt) - wakeup all wait for zero tasks
++ * @sma: semaphore array
++ * @sops: operations that were performed
++ * @nsops: number of operations
++ * @pt: list head of the tasks that must be woken up.
++ *
++ * do_smart_wakeup_zero() checks all required queue for wait-for-zero
++ * operations, based on the actual changes that were performed on the
++ * semaphore array.
++ * The function returns 1 if at least one operation was completed successfully.
++ */
++static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
++ int nsops, struct list_head *pt)
++{
++ int i;
++ int semop_completed = 0;
++ int got_zero = 0;
++
++ /* first: the per-semaphore queues, if known */
++ if (sops) {
++ for (i = 0; i < nsops; i++) {
++ int num = sops[i].sem_num;
++
++ if (sma->sem_base[num].semval == 0) {
++ got_zero = 1;
++ semop_completed |= wake_const_ops(sma, num, pt);
++ }
++ }
++ } else {
++ /*
++ * No sops means modified semaphores not known.
++ * Assume all were changed.
+ */
+- BUG_ON(q->sops[0].sem_op >= 0);
+- return 0;
++ for (i = 0; i < sma->sem_nsems; i++) {
++ if (sma->sem_base[i].semval == 0) {
++ got_zero = 1;
++ semop_completed |= wake_const_ops(sma, i, pt);
++ }
++ }
+ }
+ /*
+- * semval is 0. Check if there are wait-for-zero semops.
+- * They must be the first entries in the per-semaphore queue
++ * If one of the modified semaphores got 0,
++ * then check the global queue, too.
+ */
+- h = list_first_entry(&curr->sem_pending, struct sem_queue, list);
+- BUG_ON(h->nsops != 1);
+- BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num);
++ if (got_zero)
++ semop_completed |= wake_const_ops(sma, -1, pt);
+
+- /* Yes, there is a wait-for-zero semop. Restart */
+- if (h->sops[0].sem_op == 0)
+- return 1;
+-
+- /* Again - no-one is waiting for the new value. */
+- return 0;
++ return semop_completed;
+ }
+
+
+@@ -678,6 +755,8 @@ static int check_restart(struct sem_arra
+ * semaphore.
+ * The tasks that must be woken up are added to @pt. The return code
+ * is stored in q->pid.
++ * The function internally checks if const operations can now succeed.
++ *
+ * The function return 1 if at least one semop was completed successfully.
+ */
+ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
+@@ -688,9 +767,9 @@ static int update_queue(struct sem_array
+ int semop_completed = 0;
+
+ if (semnum == -1)
+- pending_list = &sma->sem_pending;
++ pending_list = &sma->pending_alter;
+ else
+- pending_list = &sma->sem_base[semnum].sem_pending;
++ pending_list = &sma->sem_base[semnum].pending_alter;
+
+ again:
+ walk = pending_list->next;
+@@ -702,13 +781,12 @@ again:
+
+ /* If we are scanning the single sop, per-semaphore list of
+ * one semaphore and that semaphore is 0, then it is not
+- * necessary to scan the "alter" entries: simple increments
++ * necessary to scan further: simple increments
+ * that affect only one entry succeed immediately and cannot
+ * be in the per semaphore pending queue, and decrements
+ * cannot be successful if the value is already 0.
+ */
+- if (semnum != -1 && sma->sem_base[semnum].semval == 0 &&
+- q->alter)
++ if (semnum != -1 && sma->sem_base[semnum].semval == 0)
+ break;
+
+ error = try_atomic_semop(sma, q->sops, q->nsops,
+@@ -724,6 +802,7 @@ again:
+ restart = 0;
+ } else {
+ semop_completed = 1;
++ do_smart_wakeup_zero(sma, q->sops, q->nsops, pt);
+ restart = check_restart(sma, q);
+ }
+
+@@ -742,8 +821,8 @@ again:
+ * @otime: force setting otime
+ * @pt: list head of the tasks that must be woken up.
+ *
+- * do_smart_update() does the required called to update_queue, based on the
+- * actual changes that were performed on the semaphore array.
++ * do_smart_update() does the required calls to update_queue and wakeup_zero,
++ * based on the actual changes that were performed on the semaphore array.
+ * Note that the function does not do the actual wake-up: the caller is
+ * responsible for calling wake_up_sem_queue_do(@pt).
+ * It is safe to perform this call after dropping all locks.
+@@ -754,6 +833,8 @@ static void do_smart_update(struct sem_a
+ int i;
+ int progress;
+
++ otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
++
+ progress = 1;
+ retry_global:
+ if (sma->complex_count) {
+@@ -813,14 +894,14 @@ static int count_semncnt (struct sem_arr
+ struct sem_queue * q;
+
+ semncnt = 0;
+- list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) {
++ list_for_each_entry(q, &sma->sem_base[semnum].pending_alter, list) {
+ struct sembuf * sops = q->sops;
+ BUG_ON(sops->sem_num != semnum);
+ if ((sops->sem_op < 0) && !(sops->sem_flg & IPC_NOWAIT))
+ semncnt++;
+ }
+
+- list_for_each_entry(q, &sma->sem_pending, list) {
++ list_for_each_entry(q, &sma->pending_alter, list) {
+ struct sembuf * sops = q->sops;
+ int nsops = q->nsops;
+ int i;
+@@ -839,14 +920,14 @@ static int count_semzcnt (struct sem_arr
+ struct sem_queue * q;
+
+ semzcnt = 0;
+- list_for_each_entry(q, &sma->sem_base[semnum].sem_pending, list) {
++ list_for_each_entry(q, &sma->sem_base[semnum].pending_const, list) {
+ struct sembuf * sops = q->sops;
+ BUG_ON(sops->sem_num != semnum);
+ if ((sops->sem_op == 0) && !(sops->sem_flg & IPC_NOWAIT))
+ semzcnt++;
+ }
+
+- list_for_each_entry(q, &sma->sem_pending, list) {
++ list_for_each_entry(q, &sma->pending_const, list) {
+ struct sembuf * sops = q->sops;
+ int nsops = q->nsops;
+ int i;
+@@ -884,13 +965,22 @@ static void freeary(struct ipc_namespace
+
+ /* Wake up all pending processes and let them fail with EIDRM. */
+ INIT_LIST_HEAD(&tasks);
+- list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
++ list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
++ unlink_queue(sma, q);
++ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
++ }
++
++ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
+ unlink_queue(sma, q);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ }
+ for (i = 0; i < sma->sem_nsems; i++) {
+ struct sem *sem = sma->sem_base + i;
+- list_for_each_entry_safe(q, tq, &sem->sem_pending, list) {
++ list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
++ unlink_queue(sma, q);
++ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
++ }
++ list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
+ unlink_queue(sma, q);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
+ }
+@@ -1658,14 +1748,15 @@ SYSCALL_DEFINE4(semtimedop, int, semid,
+ curr = &sma->sem_base[sops->sem_num];
+
+ if (alter)
+- list_add_tail(&queue.list, &curr->sem_pending);
++ list_add_tail(&queue.list, &curr->pending_alter);
+ else
+- list_add(&queue.list, &curr->sem_pending);
++ list_add_tail(&queue.list, &curr->pending_const);
+ } else {
+ if (alter)
+- list_add_tail(&queue.list, &sma->sem_pending);
++ list_add_tail(&queue.list, &sma->pending_alter);
+ else
+- list_add(&queue.list, &sma->sem_pending);
++ list_add_tail(&queue.list, &sma->pending_const);
++
+ sma->complex_count++;
+ }
+
--- /dev/null
+From f269f40ad5aeee229ed70044926f44318abe41ef Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:24 -0700
+Subject: ipc/sem.c: always use only one queue for alter operations
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit f269f40ad5aeee229ed70044926f44318abe41ef upstream.
+
+There are two places that can contain alter operations:
+ - the global queue: sma->pending_alter
+ - the per-semaphore queues: sma->sem_base[].pending_alter.
+
+Since one of the queues must be processed first, this causes an odd
+priorization of the wakeups: complex operations have priority over
+simple ops.
+
+The patch restores the behavior of linux <=3.0.9: The longest waiting
+operation has the highest priority.
+
+This is done by using only one queue:
+ - if there are complex ops, then sma->pending_alter is used.
+ - otherwise, the per-semaphore queues are used.
+
+As a side effect, do_smart_update_queue() becomes much simpler: no more
+goto logic.
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/sem.c | 128 ++++++++++++++++++++++++++++++++++++++++++--------------------
+ 1 file changed, 88 insertions(+), 40 deletions(-)
+
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -192,6 +192,53 @@ void __init sem_init (void)
+ IPC_SEM_IDS, sysvipc_sem_proc_show);
+ }
+
++/**
++ * unmerge_queues - unmerge queues, if possible.
++ * @sma: semaphore array
++ *
++ * The function unmerges the wait queues if complex_count is 0.
++ * It must be called prior to dropping the global semaphore array lock.
++ */
++static void unmerge_queues(struct sem_array *sma)
++{
++ struct sem_queue *q, *tq;
++
++ /* complex operations still around? */
++ if (sma->complex_count)
++ return;
++ /*
++ * We will switch back to simple mode.
++ * Move all pending operation back into the per-semaphore
++ * queues.
++ */
++ list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
++ struct sem *curr;
++ curr = &sma->sem_base[q->sops[0].sem_num];
++
++ list_add_tail(&q->list, &curr->pending_alter);
++ }
++ INIT_LIST_HEAD(&sma->pending_alter);
++}
++
++/**
++ * merge_queues - Merge single semop queues into global queue
++ * @sma: semaphore array
++ *
++ * This function merges all per-semaphore queues into the global queue.
++ * It is necessary to achieve FIFO ordering for the pending single-sop
++ * operations when a multi-semop operation must sleep.
++ * Only the alter operations must be moved, the const operations can stay.
++ */
++static void merge_queues(struct sem_array *sma)
++{
++ int i;
++ for (i = 0; i < sma->sem_nsems; i++) {
++ struct sem *sem = sma->sem_base + i;
++
++ list_splice_init(&sem->pending_alter, &sma->pending_alter);
++ }
++}
++
+ /*
+ * If the request contains only one semaphore operation, and there are
+ * no complex transactions pending, lock only the semaphore involved.
+@@ -262,6 +309,7 @@ static inline int sem_lock(struct sem_ar
+ static inline void sem_unlock(struct sem_array *sma, int locknum)
+ {
+ if (locknum == -1) {
++ unmerge_queues(sma);
+ ipc_unlock_object(&sma->sem_perm);
+ } else {
+ struct sem *sem = sma->sem_base + locknum;
+@@ -831,49 +879,38 @@ static void do_smart_update(struct sem_a
+ int otime, struct list_head *pt)
+ {
+ int i;
+- int progress;
+
+ otime |= do_smart_wakeup_zero(sma, sops, nsops, pt);
+
+- progress = 1;
+-retry_global:
+- if (sma->complex_count) {
+- if (update_queue(sma, -1, pt)) {
+- progress = 1;
+- otime = 1;
+- sops = NULL;
+- }
+- }
+- if (!progress)
+- goto done;
+-
+- if (!sops) {
+- /* No semops; something special is going on. */
+- for (i = 0; i < sma->sem_nsems; i++) {
+- if (update_queue(sma, i, pt)) {
+- otime = 1;
+- progress = 1;
++ if (!list_empty(&sma->pending_alter)) {
++ /* semaphore array uses the global queue - just process it. */
++ otime |= update_queue(sma, -1, pt);
++ } else {
++ if (!sops) {
++ /*
++ * No sops, thus the modified semaphores are not
++ * known. Check all.
++ */
++ for (i = 0; i < sma->sem_nsems; i++)
++ otime |= update_queue(sma, i, pt);
++ } else {
++ /*
++ * Check the semaphores that were increased:
++ * - No complex ops, thus all sleeping ops are
++ * decrease.
++ * - if we decreased the value, then any sleeping
++ * semaphore ops wont be able to run: If the
++ * previous value was too small, then the new
++ * value will be too small, too.
++ */
++ for (i = 0; i < nsops; i++) {
++ if (sops[i].sem_op > 0) {
++ otime |= update_queue(sma,
++ sops[i].sem_num, pt);
++ }
+ }
+ }
+- goto done_checkretry;
+- }
+-
+- /* Check the semaphores that were modified. */
+- for (i = 0; i < nsops; i++) {
+- if (sops[i].sem_op > 0 ||
+- (sops[i].sem_op < 0 &&
+- sma->sem_base[sops[i].sem_num].semval == 0))
+- if (update_queue(sma, sops[i].sem_num, pt)) {
+- otime = 1;
+- progress = 1;
+- }
+- }
+-done_checkretry:
+- if (progress) {
+- progress = 0;
+- goto retry_global;
+ }
+-done:
+ if (otime)
+ sma->sem_otime = get_seconds();
+ }
+@@ -1747,11 +1784,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid,
+ struct sem *curr;
+ curr = &sma->sem_base[sops->sem_num];
+
+- if (alter)
+- list_add_tail(&queue.list, &curr->pending_alter);
+- else
++ if (alter) {
++ if (sma->complex_count) {
++ list_add_tail(&queue.list,
++ &sma->pending_alter);
++ } else {
++
++ list_add_tail(&queue.list,
++ &curr->pending_alter);
++ }
++ } else {
+ list_add_tail(&queue.list, &curr->pending_const);
++ }
+ } else {
++ if (!sma->complex_count)
++ merge_queues(sma);
++
+ if (alter)
+ list_add_tail(&queue.list, &sma->pending_alter);
+ else
--- /dev/null
+From f5c936c0f267ec58641451cf8b8d39b4c207ee4d Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:22 -0700
+Subject: ipc/sem.c: cacheline align the semaphore structures
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit f5c936c0f267ec58641451cf8b8d39b4c207ee4d upstream.
+
+As now each semaphore has its own spinlock and parallel operations are
+possible, give each semaphore its own cacheline.
+
+On a i3 laptop, this gives up to 28% better performance:
+
+ #semscale 10 | grep "interleave 2"
+ - before:
+ Cpus 1, interleave 2 delay 0: 36109234 in 10 secs
+ Cpus 2, interleave 2 delay 0: 55276317 in 10 secs
+ Cpus 3, interleave 2 delay 0: 62411025 in 10 secs
+ Cpus 4, interleave 2 delay 0: 81963928 in 10 secs
+
+ -after:
+ Cpus 1, interleave 2 delay 0: 35527306 in 10 secs
+ Cpus 2, interleave 2 delay 0: 70922909 in 10 secs <<< + 28%
+ Cpus 3, interleave 2 delay 0: 80518538 in 10 secs
+ Cpus 4, interleave 2 delay 0: 89115148 in 10 secs <<< + 8.7%
+
+i3, with 2 cores and with hyperthreading enabled. Interleave 2 in order
+use first the full cores. HT partially hides the delay from cacheline
+trashing, thus the improvement is "only" 8.7% if 4 threads are running.
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/sem.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -96,7 +96,7 @@ struct sem {
+ int sempid; /* pid of last operation */
+ spinlock_t lock; /* spinlock for fine-grained semtimedop */
+ struct list_head sem_pending; /* pending single-sop operations */
+-};
++} ____cacheline_aligned_in_smp;
+
+ /* One queue for each sleeping process in the system. */
+ struct sem_queue {
--- /dev/null
+From 758a6ba39ef6df4cdc615e5edd7bd86eab81a5f7 Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:26 -0700
+Subject: ipc/sem.c: rename try_atomic_semop() to perform_atomic_semop(), docu update
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit 758a6ba39ef6df4cdc615e5edd7bd86eab81a5f7 upstream.
+
+Cleanup: Some minor points that I noticed while writing the previous
+patches
+
+1) The name try_atomic_semop() is misleading: The function performs the
+ operation (if it is possible).
+
+2) Some documentation updates.
+
+No real code change, a rename and documentation changes.
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/sem.c | 32 +++++++++++++++++++++-----------
+ 1 file changed, 21 insertions(+), 11 deletions(-)
+
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -154,12 +154,15 @@ static int sysvipc_sem_proc_show(struct
+ #define SEMOPM_FAST 64 /* ~ 372 bytes on stack */
+
+ /*
+- * linked list protection:
++ * Locking:
+ * sem_undo.id_next,
++ * sem_array.complex_count,
+ * sem_array.pending{_alter,_cont},
+- * sem_array.sem_undo: sem_lock() for read/write
++ * sem_array.sem_undo: global sem_lock() for read/write
+ * sem_undo.proc_next: only "current" is allowed to read/write that field.
+ *
++ * sem_array.sem_base[i].pending_{const,alter}:
++ * global or semaphore sem_lock() for read/write
+ */
+
+ #define sc_semmsl sem_ctls[0]
+@@ -536,12 +539,19 @@ SYSCALL_DEFINE3(semget, key_t, key, int,
+ return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
+ }
+
+-/*
+- * Determine whether a sequence of semaphore operations would succeed
+- * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
++/** perform_atomic_semop - Perform (if possible) a semaphore operation
++ * @sma: semaphore array
++ * @sops: array with operations that should be checked
++ * @nsems: number of sops
++ * @un: undo array
++ * @pid: pid that did the change
++ *
++ * Returns 0 if the operation was possible.
++ * Returns 1 if the operation is impossible, the caller must sleep.
++ * Negative values are error codes.
+ */
+
+-static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
++static int perform_atomic_semop(struct sem_array *sma, struct sembuf *sops,
+ int nsops, struct sem_undo *un, int pid)
+ {
+ int result, sem_op;
+@@ -724,8 +734,8 @@ static int wake_const_ops(struct sem_arr
+ q = container_of(walk, struct sem_queue, list);
+ walk = walk->next;
+
+- error = try_atomic_semop(sma, q->sops, q->nsops,
+- q->undo, q->pid);
++ error = perform_atomic_semop(sma, q->sops, q->nsops,
++ q->undo, q->pid);
+
+ if (error <= 0) {
+ /* operation completed, remove from queue & wakeup */
+@@ -838,7 +848,7 @@ again:
+ if (semnum != -1 && sma->sem_base[semnum].semval == 0)
+ break;
+
+- error = try_atomic_semop(sma, q->sops, q->nsops,
++ error = perform_atomic_semop(sma, q->sops, q->nsops,
+ q->undo, q->pid);
+
+ /* Does q->sleeper still need to sleep? */
+@@ -1686,7 +1696,6 @@ static int get_queue_result(struct sem_q
+ return error;
+ }
+
+-
+ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
+ unsigned, nsops, const struct timespec __user *, timeout)
+ {
+@@ -1784,7 +1793,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid,
+ if (un && un->semid == -1)
+ goto out_unlock_free;
+
+- error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
++ error = perform_atomic_semop(sma, sops, nsops, un,
++ task_tgid_vnr(current));
+ if (error <= 0) {
+ if (alter && error == 0)
+ do_smart_update(sma, sops, nsops, 1, &tasks);
--- /dev/null
+From d12e1e50e47e0900dbbf52237b7e171f4f15ea1e Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:25 -0700
+Subject: ipc/sem.c: replace shared sem_otime with per-semaphore value
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit d12e1e50e47e0900dbbf52237b7e171f4f15ea1e upstream.
+
+sem_otime contains the time of the last semaphore operation that
+completed successfully. Every operation updates this value, thus access
+from multiple cpus can cause thrashing.
+
+Therefore the patch replaces the variable with a per-semaphore variable.
+The per-array sem_otime is only calculated when required.
+
+No performance improvement on a single-socket i3 - only important for
+larger systems.
+
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sem.h | 1 -
+ ipc/sem.c | 37 +++++++++++++++++++++++++++++++------
+ 2 files changed, 31 insertions(+), 7 deletions(-)
+
+--- a/include/linux/sem.h
++++ b/include/linux/sem.h
+@@ -12,7 +12,6 @@ struct task_struct;
+ struct sem_array {
+ struct kern_ipc_perm ____cacheline_aligned_in_smp
+ sem_perm; /* permissions .. see ipc.h */
+- time_t sem_otime; /* last semop time */
+ time_t sem_ctime; /* last change time */
+ struct sem *sem_base; /* ptr to first semaphore in array */
+ struct list_head pending_alter; /* pending operations */
+--- a/ipc/sem.c
++++ b/ipc/sem.c
+@@ -99,6 +99,7 @@ struct sem {
+ /* that alter the semaphore */
+ struct list_head pending_const; /* pending single-sop operations */
+ /* that do not alter the semaphore*/
++ time_t sem_otime; /* candidate for sem_otime */
+ } ____cacheline_aligned_in_smp;
+
+ /* One queue for each sleeping process in the system. */
+@@ -911,8 +912,14 @@ static void do_smart_update(struct sem_a
+ }
+ }
+ }
+- if (otime)
+- sma->sem_otime = get_seconds();
++ if (otime) {
++ if (sops == NULL) {
++ sma->sem_base[0].sem_otime = get_seconds();
++ } else {
++ sma->sem_base[sops[0].sem_num].sem_otime =
++ get_seconds();
++ }
++ }
+ }
+
+
+@@ -1058,6 +1065,21 @@ static unsigned long copy_semid_to_user(
+ }
+ }
+
++static time_t get_semotime(struct sem_array *sma)
++{
++ int i;
++ time_t res;
++
++ res = sma->sem_base[0].sem_otime;
++ for (i = 1; i < sma->sem_nsems; i++) {
++ time_t to = sma->sem_base[i].sem_otime;
++
++ if (to > res)
++ res = to;
++ }
++ return res;
++}
++
+ static int semctl_nolock(struct ipc_namespace *ns, int semid,
+ int cmd, int version, void __user *p)
+ {
+@@ -1131,9 +1153,9 @@ static int semctl_nolock(struct ipc_name
+ goto out_unlock;
+
+ kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm);
+- tbuf.sem_otime = sma->sem_otime;
+- tbuf.sem_ctime = sma->sem_ctime;
+- tbuf.sem_nsems = sma->sem_nsems;
++ tbuf.sem_otime = get_semotime(sma);
++ tbuf.sem_ctime = sma->sem_ctime;
++ tbuf.sem_nsems = sma->sem_nsems;
+ rcu_read_unlock();
+ if (copy_semid_to_user(p, &tbuf, version))
+ return -EFAULT;
+@@ -2025,6 +2047,9 @@ static int sysvipc_sem_proc_show(struct
+ {
+ struct user_namespace *user_ns = seq_user_ns(s);
+ struct sem_array *sma = it;
++ time_t sem_otime;
++
++ sem_otime = get_semotime(sma);
+
+ return seq_printf(s,
+ "%10d %10d %4o %10u %5u %5u %5u %5u %10lu %10lu\n",
+@@ -2036,7 +2061,7 @@ static int sysvipc_sem_proc_show(struct
+ from_kgid_munged(user_ns, sma->sem_perm.gid),
+ from_kuid_munged(user_ns, sma->sem_perm.cuid),
+ from_kgid_munged(user_ns, sma->sem_perm.cgid),
+- sma->sem_otime,
++ sem_otime,
+ sma->sem_ctime);
+ }
+ #endif
--- /dev/null
+From 196aa0132fc7261f34b10ae1bfb44abc1bc69b3c Mon Sep 17 00:00:00 2001
+From: Manfred Spraul <manfred@colorfullife.com>
+Date: Mon, 8 Jul 2013 16:01:20 -0700
+Subject: ipc/util.c, ipc_rcu_alloc: cacheline align allocation
+
+From: Manfred Spraul <manfred@colorfullife.com>
+
+commit 196aa0132fc7261f34b10ae1bfb44abc1bc69b3c upstream.
+
+Enforce that ipc_rcu_alloc returns a cacheline aligned pointer on SMP.
+
+Rationale:
+
+The SysV sem code tries to move the main spinlock into a seperate
+cacheline (____cacheline_aligned_in_smp). This works only if
+ipc_rcu_alloc returns cacheline aligned pointers. vmalloc and kmalloc
+return cacheline algined pointers, the implementation of ipc_rcu_alloc
+breaks that.
+
+[akpm@linux-foundation.org: coding-style fixes]
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ ipc/util.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/ipc/util.c
++++ b/ipc/util.c
+@@ -468,9 +468,7 @@ void ipc_free(void* ptr, int size)
+ struct ipc_rcu {
+ struct rcu_head rcu;
+ atomic_t refcount;
+- /* "void *" makes sure alignment of following data is sane. */
+- void *data[0];
+-};
++} ____cacheline_aligned_in_smp;
+
+ /**
+ * ipc_rcu_alloc - allocate ipc and rcu space
+@@ -488,12 +486,14 @@ void *ipc_rcu_alloc(int size)
+ if (unlikely(!out))
+ return NULL;
+ atomic_set(&out->refcount, 1);
+- return out->data;
++ return out + 1;
+ }
+
+ int ipc_rcu_getref(void *ptr)
+ {
+- return atomic_inc_not_zero(&container_of(ptr, struct ipc_rcu, data)->refcount);
++ struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
++
++ return atomic_inc_not_zero(&p->refcount);
+ }
+
+ /**
+@@ -507,7 +507,7 @@ static void ipc_schedule_free(struct rcu
+
+ void ipc_rcu_putref(void *ptr)
+ {
+- struct ipc_rcu *p = container_of(ptr, struct ipc_rcu, data);
++ struct ipc_rcu *p = ((struct ipc_rcu *)ptr) - 1;
+
+ if (!atomic_dec_and_test(&p->refcount))
+ return;
ipc-msg-shorten-critical-region-in-msgsnd.patch
ipc-msg-shorten-critical-region-in-msgrcv.patch
ipc-remove-unused-functions.patch
+ipc-util.c-ipc_rcu_alloc-cacheline-align-allocation.patch
+ipc-sem.c-cacheline-align-the-semaphore-structures.patch
+ipc-sem-separate-wait-for-zero-and-alter-tasks-into-seperate-queues.patch
+ipc-sem.c-always-use-only-one-queue-for-alter-operations.patch
+ipc-sem.c-replace-shared-sem_otime-with-per-semaphore-value.patch
+ipc-sem.c-rename-try_atomic_semop-to-perform_atomic_semop-docu-update.patch