Move xen patchset to new version's subdir.

[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.arch / ppc-oprofile-spu-mutex-locking.patch
diff --git a/src/patches/suse-2.6.27.31/patches.arch/ppc-oprofile-spu-mutex-locking.patch b/src/patches/suse-2.6.27.31/patches.arch/ppc-oprofile-spu-mutex-locking.patch

new file mode 100644 (file)

index 0000000..b4ebaea
--- /dev/null
+++ b/src/patches/suse-2.6.27.31/patches.arch/ppc-oprofile-spu-mutex-locking.patch
@@ -0,0 +1,513 @@
+Subject: [PATCH] powerpc/oprofile: Fix mutex locking for cell spu-oprofile
+From: Carl Love <cel@us.ibm.com>
+References: 422501 - LTC47617
+
+The issue is the SPU code is not holding the kernel mutex lock while
+adding samples to the kernel buffer.
+
+This patch creates per SPU buffers to hold the data.  Data
+is added to the buffers from in interrupt context.  The data
+is periodically pushed to the kernel buffer via a new Oprofile
+function oprofile_put_buff(). The oprofile_put_buff() function
+is called via a work queue enabling the funtion to acquire the
+mutex lock.
+
+The existing user controls for adjusting the per CPU buffer
+size is used to control the size of the per SPU buffers.
+Similarly, overflows of the SPU buffers are reported by
+incrementing the per CPU buffer stats.  This eliminates the
+need to have architecture specific controls for the per SPU
+buffers which is not acceptable to the OProfile user tool
+maintainer.
+
+The export of the oprofile add_event_entry() is removed as it
+is no longer needed given this patch.
+
+Note, this patch has not addressed the issue of indexing arrays
+by the spu number.  This still needs to be fixed as the spu
+numbering is not guarenteed to be 0 to max_num_spus-1.
+
+Signed-off-by: Carl Love <carll@us.ibm.com>
+Signed-off-by: Maynard Johnson <maynardj@us.ibm.com>
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Acked-by: Acked-by: Robert Richter <robert.richter@amd.com>
+Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Signed-off-by: Olaf Hering <olh@suse.de>
+
+---
+ arch/powerpc/oprofile/cell/pr_util.h       |   13 +
+ arch/powerpc/oprofile/cell/spu_profiler.c  |    4 
+ arch/powerpc/oprofile/cell/spu_task_sync.c |  236 +++++++++++++++++++++++++----
+ drivers/oprofile/buffer_sync.c             |   24 ++
+ drivers/oprofile/cpu_buffer.c              |   15 +
+ drivers/oprofile/event_buffer.h            |    7 
+ include/linux/oprofile.h                   |   16 +
+ 7 files changed, 279 insertions(+), 36 deletions(-)
+
+--- a/arch/powerpc/oprofile/cell/pr_util.h
++++ b/arch/powerpc/oprofile/cell/pr_util.h
+@@ -24,6 +24,11 @@
+ #define SKIP_GENERIC_SYNC 0
+ #define SYNC_START_ERROR -1
+ #define DO_GENERIC_SYNC 1
++#define SPUS_PER_NODE   8
++#define DEFAULT_TIMER_EXPIRE  (HZ / 10)
++
++extern struct delayed_work spu_work;
++extern int spu_prof_running;
+ 
+ struct spu_overlay_info {     /* map of sections within an SPU overlay */
+       unsigned int vma;       /* SPU virtual memory address from elf */
+@@ -62,6 +67,14 @@ struct vma_to_fileoffset_map {      /* map of
+ 
+ };
+ 
++struct spu_buffer {
++      int last_guard_val;
++      int ctx_sw_seen;
++      unsigned long *buff;
++      unsigned int head, tail;
++};
++
++
+ /* The three functions below are for maintaining and accessing
+  * the vma-to-fileoffset map.
+  */
+--- a/arch/powerpc/oprofile/cell/spu_profiler.c
++++ b/arch/powerpc/oprofile/cell/spu_profiler.c
+@@ -24,12 +24,11 @@
+ 
+ static u32 *samples;
+ 
+-static int spu_prof_running;
++int spu_prof_running;
+ static unsigned int profiling_interval;
+ 
+ #define NUM_SPU_BITS_TRBUF 16
+ #define SPUS_PER_TB_ENTRY   4
+-#define SPUS_PER_NODE      8
+ 
+ #define SPU_PC_MASK        0xFFFF
+ 
+@@ -209,6 +208,7 @@ int start_spu_profiling(unsigned int cyc
+ 
+       spu_prof_running = 1;
+       hrtimer_start(&timer, kt, HRTIMER_MODE_REL);
++      schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
+ 
+       return 0;
+ }
+--- a/arch/powerpc/oprofile/cell/spu_task_sync.c
++++ b/arch/powerpc/oprofile/cell/spu_task_sync.c
+@@ -35,7 +35,102 @@ static DEFINE_SPINLOCK(buffer_lock);
+ static DEFINE_SPINLOCK(cache_lock);
+ static int num_spu_nodes;
+ int spu_prof_num_nodes;
+-int last_guard_val[MAX_NUMNODES * 8];
++
++struct spu_buffer spu_buff[MAX_NUMNODES * SPUS_PER_NODE];
++struct delayed_work spu_work;
++static unsigned max_spu_buff;
++
++static void spu_buff_add(unsigned long int value, int spu)
++{
++      /* spu buff is a circular buffer.  Add entries to the
++       * head.  Head is the index to store the next value.
++       * The buffer is full when there is one available entry
++       * in the queue, i.e. head and tail can't be equal.
++       * That way we can tell the difference between the
++       * buffer being full versus empty.
++       *
++       *  ASSUPTION: the buffer_lock is held when this function
++       *             is called to lock the buffer, head and tail.
++       */
++      int full = 1;
++
++      if (spu_buff[spu].head >= spu_buff[spu].tail) {
++              if ((spu_buff[spu].head - spu_buff[spu].tail)
++                  <  (max_spu_buff - 1))
++                      full = 0;
++
++      } else if (spu_buff[spu].tail > spu_buff[spu].head) {
++              if ((spu_buff[spu].tail - spu_buff[spu].head)
++                  > 1)
++                      full = 0;
++      }
++
++      if (!full) {
++              spu_buff[spu].buff[spu_buff[spu].head] = value;
++              spu_buff[spu].head++;
++
++              if (spu_buff[spu].head >= max_spu_buff)
++                      spu_buff[spu].head = 0;
++      } else {
++              /* From the user's perspective make the SPU buffer
++               * size management/overflow look like we are using
++               * per cpu buffers.  The user uses the same
++               * per cpu parameter to adjust the SPU buffer size.
++               * Increment the sample_lost_overflow to inform
++               * the user the buffer size needs to be increased.
++               */
++              oprofile_cpu_buffer_inc_smpl_lost();
++      }
++}
++
++/* This function copies the per SPU buffers to the
++ * OProfile kernel buffer.
++ */
++void sync_spu_buff(void)
++{
++      int spu;
++      unsigned long flags;
++      int curr_head;
++
++      for (spu = 0; spu < num_spu_nodes; spu++) {
++              /* In case there was an issue and the buffer didn't
++               * get created skip it.
++               */
++              if (spu_buff[spu].buff == NULL)
++                      continue;
++
++              /* Hold the lock to make sure the head/tail
++               * doesn't change while spu_buff_add() is
++               * deciding if the buffer is full or not.
++               * Being a little paranoid.
++               */
++              spin_lock_irqsave(&buffer_lock, flags);
++              curr_head = spu_buff[spu].head;
++              spin_unlock_irqrestore(&buffer_lock, flags);
++
++              /* Transfer the current contents to the kernel buffer.
++               * data can still be added to the head of the buffer.
++               */
++              oprofile_put_buff(spu_buff[spu].buff,
++                                spu_buff[spu].tail,
++                                curr_head, max_spu_buff);
++
++              spin_lock_irqsave(&buffer_lock, flags);
++              spu_buff[spu].tail = curr_head;
++              spin_unlock_irqrestore(&buffer_lock, flags);
++      }
++
++}
++
++static void wq_sync_spu_buff(struct work_struct *work)
++{
++      /* move data from spu buffers to kernel buffer */
++      sync_spu_buff();
++
++      /* only reschedule if profiling is not done */
++      if (spu_prof_running)
++              schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE);
++}
+ 
+ /* Container for caching information about an active SPU task. */
+ struct cached_info {
+@@ -305,14 +400,21 @@ static int process_context_switch(struct
+ 
+       /* Record context info in event buffer */
+       spin_lock_irqsave(&buffer_lock, flags);
+-      add_event_entry(ESCAPE_CODE);
+-      add_event_entry(SPU_CTX_SWITCH_CODE);
+-      add_event_entry(spu->number);
+-      add_event_entry(spu->pid);
+-      add_event_entry(spu->tgid);
+-      add_event_entry(app_dcookie);
+-      add_event_entry(spu_cookie);
+-      add_event_entry(offset);
++      spu_buff_add(ESCAPE_CODE, spu->number);
++      spu_buff_add(SPU_CTX_SWITCH_CODE, spu->number);
++      spu_buff_add(spu->number, spu->number);
++      spu_buff_add(spu->pid, spu->number);
++      spu_buff_add(spu->tgid, spu->number);
++      spu_buff_add(app_dcookie, spu->number);
++      spu_buff_add(spu_cookie, spu->number);
++      spu_buff_add(offset, spu->number);
++
++      /* Set flag to indicate SPU PC data can now be written out.  If
++       * the SPU program counter data is seen before an SPU context
++       * record is seen, the postprocessing will fail.
++       */
++      spu_buff[spu->number].ctx_sw_seen = 1;
++
+       spin_unlock_irqrestore(&buffer_lock, flags);
+       smp_wmb();      /* insure spu event buffer updates are written */
+                       /* don't want entries intermingled... */
+@@ -360,6 +462,47 @@ static int number_of_online_nodes(void)
+         return nodes;
+ }
+ 
++static int oprofile_spu_buff_create(void)
++{
++      int spu;
++
++      max_spu_buff = oprofile_get_cpu_buffer_size();
++
++      for (spu = 0; spu < num_spu_nodes; spu++) {
++              /* create circular buffers to store the data in.
++               * use locks to manage accessing the buffers
++               */
++              spu_buff[spu].head = 0;
++              spu_buff[spu].tail = 0;
++
++              /*
++               * Create a buffer for each SPU.  Can't reliably
++               * create a single buffer for all spus due to not
++               * enough contiguous kernel memory.
++               */
++
++              spu_buff[spu].buff = kzalloc((max_spu_buff
++                                            * sizeof(unsigned long)),
++                                           GFP_KERNEL);
++
++              if (!spu_buff[spu].buff) {
++                      printk(KERN_ERR "SPU_PROF: "
++                             "%s, line %d:  oprofile_spu_buff_create "
++                     "failed to allocate spu buffer %d.\n",
++                             __func__, __LINE__, spu);
++
++                      /* release the spu buffers that have been allocated */
++                      while (spu >= 0) {
++                              kfree(spu_buff[spu].buff);
++                              spu_buff[spu].buff = 0;
++                              spu--;
++                      }
++                      return -ENOMEM;
++              }
++      }
++      return 0;
++}
++
+ /* The main purpose of this function is to synchronize
+  * OProfile with SPUFS by registering to be notified of
+  * SPU task switches.
+@@ -372,20 +515,35 @@ static int number_of_online_nodes(void)
+  */
+ int spu_sync_start(void)
+ {
+-      int k;
++      int spu;
+       int ret = SKIP_GENERIC_SYNC;
+       int register_ret;
+       unsigned long flags = 0;
+ 
+       spu_prof_num_nodes = number_of_online_nodes();
+       num_spu_nodes = spu_prof_num_nodes * 8;
++      INIT_DELAYED_WORK(&spu_work, wq_sync_spu_buff);
++
++      /* create buffer for storing the SPU data to put in
++       * the kernel buffer.
++       */
++      ret = oprofile_spu_buff_create();
++      if (ret)
++              goto out;
+ 
+       spin_lock_irqsave(&buffer_lock, flags);
+-      add_event_entry(ESCAPE_CODE);
+-      add_event_entry(SPU_PROFILING_CODE);
+-      add_event_entry(num_spu_nodes);
++      for (spu = 0; spu < num_spu_nodes; spu++) {
++              spu_buff_add(ESCAPE_CODE, spu);
++              spu_buff_add(SPU_PROFILING_CODE, spu);
++              spu_buff_add(num_spu_nodes, spu);
++      }
+       spin_unlock_irqrestore(&buffer_lock, flags);
+ 
++      for (spu = 0; spu < num_spu_nodes; spu++) {
++              spu_buff[spu].ctx_sw_seen = 0;
++              spu_buff[spu].last_guard_val = 0;
++      }
++
+       /* Register for SPU events  */
+       register_ret = spu_switch_event_register(&spu_active);
+       if (register_ret) {
+@@ -393,8 +551,6 @@ int spu_sync_start(void)
+               goto out;
+       }
+ 
+-      for (k = 0; k < (MAX_NUMNODES * 8); k++)
+-              last_guard_val[k] = 0;
+       pr_debug("spu_sync_start -- running.\n");
+ out:
+       return ret;
+@@ -446,13 +602,20 @@ void spu_sync_buffer(int spu_num, unsign
+                * use.  We need to discard samples taken during the time
+                * period which an overlay occurs (i.e., guard value changes).
+                */
+-              if (grd_val && grd_val != last_guard_val[spu_num]) {
+-                      last_guard_val[spu_num] = grd_val;
++              if (grd_val && grd_val != spu_buff[spu_num].last_guard_val) {
++                      spu_buff[spu_num].last_guard_val = grd_val;
+                       /* Drop the rest of the samples. */
+                       break;
+               }
+ 
+-              add_event_entry(file_offset | spu_num_shifted);
++              /* We must ensure that the SPU context switch has been written
++               * out before samples for the SPU.  Otherwise, the SPU context
++               * information is not available and the postprocessing of the
++               * SPU PC will fail with no available anonymous map information.
++               */
++              if (spu_buff[spu_num].ctx_sw_seen)
++                      spu_buff_add((file_offset | spu_num_shifted),
++                                       spu_num);
+       }
+       spin_unlock(&buffer_lock);
+ out:
+@@ -463,20 +626,41 @@ out:
+ int spu_sync_stop(void)
+ {
+       unsigned long flags = 0;
+-      int ret = spu_switch_event_unregister(&spu_active);
+-      if (ret) {
++      int ret;
++      int k;
++
++      ret = spu_switch_event_unregister(&spu_active);
++
++      if (ret)
+               printk(KERN_ERR "SPU_PROF: "
+-                      "%s, line %d: spu_switch_event_unregister returned %d\n",
+-                      __func__, __LINE__, ret);
+-              goto out;
+-      }
++                     "%s, line %d: spu_switch_event_unregister "      \
++                     "returned %d\n",
++                     __func__, __LINE__, ret);
++
++      /* flush any remaining data in the per SPU buffers */
++      sync_spu_buff();
+ 
+       spin_lock_irqsave(&cache_lock, flags);
+       ret = release_cached_info(RELEASE_ALL);
+       spin_unlock_irqrestore(&cache_lock, flags);
+-out:
++
++      /* remove scheduled work queue item rather then waiting
++       * for every queued entry to execute.  Then flush pending
++       * system wide buffer to event buffer.
++       */
++      cancel_delayed_work(&spu_work);
++
++      for (k = 0; k < num_spu_nodes; k++) {
++              spu_buff[k].ctx_sw_seen = 0;
++
++              /*
++               * spu_sys_buff will be null if there was a problem
++               * allocating the buffer.  Only delete if it exists.
++               */
++              kfree(spu_buff[k].buff);
++              spu_buff[k].buff = 0;
++      }
+       pr_debug("spu_sync_stop -- done.\n");
+       return ret;
+ }
+ 
+-
+--- a/drivers/oprofile/buffer_sync.c
++++ b/drivers/oprofile/buffer_sync.c
+@@ -551,3 +551,27 @@ void sync_buffer(int cpu)
+ 
+       mutex_unlock(&buffer_mutex);
+ }
++
++/* The function can be used to add a buffer worth of data directly to
++ * the kernel buffer. The buffer is assumed to be a circular buffer.
++ * Take the entries from index start and end at index end, wrapping
++ * at max_entries.
++ */
++void oprofile_put_buff(unsigned long *buf, unsigned int start,
++                     unsigned int stop, unsigned int max)
++{
++      int i;
++
++      i = start;
++
++      mutex_lock(&buffer_mutex);
++      while (i != stop) {
++              add_event_entry(buf[i++]);
++
++              if (i >= max)
++                      i = 0;
++      }
++
++      mutex_unlock(&buffer_mutex);
++}
++
+--- a/drivers/oprofile/cpu_buffer.c
++++ b/drivers/oprofile/cpu_buffer.c
+@@ -37,13 +37,26 @@ static int work_enabled;
+ void free_cpu_buffers(void)
+ {
+       int i;
+- 
++
+       for_each_online_cpu(i) {
+               vfree(per_cpu(cpu_buffer, i).buffer);
+               per_cpu(cpu_buffer, i).buffer = NULL;
+       }
+ }
+ 
++unsigned long oprofile_get_cpu_buffer_size(void)
++{
++      return fs_cpu_buffer_size;
++}
++
++void oprofile_cpu_buffer_inc_smpl_lost(void)
++{
++      struct oprofile_cpu_buffer *cpu_buf
++              = &__get_cpu_var(cpu_buffer);
++
++      cpu_buf->sample_lost_overflow++;
++}
++
+ int alloc_cpu_buffers(void)
+ {
+       int i;
+--- a/drivers/oprofile/event_buffer.h
++++ b/drivers/oprofile/event_buffer.h
+@@ -17,6 +17,13 @@ int alloc_event_buffer(void);
+ 
+ void free_event_buffer(void);
+  
++/**
++ * Add data to the event buffer.
++ * The data passed is free-form, but typically consists of
++ * file offsets, dcookies, context information, and ESCAPE codes.
++ */
++void add_event_entry(unsigned long data);
++
+ /* wake up the process sleeping on the event file */
+ void wake_up_buffer_waiter(void);
+ 
+--- a/include/linux/oprofile.h
++++ b/include/linux/oprofile.h
+@@ -84,13 +84,6 @@ int oprofile_arch_init(struct oprofile_o
+ void oprofile_arch_exit(void);
+ 
+ /**
+- * Add data to the event buffer.
+- * The data passed is free-form, but typically consists of
+- * file offsets, dcookies, context information, and ESCAPE codes.
+- */
+-void add_event_entry(unsigned long data);
+-
+-/**
+  * Add a sample. This may be called from any context. Pass
+  * smp_processor_id() as cpu.
+  */
+@@ -160,5 +153,14 @@ int oprofilefs_ulong_from_user(unsigned 
+ 
+ /** lock for read/write safety */
+ extern spinlock_t oprofilefs_lock;
++
++/**
++ * Add the contents of a circular buffer to the event buffer.
++ */
++void oprofile_put_buff(unsigned long *buf, unsigned int start,
++                      unsigned int stop, unsigned int max);
++
++unsigned long oprofile_get_cpu_buffer_size(void);
++void oprofile_cpu_buffer_inc_smpl_lost(void);
+  
+ #endif /* OPROFILE_H */