.37 patches

author Greg Kroah-Hartman <gregkh@suse.de>

Tue, 15 Feb 2011 00:25:56 +0000 (16:25 -0800)

committer Greg Kroah-Hartman <gregkh@suse.de>

Tue, 15 Feb 2011 00:25:56 +0000 (16:25 -0800)
author Greg Kroah-Hartman <gregkh@suse.de>
Tue, 15 Feb 2011 00:25:56 +0000 (16:25 -0800)
committer Greg Kroah-Hartman <gregkh@suse.de>
Tue, 15 Feb 2011 00:25:56 +0000 (16:25 -0800)
diff --git a/queue-2.6.37/ath9k-fix-race-conditions-when-stop-device.patch b/queue-2.6.37/ath9k-fix-race-conditions-when-stop-device.patch

deleted file mode 100644 (file)

index aa4248a..0000000
--- a/queue-2.6.37/ath9k-fix-race-conditions-when-stop-device.patch
+++ /dev/null
@@ -1,72 +0,0 @@
-From 203043f579ece44bb30291442cd56332651dd37d Mon Sep 17 00:00:00 2001
-From: Stanislaw Gruszka <sgruszka@redhat.com>
-Date: Tue, 25 Jan 2011 14:08:40 +0100
-Subject: ath9k: fix race conditions when stop device
-
-From: Stanislaw Gruszka <sgruszka@redhat.com>
-
-commit 203043f579ece44bb30291442cd56332651dd37d upstream.
-
-We do not kill any scheduled tasklets when stopping device, that may
-cause usage of resources after free. Moreover we enable interrupts
-in tasklet function, so we could potentially end with interrupts
-enabled when driver is not ready to receive them.
-
-I think patch should fix Ben's kernel crash from:
-http://marc.info/?l=linux-wireless&m=129438358921501&w=2
-
-Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
-Signed-off-by: John W. Linville <linville@tuxdriver.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
-
----
- drivers/net/wireless/ath/ath9k/init.c |    5 -----
- drivers/net/wireless/ath/ath9k/main.c |    9 +++++++++
- 2 files changed, 9 insertions(+), 5 deletions(-)
-
---- a/drivers/net/wireless/ath/ath9k/init.c
-+++ b/drivers/net/wireless/ath/ath9k/init.c
-@@ -633,8 +633,6 @@ err_queues:
- err_debug:
-       ath9k_hw_deinit(ah);
- err_hw:
--      tasklet_kill(&sc->intr_tq);
--      tasklet_kill(&sc->bcon_tasklet);
- 
-       kfree(ah);
-       sc->sc_ah = NULL;
-@@ -802,9 +800,6 @@ static void ath9k_deinit_softc(struct at
-       ath9k_exit_debug(sc->sc_ah);
-       ath9k_hw_deinit(sc->sc_ah);
- 
--      tasklet_kill(&sc->intr_tq);
--      tasklet_kill(&sc->bcon_tasklet);
--
-       kfree(sc->sc_ah);
-       sc->sc_ah = NULL;
- }
---- a/drivers/net/wireless/ath/ath9k/main.c
-+++ b/drivers/net/wireless/ath/ath9k/main.c
-@@ -1401,6 +1401,9 @@ static void ath9k_stop(struct ieee80211_
-                       ath9k_btcoex_timer_pause(sc);
-       }
- 
-+      /* prevent tasklets to enable interrupts once we disable them */
-+      ah->imask &= ~ATH9K_INT_GLOBAL;
-+
-       /* make sure h/w will not generate any interrupt
-        * before setting the invalid flag. */
-       ath9k_hw_set_interrupts(ah, 0);
-@@ -1901,6 +1904,12 @@ static int ath9k_set_key(struct ieee8021
-               ret = -EINVAL;
-       }
- 
-+      /* we can now sync irq and kill any running tasklets, since we already
-+       * disabled interrupts and not holding a spin lock */
-+      synchronize_irq(sc->irq);
-+      tasklet_kill(&sc->intr_tq);
-+      tasklet_kill(&sc->bcon_tasklet);
-+
-       ath9k_ps_restore(sc);
-       mutex_unlock(&sc->mutex);
- 
diff --git a/queue-2.6.37/block-fix-accounting-bug-on-cross-partition-merges.patch b/queue-2.6.37/block-fix-accounting-bug-on-cross-partition-merges.patch

new file mode 100644 (file)

index 0000000..f055f57
--- /dev/null
+++ b/queue-2.6.37/block-fix-accounting-bug-on-cross-partition-merges.patch
@@ -0,0 +1,233 @@
+From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001
+From: Jerome Marchand <jmarchan@redhat.com>
+Date: Wed, 5 Jan 2011 16:57:38 +0100
+Subject: block: fix accounting bug on cross partition merges
+
+From: Jerome Marchand <jmarchan@redhat.com>
+
+commit 09e099d4bafea3b15be003d548bdf94b4b6e0e17 upstream.
+
+/proc/diskstats would display a strange output as follows.
+
+$ cat /proc/diskstats |grep sda
+   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
+   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
+                                                ~~~~~~~~~~
+   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
+   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
+   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
+   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137
+
+Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
+merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.
+
+The detailed root cause is as follows.
+
+Assuming that there are two partition, sda1 and sda2.
+
+1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
+   is 0 and sda2's one is 1.
+
+        | hd_struct->in_flight
+   ---------------------------
+   sda1 |          0
+   sda2 |          1
+   ---------------------------
+
+2. A bio belongs to sda1 is issued and is merged into the request mentioned on
+   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
+   from sda2 region to sda1 region. However the two partition's
+   hd_struct->in_flight are not changed.
+
+        | hd_struct->in_flight
+   ---------------------------
+   sda1 |          0
+   sda2 |          1
+   ---------------------------
+
+3. The request is finished and blk_account_io_done() is called. In this case,
+   sda2's hd_struct->in_flight, not a sda1's one, is decremented.
+
+        | hd_struct->in_flight
+   ---------------------------
+   sda1 |         -1
+   sda2 |          1
+   ---------------------------
+
+The patch fixes the problem by caching the partition lookup
+inside the request structure, hence making sure that the increment
+and decrement will always happen on the same partition struct. This
+also speeds up IO with accounting enabled, since it cuts down on
+the number of lookups we have to do.
+
+Also add a refcount to struct hd_struct to keep the partition in
+memory as long as users exist. We use kref_test_and_get() to ensure
+we don't add a reference to a partition which is going away.
+
+Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
+Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
+Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ block/blk-core.c       |   26 +++++++++++++++++++++-----
+ block/blk-merge.c      |    3 ++-
+ block/genhd.c          |    1 +
+ fs/partitions/check.c  |   10 +++++++++-
+ include/linux/blkdev.h |    1 +
+ include/linux/genhd.h  |    2 ++
+ 6 files changed, 36 insertions(+), 7 deletions(-)
+
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -64,13 +64,27 @@ static void drive_stat_acct(struct reque
+               return;
+ 
+       cpu = part_stat_lock();
+-      part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+ 
+-      if (!new_io)
++      if (!new_io) {
++              part = rq->part;
+               part_stat_inc(cpu, part, merges[rw]);
+-      else {
++      } else {
++              part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
++              if (!kref_test_and_get(&part->ref)) {
++                      /*
++                       * The partition is already being removed,
++                       * the request will be accounted on the disk only
++                       *
++                       * We take a reference on disk->part0 although that
++                       * partition will never be deleted, so we can treat
++                       * it as any other partition.
++                       */
++                      part = &rq->rq_disk->part0;
++                      kref_get(&part->ref);
++              }
+               part_round_stats(cpu, part);
+               part_inc_in_flight(part, rw);
++              rq->part = part;
+       }
+ 
+       part_stat_unlock();
+@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q
+       rq->ref_count = 1;
+       rq->start_time = jiffies;
+       set_start_time_ns(rq);
++      rq->part = NULL;
+ }
+ EXPORT_SYMBOL(blk_rq_init);
+ 
+@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(st
+               int cpu;
+ 
+               cpu = part_stat_lock();
+-              part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
++              part = req->part;
+               part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+               part_stat_unlock();
+       }
+@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct r
+               int cpu;
+ 
+               cpu = part_stat_lock();
+-              part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
++              part = req->part;
+ 
+               part_stat_inc(cpu, part, ios[rw]);
+               part_stat_add(cpu, part, ticks[rw], duration);
+               part_round_stats(cpu, part);
+               part_dec_in_flight(part, rw);
+ 
++              kref_put(&part->ref, __delete_partition);
+               part_stat_unlock();
+       }
+ }
+--- a/block/blk-merge.c
++++ b/block/blk-merge.c
+@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct
+               int cpu;
+ 
+               cpu = part_stat_lock();
+-              part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
++              part = req->part;
+ 
+               part_round_stats(cpu, part);
+               part_dec_in_flight(part, rq_data_dir(req));
+ 
++              kref_put(&part->ref, __delete_partition);
+               part_stat_unlock();
+       }
+ }
+--- a/block/genhd.c
++++ b/block/genhd.c
+@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int mino
+                       return NULL;
+               }
+               disk->part_tbl->part[0] = &disk->part0;
++              kref_init(&disk->part0.ref);
+ 
+               disk->minors = minors;
+               rand_initialize_disk(disk);
+--- a/fs/partitions/check.c
++++ b/fs/partitions/check.c
+@@ -372,6 +372,13 @@ static void delete_partition_rcu_cb(stru
+       put_device(part_to_dev(part));
+ }
+ 
++void __delete_partition(struct kref *ref)
++{
++      struct hd_struct *part = container_of(ref, struct hd_struct, ref);
++
++      call_rcu(&part->rcu_head, delete_partition_rcu_cb);
++}
++
+ void delete_partition(struct gendisk *disk, int partno)
+ {
+       struct disk_part_tbl *ptbl = disk->part_tbl;
+@@ -390,7 +397,7 @@ void delete_partition(struct gendisk *di
+       kobject_put(part->holder_dir);
+       device_del(part_to_dev(part));
+ 
+-      call_rcu(&part->rcu_head, delete_partition_rcu_cb);
++      kref_put(&part->ref, __delete_partition);
+ }
+ 
+ static ssize_t whole_disk_show(struct device *dev,
+@@ -489,6 +496,7 @@ struct hd_struct *add_partition(struct g
+       if (!dev_get_uevent_suppress(ddev))
+               kobject_uevent(&pdev->kobj, KOBJ_ADD);
+ 
++      kref_init(&p->ref);
+       return p;
+ 
+ out_free_info:
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -115,6 +115,7 @@ struct request {
+       void *elevator_private3;
+ 
+       struct gendisk *rq_disk;
++      struct hd_struct *part;
+       unsigned long start_time;
+ #ifdef CONFIG_BLK_CGROUP
+       unsigned long long start_time_ns;
+--- a/include/linux/genhd.h
++++ b/include/linux/genhd.h
+@@ -116,6 +116,7 @@ struct hd_struct {
+       struct disk_stats dkstats;
+ #endif
+       struct rcu_head rcu_head;
++      struct kref ref;
+ };
+ 
+ #define GENHD_FL_REMOVABLE                    1
+@@ -583,6 +584,7 @@ extern struct hd_struct * __must_check a
+                                                    sector_t len, int flags,
+                                                    struct partition_meta_info
+                                                      *info);
++extern void __delete_partition(struct kref *ref);
+ extern void delete_partition(struct gendisk *, int);
+ extern void printk_all_partitions(void);
+ 
diff --git a/queue-2.6.37/dynamic-debug-fix-build-issue-with-older-gcc.patch b/queue-2.6.37/dynamic-debug-fix-build-issue-with-older-gcc.patch

new file mode 100644 (file)

index 0000000..fba544d
--- /dev/null
+++ b/queue-2.6.37/dynamic-debug-fix-build-issue-with-older-gcc.patch
@@ -0,0 +1,91 @@
+From 2d75af2f2a7a6103a6d539a492fe81deacabde44 Mon Sep 17 00:00:00 2001
+From: Jason Baron <jbaron@redhat.com>
+Date: Fri, 7 Jan 2011 13:36:58 -0500
+Subject: dynamic debug: Fix build issue with older gcc
+
+From: Jason Baron <jbaron@redhat.com>
+
+commit 2d75af2f2a7a6103a6d539a492fe81deacabde44 upstream.
+
+On older gcc (3.3) dynamic debug fails to compile:
+
+include/net/inet_connection_sock.h: In function `inet_csk_reset_xmit_timer':
+include/net/inet_connection_sock.h:236: error: duplicate label declaration `do_printk'
+include/net/inet_connection_sock.h:219: error: this is a previous declaration
+include/net/inet_connection_sock.h:236: error: duplicate label declaration `out'
+include/net/inet_connection_sock.h:219: error: this is a previous declaration
+include/net/inet_connection_sock.h:236: error: duplicate label `do_printk'
+include/net/inet_connection_sock.h:236: error: duplicate label `out'
+
+Fix, by reverting the usage of JUMP_LABEL() in dynamic debug for now.
+
+Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Signed-off-by: Jason Baron <jbaron@redhat.com>
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/dynamic_debug.h |   18 ++++--------------
+ lib/dynamic_debug.c           |    9 ++++-----
+ 2 files changed, 8 insertions(+), 19 deletions(-)
+
+--- a/include/linux/dynamic_debug.h
++++ b/include/linux/dynamic_debug.h
+@@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *ta
+ extern int ddebug_remove_module(const char *mod_name);
+ 
+ #define dynamic_pr_debug(fmt, ...) do {                                       \
+-      __label__ do_printk;                                            \
+-      __label__ out;                                                  \
+       static struct _ddebug descriptor                                \
+       __used                                                          \
+       __attribute__((section("__verbose"), aligned(8))) =             \
+       { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,            \
+               _DPRINTK_FLAGS_DEFAULT };                               \
+-      JUMP_LABEL(&descriptor.enabled, do_printk);                     \
+-      goto out;                                                       \
+-do_printk:                                                            \
+-      printk(KERN_DEBUG pr_fmt(fmt),  ##__VA_ARGS__);                 \
+-out:  ;                                                               \
++      if (unlikely(descriptor.enabled))                               \
++              printk(KERN_DEBUG pr_fmt(fmt),  ##__VA_ARGS__);         \
+       } while (0)
+ 
+ 
+ #define dynamic_dev_dbg(dev, fmt, ...) do {                           \
+-      __label__ do_printk;                                            \
+-      __label__ out;                                                  \
+       static struct _ddebug descriptor                                \
+       __used                                                          \
+       __attribute__((section("__verbose"), aligned(8))) =             \
+       { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__,            \
+               _DPRINTK_FLAGS_DEFAULT };                               \
+-      JUMP_LABEL(&descriptor.enabled, do_printk);                     \
+-      goto out;                                                       \
+-do_printk:                                                            \
+-      dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);                \
+-out:  ;                                                               \
++      if (unlikely(descriptor.enabled))                               \
++              dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__);        \
+       } while (0)
+ 
+ #else
+--- a/lib/dynamic_debug.c
++++ b/lib/dynamic_debug.c
+@@ -141,11 +141,10 @@ static void ddebug_change(const struct d
+                       else if (!dp->flags)
+                               dt->num_enabled++;
+                       dp->flags = newflags;
+-                      if (newflags) {
+-                              jump_label_enable(&dp->enabled);
+-                      } else {
+-                              jump_label_disable(&dp->enabled);
+-                      }
++                      if (newflags)
++                              dp->enabled = 1;
++                      else
++                              dp->enabled = 0;
+                       if (verbose)
+                               printk(KERN_INFO
+                                       "ddebug: changed %s:%d [%s]%s %s\n",
diff --git a/queue-2.6.37/ext4-fix-data-corruption-with-multi-block-writepages-support.patch b/queue-2.6.37/ext4-fix-data-corruption-with-multi-block-writepages-support.patch

new file mode 100644 (file)

index 0000000..c4b256f
--- /dev/null
+++ b/queue-2.6.37/ext4-fix-data-corruption-with-multi-block-writepages-support.patch
@@ -0,0 +1,110 @@
+From d50bdd5aa55127635fd8a5c74bd2abb256bd34e3 Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Mon, 7 Feb 2011 12:46:14 -0500
+Subject: ext4: Fix data corruption with multi-block writepages support
+
+From: Curt Wohlgemuth <curtw@google.com>
+
+commit d50bdd5aa55127635fd8a5c74bd2abb256bd34e3 upstream.
+
+This fixes a corruption problem with the multi-block
+writepages submittal change for ext4, from commit
+bd2d0210cf22f2bd0cef72eb97cf94fc7d31d8cc ("ext4: use bio
+layer instead of buffer layer in mpage_da_submit_io").
+
+(Note that this corruption is not present in 2.6.37 on
+ext4, because the corruption was detected after the
+feature was merged in 2.6.37-rc1, and so it was turned
+off by adding a non-default mount option,
+mblk_io_submit.  With this commit, which hopefully
+fixes the last of the bugs with this feature, we'll be
+able to turn on this performance feature by default in
+2.6.38, and remove the mblk_io_submit option.)
+
+The ext4 code path to bundle multiple pages for
+writeback in ext4_bio_write_page() had a bug: we should
+be clearing buffer head dirty flags *before* we submit
+the bio, not in the completion routine.
+
+The patch below was tested on 2.6.37 under KVM with the
+postgresql script which was submitted by Jon Nelson as
+documented in commit 1449032be1.
+
+Without the patch, I'd hit the corruption problem about
+50-70% of the time.  With the patch, I executed the
+script > 100 times with no corruption seen.
+
+I also fixed a bug to make sure ext4_end_bio() doesn't
+dereference the bio after the bio_put() call.
+
+Reported-by: Jon Nelson <jnelson@jamponi.net>
+Reported-by: Matthias Bayer <jackdachef@gmail.com>
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/page-io.c |   11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -193,6 +193,7 @@ static void ext4_end_bio(struct bio *bio
+       struct inode *inode;
+       unsigned long flags;
+       int i;
++      sector_t bi_sector = bio->bi_sector;
+ 
+       BUG_ON(!io_end);
+       bio->bi_private = NULL;
+@@ -210,9 +211,7 @@ static void ext4_end_bio(struct bio *bio
+               if (error)
+                       SetPageError(page);
+               BUG_ON(!head);
+-              if (head->b_size == PAGE_CACHE_SIZE)
+-                      clear_buffer_dirty(head);
+-              else {
++              if (head->b_size != PAGE_CACHE_SIZE) {
+                       loff_t offset;
+                       loff_t io_end_offset = io_end->offset + io_end->size;
+ 
+@@ -224,7 +223,6 @@ static void ext4_end_bio(struct bio *bio
+                                       if (error)
+                                               buffer_io_error(bh);
+ 
+-                                      clear_buffer_dirty(bh);
+                               }
+                               if (buffer_delay(bh))
+                                       partial_write = 1;
+@@ -260,7 +258,7 @@ static void ext4_end_bio(struct bio *bio
+                            (unsigned long long) io_end->offset,
+                            (long) io_end->size,
+                            (unsigned long long)
+-                           bio->bi_sector >> (inode->i_blkbits - 9));
++                           bi_sector >> (inode->i_blkbits - 9));
+       }
+ 
+       /* Add the io_end to per-inode completed io list*/
+@@ -383,6 +381,7 @@ int ext4_bio_write_page(struct ext4_io_s
+ 
+       blocksize = 1 << inode->i_blkbits;
+ 
++      BUG_ON(!PageLocked(page));
+       BUG_ON(PageWriteback(page));
+       set_page_writeback(page);
+       ClearPageError(page);
+@@ -400,12 +399,14 @@ int ext4_bio_write_page(struct ext4_io_s
+       for (bh = head = page_buffers(page), block_start = 0;
+            bh != head || !block_start;
+            block_start = block_end, bh = bh->b_this_page) {
++
+               block_end = block_start + blocksize;
+               if (block_start >= len) {
+                       clear_buffer_dirty(bh);
+                       set_buffer_uptodate(bh);
+                       continue;
+               }
++              clear_buffer_dirty(bh);
+               ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+               if (ret) {
+                       /*
diff --git a/queue-2.6.37/ext4-fix-memory-leak-in-ext4_free_branches.patch b/queue-2.6.37/ext4-fix-memory-leak-in-ext4_free_branches.patch

new file mode 100644 (file)

index 0000000..f0e0da7
--- /dev/null
+++ b/queue-2.6.37/ext4-fix-memory-leak-in-ext4_free_branches.patch
@@ -0,0 +1,35 @@
+From 1c5b9e9065567876c2d4a7a16d78f0fed154a5bf Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Mon, 10 Jan 2011 12:51:28 -0500
+Subject: ext4: fix memory leak in ext4_free_branches
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 1c5b9e9065567876c2d4a7a16d78f0fed154a5bf upstream.
+
+Commit 40389687 moved a call to ext4_forget() out of
+ext4_free_branches and let ext4_free_blocks() handle calling
+bforget().  But that change unfortunately did not replace the call to
+ext4_forget() with brelse(), which was needed to drop the in-use count
+of the indirect block's buffer head, which lead to a memory leak when
+deleting files that used indirect blocks.  Fix this.
+
+Thanks to Hugh Dickins for pointing this out.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4349,6 +4349,7 @@ static void ext4_free_branches(handle_t
+                                       (__le32 *) bh->b_data,
+                                       (__le32 *) bh->b_data + addr_per_block,
+                                       depth);
++                      brelse(bh);
+ 
+                       /*
+                        * Everything below this this pointer has been
diff --git a/queue-2.6.37/ext4-fix-panic-on-module-unload-when-stopping-lazyinit-thread.patch b/queue-2.6.37/ext4-fix-panic-on-module-unload-when-stopping-lazyinit-thread.patch

new file mode 100644 (file)

index 0000000..f5cc3ec
--- /dev/null
+++ b/queue-2.6.37/ext4-fix-panic-on-module-unload-when-stopping-lazyinit-thread.patch
@@ -0,0 +1,110 @@
+From 8f1f745331c1b560f53c0d60e55a4f4f43f7cce5 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Thu, 3 Feb 2011 14:33:15 -0500
+Subject: ext4: fix panic on module unload when stopping lazyinit thread
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+commit 8f1f745331c1b560f53c0d60e55a4f4f43f7cce5 upstream.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=27652
+
+If the lazyinit thread is running, the teardown function
+ext4_destroy_lazyinit_thread() has problems:
+
+        ext4_clear_request_list();
+        while (ext4_li_info->li_task) {
+                wake_up(&ext4_li_info->li_wait_daemon);
+                wait_event(ext4_li_info->li_wait_task,
+                           ext4_li_info->li_task == NULL);
+        }
+
+Clearing the request list will cause the thread to exit and free
+ext4_li_info, so then we're waiting on something which is getting
+freed.
+
+Fix this up by making the thread respond to kthread_stop, and exit,
+without the need to wait for that exit in some other homegrown way.
+
+Reported-and-Tested-by: Tao Ma <boyu.mt@taobao.com>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/super.c |   27 ++++++++++++++-------------
+ 1 file changed, 14 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct
+                      const char *dev_name, void *data);
+ static void ext4_destroy_lazyinit_thread(void);
+ static void ext4_unregister_li_request(struct super_block *sb);
++static void ext4_clear_request_list(void);
+ 
+ #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+ static struct file_system_type ext3_fs_type = {
+@@ -2704,6 +2705,8 @@ static void ext4_unregister_li_request(s
+       mutex_unlock(&ext4_li_info->li_list_mtx);
+ }
+ 
++static struct task_struct *ext4_lazyinit_task;
++
+ /*
+  * This is the function where ext4lazyinit thread lives. It walks
+  * through the request list searching for next scheduled filesystem.
+@@ -2772,6 +2775,10 @@ cont_thread:
+               if (time_before(jiffies, next_wakeup))
+                       schedule();
+               finish_wait(&eli->li_wait_daemon, &wait);
++              if (kthread_should_stop()) {
++                      ext4_clear_request_list();
++                      goto exit_thread;
++              }
+       }
+ 
+ exit_thread:
+@@ -2796,6 +2803,7 @@ exit_thread:
+       wake_up(&eli->li_wait_task);
+ 
+       kfree(ext4_li_info);
++      ext4_lazyinit_task = NULL;
+       ext4_li_info = NULL;
+       mutex_unlock(&ext4_li_mtx);
+ 
+@@ -2818,11 +2826,10 @@ static void ext4_clear_request_list(void
+ 
+ static int ext4_run_lazyinit_thread(void)
+ {
+-      struct task_struct *t;
+-
+-      t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+-      if (IS_ERR(t)) {
+-              int err = PTR_ERR(t);
++      ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
++                                       ext4_li_info, "ext4lazyinit");
++      if (IS_ERR(ext4_lazyinit_task)) {
++              int err = PTR_ERR(ext4_lazyinit_task);
+               ext4_clear_request_list();
+               del_timer_sync(&ext4_li_info->li_timer);
+               kfree(ext4_li_info);
+@@ -2973,16 +2980,10 @@ static void ext4_destroy_lazyinit_thread
+        * If thread exited earlier
+        * there's nothing to be done.
+        */
+-      if (!ext4_li_info)
++      if (!ext4_li_info || !ext4_lazyinit_task)
+               return;
+ 
+-      ext4_clear_request_list();
+-
+-      while (ext4_li_info->li_task) {
+-              wake_up(&ext4_li_info->li_wait_daemon);
+-              wait_event(ext4_li_info->li_wait_task,
+-                         ext4_li_info->li_task == NULL);
+-      }
++      kthread_stop(ext4_lazyinit_task);
+ }
+ 
+ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/queue-2.6.37/ext4-fix-trimming-of-a-single-group.patch b/queue-2.6.37/ext4-fix-trimming-of-a-single-group.patch

new file mode 100644 (file)

index 0000000..96e6772
--- /dev/null
+++ b/queue-2.6.37/ext4-fix-trimming-of-a-single-group.patch
@@ -0,0 +1,34 @@
+From ca6e909f9bebe709bc65a3ee605ce32969db0452 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Mon, 10 Jan 2011 12:30:39 -0500
+Subject: ext4: fix trimming of a single group
+
+From: Jan Kara <jack@suse.cz>
+
+commit ca6e909f9bebe709bc65a3ee605ce32969db0452 upstream.
+
+When ext4_trim_fs() is called to trim a part of a single group, the
+logic will wrongly set last block of the interval to 'len' instead
+of 'first_block + len'. Thus a shorter interval is possibly trimmed.
+Fix it.
+
+CC: Lukas Czerner <lczerner@redhat.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/mballoc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -4851,7 +4851,7 @@ int ext4_trim_fs(struct super_block *sb,
+               if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+                       len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+               else
+-                      last_block = len;
++                      last_block = first_block + len;
+ 
+               if (e4b.bd_info->bb_free >= minlen) {
+                       cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/queue-2.6.37/ext4-fix-uninitialized-variable-in-ext4_register_li_request.patch b/queue-2.6.37/ext4-fix-uninitialized-variable-in-ext4_register_li_request.patch

new file mode 100644 (file)

index 0000000..d94eb4f
--- /dev/null
+++ b/queue-2.6.37/ext4-fix-uninitialized-variable-in-ext4_register_li_request.patch
@@ -0,0 +1,34 @@
+From 6c5a6cb998854f3c579ecb2bc1423d302bcb1b76 Mon Sep 17 00:00:00 2001
+From: Andrew Morton <akpm@linux-foundation.org>
+Date: Mon, 10 Jan 2011 12:30:17 -0500
+Subject: ext4: fix uninitialized variable in ext4_register_li_request
+
+From: Andrew Morton <akpm@linux-foundation.org>
+
+commit 6c5a6cb998854f3c579ecb2bc1423d302bcb1b76 upstream.
+
+fs/ext4/super.c: In function 'ext4_register_li_request':
+fs/ext4/super.c:2936: warning: 'ret' may be used uninitialized in this function
+
+It looks buggy to me, too.
+
+Cc: Lukas Czerner <lczerner@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/super.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2916,7 +2916,7 @@ static int ext4_register_li_request(stru
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_li_request *elr;
+       ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+-      int ret;
++      int ret = 0;
+ 
+       if (sbi->s_li_request != NULL)
+               return 0;
diff --git a/queue-2.6.37/ext4-make-grpinfo-slab-cache-names-static.patch b/queue-2.6.37/ext4-make-grpinfo-slab-cache-names-static.patch

new file mode 100644 (file)

index 0000000..d755ad6
--- /dev/null
+++ b/queue-2.6.37/ext4-make-grpinfo-slab-cache-names-static.patch
@@ -0,0 +1,195 @@
+From 2892c15ddda6a76dc10b7499e56c0f3b892e5a69 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sat, 12 Feb 2011 08:12:18 -0500
+Subject: ext4: make grpinfo slab cache names static
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+commit 2892c15ddda6a76dc10b7499e56c0f3b892e5a69 upstream.
+
+In 2.6.37 I was running into oopses with repeated module
+loads & unloads.  I tracked this down to:
+
+fb1813f4 ext4: use dedicated slab caches for group_info structures
+
+(this was in addition to the features advert unload problem)
+
+The kstrdup & subsequent kfree of the cache name was causing
+a double free.  In slub, at least, if I read it right it allocates
+& frees the name itself, slab seems to do something different...
+so in slub I think we were leaking -our- cachep->name, and double
+freeing the one allocated by slub.
+
+After getting lost in slab/slub/slob a bit, I just looked at other
+sized-caches that get allocated.  jbd2, biovec, sgpool all do it
+more or less the way jbd2 does.  Below patch follows the jbd2
+method of dynamically allocating a cache at mount time from
+a list of static names.
+
+(This might also possibly fix a race creating the caches with
+parallel mounts running).
+
+[Folded in a fix from Dan Carpenter which fixed an off-by-one error in
+the original patch]
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/mballoc.c |  100 ++++++++++++++++++++++++++++++++----------------------
+ 1 file changed, 60 insertions(+), 40 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_
+ /* We create slab caches for groupinfo data structures based on the
+  * superblock block size.  There will be one per mounted filesystem for
+  * each unique s_blocksize_bits */
+-#define NR_GRPINFO_CACHES     \
+-      (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
++#define NR_GRPINFO_CACHES 8
+ static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+ 
++static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
++      "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
++      "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
++      "ext4_groupinfo_64k", "ext4_groupinfo_128k"
++};
++
+ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+                                       ext4_group_t group);
+ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+@@ -2414,6 +2419,55 @@ err_freesgi:
+       return -ENOMEM;
+ }
+ 
++static void ext4_groupinfo_destroy_slabs(void)
++{
++      int i;
++
++      for (i = 0; i < NR_GRPINFO_CACHES; i++) {
++              if (ext4_groupinfo_caches[i])
++                      kmem_cache_destroy(ext4_groupinfo_caches[i]);
++              ext4_groupinfo_caches[i] = NULL;
++      }
++}
++
++static int ext4_groupinfo_create_slab(size_t size)
++{
++      static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
++      int slab_size;
++      int blocksize_bits = order_base_2(size);
++      int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
++      struct kmem_cache *cachep;
++
++      if (cache_index >= NR_GRPINFO_CACHES)
++              return -EINVAL;
++
++      if (unlikely(cache_index < 0))
++              cache_index = 0;
++
++      mutex_lock(&ext4_grpinfo_slab_create_mutex);
++      if (ext4_groupinfo_caches[cache_index]) {
++              mutex_unlock(&ext4_grpinfo_slab_create_mutex);
++              return 0;       /* Already created */
++      }
++
++      slab_size = offsetof(struct ext4_group_info,
++                              bb_counters[blocksize_bits + 2]);
++
++      cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
++                                      slab_size, 0, SLAB_RECLAIM_ACCOUNT,
++                                      NULL);
++
++      mutex_unlock(&ext4_grpinfo_slab_create_mutex);
++      if (!cachep) {
++              printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
++              return -ENOMEM;
++      }
++
++      ext4_groupinfo_caches[cache_index] = cachep;
++
++      return 0;
++}
++
+ int ext4_mb_init(struct super_block *sb, int needs_recovery)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb,
+       unsigned offset;
+       unsigned max;
+       int ret;
+-      int cache_index;
+-      struct kmem_cache *cachep;
+-      char *namep = NULL;
+ 
+       i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+ 
+@@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb,
+               goto out;
+       }
+ 
+-      cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+-      cachep = ext4_groupinfo_caches[cache_index];
+-      if (!cachep) {
+-              char name[32];
+-              int len = offsetof(struct ext4_group_info,
+-                                      bb_counters[sb->s_blocksize_bits + 2]);
+-
+-              sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+-              namep = kstrdup(name, GFP_KERNEL);
+-              if (!namep) {
+-                      ret = -ENOMEM;
+-                      goto out;
+-              }
+-
+-              /* Need to free the kmem_cache_name() when we
+-               * destroy the slab */
+-              cachep = kmem_cache_create(namep, len, 0,
+-                                           SLAB_RECLAIM_ACCOUNT, NULL);
+-              if (!cachep) {
+-                      ret = -ENOMEM;
+-                      goto out;
+-              }
+-              ext4_groupinfo_caches[cache_index] = cachep;
+-      }
++      ret = ext4_groupinfo_create_slab(sb->s_blocksize);
++      if (ret < 0)
++              goto out;
+ 
+       /* order 0 is regular bitmap */
+       sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
+@@ -2520,7 +2550,6 @@ out:
+       if (ret) {
+               kfree(sbi->s_mb_offsets);
+               kfree(sbi->s_mb_maxs);
+-              kfree(namep);
+       }
+       return ret;
+ }
+@@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void)
+ 
+ void ext4_exit_mballoc(void)
+ {
+-      int i;
+       /*
+        * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+        * before destroying the slab cache.
+@@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void)
+       kmem_cache_destroy(ext4_pspace_cachep);
+       kmem_cache_destroy(ext4_ac_cachep);
+       kmem_cache_destroy(ext4_free_ext_cachep);
+-
+-      for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+-              struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+-              if (cachep) {
+-                      char *name = (char *)kmem_cache_name(cachep);
+-                      kmem_cache_destroy(cachep);
+-                      kfree(name);
+-              }
+-      }
++      ext4_groupinfo_destroy_slabs();
+       ext4_remove_debugfs_entry();
+ }
+ 
diff --git a/queue-2.6.37/ext4-unregister-features-interface-on-module-unload.patch b/queue-2.6.37/ext4-unregister-features-interface-on-module-unload.patch

new file mode 100644 (file)

index 0000000..419ef6c
--- /dev/null
+++ b/queue-2.6.37/ext4-unregister-features-interface-on-module-unload.patch
@@ -0,0 +1,65 @@
+From 8f021222c1e2756ea4c9dde93b23e1d2a0a4ec37 Mon Sep 17 00:00:00 2001
+From: Lukas Czerner <lczerner@redhat.com>
+Date: Thu, 3 Feb 2011 14:33:33 -0500
+Subject: ext4: unregister features interface on module unload
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+commit 8f021222c1e2756ea4c9dde93b23e1d2a0a4ec37 upstream.
+
+Ext4 features interface was not properly unregistered which led to
+problems while unloading/reloading ext4 module. This commit fixes that by
+adding proper kobject unregistration code into ext4_exit_fs() as well as
+fail-path of ext4_init_fs()
+
+Reported-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/super.c |   12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4757,7 +4757,7 @@ static struct file_system_type ext4_fs_t
+       .fs_flags       = FS_REQUIRES_DEV,
+ };
+ 
+-int __init ext4_init_feat_adverts(void)
++static int __init ext4_init_feat_adverts(void)
+ {
+       struct ext4_features *ef;
+       int ret = -ENOMEM;
+@@ -4781,6 +4781,13 @@ out:
+       return ret;
+ }
+ 
++static void ext4_exit_feat_adverts(void)
++{
++      kobject_put(&ext4_feat->f_kobj);
++      wait_for_completion(&ext4_feat->f_kobj_unregister);
++      kfree(ext4_feat);
++}
++
+ static int __init ext4_init_fs(void)
+ {
+       int err;
+@@ -4827,7 +4834,7 @@ out1:
+ out2:
+       ext4_exit_mballoc();
+ out3:
+-      kfree(ext4_feat);
++      ext4_exit_feat_adverts();
+       remove_proc_entry("fs/ext4", NULL);
+       kset_unregister(ext4_kset);
+ out4:
+@@ -4846,6 +4853,7 @@ static void __exit ext4_exit_fs(void)
+       destroy_inodecache();
+       ext4_exit_xattr();
+       ext4_exit_mballoc();
++      ext4_exit_feat_adverts();
+       remove_proc_entry("fs/ext4", NULL);
+       kset_unregister(ext4_kset);
+       ext4_exit_system_zone();
diff --git a/queue-2.6.37/genirq-prevent-irq-storm-on-migration.patch b/queue-2.6.37/genirq-prevent-irq-storm-on-migration.patch

new file mode 100644 (file)

index 0000000..49c4f22
--- /dev/null
+++ b/queue-2.6.37/genirq-prevent-irq-storm-on-migration.patch
@@ -0,0 +1,51 @@
+From f1a06390d013244e721372b3f9b66e39b6429c71 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Fri, 28 Jan 2011 08:47:15 +0100
+Subject: genirq: Prevent irq storm on migration
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit f1a06390d013244e721372b3f9b66e39b6429c71 upstream.
+
+move_native_irq() masks and unmasks the interrupt line
+unconditionally, but the interrupt line might be masked due to a
+threaded oneshot handler in progress. Unmasking the line in that case
+can lead to interrupt storms. Observed on PREEMPT_RT.
+
+Originally-from: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/irq/migration.c |   14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/kernel/irq/migration.c
++++ b/kernel/irq/migration.c
+@@ -56,6 +56,7 @@ void move_masked_irq(int irq)
+ void move_native_irq(int irq)
+ {
+       struct irq_desc *desc = irq_to_desc(irq);
++      bool masked;
+ 
+       if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+               return;
+@@ -63,8 +64,15 @@ void move_native_irq(int irq)
+       if (unlikely(desc->status & IRQ_DISABLED))
+               return;
+ 
+-      desc->irq_data.chip->irq_mask(&desc->irq_data);
++      /*
++       * Be careful vs. already masked interrupts. If this is a
++       * threaded interrupt with ONESHOT set, we can end up with an
++       * interrupt storm.
++       */
++      masked = desc->status & IRQ_MASKED;
++      if (!masked)
++              desc->irq_data.chip->irq_mask(&desc->irq_data);
+       move_masked_irq(irq);
+-      desc->irq_data.chip->irq_unmask(&desc->irq_data);
++      if (!masked)
++              desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ }
+-
diff --git a/queue-2.6.37/kref-add-kref_test_and_get.patch b/queue-2.6.37/kref-add-kref_test_and_get.patch

new file mode 100644 (file)

index 0000000..519301a
--- /dev/null
+++ b/queue-2.6.37/kref-add-kref_test_and_get.patch
@@ -0,0 +1,53 @@
+From e4a683c899cd5a49f8d684a054c95bd115a0c005 Mon Sep 17 00:00:00 2001
+From: Jerome Marchand <jmarchan@redhat.com>
+Date: Wed, 5 Jan 2011 16:57:37 +0100
+Subject: kref: add kref_test_and_get
+
+From: Jerome Marchand <jmarchan@redhat.com>
+
+commit e4a683c899cd5a49f8d684a054c95bd115a0c005 upstream.
+
+Add kref_test_and_get() function, which atomically add a reference only if
+refcount is not zero. This prevent to add a reference to an object that is
+already being removed.
+
+Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
+Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/kref.h |    1 +
+ lib/kref.c           |   12 ++++++++++++
+ 2 files changed, 13 insertions(+)
+
+--- a/include/linux/kref.h
++++ b/include/linux/kref.h
+@@ -23,6 +23,7 @@ struct kref {
+ 
+ void kref_init(struct kref *kref);
+ void kref_get(struct kref *kref);
++int kref_test_and_get(struct kref *kref);
+ int kref_put(struct kref *kref, void (*release) (struct kref *kref));
+ 
+ #endif /* _KREF_H_ */
+--- a/lib/kref.c
++++ b/lib/kref.c
+@@ -37,6 +37,18 @@ void kref_get(struct kref *kref)
+ }
+ 
+ /**
++ * kref_test_and_get - increment refcount for object only if refcount is not
++ * zero.
++ * @kref: object.
++ *
++ * Return non-zero if the refcount was incremented, 0 otherwise
++ */
++int kref_test_and_get(struct kref *kref)
++{
++      return atomic_inc_not_zero(&kref->refcount);
++}
++
++/**
+  * kref_put - decrement refcount for object.
+  * @kref: object.
+  * @release: pointer to the function that will clean up the object when the
diff --git a/queue-2.6.37/mm-fix-hugepage-migration.patch b/queue-2.6.37/mm-fix-hugepage-migration.patch

new file mode 100644 (file)

index 0000000..8314e37
--- /dev/null
+++ b/queue-2.6.37/mm-fix-hugepage-migration.patch
@@ -0,0 +1,78 @@
+From fd4a4663db293bfd5dc20fb4113977f62895e550 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Thu, 13 Jan 2011 15:47:31 -0800
+Subject: mm: fix hugepage migration
+
+From: Hugh Dickins <hughd@google.com>
+
+commit fd4a4663db293bfd5dc20fb4113977f62895e550 upstream.
+
+2.6.37 added an unmap_and_move_huge_page() for memory failure recovery,
+but its anon_vma handling was still based around the 2.6.35 conventions.
+Update it to use page_lock_anon_vma, get_anon_vma, page_unlock_anon_vma,
+drop_anon_vma in the same way as we're now changing unmap_and_move().
+
+I don't particularly like to propose this for stable when I've not seen
+its problems in practice nor tested the solution: but it's clearly out of
+synch at present.
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Mel Gorman <mel@csn.ul.ie>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: "Jun'ichi Nomura" <j-nomura@ce.jp.nec.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/migrate.c |   23 ++++++-----------------
+ 1 file changed, 6 insertions(+), 17 deletions(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -805,7 +805,6 @@ static int unmap_and_move_huge_page(new_
+       int rc = 0;
+       int *result = NULL;
+       struct page *new_hpage = get_new_page(hpage, private, &result);
+-      int rcu_locked = 0;
+       struct anon_vma *anon_vma = NULL;
+ 
+       if (!new_hpage)
+@@ -820,12 +819,10 @@ static int unmap_and_move_huge_page(new_
+       }
+ 
+       if (PageAnon(hpage)) {
+-              rcu_read_lock();
+-              rcu_locked = 1;
+-
+-              if (page_mapped(hpage)) {
+-                      anon_vma = page_anon_vma(hpage);
+-                      atomic_inc(&anon_vma->external_refcount);
++              anon_vma = page_lock_anon_vma(hpage);
++              if (anon_vma) {
++                      get_anon_vma(anon_vma);
++                      page_unlock_anon_vma(anon_vma);
+               }
+       }
+ 
+@@ -837,16 +834,8 @@ static int unmap_and_move_huge_page(new_
+       if (rc)
+               remove_migration_ptes(hpage, hpage);
+ 
+-      if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+-                                          &anon_vma->lock)) {
+-              int empty = list_empty(&anon_vma->head);
+-              spin_unlock(&anon_vma->lock);
+-              if (empty)
+-                      anon_vma_free(anon_vma);
+-      }
+-
+-      if (rcu_locked)
+-              rcu_read_unlock();
++      if (anon_vma)
++              drop_anon_vma(anon_vma);
+ out:
+       unlock_page(hpage);
+ 
diff --git a/queue-2.6.37/mm-fix-migration-hangs-on-anon_vma-lock.patch b/queue-2.6.37/mm-fix-migration-hangs-on-anon_vma-lock.patch

new file mode 100644 (file)

index 0000000..efaf365
--- /dev/null
+++ b/queue-2.6.37/mm-fix-migration-hangs-on-anon_vma-lock.patch
@@ -0,0 +1,137 @@
+From 1ce82b69e96c838d007f316b8347b911fdfa9842 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Thu, 13 Jan 2011 15:47:30 -0800
+Subject: mm: fix migration hangs on anon_vma lock
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 1ce82b69e96c838d007f316b8347b911fdfa9842 upstream.
+
+Increased usage of page migration in mmotm reveals that the anon_vma
+locking in unmap_and_move() has been deficient since 2.6.36 (or even
+earlier).  Review at the time of f18194275c39835cb84563500995e0d503a32d9a
+("mm: fix hang on anon_vma->root->lock") missed the issue here: the
+anon_vma to which we get a reference may already have been freed back to
+its slab (it is in use when we check page_mapped, but that can change),
+and so its anon_vma->root may be switched at any moment by reuse in
+anon_vma_prepare.
+
+Perhaps we could fix that with a get_anon_vma_unless_zero(), but let's
+not: just rely on page_lock_anon_vma() to do all the hard thinking for us,
+then we don't need any rcu read locking over here.
+
+In removing the rcu_unlock label: since PageAnon is a bit in
+page->mapping, it's impossible for a !page->mapping page to be anon; but
+insert VM_BUG_ON in case the implementation ever changes.
+
+[akpm@linux-foundation.org: coding-style fixes]
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Reviewed-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: "Jun'ichi Nomura" <j-nomura@ce.jp.nec.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/migrate.c |   48 +++++++++++++++++++-----------------------------
+ 1 file changed, 19 insertions(+), 29 deletions(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -620,7 +620,6 @@ static int unmap_and_move(new_page_t get
+       int *result = NULL;
+       struct page *newpage = get_new_page(page, private, &result);
+       int remap_swapcache = 1;
+-      int rcu_locked = 0;
+       int charge = 0;
+       struct mem_cgroup *mem = NULL;
+       struct anon_vma *anon_vma = NULL;
+@@ -672,20 +671,26 @@ static int unmap_and_move(new_page_t get
+       /*
+        * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
+        * we cannot notice that anon_vma is freed while we migrates a page.
+-       * This rcu_read_lock() delays freeing anon_vma pointer until the end
++       * This get_anon_vma() delays freeing anon_vma pointer until the end
+        * of migration. File cache pages are no problem because of page_lock()
+        * File Caches may use write_page() or lock_page() in migration, then,
+        * just care Anon page here.
+        */
+       if (PageAnon(page)) {
+-              rcu_read_lock();
+-              rcu_locked = 1;
+-
+-              /* Determine how to safely use anon_vma */
+-              if (!page_mapped(page)) {
+-                      if (!PageSwapCache(page))
+-                              goto rcu_unlock;
+-
++              /*
++               * Only page_lock_anon_vma() understands the subtleties of
++               * getting a hold on an anon_vma from outside one of its mms.
++               */
++              anon_vma = page_lock_anon_vma(page);
++              if (anon_vma) {
++                      /*
++                       * Take a reference count on the anon_vma if the
++                       * page is mapped so that it is guaranteed to
++                       * exist when the page is remapped later
++                       */
++                      get_anon_vma(anon_vma);
++                      page_unlock_anon_vma(anon_vma);
++              } else if (PageSwapCache(page)) {
+                       /*
+                        * We cannot be sure that the anon_vma of an unmapped
+                        * swapcache page is safe to use because we don't
+@@ -700,13 +705,7 @@ static int unmap_and_move(new_page_t get
+                        */
+                       remap_swapcache = 0;
+               } else {
+-                      /*
+-                       * Take a reference count on the anon_vma if the
+-                       * page is mapped so that it is guaranteed to
+-                       * exist when the page is remapped later
+-                       */
+-                      anon_vma = page_anon_vma(page);
+-                      get_anon_vma(anon_vma);
++                      goto uncharge;
+               }
+       }
+ 
+@@ -723,16 +722,10 @@ static int unmap_and_move(new_page_t get
+        * free the metadata, so the page can be freed.
+        */
+       if (!page->mapping) {
+-              if (!PageAnon(page) && page_has_private(page)) {
+-                      /*
+-                       * Go direct to try_to_free_buffers() here because
+-                       * a) that's what try_to_release_page() would do anyway
+-                       * b) we may be under rcu_read_lock() here, so we can't
+-                       *    use GFP_KERNEL which is what try_to_release_page()
+-                       *    needs to be effective.
+-                       */
++              VM_BUG_ON(PageAnon(page));
++              if (page_has_private(page)) {
+                       try_to_free_buffers(page);
+-                      goto rcu_unlock;
++                      goto uncharge;
+               }
+               goto skip_unmap;
+       }
+@@ -746,14 +739,11 @@ skip_unmap:
+ 
+       if (rc && remap_swapcache)
+               remove_migration_ptes(page, page);
+-rcu_unlock:
+ 
+       /* Drop an anon_vma reference if we took one */
+       if (anon_vma)
+               drop_anon_vma(anon_vma);
+ 
+-      if (rcu_locked)
+-              rcu_read_unlock();
+ uncharge:
+       if (!charge)
+               mem_cgroup_end_migration(mem, page, newpage);
diff --git a/queue-2.6.37/mm-migration-use-rcu_dereference_protected-when-dereferencing-the-radix-tree-slot-during-file-page-migration.patch b/queue-2.6.37/mm-migration-use-rcu_dereference_protected-when-dereferencing-the-radix-tree-slot-during-file-page-migration.patch

new file mode 100644 (file)

index 0000000..ce5c3ef
--- /dev/null
+++ b/queue-2.6.37/mm-migration-use-rcu_dereference_protected-when-dereferencing-the-radix-tree-slot-during-file-page-migration.patch
@@ -0,0 +1,112 @@
+From 29c1f677d424e8c5683a837fc4f03fc9f19201d7 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Thu, 13 Jan 2011 15:47:21 -0800
+Subject: mm: migration: use rcu_dereference_protected when dereferencing the radix tree slot during file page migration
+
+From: Mel Gorman <mel@csn.ul.ie>
+
+commit 29c1f677d424e8c5683a837fc4f03fc9f19201d7 upstream.
+
+migrate_pages() -> unmap_and_move() only calls rcu_read_lock() for
+anonymous pages, as introduced by git commit
+989f89c57e6361e7d16fbd9572b5da7d313b073d ("fix rcu_read_lock() in page
+migraton").  The point of the RCU protection there is part of getting a
+stable reference to anon_vma and is only held for anon pages as file pages
+are locked which is sufficient protection against freeing.
+
+However, while a file page's mapping is being migrated, the radix tree is
+double checked to ensure it is the expected page.  This uses
+radix_tree_deref_slot() -> rcu_dereference() without the RCU lock held
+triggering the following warning.
+
+[  173.674290] ===================================================
+[  173.676016] [ INFO: suspicious rcu_dereference_check() usage. ]
+[  173.676016] ---------------------------------------------------
+[  173.676016] include/linux/radix-tree.h:145 invoked rcu_dereference_check() without protection!
+[  173.676016]
+[  173.676016] other info that might help us debug this:
+[  173.676016]
+[  173.676016]
+[  173.676016] rcu_scheduler_active = 1, debug_locks = 0
+[  173.676016] 1 lock held by hugeadm/2899:
+[  173.676016]  #0:  (&(&inode->i_data.tree_lock)->rlock){..-.-.}, at: [<c10e3d2b>] migrate_page_move_mapping+0x40/0x1ab
+[  173.676016]
+[  173.676016] stack backtrace:
+[  173.676016] Pid: 2899, comm: hugeadm Not tainted 2.6.37-rc5-autobuild
+[  173.676016] Call Trace:
+[  173.676016]  [<c128cc01>] ? printk+0x14/0x1b
+[  173.676016]  [<c1063502>] lockdep_rcu_dereference+0x7d/0x86
+[  173.676016]  [<c10e3db5>] migrate_page_move_mapping+0xca/0x1ab
+[  173.676016]  [<c10e41ad>] migrate_page+0x23/0x39
+[  173.676016]  [<c10e491b>] buffer_migrate_page+0x22/0x107
+[  173.676016]  [<c10e48f9>] ? buffer_migrate_page+0x0/0x107
+[  173.676016]  [<c10e425d>] move_to_new_page+0x9a/0x1ae
+[  173.676016]  [<c10e47e6>] migrate_pages+0x1e7/0x2fa
+
+This patch introduces radix_tree_deref_slot_protected() which calls
+rcu_dereference_protected().  Users of it must pass in the
+mapping->tree_lock that is protecting this dereference.  Holding the tree
+lock protects against parallel updaters of the radix tree meaning that
+rcu_dereference_protected is allowable.
+
+[akpm@linux-foundation.org: remove unneeded casts]
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Cc: Minchan Kim <minchan.kim@gmail.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: Milton Miller <miltonm@bga.com>
+Cc: Nick Piggin <nickpiggin@yahoo.com.au>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/radix-tree.h |   16 ++++++++++++++++
+ mm/migrate.c               |    4 ++--
+ 2 files changed, 18 insertions(+), 2 deletions(-)
+
+--- a/include/linux/radix-tree.h
++++ b/include/linux/radix-tree.h
+@@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slo
+ }
+ 
+ /**
++ * radix_tree_deref_slot_protected    - dereference a slot without RCU lock but with tree lock held
++ * @pslot:    pointer to slot, returned by radix_tree_lookup_slot
++ * Returns:   item that was stored in that slot with any direct pointer flag
++ *            removed.
++ *
++ * Similar to radix_tree_deref_slot but only used during migration when a pages
++ * mapping is being moved. The caller does not hold the RCU read lock but it
++ * must hold the tree lock to prevent parallel updates.
++ */
++static inline void *radix_tree_deref_slot_protected(void **pslot,
++                                                      spinlock_t *treelock)
++{
++      return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
++}
++
++/**
+  * radix_tree_deref_retry     - check radix_tree_deref_slot
+  * @arg:      pointer returned by radix_tree_deref_slot
+  * Returns:   0 if retry is not required, otherwise retry is required
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -246,7 +246,7 @@ static int migrate_page_move_mapping(str
+ 
+       expected_count = 2 + page_has_private(page);
+       if (page_count(page) != expected_count ||
+-                      (struct page *)radix_tree_deref_slot(pslot) != page) {
++              radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return -EAGAIN;
+       }
+@@ -318,7 +318,7 @@ int migrate_huge_page_move_mapping(struc
+ 
+       expected_count = 2 + page_has_private(page);
+       if (page_count(page) != expected_count ||
+-          (struct page *)radix_tree_deref_slot(pslot) != page) {
++              radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+               spin_unlock_irq(&mapping->tree_lock);
+               return -EAGAIN;
+       }
diff --git a/queue-2.6.37/nfs-don-t-use-vm_map_ram-in-readdir.patch b/queue-2.6.37/nfs-don-t-use-vm_map_ram-in-readdir.patch

new file mode 100644 (file)

index 0000000..cbfe453
--- /dev/null
+++ b/queue-2.6.37/nfs-don-t-use-vm_map_ram-in-readdir.patch
@@ -0,0 +1,415 @@
+From 6650239a4b01077e80d5a4468562756d77afaa59 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Sat, 8 Jan 2011 17:45:38 -0500
+Subject: NFS: Don't use vm_map_ram() in readdir
+
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+
+commit 6650239a4b01077e80d5a4468562756d77afaa59 upstream.
+
+vm_map_ram() is not available on NOMMU platforms, and causes trouble
+on incoherrent architectures such as ARM when we access the page data
+through both the direct and the virtual mapping.
+
+The alternative is to use the direct mapping to access page data
+for the case when we are not crossing a page boundary, but to copy
+the data into a linear scratch buffer when we are accessing data
+that spans page boundaries.
+
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Tested-by: Marc Kleine-Budde <mkl@pengutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/dir.c               |   44 ++++++------
+ fs/nfs/nfs2xdr.c           |    6 -
+ fs/nfs/nfs3xdr.c           |    6 -
+ fs/nfs/nfs4xdr.c           |    6 -
+ include/linux/sunrpc/xdr.h |    4 -
+ net/sunrpc/xdr.c           |  155 ++++++++++++++++++++++++++++++++++++---------
+ 6 files changed, 148 insertions(+), 73 deletions(-)
+
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -33,7 +33,6 @@
+ #include <linux/namei.h>
+ #include <linux/mount.h>
+ #include <linux/sched.h>
+-#include <linux/vmalloc.h>
+ #include <linux/kmemleak.h>
+ 
+ #include "delegation.h"
+@@ -459,25 +458,26 @@ out:
+ /* Perform conversion from xdr to cache array */
+ static
+ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+-                              void *xdr_page, struct page *page, unsigned int buflen)
++                              struct page **xdr_pages, struct page *page, unsigned int buflen)
+ {
+       struct xdr_stream stream;
+-      struct xdr_buf buf;
+-      __be32 *ptr = xdr_page;
++      struct xdr_buf buf = {
++              .pages = xdr_pages,
++              .page_len = buflen,
++              .buflen = buflen,
++              .len = buflen,
++      };
++      struct page *scratch;
+       struct nfs_cache_array *array;
+       unsigned int count = 0;
+       int status;
+ 
+-      buf.head->iov_base = xdr_page;
+-      buf.head->iov_len = buflen;
+-      buf.tail->iov_len = 0;
+-      buf.page_base = 0;
+-      buf.page_len = 0;
+-      buf.buflen = buf.head->iov_len;
+-      buf.len = buf.head->iov_len;
+-
+-      xdr_init_decode(&stream, &buf, ptr);
++      scratch = alloc_page(GFP_KERNEL);
++      if (scratch == NULL)
++              return -ENOMEM;
+ 
++      xdr_init_decode(&stream, &buf, NULL);
++      xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+ 
+       do {
+               status = xdr_decode(desc, entry, &stream);
+@@ -506,6 +506,8 @@ int nfs_readdir_page_filler(nfs_readdir_
+               } else
+                       status = PTR_ERR(array);
+       }
++
++      put_page(scratch);
+       return status;
+ }
+ 
+@@ -521,7 +523,6 @@ static
+ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+               unsigned int npages)
+ {
+-      vm_unmap_ram(ptr, npages);
+       nfs_readdir_free_pagearray(pages, npages);
+ }
+ 
+@@ -530,9 +531,8 @@ void nfs_readdir_free_large_page(void *p
+  * to nfs_readdir_free_large_page
+  */
+ static
+-void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
++int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+ {
+-      void *ptr;
+       unsigned int i;
+ 
+       for (i = 0; i < npages; i++) {
+@@ -541,13 +541,11 @@ void *nfs_readdir_large_page(struct page
+                       goto out_freepages;
+               pages[i] = page;
+       }
++      return 0;
+ 
+-      ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
+-      if (!IS_ERR_OR_NULL(ptr))
+-              return ptr;
+ out_freepages:
+       nfs_readdir_free_pagearray(pages, i);
+-      return NULL;
++      return -ENOMEM;
+ }
+ 
+ static
+@@ -577,8 +575,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir
+       memset(array, 0, sizeof(struct nfs_cache_array));
+       array->eof_index = -1;
+ 
+-      pages_ptr = nfs_readdir_large_page(pages, array_size);
+-      if (!pages_ptr)
++      status = nfs_readdir_large_page(pages, array_size);
++      if (status < 0)
+               goto out_release_array;
+       do {
+               unsigned int pglen;
+@@ -587,7 +585,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir
+               if (status < 0)
+                       break;
+               pglen = status;
+-              status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
++              status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+               if (status < 0) {
+                       if (status == -ENOSPC)
+                               status = 0;
+--- a/fs/nfs/nfs2xdr.c
++++ b/fs/nfs/nfs2xdr.c
+@@ -487,12 +487,6 @@ nfs_decode_dirent(struct xdr_stream *xdr
+ 
+       entry->d_type = DT_UNKNOWN;
+ 
+-      p = xdr_inline_peek(xdr, 8);
+-      if (p != NULL)
+-              entry->eof = !p[0] && p[1];
+-      else
+-              entry->eof = 0;
+-
+       return p;
+ 
+ out_overflow:
+--- a/fs/nfs/nfs3xdr.c
++++ b/fs/nfs/nfs3xdr.c
+@@ -647,12 +647,6 @@ nfs3_decode_dirent(struct xdr_stream *xd
+                       memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
+       }
+ 
+-      p = xdr_inline_peek(xdr, 8);
+-      if (p != NULL)
+-              entry->eof = !p[0] && p[1];
+-      else
+-              entry->eof = 0;
+-
+       return p;
+ 
+ out_overflow:
+--- a/fs/nfs/nfs4xdr.c
++++ b/fs/nfs/nfs4xdr.c
+@@ -6215,12 +6215,6 @@ __be32 *nfs4_decode_dirent(struct xdr_st
+       if (verify_attr_len(xdr, p, len) < 0)
+               goto out_overflow;
+ 
+-      p = xdr_inline_peek(xdr, 8);
+-      if (p != NULL)
+-              entry->eof = !p[0] && p[1];
+-      else
+-              entry->eof = 0;
+-
+       return p;
+ 
+ out_overflow:
+--- a/include/linux/sunrpc/xdr.h
++++ b/include/linux/sunrpc/xdr.h
+@@ -201,6 +201,8 @@ struct xdr_stream {
+ 
+       __be32 *end;            /* end of available buffer space */
+       struct kvec *iov;       /* pointer to the current kvec */
++      struct kvec scratch;    /* Scratch buffer */
++      struct page **page_ptr; /* pointer to the current page */
+ };
+ 
+ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+@@ -208,7 +210,7 @@ extern __be32 *xdr_reserve_space(struct
+ extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
+               unsigned int base, unsigned int len);
+ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+-extern __be32 *xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes);
++extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
+ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
+ extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
+ extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
+--- a/net/sunrpc/xdr.c
++++ b/net/sunrpc/xdr.c
+@@ -552,6 +552,74 @@ void xdr_write_pages(struct xdr_stream *
+ }
+ EXPORT_SYMBOL_GPL(xdr_write_pages);
+ 
++static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
++              __be32 *p, unsigned int len)
++{
++      if (len > iov->iov_len)
++              len = iov->iov_len;
++      if (p == NULL)
++              p = (__be32*)iov->iov_base;
++      xdr->p = p;
++      xdr->end = (__be32*)(iov->iov_base + len);
++      xdr->iov = iov;
++      xdr->page_ptr = NULL;
++}
++
++static int xdr_set_page_base(struct xdr_stream *xdr,
++              unsigned int base, unsigned int len)
++{
++      unsigned int pgnr;
++      unsigned int maxlen;
++      unsigned int pgoff;
++      unsigned int pgend;
++      void *kaddr;
++
++      maxlen = xdr->buf->page_len;
++      if (base >= maxlen)
++              return -EINVAL;
++      maxlen -= base;
++      if (len > maxlen)
++              len = maxlen;
++
++      base += xdr->buf->page_base;
++
++      pgnr = base >> PAGE_SHIFT;
++      xdr->page_ptr = &xdr->buf->pages[pgnr];
++      kaddr = page_address(*xdr->page_ptr);
++
++      pgoff = base & ~PAGE_MASK;
++      xdr->p = (__be32*)(kaddr + pgoff);
++
++      pgend = pgoff + len;
++      if (pgend > PAGE_SIZE)
++              pgend = PAGE_SIZE;
++      xdr->end = (__be32*)(kaddr + pgend);
++      xdr->iov = NULL;
++      return 0;
++}
++
++static void xdr_set_next_page(struct xdr_stream *xdr)
++{
++      unsigned int newbase;
++
++      newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
++      newbase -= xdr->buf->page_base;
++
++      if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
++              xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
++}
++
++static bool xdr_set_next_buffer(struct xdr_stream *xdr)
++{
++      if (xdr->page_ptr != NULL)
++              xdr_set_next_page(xdr);
++      else if (xdr->iov == xdr->buf->head) {
++              if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
++                      xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
++      }
++      return xdr->p != xdr->end;
++}
++
+ /**
+  * xdr_init_decode - Initialize an xdr_stream for decoding data.
+  * @xdr: pointer to xdr_stream struct
+@@ -560,41 +628,67 @@ EXPORT_SYMBOL_GPL(xdr_write_pages);
+  */
+ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+ {
+-      struct kvec *iov = buf->head;
+-      unsigned int len = iov->iov_len;
+-
+-      if (len > buf->len)
+-              len = buf->len;
+       xdr->buf = buf;
+-      xdr->iov = iov;
+-      xdr->p = p;
+-      xdr->end = (__be32 *)((char *)iov->iov_base + len);
++      xdr->scratch.iov_base = NULL;
++      xdr->scratch.iov_len = 0;
++      if (buf->head[0].iov_len != 0)
++              xdr_set_iov(xdr, buf->head, p, buf->len);
++      else if (buf->page_len != 0)
++              xdr_set_page_base(xdr, 0, buf->len);
+ }
+ EXPORT_SYMBOL_GPL(xdr_init_decode);
+ 
+-/**
+- * xdr_inline_peek - Allow read-ahead in the XDR data stream
+- * @xdr: pointer to xdr_stream struct
+- * @nbytes: number of bytes of data to decode
+- *
+- * Check if the input buffer is long enough to enable us to decode
+- * 'nbytes' more bytes of data starting at the current position.
+- * If so return the current pointer without updating the current
+- * pointer position.
+- */
+-__be32 * xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes)
++static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+ {
+       __be32 *p = xdr->p;
+       __be32 *q = p + XDR_QUADLEN(nbytes);
+ 
+       if (unlikely(q > xdr->end || q < p))
+               return NULL;
++      xdr->p = q;
+       return p;
+ }
+-EXPORT_SYMBOL_GPL(xdr_inline_peek);
+ 
+ /**
+- * xdr_inline_decode - Retrieve non-page XDR data to decode
++ * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
++ * @xdr: pointer to xdr_stream struct
++ * @buf: pointer to an empty buffer
++ * @buflen: size of 'buf'
++ *
++ * The scratch buffer is used when decoding from an array of pages.
++ * If an xdr_inline_decode() call spans across page boundaries, then
++ * we copy the data into the scratch buffer in order to allow linear
++ * access.
++ */
++void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
++{
++      xdr->scratch.iov_base = buf;
++      xdr->scratch.iov_len = buflen;
++}
++EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
++
++static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
++{
++      __be32 *p;
++      void *cpdest = xdr->scratch.iov_base;
++      size_t cplen = (char *)xdr->end - (char *)xdr->p;
++
++      if (nbytes > xdr->scratch.iov_len)
++              return NULL;
++      memcpy(cpdest, xdr->p, cplen);
++      cpdest += cplen;
++      nbytes -= cplen;
++      if (!xdr_set_next_buffer(xdr))
++              return NULL;
++      p = __xdr_inline_decode(xdr, nbytes);
++      if (p == NULL)
++              return NULL;
++      memcpy(cpdest, p, nbytes);
++      return xdr->scratch.iov_base;
++}
++
++/**
++ * xdr_inline_decode - Retrieve XDR data to decode
+  * @xdr: pointer to xdr_stream struct
+  * @nbytes: number of bytes of data to decode
+  *
+@@ -605,13 +699,16 @@ EXPORT_SYMBOL_GPL(xdr_inline_peek);
+  */
+ __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+ {
+-      __be32 *p = xdr->p;
+-      __be32 *q = p + XDR_QUADLEN(nbytes);
++      __be32 *p;
+ 
+-      if (unlikely(q > xdr->end || q < p))
++      if (nbytes == 0)
++              return xdr->p;
++      if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
+               return NULL;
+-      xdr->p = q;
+-      return p;
++      p = __xdr_inline_decode(xdr, nbytes);
++      if (p != NULL)
++              return p;
++      return xdr_copy_to_scratch(xdr, nbytes);
+ }
+ EXPORT_SYMBOL_GPL(xdr_inline_decode);
+ 
+@@ -671,16 +768,12 @@ EXPORT_SYMBOL_GPL(xdr_read_pages);
+  */
+ void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
+ {
+-      char * kaddr = page_address(xdr->buf->pages[0]);
+       xdr_read_pages(xdr, len);
+       /*
+        * Position current pointer at beginning of tail, and
+        * set remaining message length.
+        */
+-      if (len > PAGE_CACHE_SIZE - xdr->buf->page_base)
+-              len = PAGE_CACHE_SIZE - xdr->buf->page_base;
+-      xdr->p = (__be32 *)(kaddr + xdr->buf->page_base);
+-      xdr->end = (__be32 *)((char *)xdr->p + len);
++      xdr_set_page_base(xdr, 0, len);
+ }
+ EXPORT_SYMBOL_GPL(xdr_enter_page);
+ 
diff --git a/queue-2.6.37/nfs-fix-an-nfs-client-lockdep-issue.patch b/queue-2.6.37/nfs-fix-an-nfs-client-lockdep-issue.patch

new file mode 100644 (file)

index 0000000..f78b517
--- /dev/null
+++ b/queue-2.6.37/nfs-fix-an-nfs-client-lockdep-issue.patch
@@ -0,0 +1,43 @@
+From e00b8a24041f37e56b4b8415ce4eba1cbc238065 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Thu, 27 Jan 2011 14:55:39 -0500
+Subject: NFS: Fix an NFS client lockdep issue
+
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+
+commit e00b8a24041f37e56b4b8415ce4eba1cbc238065 upstream.
+
+There is no reason to be freeing the delegation cred in the rcu callback,
+and doing so is resulting in a lockdep complaint that rpc_credcache_lock
+is being called from both softirq and non-softirq contexts.
+
+Reported-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/delegation.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/nfs/delegation.c
++++ b/fs/nfs/delegation.c
+@@ -23,8 +23,6 @@
+ 
+ static void nfs_do_free_delegation(struct nfs_delegation *delegation)
+ {
+-      if (delegation->cred)
+-              put_rpccred(delegation->cred);
+       kfree(delegation);
+ }
+ 
+@@ -37,6 +35,10 @@ static void nfs_free_delegation_callback
+ 
+ static void nfs_free_delegation(struct nfs_delegation *delegation)
+ {
++      if (delegation->cred) {
++              put_rpccred(delegation->cred);
++              delegation->cred = NULL;
++      }
+       call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+ }
+ 
diff --git a/queue-2.6.37/nfs-fix-kernel-bug-at-fs-aio.c-554.patch b/queue-2.6.37/nfs-fix-kernel-bug-at-fs-aio.c-554.patch

new file mode 100644 (file)

index 0000000..336a2f9
--- /dev/null
+++ b/queue-2.6.37/nfs-fix-kernel-bug-at-fs-aio.c-554.patch
@@ -0,0 +1,103 @@
+From 839f7ad6932d95f4d5ae7267b95c574714ff3d5b Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Fri, 21 Jan 2011 15:54:57 +0000
+Subject: NFS: Fix "kernel BUG at fs/aio.c:554!"
+
+From: Chuck Lever <chuck.lever@oracle.com>
+
+commit 839f7ad6932d95f4d5ae7267b95c574714ff3d5b upstream.
+
+Nick Piggin reports:
+
+> I'm getting use after frees in aio code in NFS
+>
+> [ 2703.396766] Call Trace:
+> [ 2703.396858]  [<ffffffff8100b057>] ? native_sched_clock+0x27/0x80
+> [ 2703.396959]  [<ffffffff8108509e>] ? put_lock_stats+0xe/0x40
+> [ 2703.397058]  [<ffffffff81088348>] ? lock_release_holdtime+0xa8/0x140
+> [ 2703.397159]  [<ffffffff8108a2a5>] lock_acquire+0x95/0x1b0
+> [ 2703.397260]  [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
+> [ 2703.397361]  [<ffffffff81039701>] ? get_parent_ip+0x11/0x50
+> [ 2703.397464]  [<ffffffff81612a31>] _raw_spin_lock_irq+0x41/0x80
+> [ 2703.397564]  [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
+> [ 2703.397662]  [<ffffffff811627db>] aio_put_req+0x2b/0x60
+> [ 2703.397761]  [<ffffffff811647fe>] do_io_submit+0x2be/0x7c0
+> [ 2703.397895]  [<ffffffff81164d0b>] sys_io_submit+0xb/0x10
+> [ 2703.397995]  [<ffffffff8100307b>] system_call_fastpath+0x16/0x1b
+>
+> Adding some tracing, it is due to nfs completing the request then
+> returning something other than -EIOCBQUEUED, so aio.c
+> also completes the request.
+
+To address this, prevent the NFS direct I/O engine from completing
+async iocbs when the forward path returns an error without starting
+any I/O.
+
+This fix appears to survive ^C during both "xfstest no. 208" and "fsx
+-Z."
+
+It's likely this bug has existed for a very long while, as we are seeing
+very similar symptoms in OEL 5.  Copying stable.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/direct.c |   34 ++++++++++++++++++++--------------
+ 1 file changed, 20 insertions(+), 14 deletions(-)
+
+--- a/fs/nfs/direct.c
++++ b/fs/nfs/direct.c
+@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_
+               pos += vec->iov_len;
+       }
+ 
++      /*
++       * If no bytes were started, return the error, and let the
++       * generic layer handle the completion.
++       */
++      if (requested_bytes == 0) {
++              nfs_direct_req_release(dreq);
++              return result < 0 ? result : -EIO;
++      }
++
+       if (put_dreq(dreq))
+               nfs_direct_complete(dreq);
+-
+-      if (requested_bytes != 0)
+-              return 0;
+-
+-      if (result < 0)
+-              return result;
+-      return -EIO;
++      return 0;
+ }
+ 
+ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
+@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule
+               pos += vec->iov_len;
+       }
+ 
++      /*
++       * If no bytes were started, return the error, and let the
++       * generic layer handle the completion.
++       */
++      if (requested_bytes == 0) {
++              nfs_direct_req_release(dreq);
++              return result < 0 ? result : -EIO;
++      }
++
+       if (put_dreq(dreq))
+               nfs_direct_write_complete(dreq, dreq->inode);
+-
+-      if (requested_bytes != 0)
+-              return 0;
+-
+-      if (result < 0)
+-              return result;
+-      return -EIO;
++      return 0;
+ }
+ 
+ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/queue-2.6.37/nfs-fix-nfsv3-exclusive-open-semantics.patch b/queue-2.6.37/nfs-fix-nfsv3-exclusive-open-semantics.patch

new file mode 100644 (file)

index 0000000..29c5d74
--- /dev/null
+++ b/queue-2.6.37/nfs-fix-nfsv3-exclusive-open-semantics.patch
@@ -0,0 +1,46 @@
+From 8a0eebf66e3b1deae036553ba641a9c2bdbae678 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Thu, 13 Jan 2011 14:15:50 -0500
+Subject: NFS: Fix NFSv3 exclusive open semantics
+
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+
+commit 8a0eebf66e3b1deae036553ba641a9c2bdbae678 upstream.
+
+Commit c0204fd2b8fe047b18b67e07e1bf2a03691240cd (NFS: Clean up
+nfs4_proc_create()) broke NFSv3 exclusive open by removing the code
+that passes the O_EXCL flag down to nfs3_proc_create(). This patch
+reverts that offending hunk from the original commit.
+
+Reported-by: Nick Bowler <nbowler@elliptictech.com>
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Tested-by: Nick Bowler <nbowler@elliptictech.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/dir.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -1577,6 +1577,7 @@ static int nfs_create(struct inode *dir,
+ {
+       struct iattr attr;
+       int error;
++      int open_flags = 0;
+ 
+       dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+                       dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+@@ -1584,7 +1585,10 @@ static int nfs_create(struct inode *dir,
+       attr.ia_mode = mode;
+       attr.ia_valid = ATTR_MODE;
+ 
+-      error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
++      if ((nd->flags & LOOKUP_CREATE) != 0)
++              open_flags = nd->intent.open.flags;
++
++      error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
+       if (error != 0)
+               goto out_err;
+       return 0;
diff --git a/queue-2.6.37/nfsd4-name-id-mapping-should-fail-with-badowner-not-badname.patch b/queue-2.6.37/nfsd4-name-id-mapping-should-fail-with-badowner-not-badname.patch

new file mode 100644 (file)

index 0000000..84eb428
--- /dev/null
+++ b/queue-2.6.37/nfsd4-name-id-mapping-should-fail-with-badowner-not-badname.patch
@@ -0,0 +1,65 @@
+From f6af99ec1b261e21219d5eba99e3af48fc6c32d4 Mon Sep 17 00:00:00 2001
+From: J. Bruce Fields <bfields@redhat.com>
+Date: Tue, 4 Jan 2011 18:02:15 -0500
+Subject: nfsd4: name->id mapping should fail with BADOWNER not BADNAME
+
+From: J. Bruce Fields <bfields@redhat.com>
+
+commit f6af99ec1b261e21219d5eba99e3af48fc6c32d4 upstream.
+
+According to rfc 3530 BADNAME is for strings that represent paths;
+BADOWNER is for user/group names that don't map.
+
+And the too-long name should probably be BADOWNER as well; it's
+effectively the same as if we couldn't map it.
+
+Reported-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Reported-by: Simon Kirby <sim@hostway.ca>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfsd/nfs4idmap.c |    4 ++--
+ fs/nfsd/nfsd.h      |    1 +
+ fs/nfsd/nfsproc.c   |    2 +-
+ 3 files changed, 4 insertions(+), 3 deletions(-)
+
+--- a/fs/nfsd/nfs4idmap.c
++++ b/fs/nfsd/nfs4idmap.c
+@@ -524,13 +524,13 @@ idmap_name_to_id(struct svc_rqst *rqstp,
+       int ret;
+ 
+       if (namelen + 1 > sizeof(key.name))
+-              return -EINVAL;
++              return -ESRCH; /* nfserr_badowner */
+       memcpy(key.name, name, namelen);
+       key.name[namelen] = '\0';
+       strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+       ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
+       if (ret == -ENOENT)
+-              ret = -ESRCH; /* nfserr_badname */
++              ret = -ESRCH; /* nfserr_badowner */
+       if (ret)
+               return ret;
+       *id = item->id;
+--- a/fs/nfsd/nfsd.h
++++ b/fs/nfsd/nfsd.h
+@@ -158,6 +158,7 @@ void               nfsd_lockd_shutdown(void);
+ #define       nfserr_attrnotsupp      cpu_to_be32(NFSERR_ATTRNOTSUPP)
+ #define       nfserr_bad_xdr          cpu_to_be32(NFSERR_BAD_XDR)
+ #define       nfserr_openmode         cpu_to_be32(NFSERR_OPENMODE)
++#define       nfserr_badowner         cpu_to_be32(NFSERR_BADOWNER)
+ #define       nfserr_locks_held       cpu_to_be32(NFSERR_LOCKS_HELD)
+ #define       nfserr_op_illegal       cpu_to_be32(NFSERR_OP_ILLEGAL)
+ #define       nfserr_grace            cpu_to_be32(NFSERR_GRACE)
+--- a/fs/nfsd/nfsproc.c
++++ b/fs/nfsd/nfsproc.c
+@@ -737,7 +737,7 @@ nfserrno (int errno)
+               { nfserr_jukebox, -ERESTARTSYS },
+               { nfserr_dropit, -EAGAIN },
+               { nfserr_dropit, -ENOMEM },
+-              { nfserr_badname, -ESRCH },
++              { nfserr_badowner, -ESRCH },
+               { nfserr_io, -ETXTBSY },
+               { nfserr_notsupp, -EOPNOTSUPP },
+               { nfserr_toosmall, -ETOOSMALL },
diff --git a/queue-2.6.37/proc-kcore-fix-seeking.patch b/queue-2.6.37/proc-kcore-fix-seeking.patch

new file mode 100644 (file)

index 0000000..7dfe6ab
--- /dev/null
+++ b/queue-2.6.37/proc-kcore-fix-seeking.patch
@@ -0,0 +1,41 @@
+From ceff1a770933e2ca2bf995b453dade4ec47a9878 Mon Sep 17 00:00:00 2001
+From: Dave Anderson <anderson@redhat.com>
+Date: Wed, 12 Jan 2011 17:00:36 -0800
+Subject: /proc/kcore: fix seeking
+
+From: Dave Anderson <anderson@redhat.com>
+
+commit ceff1a770933e2ca2bf995b453dade4ec47a9878 upstream.
+
+Commit 34aacb2920 ("procfs: Use generic_file_llseek in /proc/kcore") broke
+seeking on /proc/kcore.  This changes it back to use default_llseek in
+order to restore the original behavior.
+
+The problem with generic_file_llseek is that it only allows seeks up to
+inode->i_sb->s_maxbytes, which is 2GB-1 on procfs, where the memory file
+offset values in the /proc/kcore PT_LOAD segments may exceed or start
+beyond that offset value.
+
+A similar revert was made for /proc/vmcore.
+
+Signed-off-by: Dave Anderson <anderson@redhat.com>
+Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/proc/kcore.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/proc/kcore.c
++++ b/fs/proc/kcore.c
+@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inod
+ static const struct file_operations proc_kcore_operations = {
+       .read           = read_kcore,
+       .open           = open_kcore,
+-      .llseek         = generic_file_llseek,
++      .llseek         = default_llseek,
+ };
+ 
+ #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/queue-2.6.37/rdma-cxgb4-don-t-re-init-wait-object-in-init-fini-paths.patch b/queue-2.6.37/rdma-cxgb4-don-t-re-init-wait-object-in-init-fini-paths.patch

new file mode 100644 (file)

index 0000000..dd22808
--- /dev/null
+++ b/queue-2.6.37/rdma-cxgb4-don-t-re-init-wait-object-in-init-fini-paths.patch
@@ -0,0 +1,41 @@
+From db8b10167126d72829653690f57b9c7ca53c4d54 Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Mon, 10 Jan 2011 17:41:43 -0800
+Subject: RDMA/cxgb4: Don't re-init wait object in init/fini paths
+
+From: Steve Wise <swise@opengridcomputing.com>
+
+commit db8b10167126d72829653690f57b9c7ca53c4d54 upstream.
+
+Re-initializing the wait object in rdma_init()/rdma_fini() causes a
+timing window which can lead to a deadlock during close.  Once this
+deadlock hits, all RDMA activity over the T4 device will be stuck.
+
+There's no need to re-init the wait object, so remove it.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Roland Dreier <rolandd@cisco.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/cxgb4/qp.c |    2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/drivers/infiniband/hw/cxgb4/qp.c
++++ b/drivers/infiniband/hw/cxgb4/qp.c
+@@ -1029,7 +1029,6 @@ static int rdma_fini(struct c4iw_dev *rh
+       wqe->cookie = (unsigned long) &ep->com.wr_wait;
+ 
+       wqe->u.fini.type = FW_RI_TYPE_FINI;
+-      c4iw_init_wr_wait(&ep->com.wr_wait);
+       ret = c4iw_ofld_send(&rhp->rdev, skb);
+       if (ret)
+               goto out;
+@@ -1125,7 +1124,6 @@ static int rdma_init(struct c4iw_dev *rh
+       if (qhp->attr.mpa_attr.initiator)
+               build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init);
+ 
+-      c4iw_init_wr_wait(&qhp->ep->com.wr_wait);
+       ret = c4iw_ofld_send(&rhp->rdev, skb);
+       if (ret)
+               goto out;
diff --git a/queue-2.6.37/rdma-cxgb4-limit-maxburst-eq-context-field-to-256b.patch b/queue-2.6.37/rdma-cxgb4-limit-maxburst-eq-context-field-to-256b.patch

new file mode 100644 (file)

index 0000000..4aff8d9
--- /dev/null
+++ b/queue-2.6.37/rdma-cxgb4-limit-maxburst-eq-context-field-to-256b.patch
@@ -0,0 +1,40 @@
+From 6a09a9d6946dd516d243d072bee83fae3c683471 Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Fri, 21 Jan 2011 17:00:29 +0000
+Subject: RDMA/cxgb4: Limit MAXBURST EQ context field to 256B
+
+From: Steve Wise <swise@opengridcomputing.com>
+
+commit 6a09a9d6946dd516d243d072bee83fae3c683471 upstream.
+
+MAXBURST cannot exceed 256B for on-chip queues.  With a 512B MAXBURST,
+we can lock up the chip.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/cxgb4/qp.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/infiniband/hw/cxgb4/qp.c
++++ b/drivers/infiniband/hw/cxgb4/qp.c
+@@ -220,7 +220,7 @@ static int create_qp(struct c4iw_rdev *r
+               V_FW_RI_RES_WR_DCAEN(0) |
+               V_FW_RI_RES_WR_DCACPU(0) |
+               V_FW_RI_RES_WR_FBMIN(2) |
+-              V_FW_RI_RES_WR_FBMAX(3) |
++              V_FW_RI_RES_WR_FBMAX(2) |
+               V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
+               V_FW_RI_RES_WR_CIDXFTHRESH(0) |
+               V_FW_RI_RES_WR_EQSIZE(eqsize));
+@@ -243,7 +243,7 @@ static int create_qp(struct c4iw_rdev *r
+               V_FW_RI_RES_WR_DCAEN(0) |
+               V_FW_RI_RES_WR_DCACPU(0) |
+               V_FW_RI_RES_WR_FBMIN(2) |
+-              V_FW_RI_RES_WR_FBMAX(3) |
++              V_FW_RI_RES_WR_FBMAX(2) |
+               V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
+               V_FW_RI_RES_WR_CIDXFTHRESH(0) |
+               V_FW_RI_RES_WR_EQSIZE(eqsize));
diff --git a/queue-2.6.37/rdma-cxgb4-set-the-correct-device-physical-function-for-iwarp-connections.patch b/queue-2.6.37/rdma-cxgb4-set-the-correct-device-physical-function-for-iwarp-connections.patch

new file mode 100644 (file)

index 0000000..cc25b44
--- /dev/null
+++ b/queue-2.6.37/rdma-cxgb4-set-the-correct-device-physical-function-for-iwarp-connections.patch
@@ -0,0 +1,30 @@
+From 94788657c94169171971968c9d4b6222c5e704aa Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Fri, 21 Jan 2011 17:00:34 +0000
+Subject: RDMA/cxgb4: Set the correct device physical function for iWARP connections
+
+From: Steve Wise <swise@opengridcomputing.com>
+
+commit 94788657c94169171971968c9d4b6222c5e704aa upstream.
+
+The PF passed to FW was 0, causing PCI failures in an SR-IOV environment.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/cxgb4/cm.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/cxgb4/cm.c
++++ b/drivers/infiniband/hw/cxgb4/cm.c
+@@ -380,7 +380,7 @@ static void send_flowc(struct c4iw_ep *e
+                                         16)) | FW_WR_FLOWID(ep->hwtid));
+ 
+       flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+-      flowc->mnemval[0].val = cpu_to_be32(0);
++      flowc->mnemval[0].val = cpu_to_be32(PCI_FUNC(ep->com.dev->rdev.lldi.pdev->devfn) << 8);
+       flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+       flowc->mnemval[1].val = cpu_to_be32(ep->tx_chan);
+       flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
diff --git a/queue-2.6.37/rtc-cmos-fix-suspend-resume.patch b/queue-2.6.37/rtc-cmos-fix-suspend-resume.patch

new file mode 100644 (file)

index 0000000..af36e29
--- /dev/null
+++ b/queue-2.6.37/rtc-cmos-fix-suspend-resume.patch
@@ -0,0 +1,97 @@
+From 2fb08e6ca9f00d1aedb3964983e9c8f84b36b807 Mon Sep 17 00:00:00 2001
+From: Paul Fox <pgf@laptop.org>
+Date: Wed, 12 Jan 2011 17:00:07 -0800
+Subject: rtc-cmos: fix suspend/resume
+
+From: Paul Fox <pgf@laptop.org>
+
+commit 2fb08e6ca9f00d1aedb3964983e9c8f84b36b807 upstream.
+
+rtc-cmos was setting suspend/resume hooks at the device_driver level.
+However, the platform bus code (drivers/base/platform.c) only looks for
+resume hooks at the dev_pm_ops level, or within the platform_driver.
+
+Switch rtc_cmos to use dev_pm_ops so that suspend/resume code is executed
+again.
+
+Paul said:
+
+: The user visible symptom in our (XO laptop) case was that rtcwake would
+: fail to wake the laptop.  The RTC alarm would expire, but the wakeup
+: wasn't unmasked.
+:
+: As for severity, the impact may have been reduced because if I recall
+: correctly, the bug only affected platforms with CONFIG_PNP disabled.
+
+Signed-off-by: Paul Fox <pgf@laptop.org>
+Signed-off-by: Daniel Drake <dsd@laptop.org>
+Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/rtc/rtc-cmos.c |   16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+--- a/drivers/rtc/rtc-cmos.c
++++ b/drivers/rtc/rtc-cmos.c
+@@ -36,6 +36,7 @@
+ #include <linux/platform_device.h>
+ #include <linux/mod_devicetable.h>
+ #include <linux/log2.h>
++#include <linux/pm.h>
+ 
+ /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */
+ #include <asm-generic/rtc.h>
+@@ -850,7 +851,7 @@ static void __exit cmos_do_remove(struct
+ 
+ #ifdef        CONFIG_PM
+ 
+-static int cmos_suspend(struct device *dev, pm_message_t mesg)
++static int cmos_suspend(struct device *dev)
+ {
+       struct cmos_rtc *cmos = dev_get_drvdata(dev);
+       unsigned char   tmp;
+@@ -898,7 +899,7 @@ static int cmos_suspend(struct device *d
+  */
+ static inline int cmos_poweroff(struct device *dev)
+ {
+-      return cmos_suspend(dev, PMSG_HIBERNATE);
++      return cmos_suspend(dev);
+ }
+ 
+ static int cmos_resume(struct device *dev)
+@@ -945,9 +946,9 @@ static int cmos_resume(struct device *de
+       return 0;
+ }
+ 
++static SIMPLE_DEV_PM_OPS(cmos_pm_ops, cmos_suspend, cmos_resume);
++
+ #else
+-#define       cmos_suspend    NULL
+-#define       cmos_resume     NULL
+ 
+ static inline int cmos_poweroff(struct device *dev)
+ {
+@@ -1077,7 +1078,7 @@ static void __exit cmos_pnp_remove(struc
+ 
+ static int cmos_pnp_suspend(struct pnp_dev *pnp, pm_message_t mesg)
+ {
+-      return cmos_suspend(&pnp->dev, mesg);
++      return cmos_suspend(&pnp->dev);
+ }
+ 
+ static int cmos_pnp_resume(struct pnp_dev *pnp)
+@@ -1157,8 +1158,9 @@ static struct platform_driver cmos_platf
+       .shutdown       = cmos_platform_shutdown,
+       .driver = {
+               .name           = (char *) driver_name,
+-              .suspend        = cmos_suspend,
+-              .resume         = cmos_resume,
++#ifdef CONFIG_PM
++              .pm             = &cmos_pm_ops,
++#endif
+       }
+ };
+ 
diff --git a/queue-2.6.37/series b/queue-2.6.37/series

index a80e7e623fc3f17e5e98b73f45808282806d6107..fbb536b7fbd3a2e4993fc370ce7837e0d17fd282 100644 (file)
--- a/queue-2.6.37/series
+++ b/queue-2.6.37/series
@@ -63,7 +63,6 @@ ath9k-fix-beacon-restart-on-channel-change.patch
  ath9k_hw-do-pa-offset-calibration-only-on-longcal-interval.patch
  ath9k_hw-disabled-paprd-for-ar9003.patch
  ath9k_hw-fix-system-hang-when-resuming-from-s3-s4.patch
-ath9k-fix-race-conditions-when-stop-device.patch
  ath-missed-to-clear-key4-of-micentry.patch
  qdio-use-proper-qebsm-operand-for-siga-r-and-siga-s.patch
  zcrypt-fix-check-to-look-for-facility-bits-2-65.patch
@@ -107,3 +106,30 @@ asoc-when-disabling-wm8994-fll-force-a-source-selection.patch
  asoc-wm8990-msleep-takes-milliseconds-not-jiffies.patch
  asoc-blackfin-ac97-fix-build-error-after-multi-component-update.patch
  asoc-blackfin-tdm-fix-missed-snd_soc_dai_get_drvdata-update.patch
+nfs-don-t-use-vm_map_ram-in-readdir.patch
+nfs-fix-nfsv3-exclusive-open-semantics.patch
+nfs-fix-an-nfs-client-lockdep-issue.patch
+nfs-fix-kernel-bug-at-fs-aio.c-554.patch
+nfsd4-name-id-mapping-should-fail-with-badowner-not-badname.patch
+dynamic-debug-fix-build-issue-with-older-gcc.patch
+rdma-cxgb4-don-t-re-init-wait-object-in-init-fini-paths.patch
+rdma-cxgb4-set-the-correct-device-physical-function-for-iwarp-connections.patch
+rdma-cxgb4-limit-maxburst-eq-context-field-to-256b.patch
+proc-kcore-fix-seeking.patch
+rtc-cmos-fix-suspend-resume.patch
+kref-add-kref_test_and_get.patch
+block-fix-accounting-bug-on-cross-partition-merges.patch
+mm-migration-use-rcu_dereference_protected-when-dereferencing-the-radix-tree-slot-during-file-page-migration.patch
+mm-fix-migration-hangs-on-anon_vma-lock.patch
+mm-fix-hugepage-migration.patch
+genirq-prevent-irq-storm-on-migration.patch
+writeback-integrated-background-writeback-work.patch
+writeback-stop-background-kupdate-works-from-livelocking-other-works.patch
+writeback-avoid-livelocking-wb_sync_all-writeback.patch
+ext4-fix-uninitialized-variable-in-ext4_register_li_request.patch
+ext4-fix-trimming-of-a-single-group.patch
+ext4-fix-memory-leak-in-ext4_free_branches.patch
+ext4-fix-panic-on-module-unload-when-stopping-lazyinit-thread.patch
+ext4-unregister-features-interface-on-module-unload.patch
+ext4-fix-data-corruption-with-multi-block-writepages-support.patch
+ext4-make-grpinfo-slab-cache-names-static.patch
diff --git a/queue-2.6.37/writeback-avoid-livelocking-wb_sync_all-writeback.patch b/queue-2.6.37/writeback-avoid-livelocking-wb_sync_all-writeback.patch

new file mode 100644 (file)

index 0000000..3451123
--- /dev/null
+++ b/queue-2.6.37/writeback-avoid-livelocking-wb_sync_all-writeback.patch
@@ -0,0 +1,117 @@
+From b9543dac5bbc4aef0a598965b6b34f6259ab9a9b Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 13 Jan 2011 15:45:48 -0800
+Subject: writeback: avoid livelocking WB_SYNC_ALL writeback
+
+From: Jan Kara <jack@suse.cz>
+
+commit b9543dac5bbc4aef0a598965b6b34f6259ab9a9b upstream.
+
+When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
+usually set to LONG_MAX.  The logic in wb_writeback() then calls
+__writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and we
+easily end up with non-positive nr_to_write after the function returns, if
+the inode has more than MAX_WRITEBACK_PAGES dirty pages at the moment.
+
+When nr_to_write is <= 0 wb_writeback() decides we need another round of
+writeback but this is wrong in some cases!  For example when a single
+large file is continuously dirtied, we would never finish syncing it
+because each pass would be able to write MAX_WRITEBACK_PAGES and inode
+dirty timestamp never gets updated (as inode is never completely clean).
+Thus __writeback_inodes_sb() would write the redirtied inode again and
+again.
+
+Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode.  We
+do not need nr_to_write in WB_SYNC_ALL mode anyway since
+write_cache_pages() does livelock avoidance using page tagging in
+WB_SYNC_ALL mode.
+
+This makes wb_writeback() call __writeback_inodes_sb() only once on
+WB_SYNC_ALL.  The latter function won't livelock because it works on
+
+- a finite set of files by doing queue_io() once at the beginning
+- a finite set of pages by PAGECACHE_TAG_TOWRITE page tagging
+
+After this patch, program from http://lkml.org/lkml/2010/10/24/154 is no
+longer able to stall sync forever.
+
+[fengguang.wu@intel.com: fix locking comment]
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Jan Engelhardt <jengelh@medozas.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c |   27 +++++++++++++++++++++++----
+ 1 file changed, 23 insertions(+), 4 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -629,6 +629,7 @@ static long wb_writeback(struct bdi_writ
+       };
+       unsigned long oldest_jif;
+       long wrote = 0;
++      long write_chunk;
+       struct inode *inode;
+ 
+       if (wbc.for_kupdate) {
+@@ -641,6 +642,24 @@ static long wb_writeback(struct bdi_writ
+               wbc.range_end = LLONG_MAX;
+       }
+ 
++      /*
++       * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
++       * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
++       * here avoids calling into writeback_inodes_wb() more than once.
++       *
++       * The intended call sequence for WB_SYNC_ALL writeback is:
++       *
++       *      wb_writeback()
++       *          __writeback_inodes_sb()     <== called only once
++       *              write_cache_pages()     <== called once for each inode
++       *                   (quickly) tag currently dirty pages
++       *                   (maybe slowly) sync all tagged pages
++       */
++      if (wbc.sync_mode == WB_SYNC_NONE)
++              write_chunk = MAX_WRITEBACK_PAGES;
++      else
++              write_chunk = LONG_MAX;
++
+       wbc.wb_start = jiffies; /* livelock avoidance */
+       for (;;) {
+               /*
+@@ -667,7 +686,7 @@ static long wb_writeback(struct bdi_writ
+                       break;
+ 
+               wbc.more_io = 0;
+-              wbc.nr_to_write = MAX_WRITEBACK_PAGES;
++              wbc.nr_to_write = write_chunk;
+               wbc.pages_skipped = 0;
+ 
+               trace_wbc_writeback_start(&wbc, wb->bdi);
+@@ -677,8 +696,8 @@ static long wb_writeback(struct bdi_writ
+                       writeback_inodes_wb(wb, &wbc);
+               trace_wbc_writeback_written(&wbc, wb->bdi);
+ 
+-              work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+-              wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
++              work->nr_pages -= write_chunk - wbc.nr_to_write;
++              wrote += write_chunk - wbc.nr_to_write;
+ 
+               /*
+                * If we consumed everything, see if we have more
+@@ -693,7 +712,7 @@ static long wb_writeback(struct bdi_writ
+               /*
+                * Did we write something? Try for more
+                */
+-              if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
++              if (wbc.nr_to_write < write_chunk)
+                       continue;
+               /*
+                * Nothing written. Wait for some inode to
diff --git a/queue-2.6.37/writeback-integrated-background-writeback-work.patch b/queue-2.6.37/writeback-integrated-background-writeback-work.patch

new file mode 100644 (file)

index 0000000..37175de
--- /dev/null
+++ b/queue-2.6.37/writeback-integrated-background-writeback-work.patch
@@ -0,0 +1,168 @@
+From 6585027a5e8cb490e3a761b2f3f3c3acf722aff2 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 13 Jan 2011 15:45:44 -0800
+Subject: writeback: integrated background writeback work
+
+From: Jan Kara <jack@suse.cz>
+
+commit 6585027a5e8cb490e3a761b2f3f3c3acf722aff2 upstream.
+
+Check whether background writeback is needed after finishing each work.
+
+When bdi flusher thread finishes doing some work check whether any kind of
+background writeback needs to be done (either because
+dirty_background_ratio is exceeded or because we need to start flushing
+old inodes).  If so, just do background write back.
+
+This way, bdi_start_background_writeback() just needs to wake up the
+flusher thread.  It will do background writeback as soon as there is no
+other work.
+
+This is a preparatory patch for the next patch which stops background
+writeback as soon as there is other work to do.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Jan Engelhardt <jengelh@medozas.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c |   61 ++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 46 insertions(+), 15 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(str
+       return list_entry(head, struct inode, i_wb_list);
+ }
+ 
+-static void bdi_queue_work(struct backing_dev_info *bdi,
+-              struct wb_writeback_work *work)
++/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
++static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
+ {
+-      trace_writeback_queue(bdi, work);
+-
+-      spin_lock_bh(&bdi->wb_lock);
+-      list_add_tail(&work->list, &bdi->work_list);
+       if (bdi->wb.task) {
+               wake_up_process(bdi->wb.task);
+       } else {
+@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backin
+                * The bdi thread isn't there, wake up the forker thread which
+                * will create and run it.
+                */
+-              trace_writeback_nothread(bdi, work);
+               wake_up_process(default_backing_dev_info.wb.task);
+       }
++}
++
++static void bdi_queue_work(struct backing_dev_info *bdi,
++                         struct wb_writeback_work *work)
++{
++      trace_writeback_queue(bdi, work);
++
++      spin_lock_bh(&bdi->wb_lock);
++      list_add_tail(&work->list, &bdi->work_list);
++      if (!bdi->wb.task)
++              trace_writeback_nothread(bdi, work);
++      bdi_wakeup_flusher(bdi);
+       spin_unlock_bh(&bdi->wb_lock);
+ }
+ 
+ static void
+ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+-              bool range_cyclic, bool for_background)
++                    bool range_cyclic)
+ {
+       struct wb_writeback_work *work;
+ 
+@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev
+       work->sync_mode = WB_SYNC_NONE;
+       work->nr_pages  = nr_pages;
+       work->range_cyclic = range_cyclic;
+-      work->for_background = for_background;
+ 
+       bdi_queue_work(bdi, work);
+ }
+@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev
+  */
+ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+ {
+-      __bdi_start_writeback(bdi, nr_pages, true, false);
++      __bdi_start_writeback(bdi, nr_pages, true);
+ }
+ 
+ /**
+@@ -152,13 +158,20 @@ void bdi_start_writeback(struct backing_
+  * @bdi: the backing device to write from
+  *
+  * Description:
+- *   This does WB_SYNC_NONE background writeback. The IO is only
+- *   started when this function returns, we make no guarentees on
+- *   completion. Caller need not hold sb s_umount semaphore.
++ *   This makes sure WB_SYNC_NONE background writeback happens. When
++ *   this function returns, it is only guaranteed that for given BDI
++ *   some IO is happening if we are over background dirty threshold.
++ *   Caller need not hold sb s_umount semaphore.
+  */
+ void bdi_start_background_writeback(struct backing_dev_info *bdi)
+ {
+-      __bdi_start_writeback(bdi, LONG_MAX, true, true);
++      /*
++       * We just wake up the flusher thread. It will perform background
++       * writeback as soon as there is no other work to do.
++       */
++      spin_lock_bh(&bdi->wb_lock);
++      bdi_wakeup_flusher(bdi);
++      spin_unlock_bh(&bdi->wb_lock);
+ }
+ 
+ /*
+@@ -718,6 +731,23 @@ static unsigned long get_nr_dirty_pages(
+               get_nr_dirty_inodes();
+ }
+ 
++static long wb_check_background_flush(struct bdi_writeback *wb)
++{
++      if (over_bground_thresh()) {
++
++              struct wb_writeback_work work = {
++                      .nr_pages       = LONG_MAX,
++                      .sync_mode      = WB_SYNC_NONE,
++                      .for_background = 1,
++                      .range_cyclic   = 1,
++              };
++
++              return wb_writeback(wb, &work);
++      }
++
++      return 0;
++}
++
+ static long wb_check_old_data_flush(struct bdi_writeback *wb)
+ {
+       unsigned long expired;
+@@ -787,6 +817,7 @@ long wb_do_writeback(struct bdi_writebac
+        * Check for periodic writeback, kupdated() style
+        */
+       wrote += wb_check_old_data_flush(wb);
++      wrote += wb_check_background_flush(wb);
+       clear_bit(BDI_writeback_running, &wb->bdi->state);
+ 
+       return wrote;
+@@ -873,7 +904,7 @@ void wakeup_flusher_threads(long nr_page
+       list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+               if (!bdi_has_dirty_io(bdi))
+                       continue;
+-              __bdi_start_writeback(bdi, nr_pages, false, false);
++              __bdi_start_writeback(bdi, nr_pages, false);
+       }
+       rcu_read_unlock();
+ }
diff --git a/queue-2.6.37/writeback-stop-background-kupdate-works-from-livelocking-other-works.patch b/queue-2.6.37/writeback-stop-background-kupdate-works-from-livelocking-other-works.patch

new file mode 100644 (file)

index 0000000..bab0519
--- /dev/null
+++ b/queue-2.6.37/writeback-stop-background-kupdate-works-from-livelocking-other-works.patch
@@ -0,0 +1,69 @@
+From aa373cf550994623efb5d49a4d8775bafd10bbc1 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 13 Jan 2011 15:45:47 -0800
+Subject: writeback: stop background/kupdate works from livelocking other works
+
+From: Jan Kara <jack@suse.cz>
+
+commit aa373cf550994623efb5d49a4d8775bafd10bbc1 upstream.
+
+Background writeback is easily livelockable in a loop in wb_writeback() by
+a process continuously re-dirtying pages (or continuously appending to a
+file).  This is in fact intended as the target of background writeback is
+to write dirty pages it can find as long as we are over
+dirty_background_threshold.
+
+But the above behavior gets inconvenient at times because no other work
+queued in the flusher thread's queue gets processed.  In particular, since
+e.g.  sync(1) relies on flusher thread to do all the IO for it, sync(1)
+can hang forever waiting for flusher thread to do the work.
+
+Generally, when a flusher thread has some work queued, someone submitted
+the work to achieve a goal more specific than what background writeback
+does.  Moreover by working on the specific work, we also reduce amount of
+dirty pages which is exactly the target of background writeout.  So it
+makes sense to give specific work a priority over a generic page cleaning.
+
+Thus we interrupt background writeback if there is some other work to do.
+We return to the background writeback after completing all the queued
+work.
+
+This may delay the writeback of expired inodes for a while, however the
+expired inodes will eventually be flushed to disk as long as the other
+works won't livelock.
+
+[fengguang.wu@intel.com: update comment]
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Jan Engelhardt <jengelh@medozas.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c |   10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -650,6 +650,16 @@ static long wb_writeback(struct bdi_writ
+                       break;
+ 
+               /*
++               * Background writeout and kupdate-style writeback may
++               * run forever. Stop them if there is other work to do
++               * so that e.g. sync can proceed. They'll be restarted
++               * after the other works are all done.
++               */
++              if ((work->for_background || work->for_kupdate) &&
++                  !list_empty(&wb->bdi->work_list))
++                      break;
++
++              /*
+                * For background writeout, stop when we are below the
+                * background dirty threshold
+                */
author	Greg Kroah-Hartman <gregkh@suse.de>
	Tue, 15 Feb 2011 00:25:56 +0000 (16:25 -0800)
committer	Greg Kroah-Hartman <gregkh@suse.de>
	Tue, 15 Feb 2011 00:25:56 +0000 (16:25 -0800)
queue-2.6.37/ath9k-fix-race-conditions-when-stop-device.patch	[deleted file]	patch \| blob \| blame \| history
queue-2.6.37/block-fix-accounting-bug-on-cross-partition-merges.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/dynamic-debug-fix-build-issue-with-older-gcc.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/ext4-fix-data-corruption-with-multi-block-writepages-support.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/ext4-fix-memory-leak-in-ext4_free_branches.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/ext4-fix-panic-on-module-unload-when-stopping-lazyinit-thread.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/ext4-fix-trimming-of-a-single-group.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/ext4-fix-uninitialized-variable-in-ext4_register_li_request.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/ext4-make-grpinfo-slab-cache-names-static.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/ext4-unregister-features-interface-on-module-unload.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/genirq-prevent-irq-storm-on-migration.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/kref-add-kref_test_and_get.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/mm-fix-hugepage-migration.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/mm-fix-migration-hangs-on-anon_vma-lock.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/mm-migration-use-rcu_dereference_protected-when-dereferencing-the-radix-tree-slot-during-file-page-migration.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/nfs-don-t-use-vm_map_ram-in-readdir.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/nfs-fix-an-nfs-client-lockdep-issue.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/nfs-fix-kernel-bug-at-fs-aio.c-554.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/nfs-fix-nfsv3-exclusive-open-semantics.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/nfsd4-name-id-mapping-should-fail-with-badowner-not-badname.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/proc-kcore-fix-seeking.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/rdma-cxgb4-don-t-re-init-wait-object-in-init-fini-paths.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/rdma-cxgb4-limit-maxburst-eq-context-field-to-256b.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/rdma-cxgb4-set-the-correct-device-physical-function-for-iwarp-connections.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/rtc-cmos-fix-suspend-resume.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/series		patch \| blob \| blame \| history
queue-2.6.37/writeback-avoid-livelocking-wb_sync_all-writeback.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/writeback-integrated-background-writeback-work.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.37/writeback-stop-background-kupdate-works-from-livelocking-other-works.patch	[new file with mode: 0644]	patch \| blob