+++ /dev/null
-From 203043f579ece44bb30291442cd56332651dd37d Mon Sep 17 00:00:00 2001
-From: Stanislaw Gruszka <sgruszka@redhat.com>
-Date: Tue, 25 Jan 2011 14:08:40 +0100
-Subject: ath9k: fix race conditions when stop device
-
-From: Stanislaw Gruszka <sgruszka@redhat.com>
-
-commit 203043f579ece44bb30291442cd56332651dd37d upstream.
-
-We do not kill any scheduled tasklets when stopping device, that may
-cause usage of resources after free. Moreover we enable interrupts
-in tasklet function, so we could potentially end with interrupts
-enabled when driver is not ready to receive them.
-
-I think patch should fix Ben's kernel crash from:
-http://marc.info/?l=linux-wireless&m=129438358921501&w=2
-
-Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
-Signed-off-by: John W. Linville <linville@tuxdriver.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
-
----
- drivers/net/wireless/ath/ath9k/init.c | 5 -----
- drivers/net/wireless/ath/ath9k/main.c | 9 +++++++++
- 2 files changed, 9 insertions(+), 5 deletions(-)
-
---- a/drivers/net/wireless/ath/ath9k/init.c
-+++ b/drivers/net/wireless/ath/ath9k/init.c
-@@ -633,8 +633,6 @@ err_queues:
- err_debug:
- ath9k_hw_deinit(ah);
- err_hw:
-- tasklet_kill(&sc->intr_tq);
-- tasklet_kill(&sc->bcon_tasklet);
-
- kfree(ah);
- sc->sc_ah = NULL;
-@@ -802,9 +800,6 @@ static void ath9k_deinit_softc(struct at
- ath9k_exit_debug(sc->sc_ah);
- ath9k_hw_deinit(sc->sc_ah);
-
-- tasklet_kill(&sc->intr_tq);
-- tasklet_kill(&sc->bcon_tasklet);
--
- kfree(sc->sc_ah);
- sc->sc_ah = NULL;
- }
---- a/drivers/net/wireless/ath/ath9k/main.c
-+++ b/drivers/net/wireless/ath/ath9k/main.c
-@@ -1401,6 +1401,9 @@ static void ath9k_stop(struct ieee80211_
- ath9k_btcoex_timer_pause(sc);
- }
-
-+ /* prevent tasklets to enable interrupts once we disable them */
-+ ah->imask &= ~ATH9K_INT_GLOBAL;
-+
- /* make sure h/w will not generate any interrupt
- * before setting the invalid flag. */
- ath9k_hw_set_interrupts(ah, 0);
-@@ -1901,6 +1904,12 @@ static int ath9k_set_key(struct ieee8021
- ret = -EINVAL;
- }
-
-+ /* we can now sync irq and kill any running tasklets, since we already
-+ * disabled interrupts and not holding a spin lock */
-+ synchronize_irq(sc->irq);
-+ tasklet_kill(&sc->intr_tq);
-+ tasklet_kill(&sc->bcon_tasklet);
-+
- ath9k_ps_restore(sc);
- mutex_unlock(&sc->mutex);
-
--- /dev/null
+From 09e099d4bafea3b15be003d548bdf94b4b6e0e17 Mon Sep 17 00:00:00 2001
+From: Jerome Marchand <jmarchan@redhat.com>
+Date: Wed, 5 Jan 2011 16:57:38 +0100
+Subject: block: fix accounting bug on cross partition merges
+
+From: Jerome Marchand <jmarchan@redhat.com>
+
+commit 09e099d4bafea3b15be003d548bdf94b4b6e0e17 upstream.
+
+/proc/diskstats would display a strange output as follows.
+
+$ cat /proc/diskstats |grep sda
+ 8 0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
+ 8 1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
+ ~~~~~~~~~~
+ 8 2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
+ 8 3 sda3 54 487 2188 92 0 0 0 0 0 88 92
+ 8 4 sda4 4 0 8 0 0 0 0 0 0 0 0
+ 8 5 sda5 81 2027 2130 138 0 0 0 0 0 87 137
+
+Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
+merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.
+
+The detailed root cause is as follows.
+
+Assuming that there are two partition, sda1 and sda2.
+
+1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
+ is 0 and sda2's one is 1.
+
+ | hd_struct->in_flight
+ ---------------------------
+ sda1 | 0
+ sda2 | 1
+ ---------------------------
+
+2. A bio belongs to sda1 is issued and is merged into the request mentioned on
+ step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
+ from sda2 region to sda1 region. However the two partition's
+ hd_struct->in_flight are not changed.
+
+ | hd_struct->in_flight
+ ---------------------------
+ sda1 | 0
+ sda2 | 1
+ ---------------------------
+
+3. The request is finished and blk_account_io_done() is called. In this case,
+ sda2's hd_struct->in_flight, not a sda1's one, is decremented.
+
+ | hd_struct->in_flight
+ ---------------------------
+ sda1 | -1
+ sda2 | 1
+ ---------------------------
+
+The patch fixes the problem by caching the partition lookup
+inside the request structure, hence making sure that the increment
+and decrement will always happen on the same partition struct. This
+also speeds up IO with accounting enabled, since it cuts down on
+the number of lookups we have to do.
+
+Also add a refcount to struct hd_struct to keep the partition in
+memory as long as users exist. We use kref_test_and_get() to ensure
+we don't add a reference to a partition which is going away.
+
+Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
+Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
+Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ block/blk-core.c | 26 +++++++++++++++++++++-----
+ block/blk-merge.c | 3 ++-
+ block/genhd.c | 1 +
+ fs/partitions/check.c | 10 +++++++++-
+ include/linux/blkdev.h | 1 +
+ include/linux/genhd.h | 2 ++
+ 6 files changed, 36 insertions(+), 7 deletions(-)
+
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -64,13 +64,27 @@ static void drive_stat_acct(struct reque
+ return;
+
+ cpu = part_stat_lock();
+- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+
+- if (!new_io)
++ if (!new_io) {
++ part = rq->part;
+ part_stat_inc(cpu, part, merges[rw]);
+- else {
++ } else {
++ part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
++ if (!kref_test_and_get(&part->ref)) {
++ /*
++ * The partition is already being removed,
++ * the request will be accounted on the disk only
++ *
++ * We take a reference on disk->part0 although that
++ * partition will never be deleted, so we can treat
++ * it as any other partition.
++ */
++ part = &rq->rq_disk->part0;
++ kref_get(&part->ref);
++ }
+ part_round_stats(cpu, part);
+ part_inc_in_flight(part, rw);
++ rq->part = part;
+ }
+
+ part_stat_unlock();
+@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q
+ rq->ref_count = 1;
+ rq->start_time = jiffies;
+ set_start_time_ns(rq);
++ rq->part = NULL;
+ }
+ EXPORT_SYMBOL(blk_rq_init);
+
+@@ -1776,7 +1791,7 @@ static void blk_account_io_completion(st
+ int cpu;
+
+ cpu = part_stat_lock();
+- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
++ part = req->part;
+ part_stat_add(cpu, part, sectors[rw], bytes >> 9);
+ part_stat_unlock();
+ }
+@@ -1796,13 +1811,14 @@ static void blk_account_io_done(struct r
+ int cpu;
+
+ cpu = part_stat_lock();
+- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
++ part = req->part;
+
+ part_stat_inc(cpu, part, ios[rw]);
+ part_stat_add(cpu, part, ticks[rw], duration);
+ part_round_stats(cpu, part);
+ part_dec_in_flight(part, rw);
+
++ kref_put(&part->ref, __delete_partition);
+ part_stat_unlock();
+ }
+ }
+--- a/block/blk-merge.c
++++ b/block/blk-merge.c
+@@ -351,11 +351,12 @@ static void blk_account_io_merge(struct
+ int cpu;
+
+ cpu = part_stat_lock();
+- part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
++ part = req->part;
+
+ part_round_stats(cpu, part);
+ part_dec_in_flight(part, rq_data_dir(req));
+
++ kref_put(&part->ref, __delete_partition);
+ part_stat_unlock();
+ }
+ }
+--- a/block/genhd.c
++++ b/block/genhd.c
+@@ -1192,6 +1192,7 @@ struct gendisk *alloc_disk_node(int mino
+ return NULL;
+ }
+ disk->part_tbl->part[0] = &disk->part0;
++ kref_init(&disk->part0.ref);
+
+ disk->minors = minors;
+ rand_initialize_disk(disk);
+--- a/fs/partitions/check.c
++++ b/fs/partitions/check.c
+@@ -372,6 +372,13 @@ static void delete_partition_rcu_cb(stru
+ put_device(part_to_dev(part));
+ }
+
++void __delete_partition(struct kref *ref)
++{
++ struct hd_struct *part = container_of(ref, struct hd_struct, ref);
++
++ call_rcu(&part->rcu_head, delete_partition_rcu_cb);
++}
++
+ void delete_partition(struct gendisk *disk, int partno)
+ {
+ struct disk_part_tbl *ptbl = disk->part_tbl;
+@@ -390,7 +397,7 @@ void delete_partition(struct gendisk *di
+ kobject_put(part->holder_dir);
+ device_del(part_to_dev(part));
+
+- call_rcu(&part->rcu_head, delete_partition_rcu_cb);
++ kref_put(&part->ref, __delete_partition);
+ }
+
+ static ssize_t whole_disk_show(struct device *dev,
+@@ -489,6 +496,7 @@ struct hd_struct *add_partition(struct g
+ if (!dev_get_uevent_suppress(ddev))
+ kobject_uevent(&pdev->kobj, KOBJ_ADD);
+
++ kref_init(&p->ref);
+ return p;
+
+ out_free_info:
+--- a/include/linux/blkdev.h
++++ b/include/linux/blkdev.h
+@@ -115,6 +115,7 @@ struct request {
+ void *elevator_private3;
+
+ struct gendisk *rq_disk;
++ struct hd_struct *part;
+ unsigned long start_time;
+ #ifdef CONFIG_BLK_CGROUP
+ unsigned long long start_time_ns;
+--- a/include/linux/genhd.h
++++ b/include/linux/genhd.h
+@@ -116,6 +116,7 @@ struct hd_struct {
+ struct disk_stats dkstats;
+ #endif
+ struct rcu_head rcu_head;
++ struct kref ref;
+ };
+
+ #define GENHD_FL_REMOVABLE 1
+@@ -583,6 +584,7 @@ extern struct hd_struct * __must_check a
+ sector_t len, int flags,
+ struct partition_meta_info
+ *info);
++extern void __delete_partition(struct kref *ref);
+ extern void delete_partition(struct gendisk *, int);
+ extern void printk_all_partitions(void);
+
--- /dev/null
+From 2d75af2f2a7a6103a6d539a492fe81deacabde44 Mon Sep 17 00:00:00 2001
+From: Jason Baron <jbaron@redhat.com>
+Date: Fri, 7 Jan 2011 13:36:58 -0500
+Subject: dynamic debug: Fix build issue with older gcc
+
+From: Jason Baron <jbaron@redhat.com>
+
+commit 2d75af2f2a7a6103a6d539a492fe81deacabde44 upstream.
+
+On older gcc (3.3) dynamic debug fails to compile:
+
+include/net/inet_connection_sock.h: In function `inet_csk_reset_xmit_timer':
+include/net/inet_connection_sock.h:236: error: duplicate label declaration `do_printk'
+include/net/inet_connection_sock.h:219: error: this is a previous declaration
+include/net/inet_connection_sock.h:236: error: duplicate label declaration `out'
+include/net/inet_connection_sock.h:219: error: this is a previous declaration
+include/net/inet_connection_sock.h:236: error: duplicate label `do_printk'
+include/net/inet_connection_sock.h:236: error: duplicate label `out'
+
+Fix, by reverting the usage of JUMP_LABEL() in dynamic debug for now.
+
+Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Signed-off-by: Jason Baron <jbaron@redhat.com>
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/dynamic_debug.h | 18 ++++--------------
+ lib/dynamic_debug.c | 9 ++++-----
+ 2 files changed, 8 insertions(+), 19 deletions(-)
+
+--- a/include/linux/dynamic_debug.h
++++ b/include/linux/dynamic_debug.h
+@@ -44,34 +44,24 @@ int ddebug_add_module(struct _ddebug *ta
+ extern int ddebug_remove_module(const char *mod_name);
+
+ #define dynamic_pr_debug(fmt, ...) do { \
+- __label__ do_printk; \
+- __label__ out; \
+ static struct _ddebug descriptor \
+ __used \
+ __attribute__((section("__verbose"), aligned(8))) = \
+ { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \
+ _DPRINTK_FLAGS_DEFAULT }; \
+- JUMP_LABEL(&descriptor.enabled, do_printk); \
+- goto out; \
+-do_printk: \
+- printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
+-out: ; \
++ if (unlikely(descriptor.enabled)) \
++ printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); \
+ } while (0)
+
+
+ #define dynamic_dev_dbg(dev, fmt, ...) do { \
+- __label__ do_printk; \
+- __label__ out; \
+ static struct _ddebug descriptor \
+ __used \
+ __attribute__((section("__verbose"), aligned(8))) = \
+ { KBUILD_MODNAME, __func__, __FILE__, fmt, __LINE__, \
+ _DPRINTK_FLAGS_DEFAULT }; \
+- JUMP_LABEL(&descriptor.enabled, do_printk); \
+- goto out; \
+-do_printk: \
+- dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
+-out: ; \
++ if (unlikely(descriptor.enabled)) \
++ dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); \
+ } while (0)
+
+ #else
+--- a/lib/dynamic_debug.c
++++ b/lib/dynamic_debug.c
+@@ -141,11 +141,10 @@ static void ddebug_change(const struct d
+ else if (!dp->flags)
+ dt->num_enabled++;
+ dp->flags = newflags;
+- if (newflags) {
+- jump_label_enable(&dp->enabled);
+- } else {
+- jump_label_disable(&dp->enabled);
+- }
++ if (newflags)
++ dp->enabled = 1;
++ else
++ dp->enabled = 0;
+ if (verbose)
+ printk(KERN_INFO
+ "ddebug: changed %s:%d [%s]%s %s\n",
--- /dev/null
+From d50bdd5aa55127635fd8a5c74bd2abb256bd34e3 Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Mon, 7 Feb 2011 12:46:14 -0500
+Subject: ext4: Fix data corruption with multi-block writepages support
+
+From: Curt Wohlgemuth <curtw@google.com>
+
+commit d50bdd5aa55127635fd8a5c74bd2abb256bd34e3 upstream.
+
+This fixes a corruption problem with the multi-block
+writepages submittal change for ext4, from commit
+bd2d0210cf22f2bd0cef72eb97cf94fc7d31d8cc ("ext4: use bio
+layer instead of buffer layer in mpage_da_submit_io").
+
+(Note that this corruption is not present in 2.6.37 on
+ext4, because the corruption was detected after the
+feature was merged in 2.6.37-rc1, and so it was turned
+off by adding a non-default mount option,
+mblk_io_submit. With this commit, which hopefully
+fixes the last of the bugs with this feature, we'll be
+able to turn on this performance feature by default in
+2.6.38, and remove the mblk_io_submit option.)
+
+The ext4 code path to bundle multiple pages for
+writeback in ext4_bio_write_page() had a bug: we should
+be clearing buffer head dirty flags *before* we submit
+the bio, not in the completion routine.
+
+The patch below was tested on 2.6.37 under KVM with the
+postgresql script which was submitted by Jon Nelson as
+documented in commit 1449032be1.
+
+Without the patch, I'd hit the corruption problem about
+50-70% of the time. With the patch, I executed the
+script > 100 times with no corruption seen.
+
+I also fixed a bug to make sure ext4_end_bio() doesn't
+dereference the bio after the bio_put() call.
+
+Reported-by: Jon Nelson <jnelson@jamponi.net>
+Reported-by: Matthias Bayer <jackdachef@gmail.com>
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/page-io.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -193,6 +193,7 @@ static void ext4_end_bio(struct bio *bio
+ struct inode *inode;
+ unsigned long flags;
+ int i;
++ sector_t bi_sector = bio->bi_sector;
+
+ BUG_ON(!io_end);
+ bio->bi_private = NULL;
+@@ -210,9 +211,7 @@ static void ext4_end_bio(struct bio *bio
+ if (error)
+ SetPageError(page);
+ BUG_ON(!head);
+- if (head->b_size == PAGE_CACHE_SIZE)
+- clear_buffer_dirty(head);
+- else {
++ if (head->b_size != PAGE_CACHE_SIZE) {
+ loff_t offset;
+ loff_t io_end_offset = io_end->offset + io_end->size;
+
+@@ -224,7 +223,6 @@ static void ext4_end_bio(struct bio *bio
+ if (error)
+ buffer_io_error(bh);
+
+- clear_buffer_dirty(bh);
+ }
+ if (buffer_delay(bh))
+ partial_write = 1;
+@@ -260,7 +258,7 @@ static void ext4_end_bio(struct bio *bio
+ (unsigned long long) io_end->offset,
+ (long) io_end->size,
+ (unsigned long long)
+- bio->bi_sector >> (inode->i_blkbits - 9));
++ bi_sector >> (inode->i_blkbits - 9));
+ }
+
+ /* Add the io_end to per-inode completed io list*/
+@@ -383,6 +381,7 @@ int ext4_bio_write_page(struct ext4_io_s
+
+ blocksize = 1 << inode->i_blkbits;
+
++ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ ClearPageError(page);
+@@ -400,12 +399,14 @@ int ext4_bio_write_page(struct ext4_io_s
+ for (bh = head = page_buffers(page), block_start = 0;
+ bh != head || !block_start;
+ block_start = block_end, bh = bh->b_this_page) {
++
+ block_end = block_start + blocksize;
+ if (block_start >= len) {
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ continue;
+ }
++ clear_buffer_dirty(bh);
+ ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
+ if (ret) {
+ /*
--- /dev/null
+From 1c5b9e9065567876c2d4a7a16d78f0fed154a5bf Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Mon, 10 Jan 2011 12:51:28 -0500
+Subject: ext4: fix memory leak in ext4_free_branches
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 1c5b9e9065567876c2d4a7a16d78f0fed154a5bf upstream.
+
+Commit 40389687 moved a call to ext4_forget() out of
+ext4_free_branches and let ext4_free_blocks() handle calling
+bforget(). But that change unfortunately did not replace the call to
+ext4_forget() with brelse(), which was needed to drop the in-use count
+of the indirect block's buffer head, which lead to a memory leak when
+deleting files that used indirect blocks. Fix this.
+
+Thanks to Hugh Dickins for pointing this out.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4349,6 +4349,7 @@ static void ext4_free_branches(handle_t
+ (__le32 *) bh->b_data,
+ (__le32 *) bh->b_data + addr_per_block,
+ depth);
++ brelse(bh);
+
+ /*
+ * Everything below this this pointer has been
--- /dev/null
+From 8f1f745331c1b560f53c0d60e55a4f4f43f7cce5 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Thu, 3 Feb 2011 14:33:15 -0500
+Subject: ext4: fix panic on module unload when stopping lazyinit thread
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+commit 8f1f745331c1b560f53c0d60e55a4f4f43f7cce5 upstream.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=27652
+
+If the lazyinit thread is running, the teardown function
+ext4_destroy_lazyinit_thread() has problems:
+
+ ext4_clear_request_list();
+ while (ext4_li_info->li_task) {
+ wake_up(&ext4_li_info->li_wait_daemon);
+ wait_event(ext4_li_info->li_wait_task,
+ ext4_li_info->li_task == NULL);
+ }
+
+Clearing the request list will cause the thread to exit and free
+ext4_li_info, so then we're waiting on something which is getting
+freed.
+
+Fix this up by making the thread respond to kthread_stop, and exit,
+without the need to wait for that exit in some other homegrown way.
+
+Reported-and-Tested-by: Tao Ma <boyu.mt@taobao.com>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/super.c | 27 ++++++++++++++-------------
+ 1 file changed, 14 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -77,6 +77,7 @@ static struct dentry *ext4_mount(struct
+ const char *dev_name, void *data);
+ static void ext4_destroy_lazyinit_thread(void);
+ static void ext4_unregister_li_request(struct super_block *sb);
++static void ext4_clear_request_list(void);
+
+ #if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+ static struct file_system_type ext3_fs_type = {
+@@ -2704,6 +2705,8 @@ static void ext4_unregister_li_request(s
+ mutex_unlock(&ext4_li_info->li_list_mtx);
+ }
+
++static struct task_struct *ext4_lazyinit_task;
++
+ /*
+ * This is the function where ext4lazyinit thread lives. It walks
+ * through the request list searching for next scheduled filesystem.
+@@ -2772,6 +2775,10 @@ cont_thread:
+ if (time_before(jiffies, next_wakeup))
+ schedule();
+ finish_wait(&eli->li_wait_daemon, &wait);
++ if (kthread_should_stop()) {
++ ext4_clear_request_list();
++ goto exit_thread;
++ }
+ }
+
+ exit_thread:
+@@ -2796,6 +2803,7 @@ exit_thread:
+ wake_up(&eli->li_wait_task);
+
+ kfree(ext4_li_info);
++ ext4_lazyinit_task = NULL;
+ ext4_li_info = NULL;
+ mutex_unlock(&ext4_li_mtx);
+
+@@ -2818,11 +2826,10 @@ static void ext4_clear_request_list(void
+
+ static int ext4_run_lazyinit_thread(void)
+ {
+- struct task_struct *t;
+-
+- t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit");
+- if (IS_ERR(t)) {
+- int err = PTR_ERR(t);
++ ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
++ ext4_li_info, "ext4lazyinit");
++ if (IS_ERR(ext4_lazyinit_task)) {
++ int err = PTR_ERR(ext4_lazyinit_task);
+ ext4_clear_request_list();
+ del_timer_sync(&ext4_li_info->li_timer);
+ kfree(ext4_li_info);
+@@ -2973,16 +2980,10 @@ static void ext4_destroy_lazyinit_thread
+ * If thread exited earlier
+ * there's nothing to be done.
+ */
+- if (!ext4_li_info)
++ if (!ext4_li_info || !ext4_lazyinit_task)
+ return;
+
+- ext4_clear_request_list();
+-
+- while (ext4_li_info->li_task) {
+- wake_up(&ext4_li_info->li_wait_daemon);
+- wait_event(ext4_li_info->li_wait_task,
+- ext4_li_info->li_task == NULL);
+- }
++ kthread_stop(ext4_lazyinit_task);
+ }
+
+ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
--- /dev/null
+From ca6e909f9bebe709bc65a3ee605ce32969db0452 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Mon, 10 Jan 2011 12:30:39 -0500
+Subject: ext4: fix trimming of a single group
+
+From: Jan Kara <jack@suse.cz>
+
+commit ca6e909f9bebe709bc65a3ee605ce32969db0452 upstream.
+
+When ext4_trim_fs() is called to trim a part of a single group, the
+logic will wrongly set last block of the interval to 'len' instead
+of 'first_block + len'. Thus a shorter interval is possibly trimmed.
+Fix it.
+
+CC: Lukas Czerner <lczerner@redhat.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/mballoc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -4851,7 +4851,7 @@ int ext4_trim_fs(struct super_block *sb,
+ if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+ len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+ else
+- last_block = len;
++ last_block = first_block + len;
+
+ if (e4b.bd_info->bb_free >= minlen) {
+ cnt = ext4_trim_all_free(sb, &e4b, first_block,
--- /dev/null
+From 6c5a6cb998854f3c579ecb2bc1423d302bcb1b76 Mon Sep 17 00:00:00 2001
+From: Andrew Morton <akpm@linux-foundation.org>
+Date: Mon, 10 Jan 2011 12:30:17 -0500
+Subject: ext4: fix uninitialized variable in ext4_register_li_request
+
+From: Andrew Morton <akpm@linux-foundation.org>
+
+commit 6c5a6cb998854f3c579ecb2bc1423d302bcb1b76 upstream.
+
+fs/ext4/super.c: In function 'ext4_register_li_request':
+fs/ext4/super.c:2936: warning: 'ret' may be used uninitialized in this function
+
+It looks buggy to me, too.
+
+Cc: Lukas Czerner <lczerner@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/super.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2916,7 +2916,7 @@ static int ext4_register_li_request(stru
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_li_request *elr;
+ ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+- int ret;
++ int ret = 0;
+
+ if (sbi->s_li_request != NULL)
+ return 0;
--- /dev/null
+From 2892c15ddda6a76dc10b7499e56c0f3b892e5a69 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sat, 12 Feb 2011 08:12:18 -0500
+Subject: ext4: make grpinfo slab cache names static
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+commit 2892c15ddda6a76dc10b7499e56c0f3b892e5a69 upstream.
+
+In 2.6.37 I was running into oopses with repeated module
+loads & unloads. I tracked this down to:
+
+fb1813f4 ext4: use dedicated slab caches for group_info structures
+
+(this was in addition to the features advert unload problem)
+
+The kstrdup & subsequent kfree of the cache name was causing
+a double free. In slub, at least, if I read it right it allocates
+& frees the name itself, slab seems to do something different...
+so in slub I think we were leaking -our- cachep->name, and double
+freeing the one allocated by slub.
+
+After getting lost in slab/slub/slob a bit, I just looked at other
+sized-caches that get allocated. jbd2, biovec, sgpool all do it
+more or less the way jbd2 does. Below patch follows the jbd2
+method of dynamically allocating a cache at mount time from
+a list of static names.
+
+(This might also possibly fix a race creating the caches with
+parallel mounts running).
+
+[Folded in a fix from Dan Carpenter which fixed an off-by-one error in
+the original patch]
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/mballoc.c | 100 ++++++++++++++++++++++++++++++++----------------------
+ 1 file changed, 60 insertions(+), 40 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_
+ /* We create slab caches for groupinfo data structures based on the
+ * superblock block size. There will be one per mounted filesystem for
+ * each unique s_blocksize_bits */
+-#define NR_GRPINFO_CACHES \
+- (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
++#define NR_GRPINFO_CACHES 8
+ static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
+
++static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
++ "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
++ "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
++ "ext4_groupinfo_64k", "ext4_groupinfo_128k"
++};
++
+ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
+ ext4_group_t group);
+ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
+@@ -2414,6 +2419,55 @@ err_freesgi:
+ return -ENOMEM;
+ }
+
++static void ext4_groupinfo_destroy_slabs(void)
++{
++ int i;
++
++ for (i = 0; i < NR_GRPINFO_CACHES; i++) {
++ if (ext4_groupinfo_caches[i])
++ kmem_cache_destroy(ext4_groupinfo_caches[i]);
++ ext4_groupinfo_caches[i] = NULL;
++ }
++}
++
++static int ext4_groupinfo_create_slab(size_t size)
++{
++ static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
++ int slab_size;
++ int blocksize_bits = order_base_2(size);
++ int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
++ struct kmem_cache *cachep;
++
++ if (cache_index >= NR_GRPINFO_CACHES)
++ return -EINVAL;
++
++ if (unlikely(cache_index < 0))
++ cache_index = 0;
++
++ mutex_lock(&ext4_grpinfo_slab_create_mutex);
++ if (ext4_groupinfo_caches[cache_index]) {
++ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
++ return 0; /* Already created */
++ }
++
++ slab_size = offsetof(struct ext4_group_info,
++ bb_counters[blocksize_bits + 2]);
++
++ cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
++ slab_size, 0, SLAB_RECLAIM_ACCOUNT,
++ NULL);
++
++ mutex_unlock(&ext4_grpinfo_slab_create_mutex);
++ if (!cachep) {
++ printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
++ return -ENOMEM;
++ }
++
++ ext4_groupinfo_caches[cache_index] = cachep;
++
++ return 0;
++}
++
+ int ext4_mb_init(struct super_block *sb, int needs_recovery)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+@@ -2421,9 +2475,6 @@ int ext4_mb_init(struct super_block *sb,
+ unsigned offset;
+ unsigned max;
+ int ret;
+- int cache_index;
+- struct kmem_cache *cachep;
+- char *namep = NULL;
+
+ i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
+
+@@ -2440,30 +2491,9 @@ int ext4_mb_init(struct super_block *sb,
+ goto out;
+ }
+
+- cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
+- cachep = ext4_groupinfo_caches[cache_index];
+- if (!cachep) {
+- char name[32];
+- int len = offsetof(struct ext4_group_info,
+- bb_counters[sb->s_blocksize_bits + 2]);
+-
+- sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
+- namep = kstrdup(name, GFP_KERNEL);
+- if (!namep) {
+- ret = -ENOMEM;
+- goto out;
+- }
+-
+- /* Need to free the kmem_cache_name() when we
+- * destroy the slab */
+- cachep = kmem_cache_create(namep, len, 0,
+- SLAB_RECLAIM_ACCOUNT, NULL);
+- if (!cachep) {
+- ret = -ENOMEM;
+- goto out;
+- }
+- ext4_groupinfo_caches[cache_index] = cachep;
+- }
++ ret = ext4_groupinfo_create_slab(sb->s_blocksize);
++ if (ret < 0)
++ goto out;
+
+ /* order 0 is regular bitmap */
+ sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
+@@ -2520,7 +2550,6 @@ out:
+ if (ret) {
+ kfree(sbi->s_mb_offsets);
+ kfree(sbi->s_mb_maxs);
+- kfree(namep);
+ }
+ return ret;
+ }
+@@ -2734,7 +2763,6 @@ int __init ext4_init_mballoc(void)
+
+ void ext4_exit_mballoc(void)
+ {
+- int i;
+ /*
+ * Wait for completion of call_rcu()'s on ext4_pspace_cachep
+ * before destroying the slab cache.
+@@ -2743,15 +2771,7 @@ void ext4_exit_mballoc(void)
+ kmem_cache_destroy(ext4_pspace_cachep);
+ kmem_cache_destroy(ext4_ac_cachep);
+ kmem_cache_destroy(ext4_free_ext_cachep);
+-
+- for (i = 0; i < NR_GRPINFO_CACHES; i++) {
+- struct kmem_cache *cachep = ext4_groupinfo_caches[i];
+- if (cachep) {
+- char *name = (char *)kmem_cache_name(cachep);
+- kmem_cache_destroy(cachep);
+- kfree(name);
+- }
+- }
++ ext4_groupinfo_destroy_slabs();
+ ext4_remove_debugfs_entry();
+ }
+
--- /dev/null
+From 8f021222c1e2756ea4c9dde93b23e1d2a0a4ec37 Mon Sep 17 00:00:00 2001
+From: Lukas Czerner <lczerner@redhat.com>
+Date: Thu, 3 Feb 2011 14:33:33 -0500
+Subject: ext4: unregister features interface on module unload
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+commit 8f021222c1e2756ea4c9dde93b23e1d2a0a4ec37 upstream.
+
+Ext4 features interface was not properly unregistered which led to
+problems while unloading/reloading ext4 module. This commit fixes that by
+adding proper kobject unregistration code into ext4_exit_fs() as well as
+fail-path of ext4_init_fs()
+
+Reported-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/super.c | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4757,7 +4757,7 @@ static struct file_system_type ext4_fs_t
+ .fs_flags = FS_REQUIRES_DEV,
+ };
+
+-int __init ext4_init_feat_adverts(void)
++static int __init ext4_init_feat_adverts(void)
+ {
+ struct ext4_features *ef;
+ int ret = -ENOMEM;
+@@ -4781,6 +4781,13 @@ out:
+ return ret;
+ }
+
++static void ext4_exit_feat_adverts(void)
++{
++ kobject_put(&ext4_feat->f_kobj);
++ wait_for_completion(&ext4_feat->f_kobj_unregister);
++ kfree(ext4_feat);
++}
++
+ static int __init ext4_init_fs(void)
+ {
+ int err;
+@@ -4827,7 +4834,7 @@ out1:
+ out2:
+ ext4_exit_mballoc();
+ out3:
+- kfree(ext4_feat);
++ ext4_exit_feat_adverts();
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
+ out4:
+@@ -4846,6 +4853,7 @@ static void __exit ext4_exit_fs(void)
+ destroy_inodecache();
+ ext4_exit_xattr();
+ ext4_exit_mballoc();
++ ext4_exit_feat_adverts();
+ remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
+ ext4_exit_system_zone();
--- /dev/null
+From f1a06390d013244e721372b3f9b66e39b6429c71 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Fri, 28 Jan 2011 08:47:15 +0100
+Subject: genirq: Prevent irq storm on migration
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit f1a06390d013244e721372b3f9b66e39b6429c71 upstream.
+
+move_native_irq() masks and unmasks the interrupt line
+unconditionally, but the interrupt line might be masked due to a
+threaded oneshot handler in progress. Unmasking the line in that case
+can lead to interrupt storms. Observed on PREEMPT_RT.
+
+Originally-from: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/irq/migration.c | 14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+--- a/kernel/irq/migration.c
++++ b/kernel/irq/migration.c
+@@ -56,6 +56,7 @@ void move_masked_irq(int irq)
+ void move_native_irq(int irq)
+ {
+ struct irq_desc *desc = irq_to_desc(irq);
++ bool masked;
+
+ if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+ return;
+@@ -63,8 +64,15 @@ void move_native_irq(int irq)
+ if (unlikely(desc->status & IRQ_DISABLED))
+ return;
+
+- desc->irq_data.chip->irq_mask(&desc->irq_data);
++ /*
++ * Be careful vs. already masked interrupts. If this is a
++ * threaded interrupt with ONESHOT set, we can end up with an
++ * interrupt storm.
++ */
++ masked = desc->status & IRQ_MASKED;
++ if (!masked)
++ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ move_masked_irq(irq);
+- desc->irq_data.chip->irq_unmask(&desc->irq_data);
++ if (!masked)
++ desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ }
+-
--- /dev/null
+From e4a683c899cd5a49f8d684a054c95bd115a0c005 Mon Sep 17 00:00:00 2001
+From: Jerome Marchand <jmarchan@redhat.com>
+Date: Wed, 5 Jan 2011 16:57:37 +0100
+Subject: kref: add kref_test_and_get
+
+From: Jerome Marchand <jmarchan@redhat.com>
+
+commit e4a683c899cd5a49f8d684a054c95bd115a0c005 upstream.
+
+Add kref_test_and_get() function, which atomically add a reference only if
+refcount is not zero. This prevent to add a reference to an object that is
+already being removed.
+
+Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
+Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/kref.h | 1 +
+ lib/kref.c | 12 ++++++++++++
+ 2 files changed, 13 insertions(+)
+
+--- a/include/linux/kref.h
++++ b/include/linux/kref.h
+@@ -23,6 +23,7 @@ struct kref {
+
+ void kref_init(struct kref *kref);
+ void kref_get(struct kref *kref);
++int kref_test_and_get(struct kref *kref);
+ int kref_put(struct kref *kref, void (*release) (struct kref *kref));
+
+ #endif /* _KREF_H_ */
+--- a/lib/kref.c
++++ b/lib/kref.c
+@@ -37,6 +37,18 @@ void kref_get(struct kref *kref)
+ }
+
+ /**
++ * kref_test_and_get - increment refcount for object only if refcount is not
++ * zero.
++ * @kref: object.
++ *
++ * Return non-zero if the refcount was incremented, 0 otherwise
++ */
++int kref_test_and_get(struct kref *kref)
++{
++ return atomic_inc_not_zero(&kref->refcount);
++}
++
++/**
+ * kref_put - decrement refcount for object.
+ * @kref: object.
+ * @release: pointer to the function that will clean up the object when the
--- /dev/null
+From fd4a4663db293bfd5dc20fb4113977f62895e550 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Thu, 13 Jan 2011 15:47:31 -0800
+Subject: mm: fix hugepage migration
+
+From: Hugh Dickins <hughd@google.com>
+
+commit fd4a4663db293bfd5dc20fb4113977f62895e550 upstream.
+
+2.6.37 added an unmap_and_move_huge_page() for memory failure recovery,
+but its anon_vma handling was still based around the 2.6.35 conventions.
+Update it to use page_lock_anon_vma, get_anon_vma, page_unlock_anon_vma,
+drop_anon_vma in the same way as we're now changing unmap_and_move().
+
+I don't particularly like to propose this for stable when I've not seen
+its problems in practice nor tested the solution: but it's clearly out of
+synch at present.
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Mel Gorman <mel@csn.ul.ie>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: "Jun'ichi Nomura" <j-nomura@ce.jp.nec.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/migrate.c | 23 ++++++-----------------
+ 1 file changed, 6 insertions(+), 17 deletions(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -805,7 +805,6 @@ static int unmap_and_move_huge_page(new_
+ int rc = 0;
+ int *result = NULL;
+ struct page *new_hpage = get_new_page(hpage, private, &result);
+- int rcu_locked = 0;
+ struct anon_vma *anon_vma = NULL;
+
+ if (!new_hpage)
+@@ -820,12 +819,10 @@ static int unmap_and_move_huge_page(new_
+ }
+
+ if (PageAnon(hpage)) {
+- rcu_read_lock();
+- rcu_locked = 1;
+-
+- if (page_mapped(hpage)) {
+- anon_vma = page_anon_vma(hpage);
+- atomic_inc(&anon_vma->external_refcount);
++ anon_vma = page_lock_anon_vma(hpage);
++ if (anon_vma) {
++ get_anon_vma(anon_vma);
++ page_unlock_anon_vma(anon_vma);
+ }
+ }
+
+@@ -837,16 +834,8 @@ static int unmap_and_move_huge_page(new_
+ if (rc)
+ remove_migration_ptes(hpage, hpage);
+
+- if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+- &anon_vma->lock)) {
+- int empty = list_empty(&anon_vma->head);
+- spin_unlock(&anon_vma->lock);
+- if (empty)
+- anon_vma_free(anon_vma);
+- }
+-
+- if (rcu_locked)
+- rcu_read_unlock();
++ if (anon_vma)
++ drop_anon_vma(anon_vma);
+ out:
+ unlock_page(hpage);
+
--- /dev/null
+From 1ce82b69e96c838d007f316b8347b911fdfa9842 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Thu, 13 Jan 2011 15:47:30 -0800
+Subject: mm: fix migration hangs on anon_vma lock
+
+From: Hugh Dickins <hughd@google.com>
+
+commit 1ce82b69e96c838d007f316b8347b911fdfa9842 upstream.
+
+Increased usage of page migration in mmotm reveals that the anon_vma
+locking in unmap_and_move() has been deficient since 2.6.36 (or even
+earlier). Review at the time of f18194275c39835cb84563500995e0d503a32d9a
+("mm: fix hang on anon_vma->root->lock") missed the issue here: the
+anon_vma to which we get a reference may already have been freed back to
+its slab (it is in use when we check page_mapped, but that can change),
+and so its anon_vma->root may be switched at any moment by reuse in
+anon_vma_prepare.
+
+Perhaps we could fix that with a get_anon_vma_unless_zero(), but let's
+not: just rely on page_lock_anon_vma() to do all the hard thinking for us,
+then we don't need any rcu read locking over here.
+
+In removing the rcu_unlock label: since PageAnon is a bit in
+page->mapping, it's impossible for a !page->mapping page to be anon; but
+insert VM_BUG_ON in case the implementation ever changes.
+
+[akpm@linux-foundation.org: coding-style fixes]
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Reviewed-by: Mel Gorman <mel@csn.ul.ie>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: "Jun'ichi Nomura" <j-nomura@ce.jp.nec.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/migrate.c | 48 +++++++++++++++++++-----------------------------
+ 1 file changed, 19 insertions(+), 29 deletions(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -620,7 +620,6 @@ static int unmap_and_move(new_page_t get
+ int *result = NULL;
+ struct page *newpage = get_new_page(page, private, &result);
+ int remap_swapcache = 1;
+- int rcu_locked = 0;
+ int charge = 0;
+ struct mem_cgroup *mem = NULL;
+ struct anon_vma *anon_vma = NULL;
+@@ -672,20 +671,26 @@ static int unmap_and_move(new_page_t get
+ /*
+ * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
+ * we cannot notice that anon_vma is freed while we migrates a page.
+- * This rcu_read_lock() delays freeing anon_vma pointer until the end
++ * This get_anon_vma() delays freeing anon_vma pointer until the end
+ * of migration. File cache pages are no problem because of page_lock()
+ * File Caches may use write_page() or lock_page() in migration, then,
+ * just care Anon page here.
+ */
+ if (PageAnon(page)) {
+- rcu_read_lock();
+- rcu_locked = 1;
+-
+- /* Determine how to safely use anon_vma */
+- if (!page_mapped(page)) {
+- if (!PageSwapCache(page))
+- goto rcu_unlock;
+-
++ /*
++ * Only page_lock_anon_vma() understands the subtleties of
++ * getting a hold on an anon_vma from outside one of its mms.
++ */
++ anon_vma = page_lock_anon_vma(page);
++ if (anon_vma) {
++ /*
++ * Take a reference count on the anon_vma if the
++ * page is mapped so that it is guaranteed to
++ * exist when the page is remapped later
++ */
++ get_anon_vma(anon_vma);
++ page_unlock_anon_vma(anon_vma);
++ } else if (PageSwapCache(page)) {
+ /*
+ * We cannot be sure that the anon_vma of an unmapped
+ * swapcache page is safe to use because we don't
+@@ -700,13 +705,7 @@ static int unmap_and_move(new_page_t get
+ */
+ remap_swapcache = 0;
+ } else {
+- /*
+- * Take a reference count on the anon_vma if the
+- * page is mapped so that it is guaranteed to
+- * exist when the page is remapped later
+- */
+- anon_vma = page_anon_vma(page);
+- get_anon_vma(anon_vma);
++ goto uncharge;
+ }
+ }
+
+@@ -723,16 +722,10 @@ static int unmap_and_move(new_page_t get
+ * free the metadata, so the page can be freed.
+ */
+ if (!page->mapping) {
+- if (!PageAnon(page) && page_has_private(page)) {
+- /*
+- * Go direct to try_to_free_buffers() here because
+- * a) that's what try_to_release_page() would do anyway
+- * b) we may be under rcu_read_lock() here, so we can't
+- * use GFP_KERNEL which is what try_to_release_page()
+- * needs to be effective.
+- */
++ VM_BUG_ON(PageAnon(page));
++ if (page_has_private(page)) {
+ try_to_free_buffers(page);
+- goto rcu_unlock;
++ goto uncharge;
+ }
+ goto skip_unmap;
+ }
+@@ -746,14 +739,11 @@ skip_unmap:
+
+ if (rc && remap_swapcache)
+ remove_migration_ptes(page, page);
+-rcu_unlock:
+
+ /* Drop an anon_vma reference if we took one */
+ if (anon_vma)
+ drop_anon_vma(anon_vma);
+
+- if (rcu_locked)
+- rcu_read_unlock();
+ uncharge:
+ if (!charge)
+ mem_cgroup_end_migration(mem, page, newpage);
--- /dev/null
+From 29c1f677d424e8c5683a837fc4f03fc9f19201d7 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mel@csn.ul.ie>
+Date: Thu, 13 Jan 2011 15:47:21 -0800
+Subject: mm: migration: use rcu_dereference_protected when dereferencing the radix tree slot during file page migration
+
+From: Mel Gorman <mel@csn.ul.ie>
+
+commit 29c1f677d424e8c5683a837fc4f03fc9f19201d7 upstream.
+
+migrate_pages() -> unmap_and_move() only calls rcu_read_lock() for
+anonymous pages, as introduced by git commit
+989f89c57e6361e7d16fbd9572b5da7d313b073d ("fix rcu_read_lock() in page
+migraton"). The point of the RCU protection there is part of getting a
+stable reference to anon_vma and is only held for anon pages as file pages
+are locked which is sufficient protection against freeing.
+
+However, while a file page's mapping is being migrated, the radix tree is
+double checked to ensure it is the expected page. This uses
+radix_tree_deref_slot() -> rcu_dereference() without the RCU lock held
+triggering the following warning.
+
+[ 173.674290] ===================================================
+[ 173.676016] [ INFO: suspicious rcu_dereference_check() usage. ]
+[ 173.676016] ---------------------------------------------------
+[ 173.676016] include/linux/radix-tree.h:145 invoked rcu_dereference_check() without protection!
+[ 173.676016]
+[ 173.676016] other info that might help us debug this:
+[ 173.676016]
+[ 173.676016]
+[ 173.676016] rcu_scheduler_active = 1, debug_locks = 0
+[ 173.676016] 1 lock held by hugeadm/2899:
+[ 173.676016] #0: (&(&inode->i_data.tree_lock)->rlock){..-.-.}, at: [<c10e3d2b>] migrate_page_move_mapping+0x40/0x1ab
+[ 173.676016]
+[ 173.676016] stack backtrace:
+[ 173.676016] Pid: 2899, comm: hugeadm Not tainted 2.6.37-rc5-autobuild
+[ 173.676016] Call Trace:
+[ 173.676016] [<c128cc01>] ? printk+0x14/0x1b
+[ 173.676016] [<c1063502>] lockdep_rcu_dereference+0x7d/0x86
+[ 173.676016] [<c10e3db5>] migrate_page_move_mapping+0xca/0x1ab
+[ 173.676016] [<c10e41ad>] migrate_page+0x23/0x39
+[ 173.676016] [<c10e491b>] buffer_migrate_page+0x22/0x107
+[ 173.676016] [<c10e48f9>] ? buffer_migrate_page+0x0/0x107
+[ 173.676016] [<c10e425d>] move_to_new_page+0x9a/0x1ae
+[ 173.676016] [<c10e47e6>] migrate_pages+0x1e7/0x2fa
+
+This patch introduces radix_tree_deref_slot_protected() which calls
+rcu_dereference_protected(). Users of it must pass in the
+mapping->tree_lock that is protecting this dereference. Holding the tree
+lock protects against parallel updaters of the radix tree meaning that
+rcu_dereference_protected is allowable.
+
+[akpm@linux-foundation.org: remove unneeded casts]
+Signed-off-by: Mel Gorman <mel@csn.ul.ie>
+Cc: Minchan Kim <minchan.kim@gmail.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: Milton Miller <miltonm@bga.com>
+Cc: Nick Piggin <nickpiggin@yahoo.com.au>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/radix-tree.h | 16 ++++++++++++++++
+ mm/migrate.c | 4 ++--
+ 2 files changed, 18 insertions(+), 2 deletions(-)
+
+--- a/include/linux/radix-tree.h
++++ b/include/linux/radix-tree.h
+@@ -146,6 +146,22 @@ static inline void *radix_tree_deref_slo
+ }
+
+ /**
++ * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held
++ * @pslot: pointer to slot, returned by radix_tree_lookup_slot
++ * Returns: item that was stored in that slot with any direct pointer flag
++ * removed.
++ *
++ * Similar to radix_tree_deref_slot but only used during migration when a pages
++ * mapping is being moved. The caller does not hold the RCU read lock but it
++ * must hold the tree lock to prevent parallel updates.
++ */
++static inline void *radix_tree_deref_slot_protected(void **pslot,
++ spinlock_t *treelock)
++{
++ return rcu_dereference_protected(*pslot, lockdep_is_held(treelock));
++}
++
++/**
+ * radix_tree_deref_retry - check radix_tree_deref_slot
+ * @arg: pointer returned by radix_tree_deref_slot
+ * Returns: 0 if retry is not required, otherwise retry is required
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -246,7 +246,7 @@ static int migrate_page_move_mapping(str
+
+ expected_count = 2 + page_has_private(page);
+ if (page_count(page) != expected_count ||
+- (struct page *)radix_tree_deref_slot(pslot) != page) {
++ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
+@@ -318,7 +318,7 @@ int migrate_huge_page_move_mapping(struc
+
+ expected_count = 2 + page_has_private(page);
+ if (page_count(page) != expected_count ||
+- (struct page *)radix_tree_deref_slot(pslot) != page) {
++ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+ spin_unlock_irq(&mapping->tree_lock);
+ return -EAGAIN;
+ }
--- /dev/null
+From 6650239a4b01077e80d5a4468562756d77afaa59 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Sat, 8 Jan 2011 17:45:38 -0500
+Subject: NFS: Don't use vm_map_ram() in readdir
+
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+
+commit 6650239a4b01077e80d5a4468562756d77afaa59 upstream.
+
+vm_map_ram() is not available on NOMMU platforms, and causes trouble
+on incoherrent architectures such as ARM when we access the page data
+through both the direct and the virtual mapping.
+
+The alternative is to use the direct mapping to access page data
+for the case when we are not crossing a page boundary, but to copy
+the data into a linear scratch buffer when we are accessing data
+that spans page boundaries.
+
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Tested-by: Marc Kleine-Budde <mkl@pengutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/dir.c | 44 ++++++------
+ fs/nfs/nfs2xdr.c | 6 -
+ fs/nfs/nfs3xdr.c | 6 -
+ fs/nfs/nfs4xdr.c | 6 -
+ include/linux/sunrpc/xdr.h | 4 -
+ net/sunrpc/xdr.c | 155 ++++++++++++++++++++++++++++++++++++---------
+ 6 files changed, 148 insertions(+), 73 deletions(-)
+
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -33,7 +33,6 @@
+ #include <linux/namei.h>
+ #include <linux/mount.h>
+ #include <linux/sched.h>
+-#include <linux/vmalloc.h>
+ #include <linux/kmemleak.h>
+
+ #include "delegation.h"
+@@ -459,25 +458,26 @@ out:
+ /* Perform conversion from xdr to cache array */
+ static
+ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry,
+- void *xdr_page, struct page *page, unsigned int buflen)
++ struct page **xdr_pages, struct page *page, unsigned int buflen)
+ {
+ struct xdr_stream stream;
+- struct xdr_buf buf;
+- __be32 *ptr = xdr_page;
++ struct xdr_buf buf = {
++ .pages = xdr_pages,
++ .page_len = buflen,
++ .buflen = buflen,
++ .len = buflen,
++ };
++ struct page *scratch;
+ struct nfs_cache_array *array;
+ unsigned int count = 0;
+ int status;
+
+- buf.head->iov_base = xdr_page;
+- buf.head->iov_len = buflen;
+- buf.tail->iov_len = 0;
+- buf.page_base = 0;
+- buf.page_len = 0;
+- buf.buflen = buf.head->iov_len;
+- buf.len = buf.head->iov_len;
+-
+- xdr_init_decode(&stream, &buf, ptr);
++ scratch = alloc_page(GFP_KERNEL);
++ if (scratch == NULL)
++ return -ENOMEM;
+
++ xdr_init_decode(&stream, &buf, NULL);
++ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ do {
+ status = xdr_decode(desc, entry, &stream);
+@@ -506,6 +506,8 @@ int nfs_readdir_page_filler(nfs_readdir_
+ } else
+ status = PTR_ERR(array);
+ }
++
++ put_page(scratch);
+ return status;
+ }
+
+@@ -521,7 +523,6 @@ static
+ void nfs_readdir_free_large_page(void *ptr, struct page **pages,
+ unsigned int npages)
+ {
+- vm_unmap_ram(ptr, npages);
+ nfs_readdir_free_pagearray(pages, npages);
+ }
+
+@@ -530,9 +531,8 @@ void nfs_readdir_free_large_page(void *p
+ * to nfs_readdir_free_large_page
+ */
+ static
+-void *nfs_readdir_large_page(struct page **pages, unsigned int npages)
++int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+ {
+- void *ptr;
+ unsigned int i;
+
+ for (i = 0; i < npages; i++) {
+@@ -541,13 +541,11 @@ void *nfs_readdir_large_page(struct page
+ goto out_freepages;
+ pages[i] = page;
+ }
++ return 0;
+
+- ptr = vm_map_ram(pages, npages, 0, PAGE_KERNEL);
+- if (!IS_ERR_OR_NULL(ptr))
+- return ptr;
+ out_freepages:
+ nfs_readdir_free_pagearray(pages, i);
+- return NULL;
++ return -ENOMEM;
+ }
+
+ static
+@@ -577,8 +575,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir
+ memset(array, 0, sizeof(struct nfs_cache_array));
+ array->eof_index = -1;
+
+- pages_ptr = nfs_readdir_large_page(pages, array_size);
+- if (!pages_ptr)
++ status = nfs_readdir_large_page(pages, array_size);
++ if (status < 0)
+ goto out_release_array;
+ do {
+ unsigned int pglen;
+@@ -587,7 +585,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir
+ if (status < 0)
+ break;
+ pglen = status;
+- status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen);
++ status = nfs_readdir_page_filler(desc, &entry, pages, page, pglen);
+ if (status < 0) {
+ if (status == -ENOSPC)
+ status = 0;
+--- a/fs/nfs/nfs2xdr.c
++++ b/fs/nfs/nfs2xdr.c
+@@ -487,12 +487,6 @@ nfs_decode_dirent(struct xdr_stream *xdr
+
+ entry->d_type = DT_UNKNOWN;
+
+- p = xdr_inline_peek(xdr, 8);
+- if (p != NULL)
+- entry->eof = !p[0] && p[1];
+- else
+- entry->eof = 0;
+-
+ return p;
+
+ out_overflow:
+--- a/fs/nfs/nfs3xdr.c
++++ b/fs/nfs/nfs3xdr.c
+@@ -647,12 +647,6 @@ nfs3_decode_dirent(struct xdr_stream *xd
+ memset((u8*)(entry->fh), 0, sizeof(*entry->fh));
+ }
+
+- p = xdr_inline_peek(xdr, 8);
+- if (p != NULL)
+- entry->eof = !p[0] && p[1];
+- else
+- entry->eof = 0;
+-
+ return p;
+
+ out_overflow:
+--- a/fs/nfs/nfs4xdr.c
++++ b/fs/nfs/nfs4xdr.c
+@@ -6215,12 +6215,6 @@ __be32 *nfs4_decode_dirent(struct xdr_st
+ if (verify_attr_len(xdr, p, len) < 0)
+ goto out_overflow;
+
+- p = xdr_inline_peek(xdr, 8);
+- if (p != NULL)
+- entry->eof = !p[0] && p[1];
+- else
+- entry->eof = 0;
+-
+ return p;
+
+ out_overflow:
+--- a/include/linux/sunrpc/xdr.h
++++ b/include/linux/sunrpc/xdr.h
+@@ -201,6 +201,8 @@ struct xdr_stream {
+
+ __be32 *end; /* end of available buffer space */
+ struct kvec *iov; /* pointer to the current kvec */
++ struct kvec scratch; /* Scratch buffer */
++ struct page **page_ptr; /* pointer to the current page */
+ };
+
+ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+@@ -208,7 +210,7 @@ extern __be32 *xdr_reserve_space(struct
+ extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
+ unsigned int base, unsigned int len);
+ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
+-extern __be32 *xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes);
++extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
+ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
+ extern void xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
+ extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
+--- a/net/sunrpc/xdr.c
++++ b/net/sunrpc/xdr.c
+@@ -552,6 +552,74 @@ void xdr_write_pages(struct xdr_stream *
+ }
+ EXPORT_SYMBOL_GPL(xdr_write_pages);
+
++static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
++ __be32 *p, unsigned int len)
++{
++ if (len > iov->iov_len)
++ len = iov->iov_len;
++ if (p == NULL)
++ p = (__be32*)iov->iov_base;
++ xdr->p = p;
++ xdr->end = (__be32*)(iov->iov_base + len);
++ xdr->iov = iov;
++ xdr->page_ptr = NULL;
++}
++
++static int xdr_set_page_base(struct xdr_stream *xdr,
++ unsigned int base, unsigned int len)
++{
++ unsigned int pgnr;
++ unsigned int maxlen;
++ unsigned int pgoff;
++ unsigned int pgend;
++ void *kaddr;
++
++ maxlen = xdr->buf->page_len;
++ if (base >= maxlen)
++ return -EINVAL;
++ maxlen -= base;
++ if (len > maxlen)
++ len = maxlen;
++
++ base += xdr->buf->page_base;
++
++ pgnr = base >> PAGE_SHIFT;
++ xdr->page_ptr = &xdr->buf->pages[pgnr];
++ kaddr = page_address(*xdr->page_ptr);
++
++ pgoff = base & ~PAGE_MASK;
++ xdr->p = (__be32*)(kaddr + pgoff);
++
++ pgend = pgoff + len;
++ if (pgend > PAGE_SIZE)
++ pgend = PAGE_SIZE;
++ xdr->end = (__be32*)(kaddr + pgend);
++ xdr->iov = NULL;
++ return 0;
++}
++
++static void xdr_set_next_page(struct xdr_stream *xdr)
++{
++ unsigned int newbase;
++
++ newbase = (1 + xdr->page_ptr - xdr->buf->pages) << PAGE_SHIFT;
++ newbase -= xdr->buf->page_base;
++
++ if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
++ xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
++}
++
++static bool xdr_set_next_buffer(struct xdr_stream *xdr)
++{
++ if (xdr->page_ptr != NULL)
++ xdr_set_next_page(xdr);
++ else if (xdr->iov == xdr->buf->head) {
++ if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
++ xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len);
++ }
++ return xdr->p != xdr->end;
++}
++
+ /**
+ * xdr_init_decode - Initialize an xdr_stream for decoding data.
+ * @xdr: pointer to xdr_stream struct
+@@ -560,41 +628,67 @@ EXPORT_SYMBOL_GPL(xdr_write_pages);
+ */
+ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
+ {
+- struct kvec *iov = buf->head;
+- unsigned int len = iov->iov_len;
+-
+- if (len > buf->len)
+- len = buf->len;
+ xdr->buf = buf;
+- xdr->iov = iov;
+- xdr->p = p;
+- xdr->end = (__be32 *)((char *)iov->iov_base + len);
++ xdr->scratch.iov_base = NULL;
++ xdr->scratch.iov_len = 0;
++ if (buf->head[0].iov_len != 0)
++ xdr_set_iov(xdr, buf->head, p, buf->len);
++ else if (buf->page_len != 0)
++ xdr_set_page_base(xdr, 0, buf->len);
+ }
+ EXPORT_SYMBOL_GPL(xdr_init_decode);
+
+-/**
+- * xdr_inline_peek - Allow read-ahead in the XDR data stream
+- * @xdr: pointer to xdr_stream struct
+- * @nbytes: number of bytes of data to decode
+- *
+- * Check if the input buffer is long enough to enable us to decode
+- * 'nbytes' more bytes of data starting at the current position.
+- * If so return the current pointer without updating the current
+- * pointer position.
+- */
+-__be32 * xdr_inline_peek(struct xdr_stream *xdr, size_t nbytes)
++static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+ {
+ __be32 *p = xdr->p;
+ __be32 *q = p + XDR_QUADLEN(nbytes);
+
+ if (unlikely(q > xdr->end || q < p))
+ return NULL;
++ xdr->p = q;
+ return p;
+ }
+-EXPORT_SYMBOL_GPL(xdr_inline_peek);
+
+ /**
+- * xdr_inline_decode - Retrieve non-page XDR data to decode
++ * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
++ * @xdr: pointer to xdr_stream struct
++ * @buf: pointer to an empty buffer
++ * @buflen: size of 'buf'
++ *
++ * The scratch buffer is used when decoding from an array of pages.
++ * If an xdr_inline_decode() call spans across page boundaries, then
++ * we copy the data into the scratch buffer in order to allow linear
++ * access.
++ */
++void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
++{
++ xdr->scratch.iov_base = buf;
++ xdr->scratch.iov_len = buflen;
++}
++EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
++
++static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
++{
++ __be32 *p;
++ void *cpdest = xdr->scratch.iov_base;
++ size_t cplen = (char *)xdr->end - (char *)xdr->p;
++
++ if (nbytes > xdr->scratch.iov_len)
++ return NULL;
++ memcpy(cpdest, xdr->p, cplen);
++ cpdest += cplen;
++ nbytes -= cplen;
++ if (!xdr_set_next_buffer(xdr))
++ return NULL;
++ p = __xdr_inline_decode(xdr, nbytes);
++ if (p == NULL)
++ return NULL;
++ memcpy(cpdest, p, nbytes);
++ return xdr->scratch.iov_base;
++}
++
++/**
++ * xdr_inline_decode - Retrieve XDR data to decode
+ * @xdr: pointer to xdr_stream struct
+ * @nbytes: number of bytes of data to decode
+ *
+@@ -605,13 +699,16 @@ EXPORT_SYMBOL_GPL(xdr_inline_peek);
+ */
+ __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+ {
+- __be32 *p = xdr->p;
+- __be32 *q = p + XDR_QUADLEN(nbytes);
++ __be32 *p;
+
+- if (unlikely(q > xdr->end || q < p))
++ if (nbytes == 0)
++ return xdr->p;
++ if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
+ return NULL;
+- xdr->p = q;
+- return p;
++ p = __xdr_inline_decode(xdr, nbytes);
++ if (p != NULL)
++ return p;
++ return xdr_copy_to_scratch(xdr, nbytes);
+ }
+ EXPORT_SYMBOL_GPL(xdr_inline_decode);
+
+@@ -671,16 +768,12 @@ EXPORT_SYMBOL_GPL(xdr_read_pages);
+ */
+ void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
+ {
+- char * kaddr = page_address(xdr->buf->pages[0]);
+ xdr_read_pages(xdr, len);
+ /*
+ * Position current pointer at beginning of tail, and
+ * set remaining message length.
+ */
+- if (len > PAGE_CACHE_SIZE - xdr->buf->page_base)
+- len = PAGE_CACHE_SIZE - xdr->buf->page_base;
+- xdr->p = (__be32 *)(kaddr + xdr->buf->page_base);
+- xdr->end = (__be32 *)((char *)xdr->p + len);
++ xdr_set_page_base(xdr, 0, len);
+ }
+ EXPORT_SYMBOL_GPL(xdr_enter_page);
+
--- /dev/null
+From e00b8a24041f37e56b4b8415ce4eba1cbc238065 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Thu, 27 Jan 2011 14:55:39 -0500
+Subject: NFS: Fix an NFS client lockdep issue
+
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+
+commit e00b8a24041f37e56b4b8415ce4eba1cbc238065 upstream.
+
+There is no reason to be freeing the delegation cred in the rcu callback,
+and doing so is resulting in a lockdep complaint that rpc_credcache_lock
+is being called from both softirq and non-softirq contexts.
+
+Reported-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/delegation.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/nfs/delegation.c
++++ b/fs/nfs/delegation.c
+@@ -23,8 +23,6 @@
+
+ static void nfs_do_free_delegation(struct nfs_delegation *delegation)
+ {
+- if (delegation->cred)
+- put_rpccred(delegation->cred);
+ kfree(delegation);
+ }
+
+@@ -37,6 +35,10 @@ static void nfs_free_delegation_callback
+
+ static void nfs_free_delegation(struct nfs_delegation *delegation)
+ {
++ if (delegation->cred) {
++ put_rpccred(delegation->cred);
++ delegation->cred = NULL;
++ }
+ call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+ }
+
--- /dev/null
+From 839f7ad6932d95f4d5ae7267b95c574714ff3d5b Mon Sep 17 00:00:00 2001
+From: Chuck Lever <chuck.lever@oracle.com>
+Date: Fri, 21 Jan 2011 15:54:57 +0000
+Subject: NFS: Fix "kernel BUG at fs/aio.c:554!"
+
+From: Chuck Lever <chuck.lever@oracle.com>
+
+commit 839f7ad6932d95f4d5ae7267b95c574714ff3d5b upstream.
+
+Nick Piggin reports:
+
+> I'm getting use after frees in aio code in NFS
+>
+> [ 2703.396766] Call Trace:
+> [ 2703.396858] [<ffffffff8100b057>] ? native_sched_clock+0x27/0x80
+> [ 2703.396959] [<ffffffff8108509e>] ? put_lock_stats+0xe/0x40
+> [ 2703.397058] [<ffffffff81088348>] ? lock_release_holdtime+0xa8/0x140
+> [ 2703.397159] [<ffffffff8108a2a5>] lock_acquire+0x95/0x1b0
+> [ 2703.397260] [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
+> [ 2703.397361] [<ffffffff81039701>] ? get_parent_ip+0x11/0x50
+> [ 2703.397464] [<ffffffff81612a31>] _raw_spin_lock_irq+0x41/0x80
+> [ 2703.397564] [<ffffffff811627db>] ? aio_put_req+0x2b/0x60
+> [ 2703.397662] [<ffffffff811627db>] aio_put_req+0x2b/0x60
+> [ 2703.397761] [<ffffffff811647fe>] do_io_submit+0x2be/0x7c0
+> [ 2703.397895] [<ffffffff81164d0b>] sys_io_submit+0xb/0x10
+> [ 2703.397995] [<ffffffff8100307b>] system_call_fastpath+0x16/0x1b
+>
+> Adding some tracing, it is due to nfs completing the request then
+> returning something other than -EIOCBQUEUED, so aio.c
+> also completes the request.
+
+To address this, prevent the NFS direct I/O engine from completing
+async iocbs when the forward path returns an error without starting
+any I/O.
+
+This fix appears to survive ^C during both "xfstest no. 208" and "fsx
+-Z."
+
+It's likely this bug has existed for a very long while, as we are seeing
+very similar symptoms in OEL 5. Copying stable.
+
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/direct.c | 34 ++++++++++++++++++++--------------
+ 1 file changed, 20 insertions(+), 14 deletions(-)
+
+--- a/fs/nfs/direct.c
++++ b/fs/nfs/direct.c
+@@ -407,15 +407,18 @@ static ssize_t nfs_direct_read_schedule_
+ pos += vec->iov_len;
+ }
+
++ /*
++ * If no bytes were started, return the error, and let the
++ * generic layer handle the completion.
++ */
++ if (requested_bytes == 0) {
++ nfs_direct_req_release(dreq);
++ return result < 0 ? result : -EIO;
++ }
++
+ if (put_dreq(dreq))
+ nfs_direct_complete(dreq);
+-
+- if (requested_bytes != 0)
+- return 0;
+-
+- if (result < 0)
+- return result;
+- return -EIO;
++ return 0;
+ }
+
+ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
+@@ -841,15 +844,18 @@ static ssize_t nfs_direct_write_schedule
+ pos += vec->iov_len;
+ }
+
++ /*
++ * If no bytes were started, return the error, and let the
++ * generic layer handle the completion.
++ */
++ if (requested_bytes == 0) {
++ nfs_direct_req_release(dreq);
++ return result < 0 ? result : -EIO;
++ }
++
+ if (put_dreq(dreq))
+ nfs_direct_write_complete(dreq, dreq->inode);
+-
+- if (requested_bytes != 0)
+- return 0;
+-
+- if (result < 0)
+- return result;
+- return -EIO;
++ return 0;
+ }
+
+ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
--- /dev/null
+From 8a0eebf66e3b1deae036553ba641a9c2bdbae678 Mon Sep 17 00:00:00 2001
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+Date: Thu, 13 Jan 2011 14:15:50 -0500
+Subject: NFS: Fix NFSv3 exclusive open semantics
+
+From: Trond Myklebust <Trond.Myklebust@netapp.com>
+
+commit 8a0eebf66e3b1deae036553ba641a9c2bdbae678 upstream.
+
+Commit c0204fd2b8fe047b18b67e07e1bf2a03691240cd (NFS: Clean up
+nfs4_proc_create()) broke NFSv3 exclusive open by removing the code
+that passes the O_EXCL flag down to nfs3_proc_create(). This patch
+reverts that offending hunk from the original commit.
+
+Reported-by: Nick Bowler <nbowler@elliptictech.com>
+Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Tested-by: Nick Bowler <nbowler@elliptictech.com>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfs/dir.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -1577,6 +1577,7 @@ static int nfs_create(struct inode *dir,
+ {
+ struct iattr attr;
+ int error;
++ int open_flags = 0;
+
+ dfprintk(VFS, "NFS: create(%s/%ld), %s\n",
+ dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
+@@ -1584,7 +1585,10 @@ static int nfs_create(struct inode *dir,
+ attr.ia_mode = mode;
+ attr.ia_valid = ATTR_MODE;
+
+- error = NFS_PROTO(dir)->create(dir, dentry, &attr, 0, NULL);
++ if ((nd->flags & LOOKUP_CREATE) != 0)
++ open_flags = nd->intent.open.flags;
++
++ error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, NULL);
+ if (error != 0)
+ goto out_err;
+ return 0;
--- /dev/null
+From f6af99ec1b261e21219d5eba99e3af48fc6c32d4 Mon Sep 17 00:00:00 2001
+From: J. Bruce Fields <bfields@redhat.com>
+Date: Tue, 4 Jan 2011 18:02:15 -0500
+Subject: nfsd4: name->id mapping should fail with BADOWNER not BADNAME
+
+From: J. Bruce Fields <bfields@redhat.com>
+
+commit f6af99ec1b261e21219d5eba99e3af48fc6c32d4 upstream.
+
+According to rfc 3530 BADNAME is for strings that represent paths;
+BADOWNER is for user/group names that don't map.
+
+And the too-long name should probably be BADOWNER as well; it's
+effectively the same as if we couldn't map it.
+
+Reported-by: Trond Myklebust <Trond.Myklebust@netapp.com>
+Reported-by: Simon Kirby <sim@hostway.ca>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/nfsd/nfs4idmap.c | 4 ++--
+ fs/nfsd/nfsd.h | 1 +
+ fs/nfsd/nfsproc.c | 2 +-
+ 3 files changed, 4 insertions(+), 3 deletions(-)
+
+--- a/fs/nfsd/nfs4idmap.c
++++ b/fs/nfsd/nfs4idmap.c
+@@ -524,13 +524,13 @@ idmap_name_to_id(struct svc_rqst *rqstp,
+ int ret;
+
+ if (namelen + 1 > sizeof(key.name))
+- return -EINVAL;
++ return -ESRCH; /* nfserr_badowner */
+ memcpy(key.name, name, namelen);
+ key.name[namelen] = '\0';
+ strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ ret = idmap_lookup(rqstp, nametoid_lookup, &key, &nametoid_cache, &item);
+ if (ret == -ENOENT)
+- ret = -ESRCH; /* nfserr_badname */
++ ret = -ESRCH; /* nfserr_badowner */
+ if (ret)
+ return ret;
+ *id = item->id;
+--- a/fs/nfsd/nfsd.h
++++ b/fs/nfsd/nfsd.h
+@@ -158,6 +158,7 @@ void nfsd_lockd_shutdown(void);
+ #define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP)
+ #define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR)
+ #define nfserr_openmode cpu_to_be32(NFSERR_OPENMODE)
++#define nfserr_badowner cpu_to_be32(NFSERR_BADOWNER)
+ #define nfserr_locks_held cpu_to_be32(NFSERR_LOCKS_HELD)
+ #define nfserr_op_illegal cpu_to_be32(NFSERR_OP_ILLEGAL)
+ #define nfserr_grace cpu_to_be32(NFSERR_GRACE)
+--- a/fs/nfsd/nfsproc.c
++++ b/fs/nfsd/nfsproc.c
+@@ -737,7 +737,7 @@ nfserrno (int errno)
+ { nfserr_jukebox, -ERESTARTSYS },
+ { nfserr_dropit, -EAGAIN },
+ { nfserr_dropit, -ENOMEM },
+- { nfserr_badname, -ESRCH },
++ { nfserr_badowner, -ESRCH },
+ { nfserr_io, -ETXTBSY },
+ { nfserr_notsupp, -EOPNOTSUPP },
+ { nfserr_toosmall, -ETOOSMALL },
--- /dev/null
+From ceff1a770933e2ca2bf995b453dade4ec47a9878 Mon Sep 17 00:00:00 2001
+From: Dave Anderson <anderson@redhat.com>
+Date: Wed, 12 Jan 2011 17:00:36 -0800
+Subject: /proc/kcore: fix seeking
+
+From: Dave Anderson <anderson@redhat.com>
+
+commit ceff1a770933e2ca2bf995b453dade4ec47a9878 upstream.
+
+Commit 34aacb2920 ("procfs: Use generic_file_llseek in /proc/kcore") broke
+seeking on /proc/kcore. This changes it back to use default_llseek in
+order to restore the original behavior.
+
+The problem with generic_file_llseek is that it only allows seeks up to
+inode->i_sb->s_maxbytes, which is 2GB-1 on procfs, where the memory file
+offset values in the /proc/kcore PT_LOAD segments may exceed or start
+beyond that offset value.
+
+A similar revert was made for /proc/vmcore.
+
+Signed-off-by: Dave Anderson <anderson@redhat.com>
+Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/proc/kcore.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/proc/kcore.c
++++ b/fs/proc/kcore.c
+@@ -558,7 +558,7 @@ static int open_kcore(struct inode *inod
+ static const struct file_operations proc_kcore_operations = {
+ .read = read_kcore,
+ .open = open_kcore,
+- .llseek = generic_file_llseek,
++ .llseek = default_llseek,
+ };
+
+ #ifdef CONFIG_MEMORY_HOTPLUG
--- /dev/null
+From db8b10167126d72829653690f57b9c7ca53c4d54 Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Mon, 10 Jan 2011 17:41:43 -0800
+Subject: RDMA/cxgb4: Don't re-init wait object in init/fini paths
+
+From: Steve Wise <swise@opengridcomputing.com>
+
+commit db8b10167126d72829653690f57b9c7ca53c4d54 upstream.
+
+Re-initializing the wait object in rdma_init()/rdma_fini() causes a
+timing window which can lead to a deadlock during close. Once this
+deadlock hits, all RDMA activity over the T4 device will be stuck.
+
+There's no need to re-init the wait object, so remove it.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Roland Dreier <rolandd@cisco.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/cxgb4/qp.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/drivers/infiniband/hw/cxgb4/qp.c
++++ b/drivers/infiniband/hw/cxgb4/qp.c
+@@ -1029,7 +1029,6 @@ static int rdma_fini(struct c4iw_dev *rh
+ wqe->cookie = (unsigned long) &ep->com.wr_wait;
+
+ wqe->u.fini.type = FW_RI_TYPE_FINI;
+- c4iw_init_wr_wait(&ep->com.wr_wait);
+ ret = c4iw_ofld_send(&rhp->rdev, skb);
+ if (ret)
+ goto out;
+@@ -1125,7 +1124,6 @@ static int rdma_init(struct c4iw_dev *rh
+ if (qhp->attr.mpa_attr.initiator)
+ build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init);
+
+- c4iw_init_wr_wait(&qhp->ep->com.wr_wait);
+ ret = c4iw_ofld_send(&rhp->rdev, skb);
+ if (ret)
+ goto out;
--- /dev/null
+From 6a09a9d6946dd516d243d072bee83fae3c683471 Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Fri, 21 Jan 2011 17:00:29 +0000
+Subject: RDMA/cxgb4: Limit MAXBURST EQ context field to 256B
+
+From: Steve Wise <swise@opengridcomputing.com>
+
+commit 6a09a9d6946dd516d243d072bee83fae3c683471 upstream.
+
+MAXBURST cannot exceed 256B for on-chip queues. With a 512B MAXBURST,
+we can lock up the chip.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/cxgb4/qp.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/infiniband/hw/cxgb4/qp.c
++++ b/drivers/infiniband/hw/cxgb4/qp.c
+@@ -220,7 +220,7 @@ static int create_qp(struct c4iw_rdev *r
+ V_FW_RI_RES_WR_DCAEN(0) |
+ V_FW_RI_RES_WR_DCACPU(0) |
+ V_FW_RI_RES_WR_FBMIN(2) |
+- V_FW_RI_RES_WR_FBMAX(3) |
++ V_FW_RI_RES_WR_FBMAX(2) |
+ V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
+ V_FW_RI_RES_WR_CIDXFTHRESH(0) |
+ V_FW_RI_RES_WR_EQSIZE(eqsize));
+@@ -243,7 +243,7 @@ static int create_qp(struct c4iw_rdev *r
+ V_FW_RI_RES_WR_DCAEN(0) |
+ V_FW_RI_RES_WR_DCACPU(0) |
+ V_FW_RI_RES_WR_FBMIN(2) |
+- V_FW_RI_RES_WR_FBMAX(3) |
++ V_FW_RI_RES_WR_FBMAX(2) |
+ V_FW_RI_RES_WR_CIDXFTHRESHO(0) |
+ V_FW_RI_RES_WR_CIDXFTHRESH(0) |
+ V_FW_RI_RES_WR_EQSIZE(eqsize));
--- /dev/null
+From 94788657c94169171971968c9d4b6222c5e704aa Mon Sep 17 00:00:00 2001
+From: Steve Wise <swise@opengridcomputing.com>
+Date: Fri, 21 Jan 2011 17:00:34 +0000
+Subject: RDMA/cxgb4: Set the correct device physical function for iWARP connections
+
+From: Steve Wise <swise@opengridcomputing.com>
+
+commit 94788657c94169171971968c9d4b6222c5e704aa upstream.
+
+The PF passed to FW was 0, causing PCI failures in an SR-IOV environment.
+
+Signed-off-by: Steve Wise <swise@opengridcomputing.com>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/infiniband/hw/cxgb4/cm.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/infiniband/hw/cxgb4/cm.c
++++ b/drivers/infiniband/hw/cxgb4/cm.c
+@@ -380,7 +380,7 @@ static void send_flowc(struct c4iw_ep *e
+ 16)) | FW_WR_FLOWID(ep->hwtid));
+
+ flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+- flowc->mnemval[0].val = cpu_to_be32(0);
++ flowc->mnemval[0].val = cpu_to_be32(PCI_FUNC(ep->com.dev->rdev.lldi.pdev->devfn) << 8);
+ flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+ flowc->mnemval[1].val = cpu_to_be32(ep->tx_chan);
+ flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
--- /dev/null
+From 2fb08e6ca9f00d1aedb3964983e9c8f84b36b807 Mon Sep 17 00:00:00 2001
+From: Paul Fox <pgf@laptop.org>
+Date: Wed, 12 Jan 2011 17:00:07 -0800
+Subject: rtc-cmos: fix suspend/resume
+
+From: Paul Fox <pgf@laptop.org>
+
+commit 2fb08e6ca9f00d1aedb3964983e9c8f84b36b807 upstream.
+
+rtc-cmos was setting suspend/resume hooks at the device_driver level.
+However, the platform bus code (drivers/base/platform.c) only looks for
+resume hooks at the dev_pm_ops level, or within the platform_driver.
+
+Switch rtc_cmos to use dev_pm_ops so that suspend/resume code is executed
+again.
+
+Paul said:
+
+: The user visible symptom in our (XO laptop) case was that rtcwake would
+: fail to wake the laptop. The RTC alarm would expire, but the wakeup
+: wasn't unmasked.
+:
+: As for severity, the impact may have been reduced because if I recall
+: correctly, the bug only affected platforms with CONFIG_PNP disabled.
+
+Signed-off-by: Paul Fox <pgf@laptop.org>
+Signed-off-by: Daniel Drake <dsd@laptop.org>
+Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/rtc/rtc-cmos.c | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+--- a/drivers/rtc/rtc-cmos.c
++++ b/drivers/rtc/rtc-cmos.c
+@@ -36,6 +36,7 @@
+ #include <linux/platform_device.h>
+ #include <linux/mod_devicetable.h>
+ #include <linux/log2.h>
++#include <linux/pm.h>
+
+ /* this is for "generic access to PC-style RTC" using CMOS_READ/CMOS_WRITE */
+ #include <asm-generic/rtc.h>
+@@ -850,7 +851,7 @@ static void __exit cmos_do_remove(struct
+
+ #ifdef CONFIG_PM
+
+-static int cmos_suspend(struct device *dev, pm_message_t mesg)
++static int cmos_suspend(struct device *dev)
+ {
+ struct cmos_rtc *cmos = dev_get_drvdata(dev);
+ unsigned char tmp;
+@@ -898,7 +899,7 @@ static int cmos_suspend(struct device *d
+ */
+ static inline int cmos_poweroff(struct device *dev)
+ {
+- return cmos_suspend(dev, PMSG_HIBERNATE);
++ return cmos_suspend(dev);
+ }
+
+ static int cmos_resume(struct device *dev)
+@@ -945,9 +946,9 @@ static int cmos_resume(struct device *de
+ return 0;
+ }
+
++static SIMPLE_DEV_PM_OPS(cmos_pm_ops, cmos_suspend, cmos_resume);
++
+ #else
+-#define cmos_suspend NULL
+-#define cmos_resume NULL
+
+ static inline int cmos_poweroff(struct device *dev)
+ {
+@@ -1077,7 +1078,7 @@ static void __exit cmos_pnp_remove(struc
+
+ static int cmos_pnp_suspend(struct pnp_dev *pnp, pm_message_t mesg)
+ {
+- return cmos_suspend(&pnp->dev, mesg);
++ return cmos_suspend(&pnp->dev);
+ }
+
+ static int cmos_pnp_resume(struct pnp_dev *pnp)
+@@ -1157,8 +1158,9 @@ static struct platform_driver cmos_platf
+ .shutdown = cmos_platform_shutdown,
+ .driver = {
+ .name = (char *) driver_name,
+- .suspend = cmos_suspend,
+- .resume = cmos_resume,
++#ifdef CONFIG_PM
++ .pm = &cmos_pm_ops,
++#endif
+ }
+ };
+
ath9k_hw-do-pa-offset-calibration-only-on-longcal-interval.patch
ath9k_hw-disabled-paprd-for-ar9003.patch
ath9k_hw-fix-system-hang-when-resuming-from-s3-s4.patch
-ath9k-fix-race-conditions-when-stop-device.patch
ath-missed-to-clear-key4-of-micentry.patch
qdio-use-proper-qebsm-operand-for-siga-r-and-siga-s.patch
zcrypt-fix-check-to-look-for-facility-bits-2-65.patch
asoc-wm8990-msleep-takes-milliseconds-not-jiffies.patch
asoc-blackfin-ac97-fix-build-error-after-multi-component-update.patch
asoc-blackfin-tdm-fix-missed-snd_soc_dai_get_drvdata-update.patch
+nfs-don-t-use-vm_map_ram-in-readdir.patch
+nfs-fix-nfsv3-exclusive-open-semantics.patch
+nfs-fix-an-nfs-client-lockdep-issue.patch
+nfs-fix-kernel-bug-at-fs-aio.c-554.patch
+nfsd4-name-id-mapping-should-fail-with-badowner-not-badname.patch
+dynamic-debug-fix-build-issue-with-older-gcc.patch
+rdma-cxgb4-don-t-re-init-wait-object-in-init-fini-paths.patch
+rdma-cxgb4-set-the-correct-device-physical-function-for-iwarp-connections.patch
+rdma-cxgb4-limit-maxburst-eq-context-field-to-256b.patch
+proc-kcore-fix-seeking.patch
+rtc-cmos-fix-suspend-resume.patch
+kref-add-kref_test_and_get.patch
+block-fix-accounting-bug-on-cross-partition-merges.patch
+mm-migration-use-rcu_dereference_protected-when-dereferencing-the-radix-tree-slot-during-file-page-migration.patch
+mm-fix-migration-hangs-on-anon_vma-lock.patch
+mm-fix-hugepage-migration.patch
+genirq-prevent-irq-storm-on-migration.patch
+writeback-integrated-background-writeback-work.patch
+writeback-stop-background-kupdate-works-from-livelocking-other-works.patch
+writeback-avoid-livelocking-wb_sync_all-writeback.patch
+ext4-fix-uninitialized-variable-in-ext4_register_li_request.patch
+ext4-fix-trimming-of-a-single-group.patch
+ext4-fix-memory-leak-in-ext4_free_branches.patch
+ext4-fix-panic-on-module-unload-when-stopping-lazyinit-thread.patch
+ext4-unregister-features-interface-on-module-unload.patch
+ext4-fix-data-corruption-with-multi-block-writepages-support.patch
+ext4-make-grpinfo-slab-cache-names-static.patch
--- /dev/null
+From b9543dac5bbc4aef0a598965b6b34f6259ab9a9b Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 13 Jan 2011 15:45:48 -0800
+Subject: writeback: avoid livelocking WB_SYNC_ALL writeback
+
+From: Jan Kara <jack@suse.cz>
+
+commit b9543dac5bbc4aef0a598965b6b34f6259ab9a9b upstream.
+
+When wb_writeback() is called in WB_SYNC_ALL mode, work->nr_to_write is
+usually set to LONG_MAX. The logic in wb_writeback() then calls
+__writeback_inodes_sb() with nr_to_write == MAX_WRITEBACK_PAGES and we
+easily end up with non-positive nr_to_write after the function returns, if
+the inode has more than MAX_WRITEBACK_PAGES dirty pages at the moment.
+
+When nr_to_write is <= 0 wb_writeback() decides we need another round of
+writeback but this is wrong in some cases! For example when a single
+large file is continuously dirtied, we would never finish syncing it
+because each pass would be able to write MAX_WRITEBACK_PAGES and inode
+dirty timestamp never gets updated (as inode is never completely clean).
+Thus __writeback_inodes_sb() would write the redirtied inode again and
+again.
+
+Fix the issue by setting nr_to_write to LONG_MAX in WB_SYNC_ALL mode. We
+do not need nr_to_write in WB_SYNC_ALL mode anyway since
+write_cache_pages() does livelock avoidance using page tagging in
+WB_SYNC_ALL mode.
+
+This makes wb_writeback() call __writeback_inodes_sb() only once on
+WB_SYNC_ALL. The latter function won't livelock because it works on
+
+- a finite set of files by doing queue_io() once at the beginning
+- a finite set of pages by PAGECACHE_TAG_TOWRITE page tagging
+
+After this patch, program from http://lkml.org/lkml/2010/10/24/154 is no
+longer able to stall sync forever.
+
+[fengguang.wu@intel.com: fix locking comment]
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Jan Engelhardt <jengelh@medozas.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c | 27 +++++++++++++++++++++++----
+ 1 file changed, 23 insertions(+), 4 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -629,6 +629,7 @@ static long wb_writeback(struct bdi_writ
+ };
+ unsigned long oldest_jif;
+ long wrote = 0;
++ long write_chunk;
+ struct inode *inode;
+
+ if (wbc.for_kupdate) {
+@@ -641,6 +642,24 @@ static long wb_writeback(struct bdi_writ
+ wbc.range_end = LLONG_MAX;
+ }
+
++ /*
++ * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
++ * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
++ * here avoids calling into writeback_inodes_wb() more than once.
++ *
++ * The intended call sequence for WB_SYNC_ALL writeback is:
++ *
++ * wb_writeback()
++ * __writeback_inodes_sb() <== called only once
++ * write_cache_pages() <== called once for each inode
++ * (quickly) tag currently dirty pages
++ * (maybe slowly) sync all tagged pages
++ */
++ if (wbc.sync_mode == WB_SYNC_NONE)
++ write_chunk = MAX_WRITEBACK_PAGES;
++ else
++ write_chunk = LONG_MAX;
++
+ wbc.wb_start = jiffies; /* livelock avoidance */
+ for (;;) {
+ /*
+@@ -667,7 +686,7 @@ static long wb_writeback(struct bdi_writ
+ break;
+
+ wbc.more_io = 0;
+- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
++ wbc.nr_to_write = write_chunk;
+ wbc.pages_skipped = 0;
+
+ trace_wbc_writeback_start(&wbc, wb->bdi);
+@@ -677,8 +696,8 @@ static long wb_writeback(struct bdi_writ
+ writeback_inodes_wb(wb, &wbc);
+ trace_wbc_writeback_written(&wbc, wb->bdi);
+
+- work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+- wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
++ work->nr_pages -= write_chunk - wbc.nr_to_write;
++ wrote += write_chunk - wbc.nr_to_write;
+
+ /*
+ * If we consumed everything, see if we have more
+@@ -693,7 +712,7 @@ static long wb_writeback(struct bdi_writ
+ /*
+ * Did we write something? Try for more
+ */
+- if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
++ if (wbc.nr_to_write < write_chunk)
+ continue;
+ /*
+ * Nothing written. Wait for some inode to
--- /dev/null
+From 6585027a5e8cb490e3a761b2f3f3c3acf722aff2 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 13 Jan 2011 15:45:44 -0800
+Subject: writeback: integrated background writeback work
+
+From: Jan Kara <jack@suse.cz>
+
+commit 6585027a5e8cb490e3a761b2f3f3c3acf722aff2 upstream.
+
+Check whether background writeback is needed after finishing each work.
+
+When bdi flusher thread finishes doing some work check whether any kind of
+background writeback needs to be done (either because
+dirty_background_ratio is exceeded or because we need to start flushing
+old inodes). If so, just do background write back.
+
+This way, bdi_start_background_writeback() just needs to wake up the
+flusher thread. It will do background writeback as soon as there is no
+other work.
+
+This is a preparatory patch for the next patch which stops background
+writeback as soon as there is other work to do.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Jan Engelhardt <jengelh@medozas.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c | 61 ++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 46 insertions(+), 15 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -84,13 +84,9 @@ static inline struct inode *wb_inode(str
+ return list_entry(head, struct inode, i_wb_list);
+ }
+
+-static void bdi_queue_work(struct backing_dev_info *bdi,
+- struct wb_writeback_work *work)
++/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
++static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
+ {
+- trace_writeback_queue(bdi, work);
+-
+- spin_lock_bh(&bdi->wb_lock);
+- list_add_tail(&work->list, &bdi->work_list);
+ if (bdi->wb.task) {
+ wake_up_process(bdi->wb.task);
+ } else {
+@@ -98,15 +94,26 @@ static void bdi_queue_work(struct backin
+ * The bdi thread isn't there, wake up the forker thread which
+ * will create and run it.
+ */
+- trace_writeback_nothread(bdi, work);
+ wake_up_process(default_backing_dev_info.wb.task);
+ }
++}
++
++static void bdi_queue_work(struct backing_dev_info *bdi,
++ struct wb_writeback_work *work)
++{
++ trace_writeback_queue(bdi, work);
++
++ spin_lock_bh(&bdi->wb_lock);
++ list_add_tail(&work->list, &bdi->work_list);
++ if (!bdi->wb.task)
++ trace_writeback_nothread(bdi, work);
++ bdi_wakeup_flusher(bdi);
+ spin_unlock_bh(&bdi->wb_lock);
+ }
+
+ static void
+ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+- bool range_cyclic, bool for_background)
++ bool range_cyclic)
+ {
+ struct wb_writeback_work *work;
+
+@@ -126,7 +133,6 @@ __bdi_start_writeback(struct backing_dev
+ work->sync_mode = WB_SYNC_NONE;
+ work->nr_pages = nr_pages;
+ work->range_cyclic = range_cyclic;
+- work->for_background = for_background;
+
+ bdi_queue_work(bdi, work);
+ }
+@@ -144,7 +150,7 @@ __bdi_start_writeback(struct backing_dev
+ */
+ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
+ {
+- __bdi_start_writeback(bdi, nr_pages, true, false);
++ __bdi_start_writeback(bdi, nr_pages, true);
+ }
+
+ /**
+@@ -152,13 +158,20 @@ void bdi_start_writeback(struct backing_
+ * @bdi: the backing device to write from
+ *
+ * Description:
+- * This does WB_SYNC_NONE background writeback. The IO is only
+- * started when this function returns, we make no guarentees on
+- * completion. Caller need not hold sb s_umount semaphore.
++ * This makes sure WB_SYNC_NONE background writeback happens. When
++ * this function returns, it is only guaranteed that for given BDI
++ * some IO is happening if we are over background dirty threshold.
++ * Caller need not hold sb s_umount semaphore.
+ */
+ void bdi_start_background_writeback(struct backing_dev_info *bdi)
+ {
+- __bdi_start_writeback(bdi, LONG_MAX, true, true);
++ /*
++ * We just wake up the flusher thread. It will perform background
++ * writeback as soon as there is no other work to do.
++ */
++ spin_lock_bh(&bdi->wb_lock);
++ bdi_wakeup_flusher(bdi);
++ spin_unlock_bh(&bdi->wb_lock);
+ }
+
+ /*
+@@ -718,6 +731,23 @@ static unsigned long get_nr_dirty_pages(
+ get_nr_dirty_inodes();
+ }
+
++static long wb_check_background_flush(struct bdi_writeback *wb)
++{
++ if (over_bground_thresh()) {
++
++ struct wb_writeback_work work = {
++ .nr_pages = LONG_MAX,
++ .sync_mode = WB_SYNC_NONE,
++ .for_background = 1,
++ .range_cyclic = 1,
++ };
++
++ return wb_writeback(wb, &work);
++ }
++
++ return 0;
++}
++
+ static long wb_check_old_data_flush(struct bdi_writeback *wb)
+ {
+ unsigned long expired;
+@@ -787,6 +817,7 @@ long wb_do_writeback(struct bdi_writebac
+ * Check for periodic writeback, kupdated() style
+ */
+ wrote += wb_check_old_data_flush(wb);
++ wrote += wb_check_background_flush(wb);
+ clear_bit(BDI_writeback_running, &wb->bdi->state);
+
+ return wrote;
+@@ -873,7 +904,7 @@ void wakeup_flusher_threads(long nr_page
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ if (!bdi_has_dirty_io(bdi))
+ continue;
+- __bdi_start_writeback(bdi, nr_pages, false, false);
++ __bdi_start_writeback(bdi, nr_pages, false);
+ }
+ rcu_read_unlock();
+ }
--- /dev/null
+From aa373cf550994623efb5d49a4d8775bafd10bbc1 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 13 Jan 2011 15:45:47 -0800
+Subject: writeback: stop background/kupdate works from livelocking other works
+
+From: Jan Kara <jack@suse.cz>
+
+commit aa373cf550994623efb5d49a4d8775bafd10bbc1 upstream.
+
+Background writeback is easily livelockable in a loop in wb_writeback() by
+a process continuously re-dirtying pages (or continuously appending to a
+file). This is in fact intended as the target of background writeback is
+to write dirty pages it can find as long as we are over
+dirty_background_threshold.
+
+But the above behavior gets inconvenient at times because no other work
+queued in the flusher thread's queue gets processed. In particular, since
+e.g. sync(1) relies on flusher thread to do all the IO for it, sync(1)
+can hang forever waiting for flusher thread to do the work.
+
+Generally, when a flusher thread has some work queued, someone submitted
+the work to achieve a goal more specific than what background writeback
+does. Moreover by working on the specific work, we also reduce amount of
+dirty pages which is exactly the target of background writeout. So it
+makes sense to give specific work a priority over a generic page cleaning.
+
+Thus we interrupt background writeback if there is some other work to do.
+We return to the background writeback after completing all the queued
+work.
+
+This may delay the writeback of expired inodes for a while, however the
+expired inodes will eventually be flushed to disk as long as the other
+works won't livelock.
+
+[fengguang.wu@intel.com: update comment]
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Jan Engelhardt <jengelh@medozas.de>
+Cc: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -650,6 +650,16 @@ static long wb_writeback(struct bdi_writ
+ break;
+
+ /*
++ * Background writeout and kupdate-style writeback may
++ * run forever. Stop them if there is other work to do
++ * so that e.g. sync can proceed. They'll be restarted
++ * after the other works are all done.
++ */
++ if ((work->for_background || work->for_kupdate) &&
++ !list_empty(&wb->bdi->work_list))
++ break;
++
++ /*
+ * For background writeout, stop when we are below the
+ * background dirty threshold
+ */