From: Greg Kroah-Hartman Date: Mon, 25 Feb 2013 18:12:13 +0000 (-0800) Subject: 3.8-stable patches X-Git-Tag: v3.7.10~37 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=138b565652b8013c064f49ab8394bb086ac197db;p=thirdparty%2Fkernel%2Fstable-queue.git 3.8-stable patches added patches: drivercore-fix-ordering-between-deferred_probe-and-exiting-initcalls.patch drivers-video-backlight-adp88-0_bl.c-fix-resume.patch fs-block_dev.c-page-cache-wrongly-left-invalidated-after-revalidate_disk.patch futex-revert-futex-mark-get_robust_list-as-deprecated.patch inotify-remove-broken-mask-checks-causing-unmount-to-be-einval.patch keys-revert-one-application-of-fix-unreachable-code-patch.patch kvm-s390-handle-hosts-not-supporting-s390-virtio.patch mm-fadvise.c-drain-all-pagevecs-if-posix_fadv_dontneed-fails-to-discard-all-pages.patch nfsv4.1-don-t-decode-skipped-layoutgets.patch nfsv4.1-fix-an-abba-locking-issue-with-session-and-state-serialisation.patch nfsv4.1-fix-bulk-recall-and-destroy-of-layouts.patch nlm-ensure-that-we-resend-all-pending-blocking-locks-after-a-reclaim.patch ocfs2-unlock-super-lock-if-lockres-refresh-failed.patch s390-kvm-fix-store-status-for-acrs-fprs.patch tmpfs-fix-use-after-free-of-mempolicy-object.patch umount-oops-when-remove-blocklayoutdriver-first.patch --- diff --git a/queue-3.8/drivercore-fix-ordering-between-deferred_probe-and-exiting-initcalls.patch b/queue-3.8/drivercore-fix-ordering-between-deferred_probe-and-exiting-initcalls.patch new file mode 100644 index 00000000000..82d40a0cd4b --- /dev/null +++ b/queue-3.8/drivercore-fix-ordering-between-deferred_probe-and-exiting-initcalls.patch @@ -0,0 +1,47 @@ +From d72cca1eee5b26e313da2a380d4862924e271031 Mon Sep 17 00:00:00 2001 +From: Grant Likely +Date: Thu, 14 Feb 2013 18:14:27 +0000 +Subject: drivercore: Fix ordering between deferred_probe and exiting initcalls + +From: Grant Likely + +commit d72cca1eee5b26e313da2a380d4862924e271031 upstream. + +One of the side effects of deferred probe is that some drivers which +used to be probed before initcalls completed are now happening slightly +later. This causes two problems. +- If a console driver gets deferred, then it may not be ready when + userspace starts. For example, if a uart depends on pinctrl, then the + uart will get deferred and /dev/console will not be available +- __init sections will be discarded before built-in drivers are probed. + Strictly speaking, __init functions should not be called in a drivers + __probe path, but there are a lot of drivers (console stuff again) + that do anyway. In the past it was perfectly safe to do so because all + built-in drivers got probed before the end of initcalls. + +This patch fixes the problem by forcing the first pass of the deferred +list to complete at late_initcall time. This is late enough to catch the +drivers that are known to have the above issues. + +Signed-off-by: Grant Likely +Tested-by: Haojian Zhuang +Cc: Arnd Bergmann +Cc: Russell King +Cc: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/base/dd.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/base/dd.c ++++ b/drivers/base/dd.c +@@ -172,6 +172,8 @@ static int deferred_probe_initcall(void) + + driver_deferred_probe_enable = true; + driver_deferred_probe_trigger(); ++ /* Sort as many dependencies as possible before exiting initcalls */ ++ flush_workqueue(deferred_wq); + return 0; + } + late_initcall(deferred_probe_initcall); diff --git a/queue-3.8/drivers-video-backlight-adp88-0_bl.c-fix-resume.patch b/queue-3.8/drivers-video-backlight-adp88-0_bl.c-fix-resume.patch new file mode 100644 index 00000000000..c02a30926fa --- /dev/null +++ b/queue-3.8/drivers-video-backlight-adp88-0_bl.c-fix-resume.patch @@ -0,0 +1,46 @@ +From 5eb02c01bd1f3ef195989ab05e835e2b0711b5a9 Mon Sep 17 00:00:00 2001 +From: Lars-Peter Clausen +Date: Thu, 21 Feb 2013 16:44:04 -0800 +Subject: drivers/video/backlight/adp88?0_bl.c: fix resume + +From: Lars-Peter Clausen + +commit 5eb02c01bd1f3ef195989ab05e835e2b0711b5a9 upstream. + +Clearing the NSTBY bit in the control register also automatically clears +the BLEN bit. So we need to make sure to set it again during resume, +otherwise the backlight will stay off. + +Signed-off-by: Lars-Peter Clausen +Acked-by: Michael Hennerich +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/video/backlight/adp8860_bl.c | 2 +- + drivers/video/backlight/adp8870_bl.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/video/backlight/adp8860_bl.c ++++ b/drivers/video/backlight/adp8860_bl.c +@@ -783,7 +783,7 @@ static int adp8860_i2c_suspend(struct i2 + + static int adp8860_i2c_resume(struct i2c_client *client) + { +- adp8860_set_bits(client, ADP8860_MDCR, NSTBY); ++ adp8860_set_bits(client, ADP8860_MDCR, NSTBY | BLEN); + + return 0; + } +--- a/drivers/video/backlight/adp8870_bl.c ++++ b/drivers/video/backlight/adp8870_bl.c +@@ -957,7 +957,7 @@ static int adp8870_i2c_suspend(struct i2 + + static int adp8870_i2c_resume(struct i2c_client *client) + { +- adp8870_set_bits(client, ADP8870_MDCR, NSTBY); ++ adp8870_set_bits(client, ADP8870_MDCR, NSTBY | BLEN); + + return 0; + } diff --git a/queue-3.8/fs-block_dev.c-page-cache-wrongly-left-invalidated-after-revalidate_disk.patch b/queue-3.8/fs-block_dev.c-page-cache-wrongly-left-invalidated-after-revalidate_disk.patch new file mode 100644 index 00000000000..e56257fdaef --- /dev/null +++ b/queue-3.8/fs-block_dev.c-page-cache-wrongly-left-invalidated-after-revalidate_disk.patch @@ -0,0 +1,85 @@ +From 7630b661da330b35dd57b6f5d6d62b386f2dd751 Mon Sep 17 00:00:00 2001 +From: MITSUNARI Shigeo +Date: Thu, 21 Feb 2013 16:42:01 -0800 +Subject: fs/block_dev.c: page cache wrongly left invalidated after revalidate_disk() + +From: MITSUNARI Shigeo + +commit 7630b661da330b35dd57b6f5d6d62b386f2dd751 upstream. + +We found that bdev->bd_invalidated was left set once revalidate_disk() +is called, which results in page cache flush every time that device is +open. + +Specifically, we found this problem in MD block device. Once we resize +a MD device, mdadm --monitor periodically flush all page cache for that +device every 60 or 1000 seconds when it opens the device. + +This bug lies since at least 3.2.0 till the latest kernel(3.6.2). Patch +is attached. + +The following steps will reproduce the problem. + +1. prepair a block device (eg /dev/sdb). + +2. create two partitions: + + sudo parted /dev/sdb + mklabel gpt + mkpart primary 0% 50% + mkpart primary 50% 100% + +3. create a md device. + + sudo mdadm -C /dev/md/hoge -l 1 -n 2 -e 1.2 --assume-clean --auto=md --symlink=no /dev/sdb1 /dev/sdb2 + +4. create file system and mount it + + sudo mkfs.ext3 /dev/md/hoge + sudo mkdir /mnt/test + sudo mount /dev/md/hoge /mnt/test + +5. try to resize the device + + sudo mdadm -G /dev/md/hoge --size=max + +6. create a file to fill file cache. + + sudo dd if=/dev/urandom of=/mnt/test/data bs=1M count=10 + +and verify the current status of file by free command. + +7. mdadm monitor will open the md device every 1000 seconds and you + will find all file cache on the device are cleared. + +The timing can be reduced by the following steps. + +a) kill mdadm and restart it with --delay option + + /sbin/mdadm --monitor --delay=30 --pid-file /var/run/mdadm/monitor.pid --daemonise --scan --syslog + +or open the md device directly. + + sudo dd if=/dev/md/hoge of=/dev/null bs=4096 count=1 + +Signed-off-by: MITSUNARI Shigeo +Cc: Al Viro +Cc: Jeff Moyer +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/block_dev.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/block_dev.c ++++ b/fs/block_dev.c +@@ -994,6 +994,7 @@ int revalidate_disk(struct gendisk *disk + + mutex_lock(&bdev->bd_mutex); + check_disk_size_change(disk, bdev); ++ bdev->bd_invalidated = 0; + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; diff --git a/queue-3.8/futex-revert-futex-mark-get_robust_list-as-deprecated.patch b/queue-3.8/futex-revert-futex-mark-get_robust_list-as-deprecated.patch new file mode 100644 index 00000000000..c8770d0c75b --- /dev/null +++ b/queue-3.8/futex-revert-futex-mark-get_robust_list-as-deprecated.patch @@ -0,0 +1,52 @@ +From fe2b05f7ca9f906be61dced5489f63b8b4d7c770 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 18 Feb 2013 09:52:08 +0100 +Subject: futex: Revert "futex: Mark get_robust_list as deprecated" + +From: Thomas Gleixner + +commit fe2b05f7ca9f906be61dced5489f63b8b4d7c770 upstream. + +This reverts commit ec0c4274e33c0373e476b73e01995c53128f1257. + +get_robust_list() is in use and a removal would break existing user +space. With the permission checks in place it's not longer a security +hole. Remove the deprecation warnings. + +Signed-off-by: Thomas Gleixner +Cc: Cyrill Gorcunov +Cc: Richard Weinberger +Cc: akpm@linux-foundation.org +Cc: paul.gortmaker@windriver.com +Cc: davej@redhat.com +Cc: keescook@chromium.org +Cc: ebiederm@xmission.com +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/futex.c | 2 -- + kernel/futex_compat.c | 2 -- + 2 files changed, 4 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2471,8 +2471,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pi + if (!futex_cmpxchg_enabled) + return -ENOSYS; + +- WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); +- + rcu_read_lock(); + + ret = -ESRCH; +--- a/kernel/futex_compat.c ++++ b/kernel/futex_compat.c +@@ -142,8 +142,6 @@ compat_sys_get_robust_list(int pid, comp + if (!futex_cmpxchg_enabled) + return -ENOSYS; + +- WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n"); +- + rcu_read_lock(); + + ret = -ESRCH; diff --git a/queue-3.8/inotify-remove-broken-mask-checks-causing-unmount-to-be-einval.patch b/queue-3.8/inotify-remove-broken-mask-checks-causing-unmount-to-be-einval.patch new file mode 100644 index 00000000000..12588ea89ba --- /dev/null +++ b/queue-3.8/inotify-remove-broken-mask-checks-causing-unmount-to-be-einval.patch @@ -0,0 +1,80 @@ +From 676a0675cf9200ac047fb50825f80867b3bb733b Mon Sep 17 00:00:00 2001 +From: Jim Somerville +Date: Thu, 21 Feb 2013 16:41:59 -0800 +Subject: inotify: remove broken mask checks causing unmount to be EINVAL + +From: Jim Somerville + +commit 676a0675cf9200ac047fb50825f80867b3bb733b upstream. + +Running the command: + + inotifywait -e unmount /mnt/disk + +immediately aborts with a -EINVAL return code. This is however a valid +parameter. This abort occurs only if unmount is the sole event +parameter. If other event parameters are supplied, then the unmount +event wait will work. + +The problem was introduced by commit 44b350fc23e ("inotify: Fix mask +checks"). In that commit, it states: + + The mask checks in inotify_update_existing_watch() and + inotify_new_watch() are useless because inotify_arg_to_mask() + sets FS_IN_IGNORED and FS_EVENT_ON_CHILD bits anyway. + +But instead of removing the useless checks, it did this: + + mask = inotify_arg_to_mask(arg); + - if (unlikely(!mask)) + + if (unlikely(!(mask & IN_ALL_EVENTS))) + return -EINVAL; + +The problem is that IN_ALL_EVENTS doesn't include IN_UNMOUNT, and other +parts of the code keep IN_UNMOUNT separate from IN_ALL_EVENTS. So the +check should be: + + if (unlikely(!(mask & (IN_ALL_EVENTS | IN_UNMOUNT)))) + +But inotify_arg_to_mask(arg) always sets the IN_UNMOUNT bit in the mask +anyway, so the check is always going to pass and thus should simply be +removed. Also note that inotify_arg_to_mask completely controls what +mask bits get set from arg, there's no way for invalid bits to get +enabled there. + +Lets fix it by simply removing the useless broken checks. + +Signed-off-by: Jim Somerville +Signed-off-by: Paul Gortmaker +Cc: Jerome Marchand +Cc: John McCutchan +Cc: Robert Love +Cc: Eric Paris +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/notify/inotify/inotify_user.c | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/fs/notify/inotify/inotify_user.c ++++ b/fs/notify/inotify/inotify_user.c +@@ -576,8 +576,6 @@ static int inotify_update_existing_watch + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); +- if (unlikely(!(mask & IN_ALL_EVENTS))) +- return -EINVAL; + + fsn_mark = fsnotify_find_inode_mark(group, inode); + if (!fsn_mark) +@@ -629,8 +627,6 @@ static int inotify_new_watch(struct fsno + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); +- if (unlikely(!(mask & IN_ALL_EVENTS))) +- return -EINVAL; + + tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_i_mark)) diff --git a/queue-3.8/keys-revert-one-application-of-fix-unreachable-code-patch.patch b/queue-3.8/keys-revert-one-application-of-fix-unreachable-code-patch.patch new file mode 100644 index 00000000000..342770d33b3 --- /dev/null +++ b/queue-3.8/keys-revert-one-application-of-fix-unreachable-code-patch.patch @@ -0,0 +1,48 @@ +From fe9453a1dcb5fb146f9653267e78f4a558066f6f Mon Sep 17 00:00:00 2001 +From: David Howells +Date: Thu, 21 Feb 2013 12:00:25 +0000 +Subject: KEYS: Revert one application of "Fix unreachable code" patch + +From: David Howells + +commit fe9453a1dcb5fb146f9653267e78f4a558066f6f upstream. + +A patch to fix some unreachable code in search_my_process_keyrings() got +applied twice by two different routes upstream as commits e67eab39bee2 +and b010520ab3d2 (both "fix unreachable code"). + +Unfortunately, the second application removed something it shouldn't +have and this wasn't detected by GIT. This is due to the patch not +having sufficient lines of context to distinguish the two places of +application. + +The effect of this is relatively minor: inside the kernel, the keyring +search routines may search multiple keyrings and then prioritise the +errors if no keys or negative keys are found in any of them. With the +extra deletion, the presence of a negative key in the thread keyring +(causing ENOKEY) is incorrectly overridden by an error searching the +process keyring. + +So revert the second application of the patch. + +Signed-off-by: David Howells +Cc: Jiri Kosina +Cc: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + security/keys/process_keys.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/security/keys/process_keys.c ++++ b/security/keys/process_keys.c +@@ -367,6 +367,8 @@ key_ref_t search_my_process_keyrings(str + + switch (PTR_ERR(key_ref)) { + case -EAGAIN: /* no key */ ++ if (ret) ++ break; + case -ENOKEY: /* negative key */ + ret = key_ref; + break; diff --git a/queue-3.8/kvm-s390-handle-hosts-not-supporting-s390-virtio.patch b/queue-3.8/kvm-s390-handle-hosts-not-supporting-s390-virtio.patch new file mode 100644 index 00000000000..c6bc8b29220 --- /dev/null +++ b/queue-3.8/kvm-s390-handle-hosts-not-supporting-s390-virtio.patch @@ -0,0 +1,87 @@ +From 55c171a6d90dc0574021f9c836127cfd1a7d2e30 Mon Sep 17 00:00:00 2001 +From: Cornelia Huck +Date: Fri, 14 Dec 2012 17:02:16 +0100 +Subject: KVM: s390: Handle hosts not supporting s390-virtio. + +From: Cornelia Huck + +commit 55c171a6d90dc0574021f9c836127cfd1a7d2e30 upstream. + +Running under a kvm host does not necessarily imply the presence of +a page mapped above the main memory with the virtio information; +however, the code includes a hard coded access to that page. + +Instead, check for the presence of the page and exit gracefully +before we hit an addressing exception if it does not exist. + +Reviewed-by: Marcelo Tosatti +Reviewed-by: Alexander Graf +Signed-off-by: Cornelia Huck +Signed-off-by: Gleb Natapov +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/s390/kvm/kvm_virtio.c | 38 ++++++++++++++++++++++++++++++-------- + 1 file changed, 30 insertions(+), 8 deletions(-) + +--- a/drivers/s390/kvm/kvm_virtio.c ++++ b/drivers/s390/kvm/kvm_virtio.c +@@ -422,6 +422,26 @@ static void kvm_extint_handler(struct ex + } + + /* ++ * For s390-virtio, we expect a page above main storage containing ++ * the virtio configuration. Try to actually load from this area ++ * in order to figure out if the host provides this page. ++ */ ++static int __init test_devices_support(unsigned long addr) ++{ ++ int ret = -EIO; ++ ++ asm volatile( ++ "0: lura 0,%1\n" ++ "1: xgr %0,%0\n" ++ "2:\n" ++ EX_TABLE(0b,2b) ++ EX_TABLE(1b,2b) ++ : "+d" (ret) ++ : "a" (addr) ++ : "0", "cc"); ++ return ret; ++} ++/* + * Init function for virtio + * devices are in a single page above top of "normal" mem + */ +@@ -432,21 +452,23 @@ static int __init kvm_devices_init(void) + if (!MACHINE_IS_KVM) + return -ENODEV; + ++ if (test_devices_support(real_memory_size) < 0) ++ return -ENODEV; ++ ++ rc = vmem_add_mapping(real_memory_size, PAGE_SIZE); ++ if (rc) ++ return rc; ++ ++ kvm_devices = (void *) real_memory_size; ++ + kvm_root = root_device_register("kvm_s390"); + if (IS_ERR(kvm_root)) { + rc = PTR_ERR(kvm_root); + printk(KERN_ERR "Could not register kvm_s390 root device"); ++ vmem_remove_mapping(real_memory_size, PAGE_SIZE); + return rc; + } + +- rc = vmem_add_mapping(real_memory_size, PAGE_SIZE); +- if (rc) { +- root_device_unregister(kvm_root); +- return rc; +- } +- +- kvm_devices = (void *) real_memory_size; +- + INIT_WORK(&hotplug_work, hotplug_devices); + + service_subclass_irq_register(); diff --git a/queue-3.8/mm-fadvise.c-drain-all-pagevecs-if-posix_fadv_dontneed-fails-to-discard-all-pages.patch b/queue-3.8/mm-fadvise.c-drain-all-pagevecs-if-posix_fadv_dontneed-fails-to-discard-all-pages.patch new file mode 100644 index 00000000000..248ddce08ff --- /dev/null +++ b/queue-3.8/mm-fadvise.c-drain-all-pagevecs-if-posix_fadv_dontneed-fails-to-discard-all-pages.patch @@ -0,0 +1,198 @@ +From 67d46b296a1ba1477c0df8ff3bc5e0167a0b0732 Mon Sep 17 00:00:00 2001 +From: Mel Gorman +Date: Fri, 22 Feb 2013 16:35:59 -0800 +Subject: mm/fadvise.c: drain all pagevecs if POSIX_FADV_DONTNEED fails to discard all pages + +From: Mel Gorman + +commit 67d46b296a1ba1477c0df8ff3bc5e0167a0b0732 upstream. + +Rob van der Heij reported the following (paraphrased) on private mail. + + The scenario is that I want to avoid backups to fill up the page + cache and purge stuff that is more likely to be used again (this is + with s390x Linux on z/VM, so I don't give it as much memory that + we don't care anymore). So I have something with LD_PRELOAD that + intercepts the close() call (from tar, in this case) and issues + a posix_fadvise() just before closing the file. + + This mostly works, except for small files (less than 14 pages) + that remains in page cache after the face. + +Unfortunately Rob has not had a chance to test this exact patch but the +test program below should be reproducing the problem he described. + +The issue is the per-cpu pagevecs for LRU additions. If the pages are +added by one CPU but fadvise() is called on another then the pages +remain resident as the invalidate_mapping_pages() only drains the local +pagevecs via its call to pagevec_release(). The user-visible effect is +that a program that uses fadvise() properly is not obeyed. + +A possible fix for this is to put the necessary smarts into +invalidate_mapping_pages() to globally drain the LRU pagevecs if a +pagevec page could not be discarded. The downside with this is that an +inode cache shrink would send a global IPI and memory pressure +potentially causing global IPI storms is very undesirable. + +Instead, this patch adds a check during fadvise(POSIX_FADV_DONTNEED) to +check if invalidate_mapping_pages() discarded all the requested pages. +If a subset of pages are discarded it drains the LRU pagevecs and tries +again. If the second attempt fails, it assumes it is due to the pages +being mapped, locked or dirty and does not care. With this patch, an +application using fadvise() correctly will be obeyed but there is a +downside that a malicious application can force the kernel to send +global IPIs and increase overhead. + +If accepted, I would like this to be considered as a -stable candidate. +It's not an urgent issue but it's a system call that is not working as +advertised which is weak. + +The following test program demonstrates the problem. It should never +report that pages are still resident but will without this patch. It +assumes that CPU 0 and 1 exist. + +int main() { + int fd; + int pagesize = getpagesize(); + ssize_t written = 0, expected; + char *buf; + unsigned char *vec; + int resident, i; + cpu_set_t set; + + /* Prepare a buffer for writing */ + expected = FILESIZE_PAGES * pagesize; + buf = malloc(expected + 1); + if (buf == NULL) { + printf("ENOMEM\n"); + exit(EXIT_FAILURE); + } + buf[expected] = 0; + memset(buf, 'a', expected); + + /* Prepare the mincore vec */ + vec = malloc(FILESIZE_PAGES); + if (vec == NULL) { + printf("ENOMEM\n"); + exit(EXIT_FAILURE); + } + + /* Bind ourselves to CPU 0 */ + CPU_ZERO(&set); + CPU_SET(0, &set); + if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) { + perror("sched_setaffinity"); + exit(EXIT_FAILURE); + } + + /* open file, unlink and write buffer */ + fd = open("fadvise-test-file", O_CREAT|O_EXCL|O_RDWR); + if (fd == -1) { + perror("open"); + exit(EXIT_FAILURE); + } + unlink("fadvise-test-file"); + while (written < expected) { + ssize_t this_write; + this_write = write(fd, buf + written, expected - written); + + if (this_write == -1) { + perror("write"); + exit(EXIT_FAILURE); + } + + written += this_write; + } + free(buf); + + /* + * Force ourselves to another CPU. If fadvise only flushes the local + * CPUs pagevecs then the fadvise will fail to discard all file pages + */ + CPU_ZERO(&set); + CPU_SET(1, &set); + if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) { + perror("sched_setaffinity"); + exit(EXIT_FAILURE); + } + + /* sync and fadvise to discard the page cache */ + fsync(fd); + if (posix_fadvise(fd, 0, expected, POSIX_FADV_DONTNEED) == -1) { + perror("posix_fadvise"); + exit(EXIT_FAILURE); + } + + /* map the file and use mincore to see which parts of it are resident */ + buf = mmap(NULL, expected, PROT_READ, MAP_SHARED, fd, 0); + if (buf == NULL) { + perror("mmap"); + exit(EXIT_FAILURE); + } + if (mincore(buf, expected, vec) == -1) { + perror("mincore"); + exit(EXIT_FAILURE); + } + + /* Check residency */ + for (i = 0, resident = 0; i < FILESIZE_PAGES; i++) { + if (vec[i]) + resident++; + } + if (resident != 0) { + printf("Nr unexpected pages resident: %d\n", resident); + exit(EXIT_FAILURE); + } + + munmap(buf, expected); + close(fd); + free(vec); + exit(EXIT_SUCCESS); +} + +Signed-off-by: Mel Gorman +Reported-by: Rob van der Heij +Tested-by: Rob van der Heij +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/fadvise.c | 18 ++++++++++++++++-- + 1 file changed, 16 insertions(+), 2 deletions(-) + +--- a/mm/fadvise.c ++++ b/mm/fadvise.c +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + + #include + +@@ -120,9 +121,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, lof + start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; + end_index = (endbyte >> PAGE_CACHE_SHIFT); + +- if (end_index >= start_index) +- invalidate_mapping_pages(mapping, start_index, ++ if (end_index >= start_index) { ++ unsigned long count = invalidate_mapping_pages(mapping, ++ start_index, end_index); ++ ++ /* ++ * If fewer pages were invalidated than expected then ++ * it is possible that some of the pages were on ++ * a per-cpu pagevec for a remote CPU. Drain all ++ * pagevecs and try again. ++ */ ++ if (count < (end_index - start_index + 1)) { ++ lru_add_drain_all(); ++ invalidate_mapping_pages(mapping, start_index, + end_index); ++ } ++ } + break; + default: + ret = -EINVAL; diff --git a/queue-3.8/nfsv4.1-don-t-decode-skipped-layoutgets.patch b/queue-3.8/nfsv4.1-don-t-decode-skipped-layoutgets.patch new file mode 100644 index 00000000000..8e60a76efb0 --- /dev/null +++ b/queue-3.8/nfsv4.1-don-t-decode-skipped-layoutgets.patch @@ -0,0 +1,71 @@ +From 085b7a45c63d3da5be155faab9249a5cab224561 Mon Sep 17 00:00:00 2001 +From: Weston Andros Adamson +Date: Fri, 15 Feb 2013 16:03:46 -0500 +Subject: NFSv4.1: Don't decode skipped layoutgets + +From: Weston Andros Adamson + +commit 085b7a45c63d3da5be155faab9249a5cab224561 upstream. + +layoutget's prepare hook can call rpc_exit with status = NFS4_OK (0). +Because of this, nfs4_proc_layoutget can't depend on a 0 status to mean +that the RPC was successfully sent, received and parsed. + +To fix this, use the result's len member to see if parsing took place. + +This fixes the following OOPS -- calling xdr_init_decode() with a buffer length +0 doesn't set the stream's 'p' member and ends up using uninitialized memory +in filelayout_decode_layout. + +BUG: unable to handle kernel paging request at 0000000000008050 +IP: [] memcpy+0x18/0x120 +PGD 0 +Oops: 0000 [#1] SMP +last sysfs file: /sys/devices/pci0000:00/0000:00:11.0/0000:02:01.0/irq +CPU 1 +Modules linked in: nfs_layout_nfsv41_files nfs lockd fscache auth_rpcgss nfs_acl autofs4 sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ipv6 dm_mirror dm_region_hash dm_log dm_mod ppdev parport_pc parport snd_ens1371 snd_rawmidi snd_ac97_codec ac97_bus snd_seq snd_seq_device snd_pcm snd_timer snd soundcore snd_page_alloc e1000 microcode vmware_balloon i2c_piix4 i2c_core sg shpchp ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix mptspi mptscsih mptbase scsi_transport_spi [last unloaded: speedstep_lib] + +Pid: 1665, comm: flush-0:22 Not tainted 2.6.32-356-test-2 #2 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform +RIP: 0010:[] [] memcpy+0x18/0x120 +RSP: 0018:ffff88003dfab588 EFLAGS: 00010206 +RAX: ffff88003dc42000 RBX: ffff88003dfab610 RCX: 0000000000000009 +RDX: 000000003f807ff0 RSI: 0000000000008050 RDI: ffff88003dc42000 +RBP: ffff88003dfab5b0 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000080 R12: 0000000000000024 +R13: ffff88003dc42000 R14: ffff88003f808030 R15: ffff88003dfab6a0 +FS: 0000000000000000(0000) GS:ffff880003420000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b +CR2: 0000000000008050 CR3: 000000003bc92000 CR4: 00000000001407e0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 +Process flush-0:22 (pid: 1665, threadinfo ffff88003dfaa000, task ffff880037f77540) +Stack: +ffffffffa0398ac1 ffff8800397c5940 ffff88003dfab610 ffff88003dfab6a0 + ffff88003dfab5d0 ffff88003dfab680 ffffffffa01c150b ffffea0000d82e70 + 000000508116713b 0000000000000000 0000000000000000 0000000000000000 +Call Trace: +[] ? xdr_inline_decode+0xb1/0x120 [sunrpc] +[] filelayout_decode_layout+0xeb/0x350 [nfs_layout_nfsv41_files] +[] filelayout_alloc_lseg+0x8c/0x3c0 [nfs_layout_nfsv41_files] +[] ? __wait_on_bit+0x7e/0x90 + +Signed-off-by: Weston Andros Adamson +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/nfs4proc.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -6142,7 +6142,8 @@ nfs4_proc_layoutget(struct nfs4_layoutge + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; +- if (status == 0) ++ /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ ++ if (status == 0 && lgp->res.layoutp->len) + lseg = pnfs_layout_process(lgp); + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, status); diff --git a/queue-3.8/nfsv4.1-fix-an-abba-locking-issue-with-session-and-state-serialisation.patch b/queue-3.8/nfsv4.1-fix-an-abba-locking-issue-with-session-and-state-serialisation.patch new file mode 100644 index 00000000000..98be28a9d59 --- /dev/null +++ b/queue-3.8/nfsv4.1-fix-an-abba-locking-issue-with-session-and-state-serialisation.patch @@ -0,0 +1,125 @@ +From c8da19b9866ea84e9ad1c369393ea95d54ee7845 Mon Sep 17 00:00:00 2001 +From: Trond Myklebust +Date: Mon, 11 Feb 2013 19:01:21 -0500 +Subject: NFSv4.1: Fix an ABBA locking issue with session and state serialisation + +From: Trond Myklebust + +commit c8da19b9866ea84e9ad1c369393ea95d54ee7845 upstream. + +Ensure that if nfs_wait_on_sequence() causes our rpc task to wait for +an NFSv4 state serialisation lock, then we also drop the session slot. + +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/nfs4proc.c | 32 ++++++++++++++++++++------------ + 1 file changed, 20 insertions(+), 12 deletions(-) + +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -1463,7 +1463,7 @@ static void nfs4_open_prepare(struct rpc + struct nfs4_state_owner *sp = data->owner; + + if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) +- return; ++ goto out_wait; + /* + * Check if we still need to send an OPEN call, or if we can use + * a delegation instead. +@@ -1498,6 +1498,7 @@ unlock_no_action: + rcu_read_unlock(); + out_no_action: + task->tk_action = NULL; ++out_wait: + nfs4_sequence_done(task, &data->o_res.seq_res); + } + +@@ -2150,7 +2151,7 @@ static void nfs4_close_prepare(struct rp + + dprintk("%s: begin!\n", __func__); + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) +- return; ++ goto out_wait; + + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; + calldata->arg.fmode = FMODE_READ|FMODE_WRITE; +@@ -2172,16 +2173,14 @@ static void nfs4_close_prepare(struct rp + + if (!call_close) { + /* Note: exit _without_ calling nfs4_close_done */ +- task->tk_action = NULL; +- nfs4_sequence_done(task, &calldata->res.seq_res); +- goto out; ++ goto out_no_action; + } + + if (calldata->arg.fmode == 0) { + task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; + if (calldata->roc && + pnfs_roc_drain(inode, &calldata->roc_barrier, task)) +- goto out; ++ goto out_wait; + } + + nfs_fattr_init(calldata->res.fattr); +@@ -2191,8 +2190,12 @@ static void nfs4_close_prepare(struct rp + &calldata->res.seq_res, + task) != 0) + nfs_release_seqid(calldata->arg.seqid); +-out: + dprintk("%s: done!\n", __func__); ++ return; ++out_no_action: ++ task->tk_action = NULL; ++out_wait: ++ nfs4_sequence_done(task, &calldata->res.seq_res); + } + + static const struct rpc_call_ops nfs4_close_ops = { +@@ -4423,12 +4426,10 @@ static void nfs4_locku_prepare(struct rp + struct nfs4_unlockdata *calldata = data; + + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) +- return; ++ goto out_wait; + if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { + /* Note: exit _without_ running nfs4_locku_done */ +- task->tk_action = NULL; +- nfs4_sequence_done(task, &calldata->res.seq_res); +- return; ++ goto out_no_action; + } + calldata->timestamp = jiffies; + if (nfs4_setup_sequence(calldata->server, +@@ -4436,6 +4437,11 @@ static void nfs4_locku_prepare(struct rp + &calldata->res.seq_res, + task) != 0) + nfs_release_seqid(calldata->arg.seqid); ++ return; ++out_no_action: ++ task->tk_action = NULL; ++out_wait: ++ nfs4_sequence_done(task, &calldata->res.seq_res); + } + + static const struct rpc_call_ops nfs4_locku_ops = { +@@ -4576,7 +4582,7 @@ static void nfs4_lock_prepare(struct rpc + + dprintk("%s: begin!\n", __func__); + if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) +- return; ++ goto out_wait; + /* Do we need to do an open_to_lock_owner? */ + if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { + if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { +@@ -4596,6 +4602,8 @@ static void nfs4_lock_prepare(struct rpc + nfs_release_seqid(data->arg.open_seqid); + out_release_lock_seqid: + nfs_release_seqid(data->arg.lock_seqid); ++out_wait: ++ nfs4_sequence_done(task, &data->res.seq_res); + dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status); + } + diff --git a/queue-3.8/nfsv4.1-fix-bulk-recall-and-destroy-of-layouts.patch b/queue-3.8/nfsv4.1-fix-bulk-recall-and-destroy-of-layouts.patch new file mode 100644 index 00000000000..aa39b44da6a --- /dev/null +++ b/queue-3.8/nfsv4.1-fix-bulk-recall-and-destroy-of-layouts.patch @@ -0,0 +1,304 @@ +From fd9a8d7160937f94aad36ac80d7255b4988740ac Mon Sep 17 00:00:00 2001 +From: Trond Myklebust +Date: Tue, 12 Feb 2013 09:48:42 -0500 +Subject: NFSv4.1: Fix bulk recall and destroy of layouts + +From: Trond Myklebust + +commit fd9a8d7160937f94aad36ac80d7255b4988740ac upstream. + +The current code in pnfs_destroy_all_layouts() assumes that removing +the layout from the server->layouts list is sufficient to make it +invisible to other processes. This ignores the fact that most +users access the layout through the nfs_inode->layout... +There is further breakage due to lack of reference counting of the +layouts, meaning that the whole thing Oopses at the drop of a hat. + +The code in initiate_bulk_draining() is almost correct, and can be +used as a model for pnfs_destroy_all_layouts(), so move that +code to pnfs.c, and refactor the code to allow us to choose between +a single filesystem bulk recall, and a recall of all layouts. +Also note that initiate_bulk_draining() currently calls iput() while +holding locks. Fix that too. + +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/callback_proc.c | 61 ++----------------- + fs/nfs/pnfs.c | 150 ++++++++++++++++++++++++++++++++++++++++++------- + fs/nfs/pnfs.h | 7 +- + 3 files changed, 144 insertions(+), 74 deletions(-) + +--- a/fs/nfs/callback_proc.c ++++ b/fs/nfs/callback_proc.c +@@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct + static u32 initiate_bulk_draining(struct nfs_client *clp, + struct cb_layoutrecallargs *args) + { +- struct nfs_server *server; +- struct pnfs_layout_hdr *lo; +- struct inode *ino; +- u32 rv = NFS4ERR_NOMATCHING_LAYOUT; +- struct pnfs_layout_hdr *tmp; +- LIST_HEAD(recall_list); +- LIST_HEAD(free_me_list); +- struct pnfs_layout_range range = { +- .iomode = IOMODE_ANY, +- .offset = 0, +- .length = NFS4_MAX_UINT64, +- }; ++ int stat; + +- spin_lock(&clp->cl_lock); +- rcu_read_lock(); +- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +- if ((args->cbl_recall_type == RETURN_FSID) && +- memcmp(&server->fsid, &args->cbl_fsid, +- sizeof(struct nfs_fsid))) +- continue; +- +- list_for_each_entry(lo, &server->layouts, plh_layouts) { +- ino = igrab(lo->plh_inode); +- if (!ino) +- continue; +- spin_lock(&ino->i_lock); +- /* Is this layout in the process of being freed? */ +- if (NFS_I(ino)->layout != lo) { +- spin_unlock(&ino->i_lock); +- iput(ino); +- continue; +- } +- pnfs_get_layout_hdr(lo); +- spin_unlock(&ino->i_lock); +- list_add(&lo->plh_bulk_recall, &recall_list); +- } +- } +- rcu_read_unlock(); +- spin_unlock(&clp->cl_lock); +- +- list_for_each_entry_safe(lo, tmp, +- &recall_list, plh_bulk_recall) { +- ino = lo->plh_inode; +- spin_lock(&ino->i_lock); +- set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); +- if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range)) +- rv = NFS4ERR_DELAY; +- list_del_init(&lo->plh_bulk_recall); +- spin_unlock(&ino->i_lock); +- pnfs_free_lseg_list(&free_me_list); +- pnfs_put_layout_hdr(lo); +- iput(ino); +- } +- return rv; ++ if (args->cbl_recall_type == RETURN_FSID) ++ stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true); ++ else ++ stat = pnfs_destroy_layouts_byclid(clp, true); ++ if (stat != 0) ++ return NFS4ERR_DELAY; ++ return NFS4ERR_NOMATCHING_LAYOUT; + } + + static u32 do_callback_layoutrecall(struct nfs_client *clp, +--- a/fs/nfs/pnfs.c ++++ b/fs/nfs/pnfs.c +@@ -505,37 +505,147 @@ pnfs_destroy_layout(struct nfs_inode *nf + } + EXPORT_SYMBOL_GPL(pnfs_destroy_layout); + +-/* +- * Called by the state manger to remove all layouts established under an +- * expired lease. +- */ +-void +-pnfs_destroy_all_layouts(struct nfs_client *clp) ++static bool ++pnfs_layout_add_bulk_destroy_list(struct inode *inode, ++ struct list_head *layout_list) + { +- struct nfs_server *server; + struct pnfs_layout_hdr *lo; +- LIST_HEAD(tmp_list); ++ bool ret = false; + +- nfs4_deviceid_mark_client_invalid(clp); +- nfs4_deviceid_purge_client(clp); ++ spin_lock(&inode->i_lock); ++ lo = NFS_I(inode)->layout; ++ if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) { ++ pnfs_get_layout_hdr(lo); ++ list_add(&lo->plh_bulk_destroy, layout_list); ++ ret = true; ++ } ++ spin_unlock(&inode->i_lock); ++ return ret; ++} ++ ++/* Caller must hold rcu_read_lock and clp->cl_lock */ ++static int ++pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, ++ struct nfs_server *server, ++ struct list_head *layout_list) ++{ ++ struct pnfs_layout_hdr *lo, *next; ++ struct inode *inode; ++ ++ list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { ++ inode = igrab(lo->plh_inode); ++ if (inode == NULL) ++ continue; ++ list_del_init(&lo->plh_layouts); ++ if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) ++ continue; ++ rcu_read_unlock(); ++ spin_unlock(&clp->cl_lock); ++ iput(inode); ++ spin_lock(&clp->cl_lock); ++ rcu_read_lock(); ++ return -EAGAIN; ++ } ++ return 0; ++} ++ ++static int ++pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, ++ bool is_bulk_recall) ++{ ++ struct pnfs_layout_hdr *lo; ++ struct inode *inode; ++ struct pnfs_layout_range range = { ++ .iomode = IOMODE_ANY, ++ .offset = 0, ++ .length = NFS4_MAX_UINT64, ++ }; ++ LIST_HEAD(lseg_list); ++ int ret = 0; ++ ++ while (!list_empty(layout_list)) { ++ lo = list_entry(layout_list->next, struct pnfs_layout_hdr, ++ plh_bulk_destroy); ++ dprintk("%s freeing layout for inode %lu\n", __func__, ++ lo->plh_inode->i_ino); ++ inode = lo->plh_inode; ++ spin_lock(&inode->i_lock); ++ list_del_init(&lo->plh_bulk_destroy); ++ lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ ++ if (is_bulk_recall) ++ set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); ++ if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) ++ ret = -EAGAIN; ++ spin_unlock(&inode->i_lock); ++ pnfs_free_lseg_list(&lseg_list); ++ pnfs_put_layout_hdr(lo); ++ iput(inode); ++ } ++ return ret; ++} ++ ++int ++pnfs_destroy_layouts_byfsid(struct nfs_client *clp, ++ struct nfs_fsid *fsid, ++ bool is_recall) ++{ ++ struct nfs_server *server; ++ LIST_HEAD(layout_list); + + spin_lock(&clp->cl_lock); + rcu_read_lock(); ++restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { +- if (!list_empty(&server->layouts)) +- list_splice_init(&server->layouts, &tmp_list); ++ if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0) ++ continue; ++ if (pnfs_layout_bulk_destroy_byserver_locked(clp, ++ server, ++ &layout_list) != 0) ++ goto restart; + } + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + +- while (!list_empty(&tmp_list)) { +- lo = list_entry(tmp_list.next, struct pnfs_layout_hdr, +- plh_layouts); +- dprintk("%s freeing layout for inode %lu\n", __func__, +- lo->plh_inode->i_ino); +- list_del_init(&lo->plh_layouts); +- pnfs_destroy_layout(NFS_I(lo->plh_inode)); ++ if (list_empty(&layout_list)) ++ return 0; ++ return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); ++} ++ ++int ++pnfs_destroy_layouts_byclid(struct nfs_client *clp, ++ bool is_recall) ++{ ++ struct nfs_server *server; ++ LIST_HEAD(layout_list); ++ ++ spin_lock(&clp->cl_lock); ++ rcu_read_lock(); ++restart: ++ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { ++ if (pnfs_layout_bulk_destroy_byserver_locked(clp, ++ server, ++ &layout_list) != 0) ++ goto restart; + } ++ rcu_read_unlock(); ++ spin_unlock(&clp->cl_lock); ++ ++ if (list_empty(&layout_list)) ++ return 0; ++ return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall); ++} ++ ++/* ++ * Called by the state manger to remove all layouts established under an ++ * expired lease. ++ */ ++void ++pnfs_destroy_all_layouts(struct nfs_client *clp) ++{ ++ nfs4_deviceid_mark_client_invalid(clp); ++ nfs4_deviceid_purge_client(clp); ++ ++ pnfs_destroy_layouts_byclid(clp, false); + } + + /* +@@ -888,7 +998,7 @@ alloc_init_layout_hdr(struct inode *ino, + atomic_set(&lo->plh_refcount, 1); + INIT_LIST_HEAD(&lo->plh_layouts); + INIT_LIST_HEAD(&lo->plh_segs); +- INIT_LIST_HEAD(&lo->plh_bulk_recall); ++ INIT_LIST_HEAD(&lo->plh_bulk_destroy); + lo->plh_inode = ino; + lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred); + return lo; +--- a/fs/nfs/pnfs.h ++++ b/fs/nfs/pnfs.h +@@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type { + struct pnfs_layout_hdr { + atomic_t plh_refcount; + struct list_head plh_layouts; /* other client layouts */ +- struct list_head plh_bulk_recall; /* clnt list of bulk recalls */ ++ struct list_head plh_bulk_destroy; + struct list_head plh_segs; /* layout segments list */ + nfs4_stateid plh_stateid; + atomic_t plh_outstanding; /* number of RPCs out */ +@@ -196,6 +196,11 @@ struct pnfs_layout_segment *pnfs_layout_ + void pnfs_free_lseg_list(struct list_head *tmp_list); + void pnfs_destroy_layout(struct nfs_inode *); + void pnfs_destroy_all_layouts(struct nfs_client *); ++int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, ++ struct nfs_fsid *fsid, ++ bool is_recall); ++int pnfs_destroy_layouts_byclid(struct nfs_client *clp, ++ bool is_recall); + void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); + void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new, diff --git a/queue-3.8/nlm-ensure-that-we-resend-all-pending-blocking-locks-after-a-reclaim.patch b/queue-3.8/nlm-ensure-that-we-resend-all-pending-blocking-locks-after-a-reclaim.patch new file mode 100644 index 00000000000..800b28f557a --- /dev/null +++ b/queue-3.8/nlm-ensure-that-we-resend-all-pending-blocking-locks-after-a-reclaim.patch @@ -0,0 +1,36 @@ +From 666b3d803a511fbc9bc5e5ea8ce66010cf03ea13 Mon Sep 17 00:00:00 2001 +From: Trond Myklebust +Date: Tue, 19 Feb 2013 12:04:42 -0500 +Subject: NLM: Ensure that we resend all pending blocking locks after a reclaim + +From: Trond Myklebust + +commit 666b3d803a511fbc9bc5e5ea8ce66010cf03ea13 upstream. + +Currently, nlmclnt_lock will break out of the for(;;) loop when +the reclaimer wakes up the blocking lock thread by setting +nlm_lck_denied_grace_period. This causes the lock request to fail +with an ENOLCK error. +The intention was always to ensure that we resend the lock request +after the grace period has expired. + +Reported-by: Wangyuan Zhang +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/lockd/clntproc.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/lockd/clntproc.c ++++ b/fs/lockd/clntproc.c +@@ -550,6 +550,9 @@ again: + status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); + if (status < 0) + break; ++ /* Resend the blocking lock request after a server reboot */ ++ if (resp->status == nlm_lck_denied_grace_period) ++ continue; + if (resp->status != nlm_lck_blocked) + break; + } diff --git a/queue-3.8/ocfs2-unlock-super-lock-if-lockres-refresh-failed.patch b/queue-3.8/ocfs2-unlock-super-lock-if-lockres-refresh-failed.patch new file mode 100644 index 00000000000..468458ead10 --- /dev/null +++ b/queue-3.8/ocfs2-unlock-super-lock-if-lockres-refresh-failed.patch @@ -0,0 +1,45 @@ +From 3278bb748d2437eb1464765f36429e5d6aa91c38 Mon Sep 17 00:00:00 2001 +From: Junxiao Bi +Date: Thu, 21 Feb 2013 16:42:45 -0800 +Subject: ocfs2: unlock super lock if lockres refresh failed + +From: Junxiao Bi + +commit 3278bb748d2437eb1464765f36429e5d6aa91c38 upstream. + +If lockres refresh failed, the super lock will never be released which +will cause some processes on other cluster nodes hung forever. + +Signed-off-by: Junxiao Bi +Cc: Joel Becker +Cc: Mark Fasheh +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/dlmglue.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/fs/ocfs2/dlmglue.c ++++ b/fs/ocfs2/dlmglue.c +@@ -2545,6 +2545,7 @@ int ocfs2_super_lock(struct ocfs2_super + * everything is up to the caller :) */ + status = ocfs2_should_refresh_lock_res(lockres); + if (status < 0) { ++ ocfs2_cluster_unlock(osb, lockres, level); + mlog_errno(status); + goto bail; + } +@@ -2553,8 +2554,10 @@ int ocfs2_super_lock(struct ocfs2_super + + ocfs2_complete_lock_res_refresh(lockres, status); + +- if (status < 0) ++ if (status < 0) { ++ ocfs2_cluster_unlock(osb, lockres, level); + mlog_errno(status); ++ } + ocfs2_track_lock_refresh(lockres); + } + bail: diff --git a/queue-3.8/s390-kvm-fix-store-status-for-acrs-fprs.patch b/queue-3.8/s390-kvm-fix-store-status-for-acrs-fprs.patch new file mode 100644 index 00000000000..f6d0bb98c6e --- /dev/null +++ b/queue-3.8/s390-kvm-fix-store-status-for-acrs-fprs.patch @@ -0,0 +1,49 @@ +From 15bc8d8457875f495c59d933b05770ba88d1eacb Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Fri, 25 Jan 2013 15:34:15 +0100 +Subject: s390/kvm: Fix store status for ACRS/FPRS + +From: Christian Borntraeger + +commit 15bc8d8457875f495c59d933b05770ba88d1eacb upstream. + +On store status we need to copy the current state of registers +into a save area. Currently we might save stale versions: +The sie state descriptor doesnt have fields for guest ACRS,FPRS, +those registers are simply stored in the host registers. The host +program must copy these away if needed. We do that in vcpu_put/load. + +If we now do a store status in KVM code between vcpu_put/load, the +saved values are not up-to-date. Lets collect the ACRS/FPRS before +saving them. + +This also fixes some strange problems with hotplug and virtio-ccw, +since the low level machine check handler (on hotplug a machine check +will happen) will revalidate all registers with the content of the +save area. + +Signed-off-by: Christian Borntraeger +Signed-off-by: Gleb Natapov +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/kvm/kvm-s390.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -766,6 +766,14 @@ int kvm_s390_vcpu_store_status(struct kv + } else + prefix = 0; + ++ /* ++ * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy ++ * copying in vcpu load/put. Lets update our copies before we save ++ * it into the save area ++ */ ++ save_fp_regs(&vcpu->arch.guest_fpregs); ++ save_access_regs(vcpu->run->s.regs.acrs); ++ + if (__guestcopy(vcpu, addr + offsetof(struct save_area, fp_regs), + vcpu->arch.guest_fpregs.fprs, 128, prefix)) + return -EFAULT; diff --git a/queue-3.8/series b/queue-3.8/series index 4cd591083cb..2bba3092c0e 100644 --- a/queue-3.8/series +++ b/queue-3.8/series @@ -30,3 +30,19 @@ pps-fix-a-use-after-free-bug-when-unregistering-a-source.patch zram-fix-deadlock-bug-in-partial-read-write.patch driver-core-treat-unregistered-bus_types-as-having-no-devices.patch mmu_notifier_unregister-null-pointer-deref-and-multiple-release-callouts.patch +kvm-s390-handle-hosts-not-supporting-s390-virtio.patch +s390-kvm-fix-store-status-for-acrs-fprs.patch +futex-revert-futex-mark-get_robust_list-as-deprecated.patch +inotify-remove-broken-mask-checks-causing-unmount-to-be-einval.patch +fs-block_dev.c-page-cache-wrongly-left-invalidated-after-revalidate_disk.patch +ocfs2-unlock-super-lock-if-lockres-refresh-failed.patch +drivers-video-backlight-adp88-0_bl.c-fix-resume.patch +keys-revert-one-application-of-fix-unreachable-code-patch.patch +tmpfs-fix-use-after-free-of-mempolicy-object.patch +mm-fadvise.c-drain-all-pagevecs-if-posix_fadv_dontneed-fails-to-discard-all-pages.patch +drivercore-fix-ordering-between-deferred_probe-and-exiting-initcalls.patch +umount-oops-when-remove-blocklayoutdriver-first.patch +nlm-ensure-that-we-resend-all-pending-blocking-locks-after-a-reclaim.patch +nfsv4.1-fix-an-abba-locking-issue-with-session-and-state-serialisation.patch +nfsv4.1-fix-bulk-recall-and-destroy-of-layouts.patch +nfsv4.1-don-t-decode-skipped-layoutgets.patch diff --git a/queue-3.8/tmpfs-fix-use-after-free-of-mempolicy-object.patch b/queue-3.8/tmpfs-fix-use-after-free-of-mempolicy-object.patch new file mode 100644 index 00000000000..62ee4548792 --- /dev/null +++ b/queue-3.8/tmpfs-fix-use-after-free-of-mempolicy-object.patch @@ -0,0 +1,107 @@ +From 5f00110f7273f9ff04ac69a5f85bb535a4fd0987 Mon Sep 17 00:00:00 2001 +From: Greg Thelen +Date: Fri, 22 Feb 2013 16:36:01 -0800 +Subject: tmpfs: fix use-after-free of mempolicy object + +From: Greg Thelen + +commit 5f00110f7273f9ff04ac69a5f85bb535a4fd0987 upstream. + +The tmpfs remount logic preserves filesystem mempolicy if the mpol=M +option is not specified in the remount request. A new policy can be +specified if mpol=M is given. + +Before this patch remounting an mpol bound tmpfs without specifying +mpol= mount option in the remount request would set the filesystem's +mempolicy object to a freed mempolicy object. + +To reproduce the problem boot a DEBUG_PAGEALLOC kernel and run: + # mkdir /tmp/x + + # mount -t tmpfs -o size=100M,mpol=interleave nodev /tmp/x + + # grep /tmp/x /proc/mounts + nodev /tmp/x tmpfs rw,relatime,size=102400k,mpol=interleave:0-3 0 0 + + # mount -o remount,size=200M nodev /tmp/x + + # grep /tmp/x /proc/mounts + nodev /tmp/x tmpfs rw,relatime,size=204800k,mpol=??? 0 0 + # note ? garbage in mpol=... output above + + # dd if=/dev/zero of=/tmp/x/f count=1 + # panic here + +Panic: + BUG: unable to handle kernel NULL pointer dereference at (null) + IP: [< (null)>] (null) + [...] + Oops: 0010 [#1] SMP DEBUG_PAGEALLOC + Call Trace: + mpol_shared_policy_init+0xa5/0x160 + shmem_get_inode+0x209/0x270 + shmem_mknod+0x3e/0xf0 + shmem_create+0x18/0x20 + vfs_create+0xb5/0x130 + do_last+0x9a1/0xea0 + path_openat+0xb3/0x4d0 + do_filp_open+0x42/0xa0 + do_sys_open+0xfe/0x1e0 + compat_sys_open+0x1b/0x20 + cstar_dispatch+0x7/0x1f + +Non-debug kernels will not crash immediately because referencing the +dangling mpol will not cause a fault. Instead the filesystem will +reference a freed mempolicy object, which will cause unpredictable +behavior. + +The problem boils down to a dropped mpol reference below if +shmem_parse_options() does not allocate a new mpol: + + config = *sbinfo + shmem_parse_options(data, &config, true) + mpol_put(sbinfo->mpol) + sbinfo->mpol = config.mpol /* BUG: saves unreferenced mpol */ + +This patch avoids the crash by not releasing the mempolicy if +shmem_parse_options() doesn't create a new mpol. + +How far back does this issue go? I see it in both 2.6.36 and 3.3. I did +not look back further. + +Signed-off-by: Greg Thelen +Acked-by: Hugh Dickins +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/shmem.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2487,6 +2487,7 @@ static int shmem_remount_fs(struct super + unsigned long inodes; + int error = -EINVAL; + ++ config.mpol = NULL; + if (shmem_parse_options(data, &config, true)) + return error; + +@@ -2511,8 +2512,13 @@ static int shmem_remount_fs(struct super + sbinfo->max_inodes = config.max_inodes; + sbinfo->free_inodes = config.max_inodes - inodes; + +- mpol_put(sbinfo->mpol); +- sbinfo->mpol = config.mpol; /* transfers initial ref */ ++ /* ++ * Preserve previous mempolicy unless mpol remount option was specified. ++ */ ++ if (config.mpol) { ++ mpol_put(sbinfo->mpol); ++ sbinfo->mpol = config.mpol; /* transfers initial ref */ ++ } + out: + spin_unlock(&sbinfo->stat_lock); + return error; diff --git a/queue-3.8/umount-oops-when-remove-blocklayoutdriver-first.patch b/queue-3.8/umount-oops-when-remove-blocklayoutdriver-first.patch new file mode 100644 index 00000000000..596687e17db --- /dev/null +++ b/queue-3.8/umount-oops-when-remove-blocklayoutdriver-first.patch @@ -0,0 +1,85 @@ +From 5a12cca697aca5dfba42a7d4c3356acc0445a2b0 Mon Sep 17 00:00:00 2001 +From: fanchaoting +Date: Mon, 4 Feb 2013 21:15:02 +0800 +Subject: umount oops when remove blocklayoutdriver first + +From: fanchaoting + +commit 5a12cca697aca5dfba42a7d4c3356acc0445a2b0 upstream. + +now pnfs client uses block layout, maybe we can remove +blocklayoutdriver first. if we umount later, +it can cause oops in unset_pnfs_layoutdriver. +because nfss->pnfs_curr_ld->clear_layoutdriver is invalid. + +reproduce it: + modprobe blocklayoutdriver + mount -t nfs4 -o minorversion=1 pnfsip:/ /mnt/ + rmmod blocklayoutdriver + umount /mnt + +then you can see following + +CPU 0 +Pid: 17023, comm: umount.nfs4 Tainted: GF O 3.7.0-rc6-pnfs #1 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform +RIP: 0010:[] [] unset_pnfs_layoutdriver+0x1d/0x70 [nfsv4] +RSP: 0018:ffff8800022d9e48 EFLAGS: 00010286 +RAX: ffffffffa04a1b00 RBX: ffff88000b013800 RCX: 0000000000000001 +RDX: ffffffff81ae8ee0 RSI: ffff880001ee94b8 RDI: ffff88000b013800 +RBP: ffff8800022d9e58 R08: 0000000000000001 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000000 R12: ffff880001ee9400 +R13: ffff8800105978c0 R14: 00007fff25846c08 R15: 0000000001bba550 +FS: 00007f45ae7f0700(0000) GS:ffff880012c00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b +CR2: ffffffffa04a1b38 CR3: 0000000002c0c000 CR4: 00000000000006f0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 +Process umount.nfs4 (pid: 17023, threadinfo ffff8800022d8000, task ffff880006e48aa0) +Stack: +ffff8800105978c0 ffff88000b013800 ffff8800022d9e78 ffffffffa04cd0ce +ffff8800022d9e78 ffff88000b013800 ffff8800022d9ea8 ffffffffa04755a7 +ffff8800022d9ea8 ffff880002f96400 ffff88000b013800 ffff880002f96400 +Call Trace: +[] nfs4_destroy_server+0x1e/0x30 [nfsv4] +[] nfs_free_server+0xb7/0x150 [nfs] +[] nfs_kill_super+0x35/0x40 [nfs] +[] deactivate_locked_super+0x45/0x70 +[] deactivate_super+0x4a/0x70 +[] mntput_no_expire+0xd2/0x130 +[] sys_umount+0x72/0xe0 +[] system_call_fastpath+0x16/0x1b +Code: 06 e1 b8 ea ff ff ff eb 9e 0f 1f 44 00 00 55 48 89 e5 53 48 83 ec 08 66 66 66 66 90 48 8b 87 80 03 00 00 48 89 fb 48 85 c0 74 29 <48> 8b 40 38 48 85 c0 74 02 ff d0 48 8b 03 3e ff 48 04 0f 94 c2 +RIP [] unset_pnfs_layoutdriver+0x1d/0x70 [nfsv4] +RSP +CR2: ffffffffa04a1b38 +---[ end trace 29f75aaedda058bf ]--- + +Signed-off-by: fanchaoting +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/blocklayout/blocklayout.c | 1 + + fs/nfs/objlayout/objio_osd.c | 1 + + 2 files changed, 2 insertions(+) + +--- a/fs/nfs/blocklayout/blocklayout.c ++++ b/fs/nfs/blocklayout/blocklayout.c +@@ -1273,6 +1273,7 @@ static const struct nfs_pageio_ops bl_pg + static struct pnfs_layoutdriver_type blocklayout_type = { + .id = LAYOUT_BLOCK_VOLUME, + .name = "LAYOUT_BLOCK_VOLUME", ++ .owner = THIS_MODULE, + .read_pagelist = bl_read_pagelist, + .write_pagelist = bl_write_pagelist, + .alloc_layout_hdr = bl_alloc_layout_hdr, +--- a/fs/nfs/objlayout/objio_osd.c ++++ b/fs/nfs/objlayout/objio_osd.c +@@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type obj + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_LAYOUTRET_ON_ERROR, + ++ .owner = THIS_MODULE, + .alloc_layout_hdr = objlayout_alloc_layout_hdr, + .free_layout_hdr = objlayout_free_layout_hdr, +