From: Greg Kroah-Hartman Date: Wed, 17 May 2017 15:55:51 +0000 (+0200) Subject: 4.11-stable patches X-Git-Tag: v3.18.54~24 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5032628cfd279b13990141f30dc06e648f208015;p=thirdparty%2Fkernel%2Fstable-queue.git 4.11-stable patches added patches: ceph-fix-memory-leak-in-__ceph_setxattr.patch cifs-add-misssing-sfm-mapping-for-doublequote.patch cifs-fix-cifs_enumerate_snapshots-oops.patch cifs-fix-cifs_ioc_get_mnt_info-oops.patch cifs-fix-leak-in-fsctl_enum_snaps-response-handling.patch cifs-fix-mapping-of-sfm_space-and-sfm_period.patch cifs-fix-oplock-break-deadlocks.patch do-not-return-number-of-bytes-written-for-ioctl-cifs_ioc_copychunk_file.patch ext4-evict-inline-data-when-writing-to-memory-map.patch fix-match_prepath.patch fs-block_dev-always-invalidate-cleancache-in-invalidate_bdev.patch fs-fix-data-invalidation-in-the-cleancache-during-direct-io.patch fs-xattr.c-zero-out-memory-copied-to-userspace-in-getxattr.patch ib-core-fix-kernel-crash-during-fail-to-initialize-device.patch ib-core-fix-sysfs-registration-error-flow.patch ib-core-for-multicast-functions-verify-that-lids-are-multicast-lids.patch ib-hfi1-prevent-kernel-qp-post-send-hard-lockups.patch ib-ipoib-ibx-failed-to-create-mcg-debug-file.patch ib-mlx4-fix-ib-device-initialization-error-flow.patch ib-mlx4-reduce-sriov-multicast-cleanup-warning-message-to-debug-level.patch iov_iter-don-t-revert-iov-buffer-if-csum-error.patch jbd2-fix-dbench4-performance-regression-for-nobarrier-mounts.patch md-raid1-avoid-reusing-a-resync-bio-after-error-handling.patch mm-prevent-potential-recursive-reclaim-due-to-clearing-pf_memalloc.patch mm-vmscan-fix-io-refault-regression-in-cache-workingset-transition.patch orangefs-clean-up-oversize-xattr-validation.patch orangefs-do-not-check-possibly-stale-size-on-truncate.patch orangefs-do-not-set-getattr_time-on-orangefs_lookup.patch orangefs-fix-bounds-check-for-listxattr.patch ovl-do-not-set-overlay.opaque-on-non-dir-create.patch padata-free-correct-variable.patch perf-annotate-s390-fix-perf-annotate-error-95-4.10-regression.patch perf-annotate-s390-implement-jump-types-for-perf-annotate.patch perf-auxtrace-fix-no_size-logic-in-addr_filter__resolve_kernel_syms.patch set-unicode-flag-on-cifs-echo-request-to-avoid-mac-error.patch smb3-work-around-mount-failure-when-using-smb3-dialect-to-macs.patch --- diff --git a/queue-4.11/ceph-fix-memory-leak-in-__ceph_setxattr.patch b/queue-4.11/ceph-fix-memory-leak-in-__ceph_setxattr.patch new file mode 100644 index 00000000000..0a83f277cd7 --- /dev/null +++ b/queue-4.11/ceph-fix-memory-leak-in-__ceph_setxattr.patch @@ -0,0 +1,71 @@ +From eeca958dce0a9231d1969f86196653eb50fcc9b3 Mon Sep 17 00:00:00 2001 +From: Luis Henriques +Date: Fri, 28 Apr 2017 11:14:04 +0100 +Subject: ceph: fix memory leak in __ceph_setxattr() + +From: Luis Henriques + +commit eeca958dce0a9231d1969f86196653eb50fcc9b3 upstream. + +The ceph_inode_xattr needs to be released when removing an xattr. Easily +reproducible running the 'generic/020' test from xfstests or simply by +doing: + + attr -s attr0 -V 0 /mnt/test && attr -r attr0 /mnt/test + +While there, also fix the error path. + +Here's the kmemleak splat: + +unreferenced object 0xffff88001f86fbc0 (size 64): + comm "attr", pid 244, jiffies 4294904246 (age 98.464s) + hex dump (first 32 bytes): + 40 fa 86 1f 00 88 ff ff 80 32 38 1f 00 88 ff ff @........28..... + 00 01 00 00 00 00 ad de 00 02 00 00 00 00 ad de ................ + backtrace: + [] kmemleak_alloc+0x49/0xa0 + [] kmem_cache_alloc+0x9b/0xf0 + [] __ceph_setxattr+0x17e/0x820 + [] ceph_set_xattr_handler+0x37/0x40 + [] __vfs_removexattr+0x4b/0x60 + [] vfs_removexattr+0x77/0xd0 + [] removexattr+0x41/0x60 + [] path_removexattr+0x75/0xa0 + [] SyS_lremovexattr+0xb/0x10 + [] entry_SYSCALL_64_fastpath+0x13/0x94 + [] 0xffffffffffffffff + +Signed-off-by: Luis Henriques +Reviewed-by: "Yan, Zheng" +Signed-off-by: Ilya Dryomov +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ceph/xattr.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/ceph/xattr.c ++++ b/fs/ceph/xattr.c +@@ -392,6 +392,7 @@ static int __set_xattr(struct ceph_inode + + if (update_xattr) { + int err = 0; ++ + if (xattr && (flags & XATTR_CREATE)) + err = -EEXIST; + else if (!xattr && (flags & XATTR_REPLACE)) +@@ -399,12 +400,14 @@ static int __set_xattr(struct ceph_inode + if (err) { + kfree(name); + kfree(val); ++ kfree(*newxattr); + return err; + } + if (update_xattr < 0) { + if (xattr) + __remove_xattr(ci, xattr); + kfree(name); ++ kfree(*newxattr); + return 0; + } + } diff --git a/queue-4.11/cifs-add-misssing-sfm-mapping-for-doublequote.patch b/queue-4.11/cifs-add-misssing-sfm-mapping-for-doublequote.patch new file mode 100644 index 00000000000..98675b14661 --- /dev/null +++ b/queue-4.11/cifs-add-misssing-sfm-mapping-for-doublequote.patch @@ -0,0 +1,54 @@ +From 85435d7a15294f9f7ef23469e6aaf7c5dfcc54f0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Bj=C3=B6rn=20Jacke?= +Date: Fri, 5 May 2017 04:36:16 +0200 +Subject: CIFS: add misssing SFM mapping for doublequote + +From: Björn Jacke + +commit 85435d7a15294f9f7ef23469e6aaf7c5dfcc54f0 upstream. + +SFM is mapping doublequote to 0xF020 + +Without this patch creating files with doublequote fails to Windows/Mac + +Signed-off-by: Bjoern Jacke +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/cifs_unicode.c | 6 ++++++ + fs/cifs/cifs_unicode.h | 1 + + 2 files changed, 7 insertions(+) + +--- a/fs/cifs/cifs_unicode.c ++++ b/fs/cifs/cifs_unicode.c +@@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, c + case SFM_COLON: + *target = ':'; + break; ++ case SFM_DOUBLEQUOTE: ++ *target = '"'; ++ break; + case SFM_ASTERISK: + *target = '*'; + break; +@@ -418,6 +421,9 @@ static __le16 convert_to_sfm_char(char s + case ':': + dest_char = cpu_to_le16(SFM_COLON); + break; ++ case '"': ++ dest_char = cpu_to_le16(SFM_DOUBLEQUOTE); ++ break; + case '*': + dest_char = cpu_to_le16(SFM_ASTERISK); + break; +--- a/fs/cifs/cifs_unicode.h ++++ b/fs/cifs/cifs_unicode.h +@@ -57,6 +57,7 @@ + * not conflict (although almost does) with the mapping above. + */ + ++#define SFM_DOUBLEQUOTE ((__u16) 0xF020) + #define SFM_ASTERISK ((__u16) 0xF021) + #define SFM_QUESTION ((__u16) 0xF025) + #define SFM_COLON ((__u16) 0xF022) diff --git a/queue-4.11/cifs-fix-cifs_enumerate_snapshots-oops.patch b/queue-4.11/cifs-fix-cifs_enumerate_snapshots-oops.patch new file mode 100644 index 00000000000..3b8ab4b9706 --- /dev/null +++ b/queue-4.11/cifs-fix-cifs_enumerate_snapshots-oops.patch @@ -0,0 +1,33 @@ +From 6026685de33b0db5b2b6b0e9b41b3a1a3261033c Mon Sep 17 00:00:00 2001 +From: David Disseldorp +Date: Wed, 3 May 2017 17:39:08 +0200 +Subject: cifs: fix CIFS_ENUMERATE_SNAPSHOTS oops + +From: David Disseldorp + +commit 6026685de33b0db5b2b6b0e9b41b3a1a3261033c upstream. + +As with 618763958b22, an open directory may have a NULL private_data +pointer prior to readdir. CIFS_ENUMERATE_SNAPSHOTS must check for this +before dereference. + +Fixes: 834170c85978 ("Enable previous version support") +Signed-off-by: David Disseldorp +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/ioctl.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/cifs/ioctl.c ++++ b/fs/cifs/ioctl.c +@@ -213,6 +213,8 @@ long cifs_ioctl(struct file *filep, unsi + rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); + break; + case CIFS_ENUMERATE_SNAPSHOTS: ++ if (pSMBFile == NULL) ++ break; + if (arg == 0) { + rc = -EINVAL; + goto cifs_ioc_exit; diff --git a/queue-4.11/cifs-fix-cifs_ioc_get_mnt_info-oops.patch b/queue-4.11/cifs-fix-cifs_ioc_get_mnt_info-oops.patch new file mode 100644 index 00000000000..5c16e8698b9 --- /dev/null +++ b/queue-4.11/cifs-fix-cifs_ioc_get_mnt_info-oops.patch @@ -0,0 +1,31 @@ +From d8a6e505d6bba2250852fbc1c1c86fe68aaf9af3 Mon Sep 17 00:00:00 2001 +From: David Disseldorp +Date: Thu, 4 May 2017 00:41:13 +0200 +Subject: cifs: fix CIFS_IOC_GET_MNT_INFO oops + +From: David Disseldorp + +commit d8a6e505d6bba2250852fbc1c1c86fe68aaf9af3 upstream. + +An open directory may have a NULL private_data pointer prior to readdir. + +Fixes: 0de1f4c6f6c0 ("Add way to query server fs info for smb3") +Signed-off-by: David Disseldorp +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/ioctl.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/cifs/ioctl.c ++++ b/fs/cifs/ioctl.c +@@ -209,6 +209,8 @@ long cifs_ioctl(struct file *filep, unsi + rc = -EOPNOTSUPP; + break; + case CIFS_IOC_GET_MNT_INFO: ++ if (pSMBFile == NULL) ++ break; + tcon = tlink_tcon(pSMBFile->tlink); + rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); + break; diff --git a/queue-4.11/cifs-fix-leak-in-fsctl_enum_snaps-response-handling.patch b/queue-4.11/cifs-fix-leak-in-fsctl_enum_snaps-response-handling.patch new file mode 100644 index 00000000000..46347e90c8b --- /dev/null +++ b/queue-4.11/cifs-fix-leak-in-fsctl_enum_snaps-response-handling.patch @@ -0,0 +1,32 @@ +From 0e5c795592930d51fd30d53a2e7b73cba022a29b Mon Sep 17 00:00:00 2001 +From: David Disseldorp +Date: Wed, 3 May 2017 17:39:09 +0200 +Subject: cifs: fix leak in FSCTL_ENUM_SNAPS response handling + +From: David Disseldorp + +commit 0e5c795592930d51fd30d53a2e7b73cba022a29b upstream. + +The server may respond with success, and an output buffer less than +sizeof(struct smb_snapshot_array) in length. Do not leak the output +buffer in this case. + +Fixes: 834170c85978 ("Enable previous version support") +Signed-off-by: David Disseldorp +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/smb2ops.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/cifs/smb2ops.c ++++ b/fs/cifs/smb2ops.c +@@ -942,6 +942,7 @@ smb3_enum_snapshots(const unsigned int x + } + if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { + rc = -ERANGE; ++ kfree(retbuf); + return rc; + } + diff --git a/queue-4.11/cifs-fix-mapping-of-sfm_space-and-sfm_period.patch b/queue-4.11/cifs-fix-mapping-of-sfm_space-and-sfm_period.patch new file mode 100644 index 00000000000..d31511315c2 --- /dev/null +++ b/queue-4.11/cifs-fix-mapping-of-sfm_space-and-sfm_period.patch @@ -0,0 +1,36 @@ +From b704e70b7cf48f9b67c07d585168e102dfa30bb4 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Bj=C3=B6rn=20Jacke?= +Date: Wed, 3 May 2017 23:47:44 +0200 +Subject: CIFS: fix mapping of SFM_SPACE and SFM_PERIOD + +From: Björn Jacke + +commit b704e70b7cf48f9b67c07d585168e102dfa30bb4 upstream. + +- trailing space maps to 0xF028 +- trailing period maps to 0xF029 + +This fix corrects the mapping of file names which have a trailing character +that would otherwise be illegal (period or space) but is allowed by POSIX. + +Signed-off-by: Bjoern Jacke +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/cifs_unicode.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/cifs/cifs_unicode.h ++++ b/fs/cifs/cifs_unicode.h +@@ -64,8 +64,8 @@ + #define SFM_LESSTHAN ((__u16) 0xF023) + #define SFM_PIPE ((__u16) 0xF027) + #define SFM_SLASH ((__u16) 0xF026) +-#define SFM_PERIOD ((__u16) 0xF028) +-#define SFM_SPACE ((__u16) 0xF029) ++#define SFM_SPACE ((__u16) 0xF028) ++#define SFM_PERIOD ((__u16) 0xF029) + + /* + * Mapping mechanism to use when one of the seven reserved characters is diff --git a/queue-4.11/cifs-fix-oplock-break-deadlocks.patch b/queue-4.11/cifs-fix-oplock-break-deadlocks.patch new file mode 100644 index 00000000000..3236b674724 --- /dev/null +++ b/queue-4.11/cifs-fix-oplock-break-deadlocks.patch @@ -0,0 +1,237 @@ +From 3998e6b87d4258a70df358296d6f1c7234012bfe Mon Sep 17 00:00:00 2001 +From: Rabin Vincent +Date: Wed, 3 May 2017 17:54:01 +0200 +Subject: CIFS: fix oplock break deadlocks + +From: Rabin Vincent + +commit 3998e6b87d4258a70df358296d6f1c7234012bfe upstream. + +When the final cifsFileInfo_put() is called from cifsiod and an oplock +break work is queued, lockdep complains loudly: + + ============================================= + [ INFO: possible recursive locking detected ] + 4.11.0+ #21 Not tainted + --------------------------------------------- + kworker/0:2/78 is trying to acquire lock: + ("cifsiod"){++++.+}, at: flush_work+0x215/0x350 + + but task is already holding lock: + ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 + + other info that might help us debug this: + Possible unsafe locking scenario: + + CPU0 + ---- + lock("cifsiod"); + lock("cifsiod"); + + *** DEADLOCK *** + + May be due to missing lock nesting notation + + 2 locks held by kworker/0:2/78: + #0: ("cifsiod"){++++.+}, at: process_one_work+0x255/0x8e0 + #1: ((&wdata->work)){+.+...}, at: process_one_work+0x255/0x8e0 + + stack backtrace: + CPU: 0 PID: 78 Comm: kworker/0:2 Not tainted 4.11.0+ #21 + Workqueue: cifsiod cifs_writev_complete + Call Trace: + dump_stack+0x85/0xc2 + __lock_acquire+0x17dd/0x2260 + ? match_held_lock+0x20/0x2b0 + ? trace_hardirqs_off_caller+0x86/0x130 + ? mark_lock+0xa6/0x920 + lock_acquire+0xcc/0x260 + ? lock_acquire+0xcc/0x260 + ? flush_work+0x215/0x350 + flush_work+0x236/0x350 + ? flush_work+0x215/0x350 + ? destroy_worker+0x170/0x170 + __cancel_work_timer+0x17d/0x210 + ? ___preempt_schedule+0x16/0x18 + cancel_work_sync+0x10/0x20 + cifsFileInfo_put+0x338/0x7f0 + cifs_writedata_release+0x2a/0x40 + ? cifs_writedata_release+0x2a/0x40 + cifs_writev_complete+0x29d/0x850 + ? preempt_count_sub+0x18/0xd0 + process_one_work+0x304/0x8e0 + worker_thread+0x9b/0x6a0 + kthread+0x1b2/0x200 + ? process_one_work+0x8e0/0x8e0 + ? kthread_create_on_node+0x40/0x40 + ret_from_fork+0x31/0x40 + +This is a real warning. Since the oplock is queued on the same +workqueue this can deadlock if there is only one worker thread active +for the workqueue (which will be the case during memory pressure when +the rescuer thread is handling it). + +Furthermore, there is at least one other kind of hang possible due to +the oplock break handling if there is only worker. (This can be +reproduced without introducing memory pressure by having passing 1 for +the max_active parameter of cifsiod.) cifs_oplock_break() can wait +indefintely in the filemap_fdatawait() while the cifs_writev_complete() +work is blocked: + + sysrq: SysRq : Show Blocked State + task PC stack pid father + kworker/0:1 D 0 16 2 0x00000000 + Workqueue: cifsiod cifs_oplock_break + Call Trace: + __schedule+0x562/0xf40 + ? mark_held_locks+0x4a/0xb0 + schedule+0x57/0xe0 + io_schedule+0x21/0x50 + wait_on_page_bit+0x143/0x190 + ? add_to_page_cache_lru+0x150/0x150 + __filemap_fdatawait_range+0x134/0x190 + ? do_writepages+0x51/0x70 + filemap_fdatawait_range+0x14/0x30 + filemap_fdatawait+0x3b/0x40 + cifs_oplock_break+0x651/0x710 + ? preempt_count_sub+0x18/0xd0 + process_one_work+0x304/0x8e0 + worker_thread+0x9b/0x6a0 + kthread+0x1b2/0x200 + ? process_one_work+0x8e0/0x8e0 + ? kthread_create_on_node+0x40/0x40 + ret_from_fork+0x31/0x40 + dd D 0 683 171 0x00000000 + Call Trace: + __schedule+0x562/0xf40 + ? mark_held_locks+0x29/0xb0 + schedule+0x57/0xe0 + io_schedule+0x21/0x50 + wait_on_page_bit+0x143/0x190 + ? add_to_page_cache_lru+0x150/0x150 + __filemap_fdatawait_range+0x134/0x190 + ? do_writepages+0x51/0x70 + filemap_fdatawait_range+0x14/0x30 + filemap_fdatawait+0x3b/0x40 + filemap_write_and_wait+0x4e/0x70 + cifs_flush+0x6a/0xb0 + filp_close+0x52/0xa0 + __close_fd+0xdc/0x150 + SyS_close+0x33/0x60 + entry_SYSCALL_64_fastpath+0x1f/0xbe + + Showing all locks held in the system: + 2 locks held by kworker/0:1/16: + #0: ("cifsiod"){.+.+.+}, at: process_one_work+0x255/0x8e0 + #1: ((&cfile->oplock_break)){+.+.+.}, at: process_one_work+0x255/0x8e0 + + Showing busy workqueues and worker pools: + workqueue cifsiod: flags=0xc + pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/1 + in-flight: 16:cifs_oplock_break + delayed: cifs_writev_complete, cifs_echo_request + pool 0: cpus=0 node=0 flags=0x0 nice=0 hung=0s workers=3 idle: 750 3 + +Fix these problems by creating a a new workqueue (with a rescuer) for +the oplock break work. + +Signed-off-by: Rabin Vincent +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/cifsfs.c | 15 +++++++++++++-- + fs/cifs/cifsglob.h | 1 + + fs/cifs/misc.c | 2 +- + fs/cifs/smb2misc.c | 5 +++-- + 4 files changed, 18 insertions(+), 5 deletions(-) + +--- a/fs/cifs/cifsfs.c ++++ b/fs/cifs/cifsfs.c +@@ -87,6 +87,7 @@ extern mempool_t *cifs_req_poolp; + extern mempool_t *cifs_mid_poolp; + + struct workqueue_struct *cifsiod_wq; ++struct workqueue_struct *cifsoplockd_wq; + __u32 cifs_lock_secret; + + /* +@@ -1369,9 +1370,16 @@ init_cifs(void) + goto out_clean_proc; + } + ++ cifsoplockd_wq = alloc_workqueue("cifsoplockd", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); ++ if (!cifsoplockd_wq) { ++ rc = -ENOMEM; ++ goto out_destroy_cifsiod_wq; ++ } ++ + rc = cifs_fscache_register(); + if (rc) +- goto out_destroy_wq; ++ goto out_destroy_cifsoplockd_wq; + + rc = cifs_init_inodecache(); + if (rc) +@@ -1419,7 +1427,9 @@ out_destroy_inodecache: + cifs_destroy_inodecache(); + out_unreg_fscache: + cifs_fscache_unregister(); +-out_destroy_wq: ++out_destroy_cifsoplockd_wq: ++ destroy_workqueue(cifsoplockd_wq); ++out_destroy_cifsiod_wq: + destroy_workqueue(cifsiod_wq); + out_clean_proc: + cifs_proc_clean(); +@@ -1442,6 +1452,7 @@ exit_cifs(void) + cifs_destroy_mids(); + cifs_destroy_inodecache(); + cifs_fscache_unregister(); ++ destroy_workqueue(cifsoplockd_wq); + destroy_workqueue(cifsiod_wq); + cifs_proc_clean(); + } +--- a/fs/cifs/cifsglob.h ++++ b/fs/cifs/cifsglob.h +@@ -1683,6 +1683,7 @@ void cifs_oplock_break(struct work_struc + + extern const struct slow_work_ops cifs_oplock_break_ops; + extern struct workqueue_struct *cifsiod_wq; ++extern struct workqueue_struct *cifsoplockd_wq; + extern __u32 cifs_lock_secret; + + extern mempool_t *cifs_mid_poolp; +--- a/fs/cifs/misc.c ++++ b/fs/cifs/misc.c +@@ -492,7 +492,7 @@ is_valid_oplock_break(char *buffer, stru + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &pCifsInode->flags); + +- queue_work(cifsiod_wq, ++ queue_work(cifsoplockd_wq, + &netfile->oplock_break); + netfile->oplock_break_cancelled = false; + +--- a/fs/cifs/smb2misc.c ++++ b/fs/cifs/smb2misc.c +@@ -499,7 +499,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tc + else + cfile->oplock_break_cancelled = true; + +- queue_work(cifsiod_wq, &cfile->oplock_break); ++ queue_work(cifsoplockd_wq, &cfile->oplock_break); + kfree(lw); + return true; + } +@@ -643,7 +643,8 @@ smb2_is_valid_oplock_break(char *buffer, + CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, + &cinode->flags); + spin_unlock(&cfile->file_info_lock); +- queue_work(cifsiod_wq, &cfile->oplock_break); ++ queue_work(cifsoplockd_wq, ++ &cfile->oplock_break); + + spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_tcp_ses_lock); diff --git a/queue-4.11/do-not-return-number-of-bytes-written-for-ioctl-cifs_ioc_copychunk_file.patch b/queue-4.11/do-not-return-number-of-bytes-written-for-ioctl-cifs_ioc_copychunk_file.patch new file mode 100644 index 00000000000..20a396e1856 --- /dev/null +++ b/queue-4.11/do-not-return-number-of-bytes-written-for-ioctl-cifs_ioc_copychunk_file.patch @@ -0,0 +1,42 @@ +From 7d0c234fd2e1c9ca3fa032696c0c58b1b74a9e0b Mon Sep 17 00:00:00 2001 +From: Sachin Prabhu +Date: Wed, 26 Apr 2017 17:10:17 +0100 +Subject: Do not return number of bytes written for ioctl CIFS_IOC_COPYCHUNK_FILE + +From: Sachin Prabhu + +commit 7d0c234fd2e1c9ca3fa032696c0c58b1b74a9e0b upstream. + +commit 620d8745b35d ("Introduce cifs_copy_file_range()") changes the +behaviour of the cifs ioctl call CIFS_IOC_COPYCHUNK_FILE. In case of +successful writes, it now returns the number of bytes written. This +return value is treated as an error by the xfstest cifs/001. Depending +on the errno set at that time, this may or may not result in the test +failing. + +The patch fixes this by setting the return value to 0 in case of +successful writes. + +Fixes: commit 620d8745b35d ("Introduce cifs_copy_file_range()") +Reported-by: Eryu Guan +Signed-off-by: Sachin Prabhu +Acked-by: Pavel Shilovsky +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/ioctl.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/cifs/ioctl.c ++++ b/fs/cifs/ioctl.c +@@ -74,7 +74,8 @@ static long cifs_ioctl_copychunk(unsigne + + rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0, + src_inode->i_size, 0); +- ++ if (rc > 0) ++ rc = 0; + out_fput: + fdput(src_file); + out_drop_write: diff --git a/queue-4.11/ext4-evict-inline-data-when-writing-to-memory-map.patch b/queue-4.11/ext4-evict-inline-data-when-writing-to-memory-map.patch new file mode 100644 index 00000000000..0248c3e9dd1 --- /dev/null +++ b/queue-4.11/ext4-evict-inline-data-when-writing-to-memory-map.patch @@ -0,0 +1,86 @@ +From 7b4cc9787fe35b3ee2dfb1c35e22eafc32e00c33 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Sun, 30 Apr 2017 00:10:50 -0400 +Subject: ext4: evict inline data when writing to memory map + +From: Eric Biggers + +commit 7b4cc9787fe35b3ee2dfb1c35e22eafc32e00c33 upstream. + +Currently the case of writing via mmap to a file with inline data is not +handled. This is maybe a rare case since it requires a writable memory +map of a very small file, but it is trivial to trigger with on +inline_data filesystem, and it causes the +'BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA));' in +ext4_writepages() to be hit: + + mkfs.ext4 -O inline_data /dev/vdb + mount /dev/vdb /mnt + xfs_io -f /mnt/file \ + -c 'pwrite 0 1' \ + -c 'mmap -w 0 1m' \ + -c 'mwrite 0 1' \ + -c 'fsync' + + kernel BUG at fs/ext4/inode.c:2723! + invalid opcode: 0000 [#1] SMP + CPU: 1 PID: 2532 Comm: xfs_io Not tainted 4.11.0-rc1-xfstests-00301-g071d9acf3d1f #633 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 + task: ffff88003d3a8040 task.stack: ffffc90000300000 + RIP: 0010:ext4_writepages+0xc89/0xf8a + RSP: 0018:ffffc90000303ca0 EFLAGS: 00010283 + RAX: 0000028410000000 RBX: ffff8800383fa3b0 RCX: ffffffff812afcdc + RDX: 00000a9d00000246 RSI: ffffffff81e660e0 RDI: 0000000000000246 + RBP: ffffc90000303dc0 R08: 0000000000000002 R09: 869618e8f99b4fa5 + R10: 00000000852287a2 R11: 00000000a03b49f4 R12: ffff88003808e698 + R13: 0000000000000000 R14: 7fffffffffffffff R15: 7fffffffffffffff + FS: 00007fd3e53094c0(0000) GS:ffff88003e400000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007fd3e4c51000 CR3: 000000003d554000 CR4: 00000000003406e0 + Call Trace: + ? _raw_spin_unlock+0x27/0x2a + ? kvm_clock_read+0x1e/0x20 + do_writepages+0x23/0x2c + ? do_writepages+0x23/0x2c + __filemap_fdatawrite_range+0x80/0x87 + filemap_write_and_wait_range+0x67/0x8c + ext4_sync_file+0x20e/0x472 + vfs_fsync_range+0x8e/0x9f + ? syscall_trace_enter+0x25b/0x2d0 + vfs_fsync+0x1c/0x1e + do_fsync+0x31/0x4a + SyS_fsync+0x10/0x14 + do_syscall_64+0x69/0x131 + entry_SYSCALL64_slow_path+0x25/0x25 + +We could try to be smart and keep the inline data in this case, or at +least support delayed allocation when allocating the block, but these +solutions would be more complicated and don't seem worthwhile given how +rare this case seems to be. So just fix the bug by calling +ext4_convert_inline_data() when we're asked to make a page writable, so +that any inline data gets evicted, with the block allocated immediately. + +Reported-by: Nick Alcock +Reviewed-by: Andreas Dilger +Signed-off-by: Eric Biggers +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/inode.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5874,6 +5874,11 @@ int ext4_page_mkwrite(struct vm_fault *v + file_update_time(vma->vm_file); + + down_read(&EXT4_I(inode)->i_mmap_sem); ++ ++ ret = ext4_convert_inline_data(inode); ++ if (ret) ++ goto out_ret; ++ + /* Delalloc case is easy... */ + if (test_opt(inode->i_sb, DELALLOC) && + !ext4_should_journal_data(inode) && diff --git a/queue-4.11/fix-match_prepath.patch b/queue-4.11/fix-match_prepath.patch new file mode 100644 index 00000000000..ba549edf737 --- /dev/null +++ b/queue-4.11/fix-match_prepath.patch @@ -0,0 +1,47 @@ +From cd8c42968ee651b69e00f8661caff32b0086e82d Mon Sep 17 00:00:00 2001 +From: Sachin Prabhu +Date: Wed, 26 Apr 2017 14:05:46 +0100 +Subject: Fix match_prepath() + +From: Sachin Prabhu + +commit cd8c42968ee651b69e00f8661caff32b0086e82d upstream. + +Incorrect return value for shares not using the prefix path means that +we will never match superblocks for these shares. + +Fixes: commit c1d8b24d1819 ("Compare prepaths when comparing superblocks") +Signed-off-by: Sachin Prabhu +Reviewed-by: Pavel Shilovsky +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/connect.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -2912,16 +2912,14 @@ match_prepath(struct super_block *sb, st + { + struct cifs_sb_info *old = CIFS_SB(sb); + struct cifs_sb_info *new = mnt_data->cifs_sb; ++ bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; ++ bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; + +- if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { +- if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) +- return 0; +- /* The prepath should be null terminated strings */ +- if (strcmp(new->prepath, old->prepath)) +- return 0; +- ++ if (old_set && new_set && !strcmp(new->prepath, old->prepath)) ++ return 1; ++ else if (!old_set && !new_set) + return 1; +- } ++ + return 0; + } + diff --git a/queue-4.11/fs-block_dev-always-invalidate-cleancache-in-invalidate_bdev.patch b/queue-4.11/fs-block_dev-always-invalidate-cleancache-in-invalidate_bdev.patch new file mode 100644 index 00000000000..128963f3542 --- /dev/null +++ b/queue-4.11/fs-block_dev-always-invalidate-cleancache-in-invalidate_bdev.patch @@ -0,0 +1,55 @@ +From a5f6a6a9c72eac38a7fadd1a038532bc8516337c Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin +Date: Wed, 3 May 2017 14:56:02 -0700 +Subject: fs/block_dev: always invalidate cleancache in invalidate_bdev() + +From: Andrey Ryabinin + +commit a5f6a6a9c72eac38a7fadd1a038532bc8516337c upstream. + +invalidate_bdev() calls cleancache_invalidate_inode() iff ->nrpages != 0 +which doen't make any sense. + +Make sure that invalidate_bdev() always calls cleancache_invalidate_inode() +regardless of mapping->nrpages value. + +Fixes: c515e1fd361c ("mm/fs: add hooks to support cleancache") +Link: http://lkml.kernel.org/r/20170424164135.22350-3-aryabinin@virtuozzo.com +Signed-off-by: Andrey Ryabinin +Reviewed-by: Jan Kara +Acked-by: Konrad Rzeszutek Wilk +Cc: Alexander Viro +Cc: Ross Zwisler +Cc: Jens Axboe +Cc: Johannes Weiner +Cc: Alexey Kuznetsov +Cc: Christoph Hellwig +Cc: Nikolay Borisov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/block_dev.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +--- a/fs/block_dev.c ++++ b/fs/block_dev.c +@@ -103,12 +103,11 @@ void invalidate_bdev(struct block_device + { + struct address_space *mapping = bdev->bd_inode->i_mapping; + +- if (mapping->nrpages == 0) +- return; +- +- invalidate_bh_lrus(); +- lru_add_drain_all(); /* make sure all lru add caches are flushed */ +- invalidate_mapping_pages(mapping, 0, -1); ++ if (mapping->nrpages) { ++ invalidate_bh_lrus(); ++ lru_add_drain_all(); /* make sure all lru add caches are flushed */ ++ invalidate_mapping_pages(mapping, 0, -1); ++ } + /* 99% of the time, we don't need to flush the cleancache on the bdev. + * But, for the strange corners, lets be cautious + */ diff --git a/queue-4.11/fs-fix-data-invalidation-in-the-cleancache-during-direct-io.patch b/queue-4.11/fs-fix-data-invalidation-in-the-cleancache-during-direct-io.patch new file mode 100644 index 00000000000..108ead7e31c --- /dev/null +++ b/queue-4.11/fs-fix-data-invalidation-in-the-cleancache-during-direct-io.patch @@ -0,0 +1,148 @@ +From 55635ba76ef91f26b418702ace5e6287eb727f6a Mon Sep 17 00:00:00 2001 +From: Andrey Ryabinin +Date: Wed, 3 May 2017 14:55:59 -0700 +Subject: fs: fix data invalidation in the cleancache during direct IO + +From: Andrey Ryabinin + +commit 55635ba76ef91f26b418702ace5e6287eb727f6a upstream. + +Patch series "Properly invalidate data in the cleancache", v2. + +We've noticed that after direct IO write, buffered read sometimes gets +stale data which is coming from the cleancache. The reason for this is +that some direct write hooks call call invalidate_inode_pages2[_range]() +conditionally iff mapping->nrpages is not zero, so we may not invalidate +data in the cleancache. + +Another odd thing is that we check only for ->nrpages and don't check +for ->nrexceptional, but invalidate_inode_pages2[_range] also +invalidates exceptional entries as well. So we invalidate exceptional +entries only if ->nrpages != 0? This doesn't feel right. + + - Patch 1 fixes direct IO writes by removing ->nrpages check. + - Patch 2 fixes similar case in invalidate_bdev(). + Note: I only fixed conditional cleancache_invalidate_inode() here. + Do we also need to add ->nrexceptional check in into invalidate_bdev()? + + - Patches 3-4: some optimizations. + +This patch (of 4): + +Some direct IO write fs hooks call invalidate_inode_pages2[_range]() +conditionally iff mapping->nrpages is not zero. This can't be right, +because invalidate_inode_pages2[_range]() also invalidate data in the +cleancache via cleancache_invalidate_inode() call. So if page cache is +empty but there is some data in the cleancache, buffered read after +direct IO write would get stale data from the cleancache. + +Also it doesn't feel right to check only for ->nrpages because +invalidate_inode_pages2[_range] invalidates exceptional entries as well. + +Fix this by calling invalidate_inode_pages2[_range]() regardless of +nrpages state. + +Note: nfs,cifs,9p doesn't need similar fix because the never call +cleancache_get_page() (nor directly, nor via mpage_readpage[s]()), so +they are not affected by this bug. + +Fixes: c515e1fd361c ("mm/fs: add hooks to support cleancache") +Link: http://lkml.kernel.org/r/20170424164135.22350-2-aryabinin@virtuozzo.com +Signed-off-by: Andrey Ryabinin +Reviewed-by: Jan Kara +Acked-by: Konrad Rzeszutek Wilk +Cc: Alexander Viro +Cc: Ross Zwisler +Cc: Jens Axboe +Cc: Johannes Weiner +Cc: Alexey Kuznetsov +Cc: Christoph Hellwig +Cc: Nikolay Borisov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/iomap.c | 20 +++++++++----------- + mm/filemap.c | 26 +++++++++++--------------- + 2 files changed, 20 insertions(+), 26 deletions(-) + +--- a/fs/iomap.c ++++ b/fs/iomap.c +@@ -887,16 +887,14 @@ iomap_dio_rw(struct kiocb *iocb, struct + flags |= IOMAP_WRITE; + } + +- if (mapping->nrpages) { +- ret = filemap_write_and_wait_range(mapping, start, end); +- if (ret) +- goto out_free_dio; +- +- ret = invalidate_inode_pages2_range(mapping, +- start >> PAGE_SHIFT, end >> PAGE_SHIFT); +- WARN_ON_ONCE(ret); +- ret = 0; +- } ++ ret = filemap_write_and_wait_range(mapping, start, end); ++ if (ret) ++ goto out_free_dio; ++ ++ ret = invalidate_inode_pages2_range(mapping, ++ start >> PAGE_SHIFT, end >> PAGE_SHIFT); ++ WARN_ON_ONCE(ret); ++ ret = 0; + + inode_dio_begin(inode); + +@@ -951,7 +949,7 @@ iomap_dio_rw(struct kiocb *iocb, struct + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ +- if (iov_iter_rw(iter) == WRITE && mapping->nrpages) { ++ if (iov_iter_rw(iter) == WRITE) { + int err = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(err); +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -2719,18 +2719,16 @@ generic_file_direct_write(struct kiocb * + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ +- if (mapping->nrpages) { +- written = invalidate_inode_pages2_range(mapping, ++ written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); +- /* +- * If a page can not be invalidated, return 0 to fall back +- * to buffered write. +- */ +- if (written) { +- if (written == -EBUSY) +- return 0; +- goto out; +- } ++ /* ++ * If a page can not be invalidated, return 0 to fall back ++ * to buffered write. ++ */ ++ if (written) { ++ if (written == -EBUSY) ++ return 0; ++ goto out; + } + + data = *from; +@@ -2744,10 +2742,8 @@ generic_file_direct_write(struct kiocb * + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + */ +- if (mapping->nrpages) { +- invalidate_inode_pages2_range(mapping, +- pos >> PAGE_SHIFT, end); +- } ++ invalidate_inode_pages2_range(mapping, ++ pos >> PAGE_SHIFT, end); + + if (written > 0) { + pos += written; diff --git a/queue-4.11/fs-xattr.c-zero-out-memory-copied-to-userspace-in-getxattr.patch b/queue-4.11/fs-xattr.c-zero-out-memory-copied-to-userspace-in-getxattr.patch new file mode 100644 index 00000000000..9c01395dc40 --- /dev/null +++ b/queue-4.11/fs-xattr.c-zero-out-memory-copied-to-userspace-in-getxattr.patch @@ -0,0 +1,40 @@ +From 81be3dee96346fbe08c31be5ef74f03f6b63cf68 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Mon, 8 May 2017 15:57:24 -0700 +Subject: fs/xattr.c: zero out memory copied to userspace in getxattr + +From: Michal Hocko + +commit 81be3dee96346fbe08c31be5ef74f03f6b63cf68 upstream. + +getxattr uses vmalloc to allocate memory if kzalloc fails. This is +filled by vfs_getxattr and then copied to the userspace. vmalloc, +however, doesn't zero out the memory so if the specific implementation +of the xattr handler is sloppy we can theoretically expose a kernel +memory. There is no real sign this is really the case but let's make +sure this will not happen and use vzalloc instead. + +Fixes: 779302e67835 ("fs/xattr.c:getxattr(): improve handling of allocation failures") +Link: http://lkml.kernel.org/r/20170306103327.2766-1-mhocko@kernel.org +Acked-by: Kees Cook +Reported-by: Vlastimil Babka +Signed-off-by: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xattr.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xattr.c ++++ b/fs/xattr.c +@@ -530,7 +530,7 @@ getxattr(struct dentry *d, const char __ + size = XATTR_SIZE_MAX; + kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!kvalue) { +- kvalue = vmalloc(size); ++ kvalue = vzalloc(size); + if (!kvalue) + return -ENOMEM; + } diff --git a/queue-4.11/ib-core-fix-kernel-crash-during-fail-to-initialize-device.patch b/queue-4.11/ib-core-fix-kernel-crash-during-fail-to-initialize-device.patch new file mode 100644 index 00000000000..17f0dbb601b --- /dev/null +++ b/queue-4.11/ib-core-fix-kernel-crash-during-fail-to-initialize-device.patch @@ -0,0 +1,160 @@ +From 4be3a4fa51f432ef045546d16f25c68a1ab525b9 Mon Sep 17 00:00:00 2001 +From: Parav Pandit +Date: Sun, 19 Mar 2017 10:55:55 +0200 +Subject: IB/core: Fix kernel crash during fail to initialize device + +From: Parav Pandit + +commit 4be3a4fa51f432ef045546d16f25c68a1ab525b9 upstream. + +This patch fixes the kernel crash that occurs during ib_dealloc_device() +called due to provider driver fails with an error after +ib_alloc_device() and before it can register using ib_register_device(). + +This crashed seen in tha lab as below which can occur with any IB device +which fails to perform its device initialization before invoking +ib_register_device(). + +This patch avoids touching cache and port immutable structures if device +is not yet initialized. +It also releases related memory when cache and port immutable data +structure initialization fails during register_device() state. + +[81416.561946] BUG: unable to handle kernel NULL pointer dereference at (null) +[81416.570340] IP: ib_cache_release_one+0x29/0x80 [ib_core] +[81416.576222] PGD 78da66067 +[81416.576223] PUD 7f2d7c067 +[81416.579484] PMD 0 +[81416.582720] +[81416.587242] Oops: 0000 [#1] SMP +[81416.722395] task: ffff8807887515c0 task.stack: ffffc900062c0000 +[81416.729148] RIP: 0010:ib_cache_release_one+0x29/0x80 [ib_core] +[81416.735793] RSP: 0018:ffffc900062c3a90 EFLAGS: 00010202 +[81416.741823] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 +[81416.749785] RDX: 0000000000000000 RSI: 0000000000000282 RDI: ffff880859fec000 +[81416.757757] RBP: ffffc900062c3aa0 R08: ffff8808536e5ac0 R09: ffff880859fec5b0 +[81416.765708] R10: 00000000536e5c01 R11: ffff8808536e5ac0 R12: ffff880859fec000 +[81416.773672] R13: 0000000000000000 R14: ffff8808536e5ac0 R15: ffff88084ebc0060 +[81416.781621] FS: 00007fd879fab740(0000) GS:ffff88085fac0000(0000) knlGS:0000000000000000 +[81416.790522] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[81416.797094] CR2: 0000000000000000 CR3: 00000007eb215000 CR4: 00000000003406e0 +[81416.805051] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[81416.812997] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[81416.820950] Call Trace: +[81416.824226] ib_device_release+0x1e/0x40 [ib_core] +[81416.829858] device_release+0x32/0xa0 +[81416.834370] kobject_cleanup+0x63/0x170 +[81416.839058] kobject_put+0x25/0x50 +[81416.843319] ib_dealloc_device+0x25/0x40 [ib_core] +[81416.848986] mlx5_ib_add+0x163/0x1990 [mlx5_ib] +[81416.854414] mlx5_add_device+0x5a/0x160 [mlx5_core] +[81416.860191] mlx5_register_interface+0x8d/0xc0 [mlx5_core] +[81416.866587] ? 0xffffffffa09e9000 +[81416.870816] mlx5_ib_init+0x15/0x17 [mlx5_ib] +[81416.876094] do_one_initcall+0x51/0x1b0 +[81416.880861] ? __vunmap+0x85/0xd0 +[81416.885113] ? kmem_cache_alloc_trace+0x14b/0x1b0 +[81416.890768] ? vfree+0x2e/0x70 +[81416.894762] do_init_module+0x60/0x1fa +[81416.899441] load_module+0x15f6/0x1af0 +[81416.904114] ? __symbol_put+0x60/0x60 +[81416.908709] ? ima_post_read_file+0x3d/0x80 +[81416.913828] ? security_kernel_post_read_file+0x6b/0x80 +[81416.920006] SYSC_finit_module+0xa6/0xf0 +[81416.924888] SyS_finit_module+0xe/0x10 +[81416.929568] entry_SYSCALL_64_fastpath+0x1a/0xa9 +[81416.935089] RIP: 0033:0x7fd879494949 +[81416.939543] RSP: 002b:00007ffdbc1b4e58 EFLAGS: 00000202 ORIG_RAX: 0000000000000139 +[81416.947982] RAX: ffffffffffffffda RBX: 0000000001b66f00 RCX: 00007fd879494949 +[81416.955965] RDX: 0000000000000000 RSI: 000000000041a13c RDI: 0000000000000003 +[81416.963926] RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000001b652a0 +[81416.971861] R10: 0000000000000003 R11: 0000000000000202 R12: 00007ffdbc1b3e70 +[81416.979763] R13: 00007ffdbc1b3e50 R14: 0000000000000005 R15: 0000000000000000 +[81417.008005] RIP: ib_cache_release_one+0x29/0x80 [ib_core] RSP: ffffc900062c3a90 +[81417.016045] CR2: 0000000000000000 + +Fixes: 55aeed0654 ("IB/core: Make ib_alloc_device init the kobject") +Fixes: 7738613e7c ("IB/core: Add per port immutable struct to ib_device") +Reviewed-by: Daniel Jurgens +Signed-off-by: Parav Pandit +Signed-off-by: Leon Romanovsky +Signed-off-by: Doug Ledford +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/core/device.c | 33 ++++++++++++++++++++++----------- + 1 file changed, 22 insertions(+), 11 deletions(-) + +--- a/drivers/infiniband/core/device.c ++++ b/drivers/infiniband/core/device.c +@@ -172,8 +172,16 @@ static void ib_device_release(struct dev + { + struct ib_device *dev = container_of(device, struct ib_device, dev); + +- ib_cache_release_one(dev); +- kfree(dev->port_immutable); ++ WARN_ON(dev->reg_state == IB_DEV_REGISTERED); ++ if (dev->reg_state == IB_DEV_UNREGISTERED) { ++ /* ++ * In IB_DEV_UNINITIALIZED state, cache or port table ++ * is not even created. Free cache and port table only when ++ * device reaches UNREGISTERED state. ++ */ ++ ib_cache_release_one(dev); ++ kfree(dev->port_immutable); ++ } + kfree(dev); + } + +@@ -380,32 +388,27 @@ int ib_register_device(struct ib_device + ret = ib_cache_setup_one(device); + if (ret) { + pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); +- goto out; ++ goto port_cleanup; + } + + ret = ib_device_register_rdmacg(device); + if (ret) { + pr_warn("Couldn't register device with rdma cgroup\n"); +- ib_cache_cleanup_one(device); +- goto out; ++ goto cache_cleanup; + } + + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); + if (ret) { + pr_warn("Couldn't query the device attributes\n"); +- ib_device_unregister_rdmacg(device); +- ib_cache_cleanup_one(device); +- goto out; ++ goto cache_cleanup; + } + + ret = ib_device_register_sysfs(device, port_callback); + if (ret) { + pr_warn("Couldn't register device %s with driver model\n", + device->name); +- ib_device_unregister_rdmacg(device); +- ib_cache_cleanup_one(device); +- goto out; ++ goto cache_cleanup; + } + + device->reg_state = IB_DEV_REGISTERED; +@@ -417,6 +420,14 @@ int ib_register_device(struct ib_device + down_write(&lists_rwsem); + list_add_tail(&device->core_list, &device_list); + up_write(&lists_rwsem); ++ mutex_unlock(&device_mutex); ++ return 0; ++ ++cache_cleanup: ++ ib_cache_cleanup_one(device); ++ ib_cache_release_one(device); ++port_cleanup: ++ kfree(device->port_immutable); + out: + mutex_unlock(&device_mutex); + return ret; diff --git a/queue-4.11/ib-core-fix-sysfs-registration-error-flow.patch b/queue-4.11/ib-core-fix-sysfs-registration-error-flow.patch new file mode 100644 index 00000000000..fbc74d5b133 --- /dev/null +++ b/queue-4.11/ib-core-fix-sysfs-registration-error-flow.patch @@ -0,0 +1,50 @@ +From b312be3d87e4c80872cbea869e569175c5eb0f9a Mon Sep 17 00:00:00 2001 +From: Jack Morgenstein +Date: Sun, 19 Mar 2017 10:55:57 +0200 +Subject: IB/core: Fix sysfs registration error flow + +From: Jack Morgenstein + +commit b312be3d87e4c80872cbea869e569175c5eb0f9a upstream. + +The kernel commit cited below restructured ib device management +so that the device kobject is initialized in ib_alloc_device. + +As part of the restructuring, the kobject is now initialized in +procedure ib_alloc_device, and is later added to the device hierarchy +in the ib_register_device call stack, in procedure +ib_device_register_sysfs (which calls device_add). + +However, in the ib_device_register_sysfs error flow, if an error +occurs following the call to device_add, the cleanup procedure +device_unregister is called. This call results in the device object +being deleted -- which results in various use-after-free crashes. + +The correct cleanup call is device_del -- which undoes device_add +without deleting the device object. + +The device object will then (correctly) be deleted in the +ib_register_device caller's error cleanup flow, when the caller invokes +ib_dealloc_device. + +Fixes: 55aeed06544f6 ("IB/core: Make ib_alloc_device init the kobject") +Signed-off-by: Jack Morgenstein +Signed-off-by: Leon Romanovsky +Signed-off-by: Doug Ledford +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/core/sysfs.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/infiniband/core/sysfs.c ++++ b/drivers/infiniband/core/sysfs.c +@@ -1301,7 +1301,7 @@ err_put: + free_port_list_attributes(device); + + err_unregister: +- device_unregister(class_dev); ++ device_del(class_dev); + + err: + return ret; diff --git a/queue-4.11/ib-core-for-multicast-functions-verify-that-lids-are-multicast-lids.patch b/queue-4.11/ib-core-for-multicast-functions-verify-that-lids-are-multicast-lids.patch new file mode 100644 index 00000000000..462b5b6957f --- /dev/null +++ b/queue-4.11/ib-core-for-multicast-functions-verify-that-lids-are-multicast-lids.patch @@ -0,0 +1,53 @@ +From 8561eae60ff9417a50fa1fb2b83ae950dc5c1e21 Mon Sep 17 00:00:00 2001 +From: "Michael J. Ruhl" +Date: Sun, 9 Apr 2017 10:15:51 -0700 +Subject: IB/core: For multicast functions, verify that LIDs are multicast LIDs + +From: Michael J. Ruhl + +commit 8561eae60ff9417a50fa1fb2b83ae950dc5c1e21 upstream. + +The Infiniband spec defines "A multicast address is defined by a +MGID and a MLID" (section 10.5). Currently the MLID value is not +validated. + +Add check to verify that the MLID value is in the correct address +range. + +Fixes: 0c33aeedb2cf ("[IB] Add checks to multicast attach and detach") +Reviewed-by: Ira Weiny +Reviewed-by: Dasaratharaman Chandramouli +Signed-off-by: Michael J. Ruhl +Signed-off-by: Dennis Dalessandro +Reviewed-by: Leon Romanovsky +Signed-off-by: Doug Ledford +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/core/verbs.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/drivers/infiniband/core/verbs.c ++++ b/drivers/infiniband/core/verbs.c +@@ -1519,7 +1519,9 @@ int ib_attach_mcast(struct ib_qp *qp, un + + if (!qp->device->attach_mcast) + return -ENOSYS; +- if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) ++ if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || ++ lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || ++ lid == be16_to_cpu(IB_LID_PERMISSIVE)) + return -EINVAL; + + ret = qp->device->attach_mcast(qp, gid, lid); +@@ -1535,7 +1537,9 @@ int ib_detach_mcast(struct ib_qp *qp, un + + if (!qp->device->detach_mcast) + return -ENOSYS; +- if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD) ++ if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || ++ lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || ++ lid == be16_to_cpu(IB_LID_PERMISSIVE)) + return -EINVAL; + + ret = qp->device->detach_mcast(qp, gid, lid); diff --git a/queue-4.11/ib-hfi1-prevent-kernel-qp-post-send-hard-lockups.patch b/queue-4.11/ib-hfi1-prevent-kernel-qp-post-send-hard-lockups.patch new file mode 100644 index 00000000000..309df4234cc --- /dev/null +++ b/queue-4.11/ib-hfi1-prevent-kernel-qp-post-send-hard-lockups.patch @@ -0,0 +1,142 @@ +From b6eac931b9bb2bce4db7032c35b41e5e34ec22a5 Mon Sep 17 00:00:00 2001 +From: Mike Marciniszyn +Date: Sun, 9 Apr 2017 10:16:35 -0700 +Subject: IB/hfi1: Prevent kernel QP post send hard lockups + +From: Mike Marciniszyn + +commit b6eac931b9bb2bce4db7032c35b41e5e34ec22a5 upstream. + +The driver progress routines can call cond_resched() when +a timeslice is exhausted and irqs are enabled. + +If the ULP had been holding a spin lock without disabling irqs and +the post send directly called the progress routine, the cond_resched() +could yield allowing another thread from the same ULP to deadlock +on that same lock. + +Correct by replacing the current hfi1_do_send() calldown with a unique +one for post send and adding an argument to hfi1_do_send() to indicate +that the send engine is running in a thread. If the routine is not +running in a thread, avoid calling cond_resched(). + +Fixes: Commit 831464ce4b74 ("IB/hfi1: Don't call cond_resched in atomic mode when sending packets") +Reviewed-by: Dennis Dalessandro +Signed-off-by: Mike Marciniszyn +Signed-off-by: Dennis Dalessandro +Signed-off-by: Doug Ledford +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/hw/hfi1/ruc.c | 26 ++++++++++++++++---------- + drivers/infiniband/hw/hfi1/verbs.c | 4 ++-- + drivers/infiniband/hw/hfi1/verbs.h | 6 ++++-- + 3 files changed, 22 insertions(+), 14 deletions(-) + +--- a/drivers/infiniband/hw/hfi1/ruc.c ++++ b/drivers/infiniband/hw/hfi1/ruc.c +@@ -1,5 +1,5 @@ + /* +- * Copyright(c) 2015, 2016 Intel Corporation. ++ * Copyright(c) 2015 - 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. +@@ -784,23 +784,29 @@ void hfi1_make_ruc_header(struct rvt_qp + /* when sending, force a reschedule every one of these periods */ + #define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ + ++void hfi1_do_send_from_rvt(struct rvt_qp *qp) ++{ ++ hfi1_do_send(qp, false); ++} ++ + void _hfi1_do_send(struct work_struct *work) + { + struct iowait *wait = container_of(work, struct iowait, iowork); + struct rvt_qp *qp = iowait_to_qp(wait); + +- hfi1_do_send(qp); ++ hfi1_do_send(qp, true); + } + + /** + * hfi1_do_send - perform a send on a QP + * @work: contains a pointer to the QP ++ * @in_thread: true if in a workqueue thread + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP. + * Otherwise, two threads could send packets out of order. + */ +-void hfi1_do_send(struct rvt_qp *qp) ++void hfi1_do_send(struct rvt_qp *qp, bool in_thread) + { + struct hfi1_pkt_state ps; + struct hfi1_qp_priv *priv = qp->priv; +@@ -868,8 +874,10 @@ void hfi1_do_send(struct rvt_qp *qp) + qp->s_hdrwords = 0; + /* allow other tasks to run */ + if (unlikely(time_after(jiffies, timeout))) { +- if (workqueue_congested(cpu, +- ps.ppd->hfi1_wq)) { ++ if (!in_thread || ++ workqueue_congested( ++ cpu, ++ ps.ppd->hfi1_wq)) { + spin_lock_irqsave( + &qp->s_lock, + ps.flags); +@@ -882,11 +890,9 @@ void hfi1_do_send(struct rvt_qp *qp) + *ps.ppd->dd->send_schedule); + return; + } +- if (!irqs_disabled()) { +- cond_resched(); +- this_cpu_inc( +- *ps.ppd->dd->send_schedule); +- } ++ cond_resched(); ++ this_cpu_inc( ++ *ps.ppd->dd->send_schedule); + timeout = jiffies + (timeout_int) / 8; + } + spin_lock_irqsave(&qp->s_lock, ps.flags); +--- a/drivers/infiniband/hw/hfi1/verbs.c ++++ b/drivers/infiniband/hw/hfi1/verbs.c +@@ -1,5 +1,5 @@ + /* +- * Copyright(c) 2015, 2016 Intel Corporation. ++ * Copyright(c) 2015 - 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. +@@ -1751,7 +1751,7 @@ int hfi1_register_ib_device(struct hfi1_ + dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free; + dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps; + dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset; +- dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send; ++ dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send_from_rvt; + dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send; + dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send; + dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr; +--- a/drivers/infiniband/hw/hfi1/verbs.h ++++ b/drivers/infiniband/hw/hfi1/verbs.h +@@ -1,5 +1,5 @@ + /* +- * Copyright(c) 2015, 2016 Intel Corporation. ++ * Copyright(c) 2015 - 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. +@@ -350,7 +350,9 @@ void hfi1_make_ruc_header(struct rvt_qp + + void _hfi1_do_send(struct work_struct *work); + +-void hfi1_do_send(struct rvt_qp *qp); ++void hfi1_do_send_from_rvt(struct rvt_qp *qp); ++ ++void hfi1_do_send(struct rvt_qp *qp, bool in_thread); + + void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status); diff --git a/queue-4.11/ib-ipoib-ibx-failed-to-create-mcg-debug-file.patch b/queue-4.11/ib-ipoib-ibx-failed-to-create-mcg-debug-file.patch new file mode 100644 index 00000000000..41b050a9068 --- /dev/null +++ b/queue-4.11/ib-ipoib-ibx-failed-to-create-mcg-debug-file.patch @@ -0,0 +1,157 @@ +From 771a52584096c45e4565e8aabb596eece9d73d61 Mon Sep 17 00:00:00 2001 +From: Shamir Rabinovitch +Date: Wed, 29 Mar 2017 06:21:59 -0400 +Subject: IB/IPoIB: ibX: failed to create mcg debug file + +From: Shamir Rabinovitch + +commit 771a52584096c45e4565e8aabb596eece9d73d61 upstream. + +When udev renames the netdev devices, ipoib debugfs entries does not +get renamed. As a result, if subsequent probe of ipoib device reuse the +name then creating a debugfs entry for the new device would fail. + +Also, moved ipoib_create_debug_files and ipoib_delete_debug_files as part +of ipoib event handling in order to avoid any race condition between these. + +Fixes: 1732b0ef3b3a ([IPoIB] add path record information in debugfs) +Signed-off-by: Vijay Kumar +Signed-off-by: Shamir Rabinovitch +Reviewed-by: Mark Bloch +Signed-off-by: Doug Ledford +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/ulp/ipoib/ipoib_fs.c | 3 ++ + drivers/infiniband/ulp/ipoib/ipoib_main.c | 44 ++++++++++++++++++++++++++---- + drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 3 -- + 3 files changed, 42 insertions(+), 8 deletions(-) + +--- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c +@@ -281,8 +281,11 @@ void ipoib_delete_debug_files(struct net + { + struct ipoib_dev_priv *priv = netdev_priv(dev); + ++ WARN_ONCE(!priv->mcg_dentry, "null mcg debug file\n"); ++ WARN_ONCE(!priv->path_dentry, "null path debug file\n"); + debugfs_remove(priv->mcg_dentry); + debugfs_remove(priv->path_dentry); ++ priv->mcg_dentry = priv->path_dentry = NULL; + } + + int ipoib_register_debugfs(void) +--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c +@@ -108,6 +108,33 @@ static struct ib_client ipoib_client = { + .get_net_dev_by_params = ipoib_get_net_dev_by_params, + }; + ++#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG ++static int ipoib_netdev_event(struct notifier_block *this, ++ unsigned long event, void *ptr) ++{ ++ struct netdev_notifier_info *ni = ptr; ++ struct net_device *dev = ni->dev; ++ ++ if (dev->netdev_ops->ndo_open != ipoib_open) ++ return NOTIFY_DONE; ++ ++ switch (event) { ++ case NETDEV_REGISTER: ++ ipoib_create_debug_files(dev); ++ break; ++ case NETDEV_CHANGENAME: ++ ipoib_delete_debug_files(dev); ++ ipoib_create_debug_files(dev); ++ break; ++ case NETDEV_UNREGISTER: ++ ipoib_delete_debug_files(dev); ++ break; ++ } ++ ++ return NOTIFY_DONE; ++} ++#endif ++ + int ipoib_open(struct net_device *dev) + { + struct ipoib_dev_priv *priv = netdev_priv(dev); +@@ -1674,8 +1701,6 @@ void ipoib_dev_cleanup(struct net_device + + ASSERT_RTNL(); + +- ipoib_delete_debug_files(dev); +- + /* Delete any child interfaces first */ + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { + /* Stop GC on child */ +@@ -2090,8 +2115,6 @@ static struct net_device *ipoib_add_port + goto register_failed; + } + +- ipoib_create_debug_files(priv->dev); +- + if (ipoib_cm_add_mode_attr(priv->dev)) + goto sysfs_failed; + if (ipoib_add_pkey_attr(priv->dev)) +@@ -2106,7 +2129,6 @@ static struct net_device *ipoib_add_port + return priv->dev; + + sysfs_failed: +- ipoib_delete_debug_files(priv->dev); + unregister_netdev(priv->dev); + + register_failed: +@@ -2191,6 +2213,12 @@ static void ipoib_remove_one(struct ib_d + kfree(dev_list); + } + ++#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG ++static struct notifier_block ipoib_netdev_notifier = { ++ .notifier_call = ipoib_netdev_event, ++}; ++#endif ++ + static int __init ipoib_init_module(void) + { + int ret; +@@ -2243,6 +2271,9 @@ static int __init ipoib_init_module(void + if (ret) + goto err_client; + ++#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG ++ register_netdevice_notifier(&ipoib_netdev_notifier); ++#endif + return 0; + + err_client: +@@ -2260,6 +2291,9 @@ err_fs: + + static void __exit ipoib_cleanup_module(void) + { ++#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG ++ unregister_netdevice_notifier(&ipoib_netdev_notifier); ++#endif + ipoib_netlink_fini(); + ib_unregister_client(&ipoib_client); + ib_sa_unregister_client(&ipoib_sa_client); +--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c ++++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +@@ -86,8 +86,6 @@ int __ipoib_vlan_add(struct ipoib_dev_pr + goto register_failed; + } + +- ipoib_create_debug_files(priv->dev); +- + /* RTNL childs don't need proprietary sysfs entries */ + if (type == IPOIB_LEGACY_CHILD) { + if (ipoib_cm_add_mode_attr(priv->dev)) +@@ -108,7 +106,6 @@ int __ipoib_vlan_add(struct ipoib_dev_pr + + sysfs_failed: + result = -ENOMEM; +- ipoib_delete_debug_files(priv->dev); + unregister_netdevice(priv->dev); + + register_failed: diff --git a/queue-4.11/ib-mlx4-fix-ib-device-initialization-error-flow.patch b/queue-4.11/ib-mlx4-fix-ib-device-initialization-error-flow.patch new file mode 100644 index 00000000000..c5c7e32cf5c --- /dev/null +++ b/queue-4.11/ib-mlx4-fix-ib-device-initialization-error-flow.patch @@ -0,0 +1,34 @@ +From 99e68909d5aba1861897fe7afc3306c3c81b6de0 Mon Sep 17 00:00:00 2001 +From: Jack Morgenstein +Date: Tue, 21 Mar 2017 12:57:05 +0200 +Subject: IB/mlx4: Fix ib device initialization error flow + +From: Jack Morgenstein + +commit 99e68909d5aba1861897fe7afc3306c3c81b6de0 upstream. + +In mlx4_ib_add, procedure mlx4_ib_alloc_eqs is called to allocate EQs. + +However, in the mlx4_ib_add error flow, procedure mlx4_ib_free_eqs is not +called to free the allocated EQs. + +Fixes: e605b743f33d ("IB/mlx4: Increase the number of vectors (EQs) available for ULPs") +Signed-off-by: Jack Morgenstein +Signed-off-by: Leon Romanovsky +Signed-off-by: Doug Ledford +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/hw/mlx4/main.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/infiniband/hw/mlx4/main.c ++++ b/drivers/infiniband/hw/mlx4/main.c +@@ -2941,6 +2941,7 @@ err_counter: + mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]); + + err_map: ++ mlx4_ib_free_eqs(dev, ibdev); + iounmap(ibdev->uar_map); + + err_uar: diff --git a/queue-4.11/ib-mlx4-reduce-sriov-multicast-cleanup-warning-message-to-debug-level.patch b/queue-4.11/ib-mlx4-reduce-sriov-multicast-cleanup-warning-message-to-debug-level.patch new file mode 100644 index 00000000000..90b2cd0c5a5 --- /dev/null +++ b/queue-4.11/ib-mlx4-reduce-sriov-multicast-cleanup-warning-message-to-debug-level.patch @@ -0,0 +1,97 @@ +From fb7a91746af18b2ebf596778b38a709cdbc488d3 Mon Sep 17 00:00:00 2001 +From: Jack Morgenstein +Date: Tue, 21 Mar 2017 12:57:06 +0200 +Subject: IB/mlx4: Reduce SRIOV multicast cleanup warning message to debug level + +From: Jack Morgenstein + +commit fb7a91746af18b2ebf596778b38a709cdbc488d3 upstream. + +A warning message during SRIOV multicast cleanup should have actually been +a debug level message. The condition generating the warning does no harm +and can fill the message log. + +In some cases, during testing, some tests were so intense as to swamp the +message log with these warning messages, causing a stall in the console +message log output task. This stall caused an NMI to be sent to all CPUs +(so that they all dumped their stacks into the message log). +Aside from the message flood causing an NMI, the tests all passed. + +Once the message flood which caused the NMI is removed (by reducing the +warning message to debug level), the NMI no longer occurs. + +Sample message log (console log) output illustrating the flood and +resultant NMI (snippets with comments and modified with ... instead +of hex digits, to satisfy checkpatch.pl): + + _mlx4_ib_mcg_port_cleanup: ... WARNING: group refcount 1!!!... + *** About 4000 almost identical lines in less than one second *** + _mlx4_ib_mcg_port_cleanup: ... WARNING: group refcount 1!!!... + INFO: rcu_sched detected stalls on CPUs/tasks: { 17} (...) + *** { 17} above indicates that CPU 17 was the one that stalled *** + sending NMI to all CPUs: + ... + NMI backtrace for cpu 17 + CPU: 17 PID: 45909 Comm: kworker/17:2 + Hardware name: HP ProLiant DL360p Gen8, BIOS P71 09/08/2013 + Workqueue: events fb_flashcursor + task: ffff880478...... ti: ffff88064e...... task.ti: ffff88064e...... + RIP: 0010:[ffffffff81......] [ffffffff81......] io_serial_in+0x15/0x20 + RSP: 0018:ffff88064e257cb0 EFLAGS: 00000002 + RAX: 0000000000...... RBX: ffffffff81...... RCX: 0000000000...... + RDX: 0000000000...... RSI: 0000000000...... RDI: ffffffff81...... + RBP: ffff88064e...... R08: ffffffff81...... R09: 0000000000...... + R10: 0000000000...... R11: ffff88064e...... R12: 0000000000...... + R13: 0000000000...... R14: ffffffff81...... R15: 0000000000...... + FS: 0000000000......(0000) GS:ffff8804af......(0000) knlGS:000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080...... + CR2: 00007f2a2f...... CR3: 0000000001...... CR4: 0000000000...... + DR0: 0000000000...... DR1: 0000000000...... DR2: 0000000000...... + DR3: 0000000000...... DR6: 00000000ff...... DR7: 0000000000...... + Stack: + ffff88064e...... ffffffff81...... ffffffff81...... 0000000000...... + ffffffff81...... ffff88064e...... ffffffff81...... ffffffff81...... + ffffffff81...... ffff88064e...... ffffffff81...... 0000000000...... + Call Trace: +[] wait_for_xmitr+0x3b/0xa0 +[] serial8250_console_putchar+0x1c/0x30 +[] ? serial8250_console_write+0x140/0x140 +[] uart_console_write+0x3a/0x80 +[] serial8250_console_write+0xae/0x140 +[] call_console_drivers.constprop.15+0x91/0xf0 +[] console_unlock+0x3bf/0x400 +[] fb_flashcursor+0x5d/0x140 +[] ? bit_clear+0x120/0x120 +[] process_one_work+0x17b/0x470 +[] worker_thread+0x11b/0x400 +[] ? rescuer_thread+0x400/0x400 +[] kthread+0xcf/0xe0 +[] ? kthread_create_on_node+0x140/0x140 +[] ret_from_fork+0x58/0x90 +[] ? kthread_create_on_node+0x140/0x140 +Code: 48 89 e5 d3 e6 48 63 f6 48 03 77 10 8b 06 5d c3 66 0f 1f 44 00 00 66 66 66 6 + +As indicated in the stack trace above, the console output task got swamped. + +Fixes: b9c5d6a64358 ("IB/mlx4: Add multicast group (MCG) paravirtualization for SR-IOV") +Signed-off-by: Jack Morgenstein +Signed-off-by: Leon Romanovsky +Signed-off-by: Doug Ledford +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/hw/mlx4/mcg.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/infiniband/hw/mlx4/mcg.c ++++ b/drivers/infiniband/hw/mlx4/mcg.c +@@ -1102,7 +1102,8 @@ static void _mlx4_ib_mcg_port_cleanup(st + while ((p = rb_first(&ctx->mcg_table)) != NULL) { + group = rb_entry(p, struct mcast_group, node); + if (atomic_read(&group->refcount)) +- mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group); ++ mcg_debug_group(group, "group refcount %d!!! (pointer %p)\n", ++ atomic_read(&group->refcount), group); + + force_clean_group(group); + } diff --git a/queue-4.11/iov_iter-don-t-revert-iov-buffer-if-csum-error.patch b/queue-4.11/iov_iter-don-t-revert-iov-buffer-if-csum-error.patch new file mode 100644 index 00000000000..f2554e3016d --- /dev/null +++ b/queue-4.11/iov_iter-don-t-revert-iov-buffer-if-csum-error.patch @@ -0,0 +1,61 @@ +From a6a5993243550b09f620941dea741b7421fdf79c Mon Sep 17 00:00:00 2001 +From: Ding Tianhong +Date: Sat, 29 Apr 2017 10:38:48 +0800 +Subject: iov_iter: don't revert iov buffer if csum error + +From: Ding Tianhong + +commit a6a5993243550b09f620941dea741b7421fdf79c upstream. + +The patch 327868212381 (make skb_copy_datagram_msg() et.al. preserve +->msg_iter on error) will revert the iov buffer if copy to iter +failed, but it didn't copy any datagram if the skb_checksum_complete +error, so no need to revert any data at this place. + +v2: Sabrina notice that return -EFAULT when checksum error is not correct + here, it would confuse the caller about the return value, so fix it. + +Fixes: 327868212381 ("make skb_copy_datagram_msg() et.al. preserve->msg_iter on error") +Signed-off-by: Ding Tianhong +Acked-by: Al Viro +Signed-off-by: Wei Yongjun +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + net/core/datagram.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/net/core/datagram.c ++++ b/net/core/datagram.c +@@ -760,7 +760,7 @@ int skb_copy_and_csum_datagram_msg(struc + + if (msg_data_left(msg) < chunk) { + if (__skb_checksum_complete(skb)) +- goto csum_error; ++ return -EINVAL; + if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) + goto fault; + } else { +@@ -768,15 +768,16 @@ int skb_copy_and_csum_datagram_msg(struc + if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, + chunk, &csum)) + goto fault; +- if (csum_fold(csum)) +- goto csum_error; ++ ++ if (csum_fold(csum)) { ++ iov_iter_revert(&msg->msg_iter, chunk); ++ return -EINVAL; ++ } ++ + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) + netdev_rx_csum_fault(skb->dev); + } + return 0; +-csum_error: +- iov_iter_revert(&msg->msg_iter, chunk); +- return -EINVAL; + fault: + return -EFAULT; + } diff --git a/queue-4.11/jbd2-fix-dbench4-performance-regression-for-nobarrier-mounts.patch b/queue-4.11/jbd2-fix-dbench4-performance-regression-for-nobarrier-mounts.patch new file mode 100644 index 00000000000..afcfb87040d --- /dev/null +++ b/queue-4.11/jbd2-fix-dbench4-performance-regression-for-nobarrier-mounts.patch @@ -0,0 +1,42 @@ +From 5052b069acf73866d00077d8bc49983c3ee903e5 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Sat, 29 Apr 2017 21:07:30 -0400 +Subject: jbd2: fix dbench4 performance regression for 'nobarrier' mounts + +From: Jan Kara + +commit 5052b069acf73866d00077d8bc49983c3ee903e5 upstream. + +Commit b685d3d65ac7 "block: treat REQ_FUA and REQ_PREFLUSH as +synchronous" removed REQ_SYNC flag from WRITE_FUA implementation. Since +JBD2 strips REQ_FUA and REQ_FLUSH flags from submitted IO when the +filesystem is mounted with nobarrier mount option, journal superblock +writes ended up being async writes after this patch and that caused +heavy performance regression for dbench4 benchmark with high number of +processes. In my test setup with HP RAID array with non-volatile write +cache and 32 GB ram, dbench4 runs with 8 processes regressed by ~25%. + +Fix the problem by making sure journal superblock writes are always +treated as synchronous since they generally block progress of the +journalling machinery and thus the whole filesystem. + +Fixes: b685d3d65ac791406e0dfd8779cc9b3707fea5a3 +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/jbd2/journal.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -1348,7 +1348,7 @@ static int jbd2_write_superblock(journal + jbd2_superblock_csum_set(journal, sb); + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; +- ret = submit_bh(REQ_OP_WRITE, write_flags, bh); ++ ret = submit_bh(REQ_OP_WRITE, write_flags | REQ_SYNC, bh); + wait_on_buffer(bh); + if (buffer_write_io_error(bh)) { + clear_buffer_write_io_error(bh); diff --git a/queue-4.11/md-raid1-avoid-reusing-a-resync-bio-after-error-handling.patch b/queue-4.11/md-raid1-avoid-reusing-a-resync-bio-after-error-handling.patch new file mode 100644 index 00000000000..cc14f40c5fa --- /dev/null +++ b/queue-4.11/md-raid1-avoid-reusing-a-resync-bio-after-error-handling.patch @@ -0,0 +1,49 @@ +From 0c9d5b127f695818c2c5a3868c1f28ca2969e905 Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Thu, 6 Apr 2017 12:06:37 +1000 +Subject: md/raid1: avoid reusing a resync bio after error handling. + +From: NeilBrown + +commit 0c9d5b127f695818c2c5a3868c1f28ca2969e905 upstream. + +fix_sync_read_error() modifies a bio on a newly faulty +device by setting bi_end_io to end_sync_write. +This ensure that put_buf() will still call rdev_dec_pending() +as required, but makes sure that subsequent code in +fix_sync_read_error() doesn't try to read from the device. + +Unfortunately this interacts badly with sync_request_write() +which assumes that any bio with bi_end_io set to non-NULL +other than end_sync_read is safe to write to. + +As the device is now faulty it doesn't make sense to write. +As the bio was recently used for a read, it is "dirty" +and not suitable for immediate submission. +In particular, ->bi_next might be non-NULL, which will cause +generic_make_request() to complain. + +Break this interaction by refusing to write to devices +which are marked as Faulty. + +Reported-and-tested-by: Michael Wang +Fixes: 2e52d449bcec ("md/raid1: add failfast handling for reads.") +Signed-off-by: NeilBrown +Signed-off-by: Shaohua Li +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid1.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -2222,6 +2222,8 @@ static void sync_request_write(struct md + (i == r1_bio->read_disk || + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) + continue; ++ if (test_bit(Faulty, &conf->mirrors[i].rdev->flags)) ++ continue; + + bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); + if (test_bit(FailFast, &conf->mirrors[i].rdev->flags)) diff --git a/queue-4.11/mm-prevent-potential-recursive-reclaim-due-to-clearing-pf_memalloc.patch b/queue-4.11/mm-prevent-potential-recursive-reclaim-due-to-clearing-pf_memalloc.patch new file mode 100644 index 00000000000..49b1998273d --- /dev/null +++ b/queue-4.11/mm-prevent-potential-recursive-reclaim-due-to-clearing-pf_memalloc.patch @@ -0,0 +1,82 @@ +From 62be1511b1db8066220b18b7d4da2e6b9fdc69fb Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Mon, 8 May 2017 15:59:46 -0700 +Subject: mm: prevent potential recursive reclaim due to clearing PF_MEMALLOC + +From: Vlastimil Babka + +commit 62be1511b1db8066220b18b7d4da2e6b9fdc69fb upstream. + +Patch series "more robust PF_MEMALLOC handling" + +This series aims to unify the setting and clearing of PF_MEMALLOC, which +prevents recursive reclaim. There are some places that clear the flag +unconditionally from current->flags, which may result in clearing a +pre-existing flag. This already resulted in a bug report that Patch 1 +fixes (without the new helpers, to make backporting easier). Patch 2 +introduces the new helpers, modelled after existing memalloc_noio_* and +memalloc_nofs_* helpers, and converts mm core to use them. Patches 3 +and 4 convert non-mm code. + +This patch (of 4): + +__alloc_pages_direct_compact() sets PF_MEMALLOC to prevent deadlock +during page migration by lock_page() (see the comment in +__unmap_and_move()). Then it unconditionally clears the flag, which can +clear a pre-existing PF_MEMALLOC flag and result in recursive reclaim. +This was not a problem until commit a8161d1ed609 ("mm, page_alloc: +restructure direct compaction handling in slowpath"), because direct +compation was called only after direct reclaim, which was skipped when +PF_MEMALLOC flag was set. + +Even now it's only a theoretical issue, as the new callsite of +__alloc_pages_direct_compact() is reached only for costly orders and +when gfp_pfmemalloc_allowed() is true, which means either +__GFP_NOMEMALLOC is in gfp_flags or in_interrupt() is true. There is no +such known context, but let's play it safe and make +__alloc_pages_direct_compact() robust for cases where PF_MEMALLOC is +already set. + +Fixes: a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath") +Link: http://lkml.kernel.org/r/20170405074700.29871-2-vbabka@suse.cz +Signed-off-by: Vlastimil Babka +Reported-by: Andrey Ryabinin +Acked-by: Michal Hocko +Acked-by: Hillf Danton +Cc: Mel Gorman +Cc: Johannes Weiner +Cc: Boris Brezillon +Cc: Chris Leech +Cc: "David S. Miller" +Cc: Eric Dumazet +Cc: Josef Bacik +Cc: Lee Duncan +Cc: Michal Hocko +Cc: Richard Weinberger +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page_alloc.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3245,6 +3245,7 @@ __alloc_pages_direct_compact(gfp_t gfp_m + enum compact_priority prio, enum compact_result *compact_result) + { + struct page *page; ++ unsigned int noreclaim_flag = current->flags & PF_MEMALLOC; + + if (!order) + return NULL; +@@ -3252,7 +3253,7 @@ __alloc_pages_direct_compact(gfp_t gfp_m + current->flags |= PF_MEMALLOC; + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + prio); +- current->flags &= ~PF_MEMALLOC; ++ current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag; + + if (*compact_result <= COMPACT_INACTIVE) + return NULL; diff --git a/queue-4.11/mm-vmscan-fix-io-refault-regression-in-cache-workingset-transition.patch b/queue-4.11/mm-vmscan-fix-io-refault-regression-in-cache-workingset-transition.patch new file mode 100644 index 00000000000..1f355eaddc5 --- /dev/null +++ b/queue-4.11/mm-vmscan-fix-io-refault-regression-in-cache-workingset-transition.patch @@ -0,0 +1,422 @@ +From 2a2e48854d704214dac7546e87ae0e4daa0e61a0 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 3 May 2017 14:55:03 -0700 +Subject: mm: vmscan: fix IO/refault regression in cache workingset transition + +From: Johannes Weiner + +commit 2a2e48854d704214dac7546e87ae0e4daa0e61a0 upstream. + +Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file +list") we noticed bigger IO spikes during changes in cache access +patterns. + +The patch in question shrunk the inactive list size to leave more room +for the current workingset in the presence of streaming IO. However, +workingset transitions that previously happened on the inactive list are +now pushed out of memory and incur more refaults to complete. + +This patch disables active list protection when refaults are being +observed. This accelerates workingset transitions, and allows more of +the new set to establish itself from memory, without eating into the +ability to protect the established workingset during stable periods. + +The workloads that were measurably affected for us were hit pretty bad +by it, with refault/majfault rates doubling and tripling during cache +transitions, and the machines sustaining half-hour periods of 100% IO +utilization, where they'd previously have sub-minute peaks at 60-90%. + +Stateful services that handle user data tend to be more conservative +with kernel upgrades. As a result we hit most page cache issues with +some delay, as was the case here. + +The severity seemed to warrant a stable tag. + +Fixes: 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list") +Link: http://lkml.kernel.org/r/20170404220052.27593-1-hannes@cmpxchg.org +Signed-off-by: Johannes Weiner +Cc: Rik van Riel +Cc: Mel Gorman +Cc: Michal Hocko +Cc: Vladimir Davydov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/memcontrol.h | 64 +++++++++++++++++++++++++++++- + include/linux/mmzone.h | 2 + mm/memcontrol.c | 24 +++-------- + mm/vmscan.c | 94 +++++++++++++++++++++++++++++++++++---------- + mm/workingset.c | 7 ++- + 5 files changed, 150 insertions(+), 41 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -56,6 +56,9 @@ enum mem_cgroup_stat_index { + MEMCG_SLAB_RECLAIMABLE, + MEMCG_SLAB_UNRECLAIMABLE, + MEMCG_SOCK, ++ MEMCG_WORKINGSET_REFAULT, ++ MEMCG_WORKINGSET_ACTIVATE, ++ MEMCG_WORKINGSET_NODERECLAIM, + MEMCG_NR_STAT, + }; + +@@ -494,6 +497,40 @@ extern int do_swap_account; + void lock_page_memcg(struct page *page); + void unlock_page_memcg(struct page *page); + ++static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx) ++{ ++ long val = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ val += per_cpu(memcg->stat->count[idx], cpu); ++ ++ if (val < 0) ++ val = 0; ++ ++ return val; ++} ++ ++static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx, int val) ++{ ++ if (!mem_cgroup_disabled()) ++ this_cpu_add(memcg->stat->count[idx], val); ++} ++ ++static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx) ++{ ++ mem_cgroup_update_stat(memcg, idx, 1); ++} ++ ++static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx) ++{ ++ mem_cgroup_update_stat(memcg, idx, -1); ++} ++ + /** + * mem_cgroup_update_page_stat - update page state statistics + * @page: the page +@@ -508,14 +545,14 @@ void unlock_page_memcg(struct page *page + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(page, state, -1); + * unlock_page(page) or unlock_page_memcg(page) ++ * ++ * Kernel pages are an exception to this, since they'll never move. + */ + static inline void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, int val) + { +- VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page))); +- + if (page->mem_cgroup) +- this_cpu_add(page->mem_cgroup->stat->count[idx], val); ++ mem_cgroup_update_stat(page->mem_cgroup, idx, val); + } + + static inline void mem_cgroup_inc_page_stat(struct page *page, +@@ -740,6 +777,27 @@ static inline bool mem_cgroup_oom_synchr + return false; + } + ++static inline unsigned long mem_cgroup_read_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx) ++{ ++ return 0; ++} ++ ++static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx, int val) ++{ ++} ++ ++static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx) ++{ ++} ++ ++static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg, ++ enum mem_cgroup_stat_index idx) ++{ ++} ++ + static inline void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, + int nr) +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -226,6 +226,8 @@ struct lruvec { + struct zone_reclaim_stat reclaim_stat; + /* Evictions & activations on the inactive file list */ + atomic_long_t inactive_age; ++ /* Refaults at the time of last reclaim cycle */ ++ unsigned long refaults; + #ifdef CONFIG_MEMCG + struct pglist_data *pgdat; + #endif +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -568,23 +568,6 @@ mem_cgroup_largest_soft_limit_node(struc + * common workload, threshold and synchronization as vmstat[] should be + * implemented. + */ +-static unsigned long +-mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) +-{ +- long val = 0; +- int cpu; +- +- /* Per-cpu values can be negative, use a signed accumulator */ +- for_each_possible_cpu(cpu) +- val += per_cpu(memcg->stat->count[idx], cpu); +- /* +- * Summing races with updates, so val may be negative. Avoid exposing +- * transient negative values. +- */ +- if (val < 0) +- val = 0; +- return val; +-} + + static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx) +@@ -5237,6 +5220,13 @@ static int memory_stat_show(struct seq_f + seq_printf(m, "pgmajfault %lu\n", + events[MEM_CGROUP_EVENTS_PGMAJFAULT]); + ++ seq_printf(m, "workingset_refault %lu\n", ++ stat[MEMCG_WORKINGSET_REFAULT]); ++ seq_printf(m, "workingset_activate %lu\n", ++ stat[MEMCG_WORKINGSET_ACTIVATE]); ++ seq_printf(m, "workingset_nodereclaim %lu\n", ++ stat[MEMCG_WORKINGSET_NODERECLAIM]); ++ + return 0; + } + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2033,6 +2033,8 @@ static void shrink_active_list(unsigned + * Both inactive lists should also be large enough that each inactive + * page has a chance to be referenced again before it is reclaimed. + * ++ * If that fails and refaulting is observed, the inactive list grows. ++ * + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages + * on this LRU, maintained by the pageout code. A zone->inactive_ratio + * of 3 means 3:1 or 25% of the pages are kept on the inactive list. +@@ -2049,12 +2051,15 @@ static void shrink_active_list(unsigned + * 10TB 320 32GB + */ + static bool inactive_list_is_low(struct lruvec *lruvec, bool file, +- struct scan_control *sc, bool trace) ++ struct mem_cgroup *memcg, ++ struct scan_control *sc, bool actual_reclaim) + { +- unsigned long inactive_ratio; +- unsigned long inactive, active; +- enum lru_list inactive_lru = file * LRU_FILE; + enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; ++ struct pglist_data *pgdat = lruvec_pgdat(lruvec); ++ enum lru_list inactive_lru = file * LRU_FILE; ++ unsigned long inactive, active; ++ unsigned long inactive_ratio; ++ unsigned long refaults; + unsigned long gb; + + /* +@@ -2067,27 +2072,43 @@ static bool inactive_list_is_low(struct + inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); + active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); + +- gb = (inactive + active) >> (30 - PAGE_SHIFT); +- if (gb) +- inactive_ratio = int_sqrt(10 * gb); ++ if (memcg) ++ refaults = mem_cgroup_read_stat(memcg, ++ MEMCG_WORKINGSET_ACTIVATE); + else +- inactive_ratio = 1; ++ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); + +- if (trace) +- trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, +- sc->reclaim_idx, +- lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, +- lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, +- inactive_ratio, file); ++ /* ++ * When refaults are being observed, it means a new workingset ++ * is being established. Disable active list protection to get ++ * rid of the stale workingset quickly. ++ */ ++ if (file && actual_reclaim && lruvec->refaults != refaults) { ++ inactive_ratio = 0; ++ } else { ++ gb = (inactive + active) >> (30 - PAGE_SHIFT); ++ if (gb) ++ inactive_ratio = int_sqrt(10 * gb); ++ else ++ inactive_ratio = 1; ++ } ++ ++ if (actual_reclaim) ++ trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, ++ lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, ++ lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, ++ inactive_ratio, file); + + return inactive * inactive_ratio < active; + } + + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, +- struct lruvec *lruvec, struct scan_control *sc) ++ struct lruvec *lruvec, struct mem_cgroup *memcg, ++ struct scan_control *sc) + { + if (is_active_lru(lru)) { +- if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) ++ if (inactive_list_is_low(lruvec, is_file_lru(lru), ++ memcg, sc, true)) + shrink_active_list(nr_to_scan, lruvec, sc, lru); + return 0; + } +@@ -2218,7 +2239,7 @@ static void get_scan_count(struct lruvec + * lruvec even if it has plenty of old anonymous pages unless the + * system is under heavy pressure. + */ +- if (!inactive_list_is_low(lruvec, true, sc, false) && ++ if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { + scan_balance = SCAN_FILE; + goto out; +@@ -2376,7 +2397,7 @@ static void shrink_node_memcg(struct pgl + nr[lru] -= nr_to_scan; + + nr_reclaimed += shrink_list(lru, nr_to_scan, +- lruvec, sc); ++ lruvec, memcg, sc); + } + } + +@@ -2443,7 +2464,7 @@ static void shrink_node_memcg(struct pgl + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ +- if (inactive_list_is_low(lruvec, false, sc, true)) ++ if (inactive_list_is_low(lruvec, false, memcg, sc, true)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + } +@@ -2752,6 +2773,26 @@ static void shrink_zones(struct zonelist + sc->gfp_mask = orig_mask; + } + ++static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) ++{ ++ struct mem_cgroup *memcg; ++ ++ memcg = mem_cgroup_iter(root_memcg, NULL, NULL); ++ do { ++ unsigned long refaults; ++ struct lruvec *lruvec; ++ ++ if (memcg) ++ refaults = mem_cgroup_read_stat(memcg, ++ MEMCG_WORKINGSET_ACTIVATE); ++ else ++ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); ++ ++ lruvec = mem_cgroup_lruvec(pgdat, memcg); ++ lruvec->refaults = refaults; ++ } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); ++} ++ + /* + * This is the main entry point to direct page reclaim. + * +@@ -2772,6 +2813,9 @@ static unsigned long do_try_to_free_page + struct scan_control *sc) + { + int initial_priority = sc->priority; ++ pg_data_t *last_pgdat; ++ struct zoneref *z; ++ struct zone *zone; + retry: + delayacct_freepages_start(); + +@@ -2798,6 +2842,15 @@ retry: + sc->may_writepage = 1; + } while (--sc->priority >= 0); + ++ last_pgdat = NULL; ++ for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, ++ sc->nodemask) { ++ if (zone->zone_pgdat == last_pgdat) ++ continue; ++ last_pgdat = zone->zone_pgdat; ++ snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); ++ } ++ + delayacct_freepages_end(); + + if (sc->nr_reclaimed) +@@ -3076,7 +3129,7 @@ static void age_active_anon(struct pglis + do { + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); + +- if (inactive_list_is_low(lruvec, false, sc, true)) ++ if (inactive_list_is_low(lruvec, false, memcg, sc, true)) + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); + +@@ -3311,6 +3364,7 @@ static int balance_pgdat(pg_data_t *pgda + } while (sc.priority >= 1); + + out: ++ snapshot_refaults(NULL, pgdat); + /* + * Return the order kswapd stopped reclaiming at as + * prepare_kswapd_sleep() takes it into account. If another caller +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -269,7 +269,6 @@ bool workingset_refault(void *shadow) + lruvec = mem_cgroup_lruvec(pgdat, memcg); + refault = atomic_long_read(&lruvec->inactive_age); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); +- rcu_read_unlock(); + + /* + * The unsigned subtraction here gives an accurate distance +@@ -290,11 +289,15 @@ bool workingset_refault(void *shadow) + refault_distance = (refault - eviction) & EVICTION_MASK; + + inc_node_state(pgdat, WORKINGSET_REFAULT); ++ mem_cgroup_inc_stat(memcg, MEMCG_WORKINGSET_REFAULT); + + if (refault_distance <= active_file) { + inc_node_state(pgdat, WORKINGSET_ACTIVATE); ++ mem_cgroup_inc_stat(memcg, MEMCG_WORKINGSET_ACTIVATE); ++ rcu_read_unlock(); + return true; + } ++ rcu_read_unlock(); + return false; + } + +@@ -472,6 +475,8 @@ static enum lru_status shadow_lru_isolat + if (WARN_ON_ONCE(node->exceptional)) + goto out_invalid; + inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); ++ mem_cgroup_inc_page_stat(virt_to_page(node), ++ MEMCG_WORKINGSET_NODERECLAIM); + __radix_tree_delete_node(&mapping->page_tree, node, + workingset_update_node, mapping); + diff --git a/queue-4.11/orangefs-clean-up-oversize-xattr-validation.patch b/queue-4.11/orangefs-clean-up-oversize-xattr-validation.patch new file mode 100644 index 00000000000..f694df0422a --- /dev/null +++ b/queue-4.11/orangefs-clean-up-oversize-xattr-validation.patch @@ -0,0 +1,78 @@ +From e675c5ec51fe2554719a7b6bcdbef0a770f2c19b Mon Sep 17 00:00:00 2001 +From: Martin Brandenburg +Date: Tue, 25 Apr 2017 15:37:57 -0400 +Subject: orangefs: clean up oversize xattr validation + +From: Martin Brandenburg + +commit e675c5ec51fe2554719a7b6bcdbef0a770f2c19b upstream. + +Also don't check flags as this has been validated by the VFS already. + +Fix an off-by-one error in the max size checking. + +Stop logging just because userspace wants to write attributes which do +not fit. + +This and the previous commit fix xfstests generic/020. + +Signed-off-by: Martin Brandenburg +Signed-off-by: Mike Marshall +Signed-off-by: Greg Kroah-Hartman + +--- + fs/orangefs/xattr.c | 24 +++++++----------------- + 1 file changed, 7 insertions(+), 17 deletions(-) + +--- a/fs/orangefs/xattr.c ++++ b/fs/orangefs/xattr.c +@@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct i + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + +- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { +- gossip_err("Invalid key length (%d)\n", +- (int)strlen(name)); ++ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; +- } + + fsuid = from_kuid(&init_user_ns, current_fsuid()); + fsgid = from_kgid(&init_user_ns, current_fsgid()); +@@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(st + struct orangefs_kernel_op_s *new_op = NULL; + int ret = -ENOMEM; + ++ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) ++ return -EINVAL; ++ + down_write(&orangefs_inode->xattr_sem); + new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR); + if (!new_op) +@@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode + "%s: name %s, buffer_size %zd\n", + __func__, name, size); + +- if (size >= ORANGEFS_MAX_XATTR_VALUELEN || +- flags < 0) { +- gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n", +- (int)size, +- flags); ++ if (size > ORANGEFS_MAX_XATTR_VALUELEN) ++ return -EINVAL; ++ if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; +- } + + internal_flag = convert_to_internal_xattr_flags(flags); + +- if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { +- gossip_err +- ("orangefs_inode_setxattr: bogus key size (%d)\n", +- (int)(strlen(name))); +- return -EINVAL; +- } +- + /* This is equivalent to a removexattr */ + if (size == 0 && value == NULL) { + gossip_debug(GOSSIP_XATTR_DEBUG, diff --git a/queue-4.11/orangefs-do-not-check-possibly-stale-size-on-truncate.patch b/queue-4.11/orangefs-do-not-check-possibly-stale-size-on-truncate.patch new file mode 100644 index 00000000000..4f9781d596e --- /dev/null +++ b/queue-4.11/orangefs-do-not-check-possibly-stale-size-on-truncate.patch @@ -0,0 +1,43 @@ +From 53950ef541675df48c219a8d665111a0e68dfc2f Mon Sep 17 00:00:00 2001 +From: Martin Brandenburg +Date: Tue, 25 Apr 2017 15:38:04 -0400 +Subject: orangefs: do not check possibly stale size on truncate + +From: Martin Brandenburg + +commit 53950ef541675df48c219a8d665111a0e68dfc2f upstream. + +Let the server figure this out because our size might be out of date or +not present. + +The bug was that + + xfs_io -f -t -c "pread -v 0 100" /mnt/foo + echo "Test" > /mnt/foo + xfs_io -f -t -c "pread -v 0 100" /mnt/foo + +fails because the second truncate did not happen if nothing had +requested the size after the write in echo. Thus i_size was zero (not +present) and the orangefs_setattr though i_size was zero and there was +nothing to do. + +Signed-off-by: Martin Brandenburg +Signed-off-by: Mike Marshall +Signed-off-by: Greg Kroah-Hartman + +--- + fs/orangefs/inode.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/orangefs/inode.c ++++ b/fs/orangefs/inode.c +@@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dent + if (ret) + goto out; + +- if ((iattr->ia_valid & ATTR_SIZE) && +- iattr->ia_size != i_size_read(inode)) { ++ if (iattr->ia_valid & ATTR_SIZE) { + ret = orangefs_setattr_size(inode, iattr); + if (ret) + goto out; diff --git a/queue-4.11/orangefs-do-not-set-getattr_time-on-orangefs_lookup.patch b/queue-4.11/orangefs-do-not-set-getattr_time-on-orangefs_lookup.patch new file mode 100644 index 00000000000..2299f4ab3e6 --- /dev/null +++ b/queue-4.11/orangefs-do-not-set-getattr_time-on-orangefs_lookup.patch @@ -0,0 +1,31 @@ +From 17930b252cd6f31163c259eaa99dd8aa630fb9ba Mon Sep 17 00:00:00 2001 +From: Martin Brandenburg +Date: Tue, 25 Apr 2017 15:37:58 -0400 +Subject: orangefs: do not set getattr_time on orangefs_lookup + +From: Martin Brandenburg + +commit 17930b252cd6f31163c259eaa99dd8aa630fb9ba upstream. + +Since orangefs_lookup calls orangefs_iget which calls +orangefs_inode_getattr, getattr_time will get set. + +Signed-off-by: Martin Brandenburg +Signed-off-by: Mike Marshall +Signed-off-by: Greg Kroah-Hartman + +--- + fs/orangefs/namei.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/fs/orangefs/namei.c ++++ b/fs/orangefs/namei.c +@@ -193,8 +193,6 @@ static struct dentry *orangefs_lookup(st + goto out; + } + +- ORANGEFS_I(inode)->getattr_time = jiffies - 1; +- + gossip_debug(GOSSIP_NAME_DEBUG, + "%s:%s:%d " + "Found good inode [%lu] with count [%d]\n", diff --git a/queue-4.11/orangefs-fix-bounds-check-for-listxattr.patch b/queue-4.11/orangefs-fix-bounds-check-for-listxattr.patch new file mode 100644 index 00000000000..ca1b4e6e569 --- /dev/null +++ b/queue-4.11/orangefs-fix-bounds-check-for-listxattr.patch @@ -0,0 +1,28 @@ +From a956af337b9ff25822d9ce1a59c6ed0c09fc14b9 Mon Sep 17 00:00:00 2001 +From: Martin Brandenburg +Date: Tue, 25 Apr 2017 15:37:56 -0400 +Subject: orangefs: fix bounds check for listxattr + +From: Martin Brandenburg + +commit a956af337b9ff25822d9ce1a59c6ed0c09fc14b9 upstream. + +Signed-off-by: Martin Brandenburg +Signed-off-by: Mike Marshall +Signed-off-by: Greg Kroah-Hartman + +--- + fs/orangefs/xattr.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/orangefs/xattr.c ++++ b/fs/orangefs/xattr.c +@@ -358,7 +358,7 @@ try_again: + + returned_count = new_op->downcall.resp.listxattr.returned_count; + if (returned_count < 0 || +- returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) { ++ returned_count > ORANGEFS_MAX_XATTR_LISTLEN) { + gossip_err("%s: impossible value for returned_count:%d:\n", + __func__, + returned_count); diff --git a/queue-4.11/ovl-do-not-set-overlay.opaque-on-non-dir-create.patch b/queue-4.11/ovl-do-not-set-overlay.opaque-on-non-dir-create.patch new file mode 100644 index 00000000000..3c3a34e1664 --- /dev/null +++ b/queue-4.11/ovl-do-not-set-overlay.opaque-on-non-dir-create.patch @@ -0,0 +1,32 @@ +From 4a99f3c83dc493c8ea84693d78cd792839c8aa64 Mon Sep 17 00:00:00 2001 +From: Amir Goldstein +Date: Mon, 24 Apr 2017 22:26:40 +0300 +Subject: ovl: do not set overlay.opaque on non-dir create + +From: Amir Goldstein + +commit 4a99f3c83dc493c8ea84693d78cd792839c8aa64 upstream. + +The optimization for opaque dir create was wrongly being applied +also to non-dir create. + +Fixes: 97c684cc9110 ("ovl: create directories inside merged parent opaque") +Signed-off-by: Amir Goldstein +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/overlayfs/dir.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/overlayfs/dir.c ++++ b/fs/overlayfs/dir.c +@@ -210,7 +210,7 @@ static int ovl_create_upper(struct dentr + if (err) + goto out_dput; + +- if (ovl_type_merge(dentry->d_parent)) { ++ if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { + /* Setting opaque here is just an optimization, allow to fail */ + ovl_set_opaque(dentry, newdentry); + } diff --git a/queue-4.11/padata-free-correct-variable.patch b/queue-4.11/padata-free-correct-variable.patch new file mode 100644 index 00000000000..5683b2c4397 --- /dev/null +++ b/queue-4.11/padata-free-correct-variable.patch @@ -0,0 +1,32 @@ +From 07a77929ba672d93642a56dc2255dd21e6e2290b Mon Sep 17 00:00:00 2001 +From: "Jason A. Donenfeld" +Date: Fri, 7 Apr 2017 02:33:30 +0200 +Subject: padata: free correct variable + +From: Jason A. Donenfeld + +commit 07a77929ba672d93642a56dc2255dd21e6e2290b upstream. + +The author meant to free the variable that was just allocated, instead +of the one that failed to be allocated, but made a simple typo. This +patch rectifies that. + +Signed-off-by: Jason A. Donenfeld +Signed-off-by: Herbert Xu +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/padata.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/padata.c ++++ b/kernel/padata.c +@@ -354,7 +354,7 @@ static int padata_setup_cpumasks(struct + + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { +- free_cpumask_var(pd->cpumask.cbcpu); ++ free_cpumask_var(pd->cpumask.pcpu); + return -ENOMEM; + } + diff --git a/queue-4.11/perf-annotate-s390-fix-perf-annotate-error-95-4.10-regression.patch b/queue-4.11/perf-annotate-s390-fix-perf-annotate-error-95-4.10-regression.patch new file mode 100644 index 00000000000..2d6b6a29598 --- /dev/null +++ b/queue-4.11/perf-annotate-s390-fix-perf-annotate-error-95-4.10-regression.patch @@ -0,0 +1,47 @@ +From e77852b32d6d4430c68c38aaf73efe5650fa25af Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Thu, 6 Apr 2017 09:51:51 +0200 +Subject: perf annotate s390: Fix perf annotate error -95 (4.10 regression) + +From: Christian Borntraeger + +commit e77852b32d6d4430c68c38aaf73efe5650fa25af upstream. + +since 4.10 perf annotate exits on s390 with an "unknown error -95". +Turns out that commit 786c1b51844d ("perf annotate: Start supporting +cross arch annotation") added a hard requirement for architecture +support when objdump is used but only provided x86 and arm support. +Meanwhile power was added so lets add s390 as well. + +While at it make sure to implement the branch and jump types. + +Signed-off-by: Christian Borntraeger +Cc: Andreas Krebbel +Cc: Hendrik Brueckner +Cc: Martin Schwidefsky +Cc: Peter Zijlstra +Cc: linux-s390 +Fixes: 786c1b51844 "perf annotate: Start supporting cross arch annotation" +Link: http://lkml.kernel.org/r/1491465112-45819-2-git-send-email-borntraeger@de.ibm.com +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: Greg Kroah-Hartman + +--- + tools/perf/util/annotate.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/tools/perf/util/annotate.c ++++ b/tools/perf/util/annotate.c +@@ -136,6 +136,12 @@ static struct arch architectures[] = { + .comment_char = '#', + }, + }, ++ { ++ .name = "s390", ++ .objdump = { ++ .comment_char = '#', ++ }, ++ }, + }; + + static void ins__delete(struct ins_operands *ops) diff --git a/queue-4.11/perf-annotate-s390-implement-jump-types-for-perf-annotate.patch b/queue-4.11/perf-annotate-s390-implement-jump-types-for-perf-annotate.patch new file mode 100644 index 00000000000..7d3c4d5b825 --- /dev/null +++ b/queue-4.11/perf-annotate-s390-implement-jump-types-for-perf-annotate.patch @@ -0,0 +1,77 @@ +From d9f8dfa9baf9b6ae1f2f84f887176558ecde5268 Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Thu, 6 Apr 2017 09:51:52 +0200 +Subject: perf annotate s390: Implement jump types for perf annotate + +From: Christian Borntraeger + +commit d9f8dfa9baf9b6ae1f2f84f887176558ecde5268 upstream. + +Implement simple detection for all kind of jumps and branches. + +Signed-off-by: Christian Borntraeger +Cc: Andreas Krebbel +Cc: Hendrik Brueckner +Cc: Martin Schwidefsky +Cc: Peter Zijlstra +Cc: linux-s390 +Link: http://lkml.kernel.org/r/1491465112-45819-3-git-send-email-borntraeger@de.ibm.com +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: Greg Kroah-Hartman + +--- + tools/perf/arch/s390/annotate/instructions.c | 30 +++++++++++++++++++++++++++ + tools/perf/util/annotate.c | 2 + + 2 files changed, 32 insertions(+) + +--- /dev/null ++++ b/tools/perf/arch/s390/annotate/instructions.c +@@ -0,0 +1,30 @@ ++static struct ins_ops *s390__associate_ins_ops(struct arch *arch, const char *name) ++{ ++ struct ins_ops *ops = NULL; ++ ++ /* catch all kind of jumps */ ++ if (strchr(name, 'j') || ++ !strncmp(name, "bct", 3) || ++ !strncmp(name, "br", 2)) ++ ops = &jump_ops; ++ /* override call/returns */ ++ if (!strcmp(name, "bras") || ++ !strcmp(name, "brasl") || ++ !strcmp(name, "basr")) ++ ops = &call_ops; ++ if (!strcmp(name, "br")) ++ ops = &ret_ops; ++ ++ arch__associate_ins_ops(arch, name, ops); ++ return ops; ++} ++ ++static int s390__annotate_init(struct arch *arch) ++{ ++ if (!arch->initialized) { ++ arch->initialized = true; ++ arch->associate_instruction_ops = s390__associate_ins_ops; ++ } ++ ++ return 0; ++} +--- a/tools/perf/util/annotate.c ++++ b/tools/perf/util/annotate.c +@@ -108,6 +108,7 @@ static int arch__associate_ins_ops(struc + #include "arch/arm64/annotate/instructions.c" + #include "arch/x86/annotate/instructions.c" + #include "arch/powerpc/annotate/instructions.c" ++#include "arch/s390/annotate/instructions.c" + + static struct arch architectures[] = { + { +@@ -132,6 +133,7 @@ static struct arch architectures[] = { + }, + { + .name = "s390", ++ .init = s390__annotate_init, + .objdump = { + .comment_char = '#', + }, diff --git a/queue-4.11/perf-auxtrace-fix-no_size-logic-in-addr_filter__resolve_kernel_syms.patch b/queue-4.11/perf-auxtrace-fix-no_size-logic-in-addr_filter__resolve_kernel_syms.patch new file mode 100644 index 00000000000..b10d92a84bd --- /dev/null +++ b/queue-4.11/perf-auxtrace-fix-no_size-logic-in-addr_filter__resolve_kernel_syms.patch @@ -0,0 +1,43 @@ +From c3a0bbc7ad7598dec5a204868bdf8a2b1b51df14 Mon Sep 17 00:00:00 2001 +From: Adrian Hunter +Date: Fri, 24 Mar 2017 14:15:52 +0200 +Subject: perf auxtrace: Fix no_size logic in addr_filter__resolve_kernel_syms() + +From: Adrian Hunter + +commit c3a0bbc7ad7598dec5a204868bdf8a2b1b51df14 upstream. + +Address filtering with kernel symbols incorrectly resulted in the error +"Cannot determine size of symbol" because the no_size logic was the wrong +way around. + +Signed-off-by: Adrian Hunter +Tested-by: Andi Kleen +Link: http://lkml.kernel.org/r/1490357752-27942-1-git-send-email-adrian.hunter@intel.com +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: Greg Kroah-Hartman + +--- + tools/perf/util/auxtrace.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/tools/perf/util/auxtrace.c ++++ b/tools/perf/util/auxtrace.c +@@ -1826,7 +1826,7 @@ static int addr_filter__resolve_kernel_s + filt->addr = start; + if (filt->range && !filt->size && !filt->sym_to) { + filt->size = size; +- no_size = !!size; ++ no_size = !size; + } + } + +@@ -1840,7 +1840,7 @@ static int addr_filter__resolve_kernel_s + if (err) + return err; + filt->size = start + size - filt->addr; +- no_size = !!size; ++ no_size = !size; + } + + /* The very last symbol in kallsyms does not imply a particular size */ diff --git a/queue-4.11/series b/queue-4.11/series index c212fb48adb..c33aa1749b6 100644 --- a/queue-4.11/series +++ b/queue-4.11/series @@ -41,3 +41,39 @@ dm-era-save-spacemap-metadata-root-after-the-pre-commit.patch dm-rq-check-blk_mq_register_dev-return-value-in-dm_mq_init_request_queue.patch dm-thin-fix-a-memory-leak-when-passing-discard-bio-down.patch vfio-type1-remove-locked-page-accounting-workqueue.patch +iov_iter-don-t-revert-iov-buffer-if-csum-error.patch +ib-core-fix-sysfs-registration-error-flow.patch +ib-core-fix-kernel-crash-during-fail-to-initialize-device.patch +ib-core-for-multicast-functions-verify-that-lids-are-multicast-lids.patch +ib-ipoib-ibx-failed-to-create-mcg-debug-file.patch +ib-mlx4-fix-ib-device-initialization-error-flow.patch +ib-mlx4-reduce-sriov-multicast-cleanup-warning-message-to-debug-level.patch +ib-hfi1-prevent-kernel-qp-post-send-hard-lockups.patch +perf-auxtrace-fix-no_size-logic-in-addr_filter__resolve_kernel_syms.patch +perf-annotate-s390-fix-perf-annotate-error-95-4.10-regression.patch +perf-annotate-s390-implement-jump-types-for-perf-annotate.patch +jbd2-fix-dbench4-performance-regression-for-nobarrier-mounts.patch +ext4-evict-inline-data-when-writing-to-memory-map.patch +orangefs-fix-bounds-check-for-listxattr.patch +orangefs-clean-up-oversize-xattr-validation.patch +orangefs-do-not-set-getattr_time-on-orangefs_lookup.patch +orangefs-do-not-check-possibly-stale-size-on-truncate.patch +fs-xattr.c-zero-out-memory-copied-to-userspace-in-getxattr.patch +ceph-fix-memory-leak-in-__ceph_setxattr.patch +fs-fix-data-invalidation-in-the-cleancache-during-direct-io.patch +fs-block_dev-always-invalidate-cleancache-in-invalidate_bdev.patch +mm-vmscan-fix-io-refault-regression-in-cache-workingset-transition.patch +mm-prevent-potential-recursive-reclaim-due-to-clearing-pf_memalloc.patch +fix-match_prepath.patch +do-not-return-number-of-bytes-written-for-ioctl-cifs_ioc_copychunk_file.patch +set-unicode-flag-on-cifs-echo-request-to-avoid-mac-error.patch +smb3-work-around-mount-failure-when-using-smb3-dialect-to-macs.patch +cifs-fix-mapping-of-sfm_space-and-sfm_period.patch +cifs-fix-leak-in-fsctl_enum_snaps-response-handling.patch +cifs-fix-cifs_enumerate_snapshots-oops.patch +cifs-fix-oplock-break-deadlocks.patch +cifs-fix-cifs_ioc_get_mnt_info-oops.patch +cifs-add-misssing-sfm-mapping-for-doublequote.patch +ovl-do-not-set-overlay.opaque-on-non-dir-create.patch +padata-free-correct-variable.patch +md-raid1-avoid-reusing-a-resync-bio-after-error-handling.patch diff --git a/queue-4.11/set-unicode-flag-on-cifs-echo-request-to-avoid-mac-error.patch b/queue-4.11/set-unicode-flag-on-cifs-echo-request-to-avoid-mac-error.patch new file mode 100644 index 00000000000..d6ac2bee5be --- /dev/null +++ b/queue-4.11/set-unicode-flag-on-cifs-echo-request-to-avoid-mac-error.patch @@ -0,0 +1,34 @@ +From 26c9cb668c7fbf9830516b75d8bee70b699ed449 Mon Sep 17 00:00:00 2001 +From: Steve French +Date: Tue, 2 May 2017 13:35:20 -0500 +Subject: Set unicode flag on cifs echo request to avoid Mac error + +From: Steve French + +commit 26c9cb668c7fbf9830516b75d8bee70b699ed449 upstream. + +Mac requires the unicode flag to be set for cifs, even for the smb +echo request (which doesn't have strings). + +Without this Mac rejects the periodic echo requests (when mounting +with cifs) that we use to check if server is down + +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/cifssmb.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/cifs/cifssmb.c ++++ b/fs/cifs/cifssmb.c +@@ -718,6 +718,9 @@ CIFSSMBEcho(struct TCP_Server_Info *serv + if (rc) + return rc; + ++ if (server->capabilities & CAP_UNICODE) ++ smb->hdr.Flags2 |= SMBFLG2_UNICODE; ++ + /* set up echo request */ + smb->hdr.Tid = 0xffff; + smb->hdr.WordCount = 1; diff --git a/queue-4.11/smb3-work-around-mount-failure-when-using-smb3-dialect-to-macs.patch b/queue-4.11/smb3-work-around-mount-failure-when-using-smb3-dialect-to-macs.patch new file mode 100644 index 00000000000..fa66636bbe5 --- /dev/null +++ b/queue-4.11/smb3-work-around-mount-failure-when-using-smb3-dialect-to-macs.patch @@ -0,0 +1,55 @@ +From 7db0a6efdc3e990cdfd4b24820d010e9eb7890ad Mon Sep 17 00:00:00 2001 +From: Steve French +Date: Wed, 3 May 2017 21:12:20 -0500 +Subject: SMB3: Work around mount failure when using SMB3 dialect to Macs + +From: Steve French + +commit 7db0a6efdc3e990cdfd4b24820d010e9eb7890ad upstream. + +Macs send the maximum buffer size in response on ioctl to validate +negotiate security information, which causes us to fail the mount +as the response buffer is larger than the expected response. + +Changed ioctl response processing to allow for padding of validate +negotiate ioctl response and limit the maximum response size to +maximum buffer size. + +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/smb2pdu.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -632,8 +632,12 @@ int smb3_validate_negotiate(const unsign + } + + if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { +- cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); +- return -EIO; ++ cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", ++ rsplen); ++ ++ /* relax check since Mac returns max bufsize allowed on ioctl */ ++ if (rsplen > CIFSMaxBufSize) ++ return -EIO; + } + + /* check validate negotiate info response matches what we got earlier */ +@@ -1853,8 +1857,12 @@ SMB2_ioctl(const unsigned int xid, struc + * than one credit. Windows typically sets this smaller, but for some + * ioctls it may be useful to allow server to send more. No point + * limiting what the server can send as long as fits in one credit ++ * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE ++ * (by default, note that it can be overridden to make max larger) ++ * in responses (except for read responses which can be bigger. ++ * We may want to bump this limit up + */ +- req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ ++ req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); + + if (is_fsctl) + req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);