From 2dda57915876b8311a41770dd324840532e4e05e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Oct 2009 10:41:39 -0700 Subject: [PATCH] more .27 patches after the review cycle started --- review-2.6.27/mm-fix-anonymous-dirtying.patch | 43 +++++++++ ...n_vma-lock-acquisition-in-vma_adjust.patch | 89 +++++++++++++++++++ review-2.6.27/series | 2 + 3 files changed, 134 insertions(+) create mode 100644 review-2.6.27/mm-fix-anonymous-dirtying.patch create mode 100644 review-2.6.27/mmap-avoid-unnecessary-anon_vma-lock-acquisition-in-vma_adjust.patch diff --git a/review-2.6.27/mm-fix-anonymous-dirtying.patch b/review-2.6.27/mm-fix-anonymous-dirtying.patch new file mode 100644 index 00000000000..e9f3475f74b --- /dev/null +++ b/review-2.6.27/mm-fix-anonymous-dirtying.patch @@ -0,0 +1,43 @@ +From 1ac0cb5d0e22d5e483f56b2bc12172dec1cf7536 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Mon, 21 Sep 2009 17:03:29 -0700 +Subject: mm: fix anonymous dirtying + +From: Hugh Dickins + +commit 1ac0cb5d0e22d5e483f56b2bc12172dec1cf7536 upstream. + +do_anonymous_page() has been wrong to dirty the pte regardless. +If it's not going to mark the pte writable, then it won't help +to mark it dirty here, and clogs up memory with pages which will +need swap instead of being thrown away. Especially wrong if no +overcommit is chosen, and this vma is not yet VM_ACCOUNTed - +we could exceed the limit and OOM despite no overcommit. + +Signed-off-by: Hugh Dickins +Acked-by: Rik van Riel +Cc: KAMEZAWA Hiroyuki +Cc: KOSAKI Motohiro +Cc: Nick Piggin +Cc: Mel Gorman +Cc: Minchan Kim +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -2422,7 +2422,8 @@ static int do_anonymous_page(struct mm_s + goto oom_free_page; + + entry = mk_pte(page, vma->vm_page_prot); +- entry = maybe_mkwrite(pte_mkdirty(entry), vma); ++ if (vma->vm_flags & VM_WRITE) ++ entry = pte_mkwrite(pte_mkdirty(entry)); + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) diff --git a/review-2.6.27/mmap-avoid-unnecessary-anon_vma-lock-acquisition-in-vma_adjust.patch b/review-2.6.27/mmap-avoid-unnecessary-anon_vma-lock-acquisition-in-vma_adjust.patch new file mode 100644 index 00000000000..6827309168a --- /dev/null +++ b/review-2.6.27/mmap-avoid-unnecessary-anon_vma-lock-acquisition-in-vma_adjust.patch @@ -0,0 +1,89 @@ +From 252c5f94d944487e9f50ece7942b0fbf659c5c31 Mon Sep 17 00:00:00 2001 +From: Lee Schermerhorn +Date: Mon, 21 Sep 2009 17:03:40 -0700 +Subject: mmap: avoid unnecessary anon_vma lock acquisition in vma_adjust() + +From: Lee Schermerhorn + +commit 252c5f94d944487e9f50ece7942b0fbf659c5c31 upstream. + +We noticed very erratic behavior [throughput] with the AIM7 shared +workload running on recent distro [SLES11] and mainline kernels on an +8-socket, 32-core, 256GB x86_64 platform. On the SLES11 kernel +[2.6.27.19+] with Barcelona processors, as we increased the load [10s of +thousands of tasks], the throughput would vary between two "plateaus"--one +at ~65K jobs per minute and one at ~130K jpm. The simple patch below +causes the results to smooth out at the ~130k plateau. + +But wait, there's more: + +We do not see this behavior on smaller platforms--e.g., 4 socket/8 core. +This could be the result of the larger number of cpus on the larger +platform--a scalability issue--or it could be the result of the larger +number of interconnect "hops" between some nodes in this platform and how +the tasks for a given load end up distributed over the nodes' cpus and +memories--a stochastic NUMA effect. + +The variability in the results are less pronounced [on the same platform] +with Shanghai processors and with mainline kernels. With 31-rc6 on +Shanghai processors and 288 file systems on 288 fibre attached storage +volumes, the curves [jpm vs load] are both quite flat with the patched +kernel consistently producing ~3.9% better throughput [~80K jpm vs ~77K +jpm] than the unpatched kernel. + +Profiling indicated that the "slow" runs were incurring high[er] +contention on an anon_vma lock in vma_adjust(), apparently called from the +sbrk() system call. + +The patch: + +A comment in mm/mmap.c:vma_adjust() suggests that we don't really need the +anon_vma lock when we're only adjusting the end of a vma, as is the case +for brk(). The comment questions whether it's worth while to optimize for +this case. Apparently, on the newer, larger x86_64 platforms, with +interesting NUMA topologies, it is worth while--especially considering +that the patch [if correct!] is quite simple. + +We can detect this condition--no overlap with next vma--by noting a NULL +"importer". The anon_vma pointer will also be NULL in this case, so +simply avoid loading vma->anon_vma to avoid the lock. + +However, we DO need to take the anon_vma lock when we're inserting a vma +['insert' non-NULL] even when we have no overlap [NULL "importer"], so we +need to check for 'insert', as well. And Hugh points out that we should +also take it when adjusting vm_start (so that rmap.c can rely upon +vma_address() while it holds the anon_vma lock). + +akpm: Zhang Yanmin reprts a 150% throughput improvement with aim7, so it +might be -stable material even though thiss isn't a regression: "this +issue is not clear on dual socket Nehalem machine (2*4*2 cpu), but is +severe on large machine (4*8*2 cpu)" + +[hugh.dickins@tiscali.co.uk: test vma start too] +Signed-off-by: Lee Schermerhorn +Signed-off-by: Hugh Dickins +Cc: Nick Piggin +Cc: Eric Whitney +Tested-by: "Zhang, Yanmin" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/mmap.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -575,9 +575,9 @@ again: remove_next = 1 + (end > next-> + + /* + * When changing only vma->vm_end, we don't really need +- * anon_vma lock: but is that case worth optimizing out? ++ * anon_vma lock. + */ +- if (vma->anon_vma) ++ if (vma->anon_vma && (insert || importer || start != vma->vm_start)) + anon_vma = vma->anon_vma; + if (anon_vma) { + spin_lock(&anon_vma->lock); diff --git a/review-2.6.27/series b/review-2.6.27/series index 9386a587c2b..c1eda22d465 100644 --- a/review-2.6.27/series +++ b/review-2.6.27/series @@ -7,3 +7,5 @@ net-ax25-fix-signed-comparison-in-the-sockopt-handler.patch net-make-the-copy-length-in-af_packet-sockopt-handler-unsigned.patch netfilter-bridge-refcount-fix.patch hugetlb-restore-interleaving-of-bootmem-huge-pages.patch +mm-fix-anonymous-dirtying.patch +mmap-avoid-unnecessary-anon_vma-lock-acquisition-in-vma_adjust.patch -- 2.47.2