.27 patches

author Greg Kroah-Hartman <gregkh@suse.de>

Fri, 25 Jun 2010 23:31:52 +0000 (16:31 -0700)

committer Greg Kroah-Hartman <gregkh@suse.de>

Fri, 25 Jun 2010 23:31:52 +0000 (16:31 -0700)
author Greg Kroah-Hartman <gregkh@suse.de>
Fri, 25 Jun 2010 23:31:52 +0000 (16:31 -0700)
committer Greg Kroah-Hartman <gregkh@suse.de>
Fri, 25 Jun 2010 23:31:52 +0000 (16:31 -0700)
diff --git a/queue-2.6.27/ext4-check-s_log_groups_per_flex-in-online-resize-code.patch b/queue-2.6.27/ext4-check-s_log_groups_per_flex-in-online-resize-code.patch

new file mode 100644 (file)

index 0000000..f2fa336
--- /dev/null
+++ b/queue-2.6.27/ext4-check-s_log_groups_per_flex-in-online-resize-code.patch
@@ -0,0 +1,49 @@
+From 42007efd569f1cf3bfb9a61da60ef6c2179508ca Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 16 May 2010 01:00:00 -0400
+Subject: ext4: check s_log_groups_per_flex in online resize code
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+commit 42007efd569f1cf3bfb9a61da60ef6c2179508ca upstream.
+
+If groups_per_flex < 2, sbi->s_flex_groups[] doesn't get filled out,
+and every other access to this first tests s_log_groups_per_flex;
+same thing needs to happen in resize or we'll wander off into
+a null pointer when doing an online resize of the file system.
+
+Thanks to Christoph Biedl, who came up with the trivial testcase:
+
+# truncate --size 128M fsfile
+# mkfs.ext3 -F fsfile
+# tune2fs -O extents,uninit_bg,dir_index,flex_bg,huge_file,dir_nlink,extra_isize fsfile
+# e2fsck -yDf -C0 fsfile
+# truncate --size 132M fsfile
+# losetup /dev/loop0 fsfile
+# mount /dev/loop0 mnt
+# resize2fs -p /dev/loop0
+
+       https://bugzilla.kernel.org/show_bug.cgi?id=13549
+
+Reported-by: Alessandro Polverini <alex@nibbles.it>
+Test-case-by: Christoph Biedl  <bugzilla.kernel.bpeb@manchmal.in-ulm.de>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/resize.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/resize.c
++++ b/fs/ext4/resize.c
+@@ -935,7 +935,8 @@ int ext4_group_add(struct super_block *s
+       percpu_counter_add(&sbi->s_freeinodes_counter,
+                          EXT4_INODES_PER_GROUP(sb));
+ 
+-      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
++      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
++          sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group;
+               flex_group = ext4_flex_group(sbi, input->group);
+               sbi->s_flex_groups[flex_group].free_blocks +=
diff --git a/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch b/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch

new file mode 100644 (file)

index 0000000..7f6d1a2
--- /dev/null
+++ b/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch
@@ -0,0 +1,189 @@
+From dev@jaysonking.com  Fri Jun 25 15:33:09 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Fri, 28 May 2010 14:26:57 -0500
+Subject: ext4: Fix file fragmentation during large file write.
+Cc: "Jayson R. King" <dev@jaysonking.com>, Theodore Ts'o <tytso@mit.edu>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>, Dave Chinner <david@fromorbit.com>, Ext4 Developers List <linux-ext4@vger.kernel.org>, Kay Diederichs <Kay.Diederichs@uni-konstanz.de>
+Message-ID: <4C001901.1070207@jaysonking.com>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 22208dedbd7626e5fc4339c417f8d24cc21f79d7 upstream.
+
+The range_cyclic writeback mode uses the address_space writeback_index
+as the start index for writeback.  With delayed allocation we were
+updating writeback_index wrongly resulting in highly fragmented file.
+This patch reduces the number of extents reduced from 4000 to 27 for a
+3GB file.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+[dev@jaysonking.com: Some changed lines from the original version of this patch were dropped, since they were rolled up with another cherry-picked patch applied to 2.6.27.y earlier.]
+[dev@jaysonking.com: Use of wbc->no_nrwrite_index_update was dropped, since write_cache_pages_da() implies it.]
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |   79 ++++++++++++++++++++++++++++++++------------------------
+ 1 file changed, 46 insertions(+), 33 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1721,7 +1721,11 @@ static int mpage_da_submit_io(struct mpa
+ 
+                       pages_skipped = mpd->wbc->pages_skipped;
+                       err = mapping->a_ops->writepage(page, mpd->wbc);
+-                      if (!err)
++                      if (!err && (pages_skipped == mpd->wbc->pages_skipped))
++                              /*
++                               * have successfully written the page
++                               * without skipping the same
++                               */
+                               mpd->pages_written++;
+                       /*
+                        * In error case, we have to continue because
+@@ -2295,7 +2299,6 @@ static int mpage_da_writepages(struct ad
+                              struct writeback_control *wbc,
+                              struct mpage_da_data *mpd)
+ {
+-      long to_write;
+       int ret;
+ 
+       if (!mpd->get_block)
+@@ -2310,19 +2313,18 @@ static int mpage_da_writepages(struct ad
+       mpd->pages_written = 0;
+       mpd->retval = 0;
+ 
+-      to_write = wbc->nr_to_write;
+-
+       ret = write_cache_pages_da(mapping, wbc, mpd);
+-
+       /*
+        * Handle last extent of pages
+        */
+       if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+               if (mpage_da_map_blocks(mpd) == 0)
+                       mpage_da_submit_io(mpd);
+-      }
+ 
+-      wbc->nr_to_write = to_write - mpd->pages_written;
++              mpd->io_done = 1;
++              ret = MPAGE_DA_EXTENT_TAIL;
++      }
++      wbc->nr_to_write -= mpd->pages_written;
+       return ret;
+ }
+ 
+@@ -2567,11 +2569,13 @@ static int ext4_da_writepages_trans_bloc
+ static int ext4_da_writepages(struct address_space *mapping,
+                             struct writeback_control *wbc)
+ {
++      pgoff_t index;
++      int range_whole = 0;
+       handle_t *handle = NULL;
+       struct mpage_da_data mpd;
+       struct inode *inode = mapping->host;
++      long pages_written = 0, pages_skipped;
+       int needed_blocks, ret = 0, nr_to_writebump = 0;
+-      long to_write, pages_skipped = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ 
+       /*
+@@ -2605,16 +2609,20 @@ static int ext4_da_writepages(struct add
+               nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+               wbc->nr_to_write = sbi->s_mb_stream_request;
+       }
++      if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
++              range_whole = 1;
+ 
+-
+-      pages_skipped = wbc->pages_skipped;
++      if (wbc->range_cyclic)
++              index = mapping->writeback_index;
++      else
++              index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ 
+       mpd.wbc = wbc;
+       mpd.inode = mapping->host;
+ 
+-restart_loop:
+-      to_write = wbc->nr_to_write;
+-      while (!ret && to_write > 0) {
++      pages_skipped = wbc->pages_skipped;
++
++      while (!ret && wbc->nr_to_write > 0) {
+ 
+               /*
+                * we  insert one extent at a time. So we need
+@@ -2647,46 +2655,51 @@ restart_loop:
+                               goto out_writepages;
+                       }
+               }
+-              to_write -= wbc->nr_to_write;
+-
+               mpd.get_block = ext4_da_get_block_write;
+               ret = mpage_da_writepages(mapping, wbc, &mpd);
+ 
+               ext4_journal_stop(handle);
+ 
+-              if (mpd.retval == -ENOSPC)
++              if (mpd.retval == -ENOSPC) {
++                      /* commit the transaction which would
++                       * free blocks released in the transaction
++                       * and try again
++                       */
+                       jbd2_journal_force_commit_nested(sbi->s_journal);
+-
+-              /* reset the retry count */
+-              if (ret == MPAGE_DA_EXTENT_TAIL) {
++                      wbc->pages_skipped = pages_skipped;
++                      ret = 0;
++              } else if (ret == MPAGE_DA_EXTENT_TAIL) {
+                       /*
+                        * got one extent now try with
+                        * rest of the pages
+                        */
+-                      to_write += wbc->nr_to_write;
++                      pages_written += mpd.pages_written;
++                      wbc->pages_skipped = pages_skipped;
+                       ret = 0;
+-              } else if (wbc->nr_to_write) {
++              } else if (wbc->nr_to_write)
+                       /*
+                        * There is no more writeout needed
+                        * or we requested for a noblocking writeout
+                        * and we found the device congested
+                        */
+-                      to_write += wbc->nr_to_write;
+                       break;
+-              }
+-              wbc->nr_to_write = to_write;
+-      }
+-
+-      if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
+-              /* We skipped pages in this loop */
+-              wbc->nr_to_write = to_write +
+-                              wbc->pages_skipped - pages_skipped;
+-              wbc->pages_skipped = pages_skipped;
+-              goto restart_loop;
+       }
++      if (pages_skipped != wbc->pages_skipped)
++              printk(KERN_EMERG "This should not happen leaving %s "
++                              "with nr_to_write = %ld ret = %d\n",
++                              __func__, wbc->nr_to_write, ret);
++
++      /* Update index */
++      index += pages_written;
++      if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
++              /*
++               * set the writeback_index so that range_cyclic
++               * mode will write it back later
++               */
++              mapping->writeback_index = index;
+ 
+ out_writepages:
+-      wbc->nr_to_write = to_write - nr_to_writebump;
++      wbc->nr_to_write -= nr_to_writebump;
+       return ret;
+ }
+ 
diff --git a/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch b/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch

new file mode 100644 (file)

index 0000000..d551b7a
--- /dev/null
+++ b/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch
@@ -0,0 +1,102 @@
+From dev@jaysonking.com  Fri Jun 25 15:33:41 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Fri, 28 May 2010 14:27:23 -0500
+Subject: ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages
+To: Stable team <stable@kernel.org>, LKML <linux-kernel@vger.kernel.org>, Greg Kroah-Hartman <gregkh@suse.de>
+Cc: "Theodore Ts'o" <tytso@mit.edu>, Dave Chinner <david@fromorbit.com>, "Jayson R. King" <dev@jaysonking.com>, Kay Diederichs <Kay.Diederichs@uni-konstanz.de>, Ext4 Developers List <linux-ext4@vger.kernel.org>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <4C00191B.3030702@jaysonking.com>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 2acf2c261b823d9d9ed954f348b97620297a36b5 upstream.
+
+With delayed allocation we lock the page in write_cache_pages() and
+try to build an in memory extent of contiguous blocks.  This is needed
+so that we can get large contiguous blocks request.  If range_cyclic
+mode is enabled, write_cache_pages() will loop back to the 0 index if
+no I/O has been done yet, and try to start writing from the beginning
+of the range.  That causes an attempt to take the page lock of lower
+index page while holding the page lock of higher index page, which can
+cause a dead lock with another writeback thread.
+
+The solution is to implement the range_cyclic behavior in
+ext4_da_writepages() instead.
+
+http://bugzilla.kernel.org/show_bug.cgi?id=12579
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |   21 +++++++++++++++++++--
+ 1 file changed, 19 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2575,6 +2575,7 @@ static int ext4_da_writepages(struct add
+       struct mpage_da_data mpd;
+       struct inode *inode = mapping->host;
+       long pages_written = 0, pages_skipped;
++      int range_cyclic, cycled = 1, io_done = 0;
+       int needed_blocks, ret = 0, nr_to_writebump = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ 
+@@ -2612,9 +2613,15 @@ static int ext4_da_writepages(struct add
+       if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+               range_whole = 1;
+ 
+-      if (wbc->range_cyclic)
++      range_cyclic = wbc->range_cyclic;
++      if (wbc->range_cyclic) {
+               index = mapping->writeback_index;
+-      else
++              if (index)
++                      cycled = 0;
++              wbc->range_start = index << PAGE_CACHE_SHIFT;
++              wbc->range_end  = LLONG_MAX;
++              wbc->range_cyclic = 0;
++      } else
+               index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ 
+       mpd.wbc = wbc;
+@@ -2622,6 +2629,7 @@ static int ext4_da_writepages(struct add
+ 
+       pages_skipped = wbc->pages_skipped;
+ 
++retry:
+       while (!ret && wbc->nr_to_write > 0) {
+ 
+               /*
+@@ -2676,6 +2684,7 @@ static int ext4_da_writepages(struct add
+                       pages_written += mpd.pages_written;
+                       wbc->pages_skipped = pages_skipped;
+                       ret = 0;
++                      io_done = 1;
+               } else if (wbc->nr_to_write)
+                       /*
+                        * There is no more writeout needed
+@@ -2684,6 +2693,13 @@ static int ext4_da_writepages(struct add
+                        */
+                       break;
+       }
++      if (!io_done && !cycled) {
++              cycled = 1;
++              index = 0;
++              wbc->range_start = index << PAGE_CACHE_SHIFT;
++              wbc->range_end  = mapping->writeback_index - 1;
++              goto retry;
++      }
+       if (pages_skipped != wbc->pages_skipped)
+               printk(KERN_EMERG "This should not happen leaving %s "
+                               "with nr_to_write = %ld ret = %d\n",
+@@ -2691,6 +2707,7 @@ static int ext4_da_writepages(struct add
+ 
+       /* Update index */
+       index += pages_written;
++      wbc->range_cyclic = range_cyclic;
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+               /*
+                * set the writeback_index so that range_cyclic
diff --git a/queue-2.6.27/ext4-use-our-own-write_cache_pages.patch b/queue-2.6.27/ext4-use-our-own-write_cache_pages.patch

new file mode 100644 (file)

index 0000000..7b1596e
--- /dev/null
+++ b/queue-2.6.27/ext4-use-our-own-write_cache_pages.patch
@@ -0,0 +1,206 @@
+From dev@jaysonking.com  Fri Jun 25 15:32:26 2010
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Fri, 28 May 2010 14:26:25 -0500
+Subject: ext4: Use our own write_cache_pages()
+Cc: "Theodore Ts'o" <tytso@mit.edu>, Dave Chinner <david@fromorbit.com>, "Jayson R. King" <dev@jaysonking.com>, Kay Diederichs <Kay.Diederichs@uni-konstanz.de>, Ext4 Developers List <linux-ext4@vger.kernel.org>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <4C0018E1.5060007@jaysonking.com>
+
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 8e48dcfbd7c0892b4cfd064d682cc4c95a29df32 upstream.
+
+Make a copy of write_cache_pages() for the benefit of
+ext4_da_writepages().  This allows us to simplify the code some, and
+will allow us to further customize the code in future patches.
+
+There are some nasty hacks in write_cache_pages(), which Linus has
+(correctly) characterized as vile.  I've just copied it into
+write_cache_pages_da(), without trying to clean those bits up lest I
+break something in the ext4's delalloc implementation, which is a bit
+fragile right now.  This will allow Dave Chinner to clean up
+write_cache_pages() in mm/page-writeback.c, without worrying about
+breaking ext4.  Eventually write_cache_pages_da() will go away when I
+rewrite ext4's delayed allocation and create a general
+ext4_writepages() which is used for all of ext4's writeback.  Until
+now this is the lowest risk way to clean up the core
+write_cache_pages() function.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Cc: Dave Chinner <david@fromorbit.com>
+[dev@jaysonking.com: Dropped the hunks which reverted the use of no_nrwrite_index_update, since those lines weren't ever created on 2.6.27.y]
+[dev@jaysonking.com: Copied from 2.6.27.y's version of write_cache_pages(), plus the changes to it from patch "vfs: Add no_nrwrite_index_update writeback control flag"]
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |  144 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 132 insertions(+), 12 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2059,17 +2059,6 @@ static int __mpage_da_writepage(struct p
+       struct buffer_head *bh, *head, fake;
+       sector_t logical;
+ 
+-      if (mpd->io_done) {
+-              /*
+-               * Rest of the page in the page_vec
+-               * redirty then and skip then. We will
+-               * try to to write them again after
+-               * starting a new transaction
+-               */
+-              redirty_page_for_writepage(wbc, page);
+-              unlock_page(page);
+-              return MPAGE_DA_EXTENT_TAIL;
+-      }
+       /*
+        * Can we merge this page to current extent?
+        */
+@@ -2160,6 +2149,137 @@ static int __mpage_da_writepage(struct p
+ }
+ 
+ /*
++ * write_cache_pages_da - walk the list of dirty pages of the given
++ * address space and call the callback function (which usually writes
++ * the pages).
++ *
++ * This is a forked version of write_cache_pages().  Differences:
++ *    Range cyclic is ignored.
++ *    no_nrwrite_index_update is always presumed true
++ */
++static int write_cache_pages_da(struct address_space *mapping,
++                              struct writeback_control *wbc,
++                              struct mpage_da_data *mpd)
++{
++      struct backing_dev_info *bdi = mapping->backing_dev_info;
++      int ret = 0;
++      int done = 0;
++      struct pagevec pvec;
++      int nr_pages;
++      pgoff_t index;
++      pgoff_t end;            /* Inclusive */
++      long nr_to_write = wbc->nr_to_write;
++
++      if (wbc->nonblocking && bdi_write_congested(bdi)) {
++              wbc->encountered_congestion = 1;
++              return 0;
++      }
++
++      pagevec_init(&pvec, 0);
++      index = wbc->range_start >> PAGE_CACHE_SHIFT;
++      end = wbc->range_end >> PAGE_CACHE_SHIFT;
++
++      while (!done && (index <= end)) {
++              int i;
++
++              nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
++                            PAGECACHE_TAG_DIRTY,
++                            min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
++              if (nr_pages == 0)
++                      break;
++
++              for (i = 0; i < nr_pages; i++) {
++                      struct page *page = pvec.pages[i];
++
++                      /*
++                       * At this point, the page may be truncated or
++                       * invalidated (changing page->mapping to NULL), or
++                       * even swizzled back from swapper_space to tmpfs file
++                       * mapping. However, page->index will not change
++                       * because we have a reference on the page.
++                       */
++                      if (page->index > end) {
++                              done = 1;
++                              break;
++                      }
++
++                      lock_page(page);
++
++                      /*
++                       * Page truncated or invalidated. We can freely skip it
++                       * then, even for data integrity operations: the page
++                       * has disappeared concurrently, so there could be no
++                       * real expectation of this data interity operation
++                       * even if there is now a new, dirty page at the same
++                       * pagecache address.
++                       */
++                      if (unlikely(page->mapping != mapping)) {
++continue_unlock:
++                              unlock_page(page);
++                              continue;
++                      }
++
++                      if (!PageDirty(page)) {
++                              /* someone wrote it for us */
++                              goto continue_unlock;
++                      }
++
++                      if (PageWriteback(page)) {
++                              if (wbc->sync_mode != WB_SYNC_NONE)
++                                      wait_on_page_writeback(page);
++                              else
++                                      goto continue_unlock;
++                      }
++
++                      BUG_ON(PageWriteback(page));
++                      if (!clear_page_dirty_for_io(page))
++                              goto continue_unlock;
++
++                      ret = __mpage_da_writepage(page, wbc, mpd);
++
++                      if (unlikely(ret)) {
++                              if (ret == AOP_WRITEPAGE_ACTIVATE) {
++                                      unlock_page(page);
++                                      ret = 0;
++                              } else {
++                                      done = 1;
++                                      break;
++                              }
++                      }
++
++                      if (nr_to_write > 0) {
++                              nr_to_write--;
++                              if (nr_to_write == 0 &&
++                                  wbc->sync_mode == WB_SYNC_NONE) {
++                                      /*
++                                       * We stop writing back only if we are
++                                       * not doing integrity sync. In case of
++                                       * integrity sync we have to keep going
++                                       * because someone may be concurrently
++                                       * dirtying pages, and we might have
++                                       * synced a lot of newly appeared dirty
++                                       * pages, but have not synced all of the
++                                       * old dirty pages.
++                                       */
++                                      done = 1;
++                                      break;
++                              }
++                      }
++
++                      if (wbc->nonblocking && bdi_write_congested(bdi)) {
++                              wbc->encountered_congestion = 1;
++                              done = 1;
++                              break;
++                      }
++              }
++              pagevec_release(&pvec);
++              cond_resched();
++      }
++      return ret;
++}
++
++
++/*
+  * mpage_da_writepages - walk the list of dirty pages of the given
+  * address space, allocates non-allocated blocks, maps newly-allocated
+  * blocks to existing bhs and issue IO them
+@@ -2192,7 +2312,7 @@ static int mpage_da_writepages(struct ad
+ 
+       to_write = wbc->nr_to_write;
+ 
+-      ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
++      ret = write_cache_pages_da(mapping, wbc, mpd);
+ 
+       /*
+        * Handle last extent of pages
diff --git a/queue-2.6.27/keys-find_keyring_by_name-can-gain-access-to-a-freed-keyring.patch b/queue-2.6.27/keys-find_keyring_by_name-can-gain-access-to-a-freed-keyring.patch

new file mode 100644 (file)

index 0000000..6b1506c
--- /dev/null
+++ b/queue-2.6.27/keys-find_keyring_by_name-can-gain-access-to-a-freed-keyring.patch
@@ -0,0 +1,191 @@
+From cea7daa3589d6b550546a8c8963599f7c1a3ae5c Mon Sep 17 00:00:00 2001
+From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
+Date: Fri, 30 Apr 2010 14:32:13 +0100
+Subject: KEYS: find_keyring_by_name() can gain access to a freed keyring
+
+From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
+
+commit cea7daa3589d6b550546a8c8963599f7c1a3ae5c upstream.
+
+find_keyring_by_name() can gain access to a keyring that has had its reference
+count reduced to zero, and is thus ready to be freed.  This then allows the
+dead keyring to be brought back into use whilst it is being destroyed.
+
+The following timeline illustrates the process:
+
+|(cleaner)                           (user)
+|
+| free_user(user)                    sys_keyctl()
+|  |                                  |
+|  key_put(user->session_keyring)     keyctl_get_keyring_ID()
+|  ||  //=> keyring->usage = 0        |
+|  |schedule_work(&key_cleanup_task)   lookup_user_key()
+|  ||                                   |
+|  kmem_cache_free(,user)               |
+|  .                                    |[KEY_SPEC_USER_KEYRING]
+|  .                                    install_user_keyrings()
+|  .                                    ||
+| key_cleanup() [<= worker_thread()]    ||
+|  |                                    ||
+|  [spin_lock(&key_serial_lock)]        |[mutex_lock(&key_user_keyr..mutex)]
+|  |                                    ||
+|  atomic_read() == 0                   ||
+|  |{ rb_ease(&key->serial_node,) }     ||
+|  |                                    ||
+|  [spin_unlock(&key_serial_lock)]      |find_keyring_by_name()
+|  |                                    |||
+|  keyring_destroy(keyring)             ||[read_lock(&keyring_name_lock)]
+|  ||                                   |||
+|  |[write_lock(&keyring_name_lock)]    ||atomic_inc(&keyring->usage)
+|  |.                                   ||| *** GET freeing keyring ***
+|  |.                                   ||[read_unlock(&keyring_name_lock)]
+|  ||                                   ||
+|  |list_del()                          |[mutex_unlock(&key_user_k..mutex)]
+|  ||                                   |
+|  |[write_unlock(&keyring_name_lock)]  ** INVALID keyring is returned **
+|  |                                    .
+|  kmem_cache_free(,keyring)            .
+|                                       .
+|                                       atomic_dec(&keyring->usage)
+v                                         *** DESTROYED ***
+TIME
+
+If CONFIG_SLUB_DEBUG=y then we may see the following message generated:
+
+       =============================================================================
+       BUG key_jar: Poison overwritten
+       -----------------------------------------------------------------------------
+
+       INFO: 0xffff880197a7e200-0xffff880197a7e200. First byte 0x6a instead of 0x6b
+       INFO: Allocated in key_alloc+0x10b/0x35f age=25 cpu=1 pid=5086
+       INFO: Freed in key_cleanup+0xd0/0xd5 age=12 cpu=1 pid=10
+       INFO: Slab 0xffffea000592cb90 objects=16 used=2 fp=0xffff880197a7e200 flags=0x200000000000c3
+       INFO: Object 0xffff880197a7e200 @offset=512 fp=0xffff880197a7e300
+
+       Bytes b4 0xffff880197a7e1f0:  5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ
+         Object 0xffff880197a7e200:  6a 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b jkkkkkkkkkkkkkkk
+
+Alternatively, we may see a system panic happen, such as:
+
+       BUG: unable to handle kernel NULL pointer dereference at 0000000000000001
+       IP: [<ffffffff810e61a3>] kmem_cache_alloc+0x5b/0xe9
+       PGD 6b2b4067 PUD 6a80d067 PMD 0
+       Oops: 0000 [#1] SMP
+       last sysfs file: /sys/kernel/kexec_crash_loaded
+       CPU 1
+       ...
+       Pid: 31245, comm: su Not tainted 2.6.34-rc5-nofixed-nodebug #2 D2089/PRIMERGY
+       RIP: 0010:[<ffffffff810e61a3>]  [<ffffffff810e61a3>] kmem_cache_alloc+0x5b/0xe9
+       RSP: 0018:ffff88006af3bd98  EFLAGS: 00010002
+       RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff88007d19900b
+       RDX: 0000000100000000 RSI: 00000000000080d0 RDI: ffffffff81828430
+       RBP: ffffffff81828430 R08: ffff88000a293750 R09: 0000000000000000
+       R10: 0000000000000001 R11: 0000000000100000 R12: 00000000000080d0
+       R13: 00000000000080d0 R14: 0000000000000296 R15: ffffffff810f20ce
+       FS:  00007f97116bc700(0000) GS:ffff88000a280000(0000) knlGS:0000000000000000
+       CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+       CR2: 0000000000000001 CR3: 000000006a91c000 CR4: 00000000000006e0
+       DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+       DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
+       Process su (pid: 31245, threadinfo ffff88006af3a000, task ffff8800374414c0)
+       Stack:
+        0000000512e0958e 0000000000008000 ffff880037f8d180 0000000000000001
+        0000000000000000 0000000000008001 ffff88007d199000 ffffffff810f20ce
+        0000000000008000 ffff88006af3be48 0000000000000024 ffffffff810face3
+       Call Trace:
+        [<ffffffff810f20ce>] ? get_empty_filp+0x70/0x12f
+        [<ffffffff810face3>] ? do_filp_open+0x145/0x590
+        [<ffffffff810ce208>] ? tlb_finish_mmu+0x2a/0x33
+        [<ffffffff810ce43c>] ? unmap_region+0xd3/0xe2
+        [<ffffffff810e4393>] ? virt_to_head_page+0x9/0x2d
+        [<ffffffff81103916>] ? alloc_fd+0x69/0x10e
+        [<ffffffff810ef4ed>] ? do_sys_open+0x56/0xfc
+        [<ffffffff81008a02>] ? system_call_fastpath+0x16/0x1b
+       Code: 0f 1f 44 00 00 49 89 c6 fa 66 0f 1f 44 00 00 65 4c 8b 04 25 60 e8 00 00 48 8b 45 00 49 01 c0 49 8b 18 48 85 db 74 0d 48 63 45 18 <48> 8b 04 03 49 89 00 eb 14 4c 89 f9 83 ca ff 44 89 e6 48 89 ef
+       RIP  [<ffffffff810e61a3>] kmem_cache_alloc+0x5b/0xe9
+
+This problem is that find_keyring_by_name does not confirm that the keyring is
+valid before accepting it.
+
+Skipping keyrings that have been reduced to a zero count seems the way to go.
+To this end, use atomic_inc_not_zero() to increment the usage count and skip
+the candidate keyring if that returns false.
+
+The following script _may_ cause the bug to happen, but there's no guarantee
+as the window of opportunity is small:
+
+       #!/bin/sh
+       LOOP=100000
+       USER=dummy_user
+       /bin/su -c "exit;" $USER || { /usr/sbin/adduser -m $USER; add=1; }
+       for ((i=0; i<LOOP; i++))
+       do
+               /bin/su -c "echo '$i' > /dev/null" $USER
+       done
+       (( add == 1 )) && /usr/sbin/userdel -r $USER
+       exit
+
+Note that the nominated user must not be in use.
+
+An alternative way of testing this may be:
+
+       for ((i=0; i<100000; i++))
+       do
+               keyctl session foo /bin/true || break
+       done >&/dev/null
+
+as that uses a keyring named "foo" rather than relying on the user and
+user-session named keyrings.
+
+Reported-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Tested-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
+Acked-by: Serge Hallyn <serue@us.ibm.com>
+Signed-off-by: James Morris <jmorris@namei.org>
+Cc: Ben Hutchings <ben@decadent.org.uk>
+Cc: Chuck Ebbert <cebbert@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ security/keys/keyring.c |   18 +++++++++---------
+ 1 file changed, 9 insertions(+), 9 deletions(-)
+
+--- a/security/keys/keyring.c
++++ b/security/keys/keyring.c
+@@ -523,9 +523,8 @@ struct key *find_keyring_by_name(const c
+       struct key *keyring;
+       int bucket;
+ 
+-      keyring = ERR_PTR(-EINVAL);
+       if (!name)
+-              goto error;
++              return ERR_PTR(-EINVAL);
+ 
+       bucket = keyring_hash(name);
+ 
+@@ -549,17 +548,18 @@ struct key *find_keyring_by_name(const c
+                                          KEY_SEARCH) < 0)
+                               continue;
+ 
+-                      /* we've got a match */
+-                      atomic_inc(&keyring->usage);
+-                      read_unlock(&keyring_name_lock);
+-                      goto error;
++                      /* we've got a match but we might end up racing with
++                       * key_cleanup() if the keyring is currently 'dead'
++                       * (ie. it has a zero usage count) */
++                      if (!atomic_inc_not_zero(&keyring->usage))
++                              continue;
++                      goto out;
+               }
+       }
+ 
+-      read_unlock(&keyring_name_lock);
+       keyring = ERR_PTR(-ENOKEY);
+-
+- error:
++out:
++      read_unlock(&keyring_name_lock);
+       return keyring;
+ 
+ } /* end find_keyring_by_name() */
diff --git a/queue-2.6.27/keys-return-more-accurate-error-codes.patch b/queue-2.6.27/keys-return-more-accurate-error-codes.patch

new file mode 100644 (file)

index 0000000..383962a
--- /dev/null
+++ b/queue-2.6.27/keys-return-more-accurate-error-codes.patch
@@ -0,0 +1,46 @@
+From 4d09ec0f705cf88a12add029c058b53f288cfaa2 Mon Sep 17 00:00:00 2001
+From: Dan Carpenter <error27@gmail.com>
+Date: Mon, 17 May 2010 14:42:35 +0100
+Subject: KEYS: Return more accurate error codes
+
+From: Dan Carpenter <error27@gmail.com>
+
+commit 4d09ec0f705cf88a12add029c058b53f288cfaa2 upstream.
+
+We were using the wrong variable here so the error codes weren't being returned
+properly.  The original code returns -ENOKEY.
+
+Signed-off-by: Dan Carpenter <error27@gmail.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Signed-off-by: James Morris <jmorris@namei.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+--- a/security/keys/process_keys.c
++++ b/security/keys/process_keys.c
+@@ -508,7 +508,7 @@ try_again:
+ 
+                       ret = install_thread_keyring();
+                       if (ret < 0) {
+-                              key = ERR_PTR(ret);
++                              key_ref = ERR_PTR(ret);
+                               goto error;
+                       }
+                       goto reget_creds;
+@@ -526,7 +526,7 @@ try_again:
+ 
+                       ret = install_process_keyring();
+                       if (ret < 0) {
+-                              key = ERR_PTR(ret);
++                              key_ref = ERR_PTR(ret);
+                               goto error;
+                       }
+                       goto reget_creds;
+@@ -585,7 +585,7 @@ try_again:
+ 
+       case KEY_SPEC_GROUP_KEYRING:
+               /* group keyrings are not yet supported */
+-              key = ERR_PTR(-EINVAL);
++              key_ref = ERR_PTR(-EINVAL);
+               goto error;
+ 
+       case KEY_SPEC_REQKEY_AUTH_KEY:
diff --git a/queue-2.6.27/parisc-clear-floating-point-exception-flag-on-sigfpe-signal.patch b/queue-2.6.27/parisc-clear-floating-point-exception-flag-on-sigfpe-signal.patch

new file mode 100644 (file)

index 0000000..3080f86
--- /dev/null
+++ b/queue-2.6.27/parisc-clear-floating-point-exception-flag-on-sigfpe-signal.patch
@@ -0,0 +1,34 @@
+From 550f0d922286556c7ea43974bb7921effb5a5278 Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Mon, 3 May 2010 20:44:21 +0000
+Subject: parisc: clear floating point exception flag on SIGFPE signal
+
+From: Helge Deller <deller@gmx.de>
+
+commit 550f0d922286556c7ea43974bb7921effb5a5278 upstream.
+
+Clear the floating point exception flag before returning to
+user space. This is needed, else the libc trampoline handler
+may hit the same SIGFPE again while building up a trampoline
+to a signal handler.
+
+Fixes debian bug #559406.
+
+Signed-off-by: Helge Deller <deller@gmx.de>
+Signed-off-by: Kyle McMartin <kyle@mcmartin.ca>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/parisc/math-emu/decode_exc.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/parisc/math-emu/decode_exc.c
++++ b/arch/parisc/math-emu/decode_exc.c
+@@ -342,6 +342,7 @@ decode_fpu(unsigned int Fpu_register[],
+               return SIGNALCODE(SIGFPE, FPE_FLTINV);
+         case DIVISIONBYZEROEXCEPTION:
+               update_trap_counts(Fpu_register, aflags, bflags, trap_counts);
++              Clear_excp_register(exception_index);
+               return SIGNALCODE(SIGFPE, FPE_FLTDIV);
+         case INEXACTEXCEPTION:
+               update_trap_counts(Fpu_register, aflags, bflags, trap_counts);
diff --git a/queue-2.6.27/sctp-fix-skb_over_panic-resulting-from-multiple-invalid-parameter-errors-cve-2010-1173-v4.patch b/queue-2.6.27/sctp-fix-skb_over_panic-resulting-from-multiple-invalid-parameter-errors-cve-2010-1173-v4.patch

new file mode 100644 (file)

index 0000000..36a78a2
--- /dev/null
+++ b/queue-2.6.27/sctp-fix-skb_over_panic-resulting-from-multiple-invalid-parameter-errors-cve-2010-1173-v4.patch
@@ -0,0 +1,223 @@
+From 5fa782c2f5ef6c2e4f04d3e228412c9b4a4c8809 Mon Sep 17 00:00:00 2001
+From: Neil Horman <nhorman@tuxdriver.com>
+Date: Wed, 28 Apr 2010 10:30:59 +0000
+Subject: sctp: Fix skb_over_panic resulting from multiple invalid parameter errors (CVE-2010-1173) (v4)
+
+From: Neil Horman <nhorman@tuxdriver.com>
+
+commit 5fa782c2f5ef6c2e4f04d3e228412c9b4a4c8809 upstream.
+
+Ok, version 4
+
+Change Notes:
+1) Minor cleanups, from Vlads notes
+
+Summary:
+
+Hey-
+       Recently, it was reported to me that the kernel could oops in the
+following way:
+
+<5> kernel BUG at net/core/skbuff.c:91!
+<5> invalid operand: 0000 [#1]
+<5> Modules linked in: sctp netconsole nls_utf8 autofs4 sunrpc iptable_filter
+ip_tables cpufreq_powersave parport_pc lp parport vmblock(U) vsock(U) vmci(U)
+vmxnet(U) vmmemctl(U) vmhgfs(U) acpiphp dm_mirror dm_mod button battery ac md5
+ipv6 uhci_hcd ehci_hcd snd_ens1371 snd_rawmidi snd_seq_device snd_pcm_oss
+snd_mixer_oss snd_pcm snd_timer snd_page_alloc snd_ac97_codec snd soundcore
+pcnet32 mii floppy ext3 jbd ata_piix libata mptscsih mptsas mptspi mptscsi
+mptbase sd_mod scsi_mod
+<5> CPU:    0
+<5> EIP:    0060:[<c02bff27>]    Not tainted VLI
+<5> EFLAGS: 00010216   (2.6.9-89.0.25.EL)
+<5> EIP is at skb_over_panic+0x1f/0x2d
+<5> eax: 0000002c   ebx: c033f461   ecx: c0357d96   edx: c040fd44
+<5> esi: c033f461   edi: df653280   ebp: 00000000   esp: c040fd40
+<5> ds: 007b   es: 007b   ss: 0068
+<5> Process swapper (pid: 0, threadinfo=c040f000 task=c0370be0)
+<5> Stack: c0357d96 e0c29478 00000084 00000004 c033f461 df653280 d7883180
+e0c2947d
+<5>        00000000 00000080 df653490 00000004 de4f1ac0 de4f1ac0 00000004
+df653490
+<5>        00000001 e0c2877a 08000800 de4f1ac0 df653490 00000000 e0c29d2e
+00000004
+<5> Call Trace:
+<5>  [<e0c29478>] sctp_addto_chunk+0xb0/0x128 [sctp]
+<5>  [<e0c2947d>] sctp_addto_chunk+0xb5/0x128 [sctp]
+<5>  [<e0c2877a>] sctp_init_cause+0x3f/0x47 [sctp]
+<5>  [<e0c29d2e>] sctp_process_unk_param+0xac/0xb8 [sctp]
+<5>  [<e0c29e90>] sctp_verify_init+0xcc/0x134 [sctp]
+<5>  [<e0c20322>] sctp_sf_do_5_1B_init+0x83/0x28e [sctp]
+<5>  [<e0c25333>] sctp_do_sm+0x41/0x77 [sctp]
+<5>  [<c01555a4>] cache_grow+0x140/0x233
+<5>  [<e0c26ba1>] sctp_endpoint_bh_rcv+0xc5/0x108 [sctp]
+<5>  [<e0c2b863>] sctp_inq_push+0xe/0x10 [sctp]
+<5>  [<e0c34600>] sctp_rcv+0x454/0x509 [sctp]
+<5>  [<e084e017>] ipt_hook+0x17/0x1c [iptable_filter]
+<5>  [<c02d005e>] nf_iterate+0x40/0x81
+<5>  [<c02e0bb9>] ip_local_deliver_finish+0x0/0x151
+<5>  [<c02e0c7f>] ip_local_deliver_finish+0xc6/0x151
+<5>  [<c02d0362>] nf_hook_slow+0x83/0xb5
+<5>  [<c02e0bb2>] ip_local_deliver+0x1a2/0x1a9
+<5>  [<c02e0bb9>] ip_local_deliver_finish+0x0/0x151
+<5>  [<c02e103e>] ip_rcv+0x334/0x3b4
+<5>  [<c02c66fd>] netif_receive_skb+0x320/0x35b
+<5>  [<e0a0928b>] init_stall_timer+0x67/0x6a [uhci_hcd]
+<5>  [<c02c67a4>] process_backlog+0x6c/0xd9
+<5>  [<c02c690f>] net_rx_action+0xfe/0x1f8
+<5>  [<c012a7b1>] __do_softirq+0x35/0x79
+<5>  [<c0107efb>] handle_IRQ_event+0x0/0x4f
+<5>  [<c01094de>] do_softirq+0x46/0x4d
+
+Its an skb_over_panic BUG halt that results from processing an init chunk in
+which too many of its variable length parameters are in some way malformed.
+
+The problem is in sctp_process_unk_param:
+if (NULL == *errp)
+       *errp = sctp_make_op_error_space(asoc, chunk,
+                                        ntohs(chunk->chunk_hdr->length));
+
+       if (*errp) {
+               sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+                                WORD_ROUND(ntohs(param.p->length)));
+               sctp_addto_chunk(*errp,
+                       WORD_ROUND(ntohs(param.p->length)),
+                                 param.v);
+
+When we allocate an error chunk, we assume that the worst case scenario requires
+that we have chunk_hdr->length data allocated, which would be correct nominally,
+given that we call sctp_addto_chunk for the violating parameter.  Unfortunately,
+we also, in sctp_init_cause insert a sctp_errhdr_t structure into the error
+chunk, so the worst case situation in which all parameters are in violation
+requires chunk_hdr->length+(sizeof(sctp_errhdr_t)*param_count) bytes of data.
+
+The result of this error is that a deliberately malformed packet sent to a
+listening host can cause a remote DOS, described in CVE-2010-1173:
+http://cve.mitre.org/cgi-bin/cvename.cgi?name=2010-1173
+
+I've tested the below fix and confirmed that it fixes the issue.  We move to a
+strategy whereby we allocate a fixed size error chunk and ignore errors we don't
+have space to report.  Tested by me successfully
+
+Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
+Acked-by: Vlad Yasevich <vladislav.yasevich@hp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/net/sctp/structs.h |    1 
+ net/sctp/sm_make_chunk.c   |   62 +++++++++++++++++++++++++++++++++++++++++----
+ 2 files changed, 58 insertions(+), 5 deletions(-)
+
+--- a/include/net/sctp/structs.h
++++ b/include/net/sctp/structs.h
+@@ -753,6 +753,7 @@ int sctp_user_addto_chunk(struct sctp_ch
+                         struct iovec *data);
+ void sctp_chunk_free(struct sctp_chunk *);
+ void  *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data);
++void  *sctp_addto_chunk_fixed(struct sctp_chunk *, int len, const void *data);
+ struct sctp_chunk *sctp_chunkify(struct sk_buff *,
+                                const struct sctp_association *,
+                                struct sock *);
+--- a/net/sctp/sm_make_chunk.c
++++ b/net/sctp/sm_make_chunk.c
+@@ -107,7 +107,7 @@ static const struct sctp_paramhdr prsctp
+       __constant_htons(sizeof(struct sctp_paramhdr)),
+ };
+ 
+-/* A helper to initialize to initialize an op error inside a
++/* A helper to initialize an op error inside a
+  * provided chunk, as most cause codes will be embedded inside an
+  * abort chunk.
+  */
+@@ -124,6 +124,29 @@ void  sctp_init_cause(struct sctp_chunk
+       chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(sctp_errhdr_t), &err);
+ }
+ 
++/* A helper to initialize an op error inside a
++ * provided chunk, as most cause codes will be embedded inside an
++ * abort chunk.  Differs from sctp_init_cause in that it won't oops
++ * if there isn't enough space in the op error chunk
++ */
++int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
++                    size_t paylen)
++{
++      sctp_errhdr_t err;
++      __u16 len;
++
++      /* Cause code constants are now defined in network order.  */
++      err.cause = cause_code;
++      len = sizeof(sctp_errhdr_t) + paylen;
++      err.length  = htons(len);
++
++      if (skb_tailroom(chunk->skb) >  len)
++              return -ENOSPC;
++      chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk,
++                                                   sizeof(sctp_errhdr_t),
++                                                   &err);
++      return 0;
++}
+ /* 3.3.2 Initiation (INIT) (1)
+  *
+  * This chunk is used to initiate a SCTP association between two
+@@ -1114,6 +1137,24 @@ nodata:
+       return retval;
+ }
+ 
++/* Create an Operation Error chunk of a fixed size,
++ * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT)
++ * This is a helper function to allocate an error chunk for
++ * for those invalid parameter codes in which we may not want
++ * to report all the errors, if the incomming chunk is large
++ */
++static inline struct sctp_chunk *sctp_make_op_error_fixed(
++      const struct sctp_association *asoc,
++      const struct sctp_chunk *chunk)
++{
++      size_t size = asoc ? asoc->pathmtu : 0;
++
++      if (!size)
++              size = SCTP_DEFAULT_MAXSEGMENT;
++
++      return sctp_make_op_error_space(asoc, chunk, size);
++}
++
+ /* Create an Operation Error chunk.  */
+ struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc,
+                                const struct sctp_chunk *chunk,
+@@ -1354,6 +1395,18 @@ void *sctp_addto_chunk(struct sctp_chunk
+       return target;
+ }
+ 
++/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient
++ * space in the chunk
++ */
++void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk,
++                           int len, const void *data)
++{
++      if (skb_tailroom(chunk->skb) > len)
++              return sctp_addto_chunk(chunk, len, data);
++      else
++              return NULL;
++}
++
+ /* Append bytes from user space to the end of a chunk.  Will panic if
+  * chunk is not big enough.
+  * Returns a kernel err value.
+@@ -1957,13 +2010,12 @@ static sctp_ierror_t sctp_process_unk_pa
+                * returning multiple unknown parameters.
+                */
+               if (NULL == *errp)
+-                      *errp = sctp_make_op_error_space(asoc, chunk,
+-                                      ntohs(chunk->chunk_hdr->length));
++                      *errp = sctp_make_op_error_fixed(asoc, chunk);
+ 
+               if (*errp) {
+-                      sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM,
++                      sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+                                       WORD_ROUND(ntohs(param.p->length)));
+-                      sctp_addto_chunk(*errp,
++                      sctp_addto_chunk_fixed(*errp,
+                                       WORD_ROUND(ntohs(param.p->length)),
+                                       param.v);
+               } else {
diff --git a/queue-2.6.27/series b/queue-2.6.27/series

index 55e3b8df82ab4697993c8fd537b9fb3fee416e21..0bbf3de9a8c725730fbefb7d29e95603c3134f16 100644 (file)
--- a/queue-2.6.27/series
+++ b/queue-2.6.27/series
@@ -10,3 +10,13 @@ md-set-mddev-readonly-flag-on-blkdev-blkroset-ioctl.patch
  do_generic_file_read-clear-page-errors-when-issuing-a-fresh-read-of-the-page.patch
  ipmi-handle-run_to_completion-properly-in-deliver_recv_msg.patch
  gconfig-fix-build-failure-on-fedora-13.patch
+ext4-check-s_log_groups_per_flex-in-online-resize-code.patch
+ext4-use-our-own-write_cache_pages.patch
+ext4-fix-file-fragmentation-during-large-file-write.patch
+ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch
+sctp-fix-skb_over_panic-resulting-from-multiple-invalid-parameter-errors-cve-2010-1173-v4.patch
+vfs-add-nofollow-flag-to-umount-2.patch
+tipc-fix-oops-on-send-prior-to-entering-networked-mode-v3.patch
+parisc-clear-floating-point-exception-flag-on-sigfpe-signal.patch
+keys-return-more-accurate-error-codes.patch
+keys-find_keyring_by_name-can-gain-access-to-a-freed-keyring.patch
diff --git a/queue-2.6.27/tipc-fix-oops-on-send-prior-to-entering-networked-mode-v3.patch b/queue-2.6.27/tipc-fix-oops-on-send-prior-to-entering-networked-mode-v3.patch

new file mode 100644 (file)

index 0000000..38be336
--- /dev/null
+++ b/queue-2.6.27/tipc-fix-oops-on-send-prior-to-entering-networked-mode-v3.patch
@@ -0,0 +1,208 @@
+From d0021b252eaf65ca07ed14f0d66425dd9ccab9a6 Mon Sep 17 00:00:00 2001
+From: Neil Horman <nhorman@tuxdriver.com>
+Date: Wed, 3 Mar 2010 08:31:23 +0000
+Subject: tipc: Fix oops on send prior to entering networked mode (v3)
+
+From: Neil Horman <nhorman@tuxdriver.com>
+
+commit d0021b252eaf65ca07ed14f0d66425dd9ccab9a6 upstream.
+
+Fix TIPC to disallow sending to remote addresses prior to entering NET_MODE
+
+user programs can oops the kernel by sending datagrams via AF_TIPC prior to
+entering networked mode.  The following backtrace has been observed:
+
+ID: 13459  TASK: ffff810014640040  CPU: 0   COMMAND: "tipc-client"
+[exception RIP: tipc_node_select_next_hop+90]
+RIP: ffffffff8869d3c3  RSP: ffff81002d9a5ab8  RFLAGS: 00010202
+RAX: 0000000000000001  RBX: 0000000000000001  RCX: 0000000000000001
+RDX: 0000000000000000  RSI: 0000000000000001  RDI: 0000000001001001
+RBP: 0000000001001001   R8: 0074736575716552   R9: 0000000000000000
+R10: ffff81003fbd0680  R11: 00000000000000c8  R12: 0000000000000008
+R13: 0000000000000001  R14: 0000000000000001  R15: ffff810015c6ca00
+ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
+RIP: 0000003cbd8d49a3  RSP: 00007fffc84e0be8  RFLAGS: 00010206
+RAX: 000000000000002c  RBX: ffffffff8005d116  RCX: 0000000000000000
+RDX: 0000000000000008  RSI: 00007fffc84e0c00  RDI: 0000000000000003
+RBP: 0000000000000000   R8: 00007fffc84e0c10   R9: 0000000000000010
+R10: 0000000000000000  R11: 0000000000000246  R12: 0000000000000000
+R13: 00007fffc84e0d10  R14: 0000000000000000  R15: 00007fffc84e0c30
+ORIG_RAX: 000000000000002c  CS: 0033  SS: 002b
+
+What happens is that, when the tipc module in inserted it enters a standalone
+node mode in which communication to its own address is allowed <0.0.0> but not
+to other addresses, since the appropriate data structures have not been
+allocated yet (specifically the tipc_net pointer).  There is nothing stopping a
+client from trying to send such a message however, and if that happens, we
+attempt to dereference tipc_net.zones while the pointer is still NULL, and
+explode.  The fix is pretty straightforward.  Since these oopses all arise from
+the dereference of global pointers prior to their assignment to allocated
+values, and since these allocations are small (about 2k total), lets convert
+these pointers to static arrays of the appropriate size.  All the accesses to
+these bits consider 0/NULL to be a non match when searching, so all the lookups
+still work properly, and there is no longer a chance of a bad dererence
+anywhere.  As a bonus, this lets us eliminate the setup/teardown routines for
+those pointers, and elimnates the need to preform any locking around them to
+prevent access while their being allocated/freed.
+
+I've updated the tipc_net structure to behave this way to fix the exact reported
+problem, and also fixed up the tipc_bearers and media_list arrays to fix an
+obvious simmilar problem that arises from issuing tipc-config commands to
+manipulate bearers/links prior to entering networked mode
+
+I've tested this for a few hours by running the sanity tests and stress test
+with the tipcutils suite, and nothing has fallen over.  There have been a few
+lockdep warnings, but those were there before, and can be addressed later, as
+they didn't actually result in any deadlock.
+
+Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
+CC: Allan Stephens <allan.stephens@windriver.com>
+CC: David S. Miller <davem@davemloft.net>
+CC: tipc-discussion@lists.sourceforge.net
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ net/tipc/bearer.c |   37 ++++++-------------------------------
+ net/tipc/bearer.h |    2 +-
+ net/tipc/net.c    |   25 ++++---------------------
+ 3 files changed, 11 insertions(+), 53 deletions(-)
+
+--- a/net/tipc/bearer.c
++++ b/net/tipc/bearer.c
+@@ -45,10 +45,10 @@
+ 
+ #define MAX_ADDR_STR 32
+ 
+-static struct media *media_list = NULL;
++static struct media media_list[MAX_MEDIA];
+ static u32 media_count = 0;
+ 
+-struct bearer *tipc_bearers = NULL;
++struct bearer tipc_bearers[MAX_BEARERS];
+ 
+ /**
+  * media_name_valid - validate media name
+@@ -108,9 +108,11 @@ int  tipc_register_media(u32 media_type,
+       int res = -EINVAL;
+ 
+       write_lock_bh(&tipc_net_lock);
+-      if (!media_list)
+-              goto exit;
+ 
++      if (tipc_mode != TIPC_NET_MODE) {
++              warn("Media <%s> rejected, not in networked mode yet\n", name);
++              goto exit;
++      }
+       if (!media_name_valid(name)) {
+               warn("Media <%s> rejected, illegal name\n", name);
+               goto exit;
+@@ -660,33 +662,10 @@ int tipc_disable_bearer(const char *name
+ 
+ 
+ 
+-int tipc_bearer_init(void)
+-{
+-      int res;
+-
+-      write_lock_bh(&tipc_net_lock);
+-      tipc_bearers = kcalloc(MAX_BEARERS, sizeof(struct bearer), GFP_ATOMIC);
+-      media_list = kcalloc(MAX_MEDIA, sizeof(struct media), GFP_ATOMIC);
+-      if (tipc_bearers && media_list) {
+-              res = 0;
+-      } else {
+-              kfree(tipc_bearers);
+-              kfree(media_list);
+-              tipc_bearers = NULL;
+-              media_list = NULL;
+-              res = -ENOMEM;
+-      }
+-      write_unlock_bh(&tipc_net_lock);
+-      return res;
+-}
+-
+ void tipc_bearer_stop(void)
+ {
+       u32 i;
+ 
+-      if (!tipc_bearers)
+-              return;
+-
+       for (i = 0; i < MAX_BEARERS; i++) {
+               if (tipc_bearers[i].active)
+                       tipc_bearers[i].publ.blocked = 1;
+@@ -695,10 +674,6 @@ void tipc_bearer_stop(void)
+               if (tipc_bearers[i].active)
+                       bearer_disable(tipc_bearers[i].publ.name);
+       }
+-      kfree(tipc_bearers);
+-      kfree(media_list);
+-      tipc_bearers = NULL;
+-      media_list = NULL;
+       media_count = 0;
+ }
+ 
+--- a/net/tipc/bearer.h
++++ b/net/tipc/bearer.h
+@@ -114,7 +114,7 @@ struct bearer_name {
+ 
+ struct link;
+ 
+-extern struct bearer *tipc_bearers;
++extern struct bearer tipc_bearers[];
+ 
+ void tipc_media_addr_printf(struct print_buf *pb, struct tipc_media_addr *a);
+ struct sk_buff *tipc_media_get_names(void);
+--- a/net/tipc/net.c
++++ b/net/tipc/net.c
+@@ -116,7 +116,8 @@
+ */
+ 
+ DEFINE_RWLOCK(tipc_net_lock);
+-struct network tipc_net = { NULL };
++struct _zone *tipc_zones[256] = { NULL, };
++struct network tipc_net = { tipc_zones };
+ 
+ struct tipc_node *tipc_net_select_remote_node(u32 addr, u32 ref)
+ {
+@@ -158,28 +159,12 @@ void tipc_net_send_external_routes(u32 d
+       }
+ }
+ 
+-static int net_init(void)
+-{
+-      memset(&tipc_net, 0, sizeof(tipc_net));
+-      tipc_net.zones = kcalloc(tipc_max_zones + 1, sizeof(struct _zone *), GFP_ATOMIC);
+-      if (!tipc_net.zones) {
+-              return -ENOMEM;
+-      }
+-      return 0;
+-}
+-
+ static void net_stop(void)
+ {
+       u32 z_num;
+ 
+-      if (!tipc_net.zones)
+-              return;
+-
+-      for (z_num = 1; z_num <= tipc_max_zones; z_num++) {
++      for (z_num = 1; z_num <= tipc_max_zones; z_num++)
+               tipc_zone_delete(tipc_net.zones[z_num]);
+-      }
+-      kfree(tipc_net.zones);
+-      tipc_net.zones = NULL;
+ }
+ 
+ static void net_route_named_msg(struct sk_buff *buf)
+@@ -282,9 +267,7 @@ int tipc_net_start(u32 addr)
+       tipc_named_reinit();
+       tipc_port_reinit();
+ 
+-      if ((res = tipc_bearer_init()) ||
+-          (res = net_init()) ||
+-          (res = tipc_cltr_init()) ||
++      if ((res = tipc_cltr_init()) ||
+           (res = tipc_bclink_init())) {
+               return res;
+       }
diff --git a/queue-2.6.27/vfs-add-nofollow-flag-to-umount-2.patch b/queue-2.6.27/vfs-add-nofollow-flag-to-umount-2.patch

new file mode 100644 (file)

index 0000000..1d4ea0f
--- /dev/null
+++ b/queue-2.6.27/vfs-add-nofollow-flag-to-umount-2.patch
@@ -0,0 +1,57 @@
+From db1f05bb85d7966b9176e293f3ceead1cb8b5d79 Mon Sep 17 00:00:00 2001
+From: Miklos Szeredi <mszeredi@suse.cz>
+Date: Wed, 10 Feb 2010 12:15:53 +0100
+Subject: vfs: add NOFOLLOW flag to umount(2)
+
+From: Miklos Szeredi <mszeredi@suse.cz>
+
+commit db1f05bb85d7966b9176e293f3ceead1cb8b5d79 upstream.
+
+Add a new UMOUNT_NOFOLLOW flag to umount(2).  This is needed to prevent
+symlink attacks in unprivileged unmounts (fuse, samba, ncpfs).
+
+Additionally, return -EINVAL if an unknown flag is used (and specify
+an explicitly unused flag: UMOUNT_UNUSED).  This makes it possible for
+the caller to determine if a flag is supported or not.
+
+CC: Eugene Teo <eugene@redhat.com>
+CC: Michael Kerrisk <mtk.manpages@gmail.com>
+Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/namespace.c     |    9 ++++++++-
+ include/linux/fs.h |    2 ++
+ 2 files changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/namespace.c
++++ b/fs/namespace.c
+@@ -1132,8 +1132,15 @@ SYSCALL_DEFINE2(umount, char __user *, n
+ {
+       struct path path;
+       int retval;
++      int lookup_flags = 0;
+ 
+-      retval = user_path(name, &path);
++      if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
++              return -EINVAL;
++
++      if (!(flags & UMOUNT_NOFOLLOW))
++              lookup_flags |= LOOKUP_FOLLOW;
++
++      retval = user_path_at(AT_FDCWD, name, lookup_flags, &path);
+       if (retval)
+               goto out;
+       retval = -EINVAL;
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -1061,6 +1061,8 @@ extern int send_sigurg(struct fown_struc
+ #define MNT_FORCE     0x00000001      /* Attempt to forcibily umount */
+ #define MNT_DETACH    0x00000002      /* Just detach from the tree */
+ #define MNT_EXPIRE    0x00000004      /* Mark for expiry */
++#define UMOUNT_NOFOLLOW       0x00000008      /* Don't follow symlink on umount */
++#define UMOUNT_UNUSED 0x80000000      /* Flag guaranteed to be unused */
+ 
+ extern struct list_head super_blocks;
+ extern spinlock_t sb_lock;
author	Greg Kroah-Hartman <gregkh@suse.de>
	Fri, 25 Jun 2010 23:31:52 +0000 (16:31 -0700)
committer	Greg Kroah-Hartman <gregkh@suse.de>
	Fri, 25 Jun 2010 23:31:52 +0000 (16:31 -0700)
queue-2.6.27/ext4-check-s_log_groups_per_flex-in-online-resize-code.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-use-our-own-write_cache_pages.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/keys-find_keyring_by_name-can-gain-access-to-a-freed-keyring.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/keys-return-more-accurate-error-codes.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/parisc-clear-floating-point-exception-flag-on-sigfpe-signal.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/sctp-fix-skb_over_panic-resulting-from-multiple-invalid-parameter-errors-cve-2010-1173-v4.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/series		patch \| blob \| blame \| history
queue-2.6.27/tipc-fix-oops-on-send-prior-to-entering-networked-mode-v3.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/vfs-add-nofollow-flag-to-umount-2.patch	[new file with mode: 0644]	patch \| blob