]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
.32 patches
authorGreg Kroah-Hartman <gregkh@suse.de>
Wed, 11 Aug 2010 23:00:30 +0000 (16:00 -0700)
committerGreg Kroah-Hartman <gregkh@suse.de>
Wed, 11 Aug 2010 23:00:30 +0000 (16:00 -0700)
69 files changed:
queue-2.6.32/acpi-fix-regression-where-_ppc-is-not-read-at-boot-even-when-ignore_ppc-0.patch [new file with mode: 0644]
queue-2.6.32/aic79xx-check-for-non-null-scb-in-ahd_handle_nonpkt_busfree.patch [new file with mode: 0644]
queue-2.6.32/btrfs-add-btrfs_duplicate_item.patch [new file with mode: 0644]
queue-2.6.32/btrfs-add-delayed-iput.patch [new file with mode: 0644]
queue-2.6.32/btrfs-align-offsets-for-btrfs_ordered_update_i_size.patch [new file with mode: 0644]
queue-2.6.32/btrfs-apply-updated-fallocate-i_size-fix.patch [new file with mode: 0644]
queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-during-committing-transaction.patch [new file with mode: 0644]
queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-while-replaying-log.patch [new file with mode: 0644]
queue-2.6.32/btrfs-avoid-superfluous-tree-log-writeout.patch [new file with mode: 0644]
queue-2.6.32/btrfs-btrfs_mark_extent_written-uses-the-wrong-slot.patch [new file with mode: 0644]
queue-2.6.32/btrfs-check-return-value-of-open_bdev_exclusive-properly.patch [new file with mode: 0644]
queue-2.6.32/btrfs-check-total-number-of-devices-when-removing-missing.patch [new file with mode: 0644]
queue-2.6.32/btrfs-deal-with-null-acl-sent-to-btrfs_set_acl.patch [new file with mode: 0644]
queue-2.6.32/btrfs-deny-sys_link-across-subvolumes.patch [new file with mode: 0644]
queue-2.6.32/btrfs-do-not-mark-the-chunk-as-readonly-if-in-degraded-mode.patch [new file with mode: 0644]
queue-2.6.32/btrfs-do-not-try-and-lookup-the-file-extent-when-finishing-ordered-io.patch [new file with mode: 0644]
queue-2.6.32/btrfs-don-t-add-extent-0-to-the-free-space-cache-v2.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fail-mount-on-bad-mount-options.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-a-memory-leak-in-btrfs_init_acl.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-btrfs_drop_extent_cache-for-skip-pinned-case.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-disk_i_size-update-corner-case.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-memory-leaks-in-error-paths.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-missing-last-entry-in-readdir-3.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-oopsen-when-dropping-empty-tree.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-per-root-used-space-accounting.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-possible-panic-on-unmount.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-race-between-allocate-and-release-extent-buffer.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-race-in-btrfs_mark_extent_written.patch [new file with mode: 0644]
queue-2.6.32/btrfs-fix-regression-in-orphan-cleanup.patch [new file with mode: 0644]
queue-2.6.32/btrfs-kfree-correct-pointer-during-mount-option-parsing.patch [new file with mode: 0644]
queue-2.6.32/btrfs-make-error-return-negative-in-btrfs_sync_file.patch [new file with mode: 0644]
queue-2.6.32/btrfs-make-fallocate-2-more-enospc-friendly.patch [new file with mode: 0644]
queue-2.6.32/btrfs-make-metadata-chunks-smaller.patch [new file with mode: 0644]
queue-2.6.32/btrfs-make-sure-fallocate-properly-starts-a-transaction.patch [new file with mode: 0644]
queue-2.6.32/btrfs-make-truncate-2-more-enospc-friendly.patch [new file with mode: 0644]
queue-2.6.32/btrfs-pass-transaction-handle-to-security-and-acl-initialization-functions.patch [new file with mode: 0644]
queue-2.6.32/btrfs-remove-bug_on-due-to-mounting-bad-filesystem.patch [new file with mode: 0644]
queue-2.6.32/btrfs-rewrite-btrfs_drop_extents.patch [new file with mode: 0644]
queue-2.6.32/btrfs-run-orphan-cleanup-on-default-fs-root.patch [new file with mode: 0644]
queue-2.6.32/btrfs-show-discard-option-in-proc-mounts.patch [new file with mode: 0644]
queue-2.6.32/btrfs-use-correct-values-when-updating-inode-i_size-on-fallocate.patch [new file with mode: 0644]
queue-2.6.32/crypto-testmgr-fix-complain-about-lack-test-for-internal-used-algorithm.patch [new file with mode: 0644]
queue-2.6.32/dlm-always-use-gfp_nofs.patch [new file with mode: 0644]
queue-2.6.32/dlm-fix-ordering-of-bast-and-cast.patch [new file with mode: 0644]
queue-2.6.32/dlm-send-reply-before-bast.patch [new file with mode: 0644]
queue-2.6.32/ext4-fix-optional-arg-mount-options.patch [new file with mode: 0644]
queue-2.6.32/ext4-make-sure-the-move_ext-ioctl-can-t-overwrite-append-only-files.patch [new file with mode: 0644]
queue-2.6.32/fix-sba-iommu-to-handle-allocation-failure-properly.patch [new file with mode: 0644]
queue-2.6.32/hwpoison-abort-on-failed-unmap.patch [new file with mode: 0644]
queue-2.6.32/hwpoison-remove-the-anonymous-entry.patch [new file with mode: 0644]
queue-2.6.32/ibmvfc-fix-command-completion-handling.patch [new file with mode: 0644]
queue-2.6.32/ibmvfc-reduce-error-recovery-timeout.patch [new file with mode: 0644]
queue-2.6.32/loop-update-mtime-when-writing-using-aops.patch [new file with mode: 0644]
queue-2.6.32/md-raid1-delay-reads-that-could-overtake-behind-writes.patch [new file with mode: 0644]
queue-2.6.32/memory-hotplug-fix-a-bug-on-dev-mem-for-64-bit-kernels.patch [new file with mode: 0644]
queue-2.6.32/mutex-don-t-spin-when-the-owner-cpu-is-offline-or-other-weird-cases.patch [new file with mode: 0644]
queue-2.6.32/nohz-introduce-arch_needs_cpu.patch [new file with mode: 0644]
queue-2.6.32/nohz-reuse-ktime-in-sub-functions-of-tick_check_idle.patch [new file with mode: 0644]
queue-2.6.32/ocfs2-find-proper-end-cpos-for-a-leaf-refcount-block.patch [new file with mode: 0644]
queue-2.6.32/ocfs2-set-ms_posixacl-on-remount.patch [new file with mode: 0644]
queue-2.6.32/powerpc-eeh-fix-a-bug-when-pci-structure-is-null.patch [new file with mode: 0644]
queue-2.6.32/reiserfs-fix-oops-while-creating-privroot-with-selinux-enabled.patch [new file with mode: 0644]
queue-2.6.32/reiserfs-properly-honor-read-only-devices.patch [new file with mode: 0644]
queue-2.6.32/sched-cputime-introduce-thread_group_times.patch [new file with mode: 0644]
queue-2.6.32/sched-fix-granularity-of-task_u-stime.patch [new file with mode: 0644]
queue-2.6.32/series
queue-2.6.32/skip-check-for-mandatory-locks-when-unlocking.patch [new file with mode: 0644]
queue-2.6.32/timekeeping-fix-clock_gettime-vsyscall-time-warp.patch [new file with mode: 0644]
queue-2.6.32/x86-fix-out-of-order-of-gsi.patch [new file with mode: 0644]

diff --git a/queue-2.6.32/acpi-fix-regression-where-_ppc-is-not-read-at-boot-even-when-ignore_ppc-0.patch b/queue-2.6.32/acpi-fix-regression-where-_ppc-is-not-read-at-boot-even-when-ignore_ppc-0.patch
new file mode 100644 (file)
index 0000000..f81b88f
--- /dev/null
@@ -0,0 +1,43 @@
+From 455c0d71d46e86b0b7ff2c9dcfc19bc162302ee9 Mon Sep 17 00:00:00 2001
+From: Darrick J. Wong <djwong@us.ibm.com>
+Date: Thu, 18 Feb 2010 10:28:20 -0800
+Subject: ACPI: Fix regression where _PPC is not read at boot even when ignore_ppc=0
+
+From: Darrick J. Wong <djwong@us.ibm.com>
+
+commit 455c0d71d46e86b0b7ff2c9dcfc19bc162302ee9 upstream.
+
+Earlier, Ingo Molnar posted a patch to make it so that the kernel would avoid
+reading _PPC on his broken T60.  Unfortunately, it seems that with Thomas
+Renninger's patch last July to eliminate _PPC evaluations when the processor
+driver loads, the kernel never actually reads _PPC at all!  This is problematic
+if you happen to boot your non-T60 computer in a state where the BIOS _wants_
+_PPC to be something other than zero.
+
+So, put the _PPC evaluation back into acpi_processor_get_performance_info if
+ignore_ppc isn't 1.
+
+Signed-off-by: Darrick J. Wong <djwong@us.ibm.com>
+Signed-off-by: Len Brown <len.brown@intel.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/acpi/processor_perflib.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/drivers/acpi/processor_perflib.c
++++ b/drivers/acpi/processor_perflib.c
+@@ -356,7 +356,11 @@ static int acpi_processor_get_performanc
+       if (result)
+               goto update_bios;
+-      return 0;
++      /* We need to call _PPC once when cpufreq starts */
++      if (ignore_ppc != 1)
++              result = acpi_processor_get_platform_limit(pr);
++
++      return result;
+       /*
+        * Having _PPC but missing frequencies (_PSS, _PCT) is a very good hint that
diff --git a/queue-2.6.32/aic79xx-check-for-non-null-scb-in-ahd_handle_nonpkt_busfree.patch b/queue-2.6.32/aic79xx-check-for-non-null-scb-in-ahd_handle_nonpkt_busfree.patch
new file mode 100644 (file)
index 0000000..8060837
--- /dev/null
@@ -0,0 +1,105 @@
+From 534ef056db8a8fb6b9d50188d88ed5d1fbc66673 Mon Sep 17 00:00:00 2001
+From: Hannes Reinecke <hare@suse.de>
+Date: Fri, 15 Jan 2010 13:07:34 +0100
+Subject: [SCSI] aic79xx: check for non-NULL scb in ahd_handle_nonpkt_busfree
+
+From: Hannes Reinecke <hare@suse.de>
+
+commit 534ef056db8a8fb6b9d50188d88ed5d1fbc66673 upstream.
+
+When removing several devices aic79xx will occasionally Oops
+in ahd_handle_nonpkt_busfree during rescan. Looking at the
+code I found that we're indeed not checking if the scb in
+question is NULL. So check for it before accessing it.
+
+Signed-off-by: Hannes Reinecke <hare@suse.de>
+Signed-off-by: James Bottomley <James.Bottomley@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/scsi/aic7xxx/aic79xx_core.c |   53 +++++++++++++++++++++---------------
+ 1 file changed, 31 insertions(+), 22 deletions(-)
+
+--- a/drivers/scsi/aic7xxx/aic79xx_core.c
++++ b/drivers/scsi/aic7xxx/aic79xx_core.c
+@@ -3171,13 +3171,16 @@ ahd_handle_nonpkt_busfree(struct ahd_sof
+                               tinfo->curr.transport_version = 2;
+                               tinfo->goal.transport_version = 2;
+                               tinfo->goal.ppr_options = 0;
+-                              /*
+-                               * Remove any SCBs in the waiting for selection
+-                               * queue that may also be for this target so
+-                               * that command ordering is preserved.
+-                               */
+-                              ahd_freeze_devq(ahd, scb);
+-                              ahd_qinfifo_requeue_tail(ahd, scb);
++                              if (scb != NULL) {
++                                      /*
++                                       * Remove any SCBs in the waiting
++                                       * for selection queue that may
++                                       * also be for this target so that
++                                       * command ordering is preserved.
++                                       */
++                                      ahd_freeze_devq(ahd, scb);
++                                      ahd_qinfifo_requeue_tail(ahd, scb);
++                              }
+                               printerror = 0;
+                       }
+               } else if (ahd_sent_msg(ahd, AHDMSG_EXT, MSG_EXT_WDTR, FALSE)
+@@ -3194,13 +3197,16 @@ ahd_handle_nonpkt_busfree(struct ahd_sof
+                                     MSG_EXT_WDTR_BUS_8_BIT,
+                                     AHD_TRANS_CUR|AHD_TRANS_GOAL,
+                                     /*paused*/TRUE);
+-                      /*
+-                       * Remove any SCBs in the waiting for selection
+-                       * queue that may also be for this target so that
+-                       * command ordering is preserved.
+-                       */
+-                      ahd_freeze_devq(ahd, scb);
+-                      ahd_qinfifo_requeue_tail(ahd, scb);
++                      if (scb != NULL) {
++                              /*
++                               * Remove any SCBs in the waiting for
++                               * selection queue that may also be for
++                               * this target so that command ordering
++                               * is preserved.
++                               */
++                              ahd_freeze_devq(ahd, scb);
++                              ahd_qinfifo_requeue_tail(ahd, scb);
++                      }
+                       printerror = 0;
+               } else if (ahd_sent_msg(ahd, AHDMSG_EXT, MSG_EXT_SDTR, FALSE)
+                       && ppr_busfree == 0) {
+@@ -3217,13 +3223,16 @@ ahd_handle_nonpkt_busfree(struct ahd_sof
+                                       /*ppr_options*/0,
+                                       AHD_TRANS_CUR|AHD_TRANS_GOAL,
+                                       /*paused*/TRUE);
+-                      /*
+-                       * Remove any SCBs in the waiting for selection
+-                       * queue that may also be for this target so that
+-                       * command ordering is preserved.
+-                       */
+-                      ahd_freeze_devq(ahd, scb);
+-                      ahd_qinfifo_requeue_tail(ahd, scb);
++                      if (scb != NULL) {
++                              /*
++                               * Remove any SCBs in the waiting for
++                               * selection queue that may also be for
++                               * this target so that command ordering
++                               * is preserved.
++                               */
++                              ahd_freeze_devq(ahd, scb);
++                              ahd_qinfifo_requeue_tail(ahd, scb);
++                      }
+                       printerror = 0;
+               } else if ((ahd->msg_flags & MSG_FLAG_EXPECT_IDE_BUSFREE) != 0
+                       && ahd_sent_msg(ahd, AHDMSG_1B,
+@@ -3251,7 +3260,7 @@ ahd_handle_nonpkt_busfree(struct ahd_sof
+        * the message phases.  We check it last in case we
+        * had to send some other message that caused a busfree.
+        */
+-      if (printerror != 0
++      if (scb != NULL && printerror != 0
+        && (lastphase == P_MESGIN || lastphase == P_MESGOUT)
+        && ((ahd->msg_flags & MSG_FLAG_EXPECT_PPR_BUSFREE) != 0)) {
diff --git a/queue-2.6.32/btrfs-add-btrfs_duplicate_item.patch b/queue-2.6.32/btrfs-add-btrfs_duplicate_item.patch
new file mode 100644 (file)
index 0000000..0e2a71a
--- /dev/null
@@ -0,0 +1,299 @@
+From ad48fd754676bfae4139be1a897b1ea58f9aaf21 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:33:58 +0000
+Subject: Btrfs: Add btrfs_duplicate_item
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit ad48fd754676bfae4139be1a897b1ea58f9aaf21 upstream.
+
+btrfs_duplicate_item duplicates item with new key, guaranteeing
+the source item and the new items are in the same tree leaf and
+contiguous. It allows us to split file extent in place, without
+using lock_extent to prevent bookend extent race.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/ctree.c |  198 ++++++++++++++++++++++++++++++++++++++-----------------
+ fs/btrfs/ctree.h |    4 +
+ 2 files changed, 143 insertions(+), 59 deletions(-)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -37,6 +37,11 @@ static int balance_node_right(struct btr
+                             struct extent_buffer *src_buf);
+ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+                  struct btrfs_path *path, int level, int slot);
++static int setup_items_for_insert(struct btrfs_trans_handle *trans,
++                      struct btrfs_root *root, struct btrfs_path *path,
++                      struct btrfs_key *cpu_key, u32 *data_size,
++                      u32 total_data, u32 total_size, int nr);
++
+ struct btrfs_path *btrfs_alloc_path(void)
+ {
+@@ -2997,75 +3002,85 @@ again:
+       return ret;
+ }
+-/*
+- * This function splits a single item into two items,
+- * giving 'new_key' to the new item and splitting the
+- * old one at split_offset (from the start of the item).
+- *
+- * The path may be released by this operation.  After
+- * the split, the path is pointing to the old item.  The
+- * new item is going to be in the same node as the old one.
+- *
+- * Note, the item being split must be smaller enough to live alone on
+- * a tree block with room for one extra struct btrfs_item
+- *
+- * This allows us to split the item in place, keeping a lock on the
+- * leaf the entire time.
+- */
+-int btrfs_split_item(struct btrfs_trans_handle *trans,
+-                   struct btrfs_root *root,
+-                   struct btrfs_path *path,
+-                   struct btrfs_key *new_key,
+-                   unsigned long split_offset)
++static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
++                                       struct btrfs_root *root,
++                                       struct btrfs_path *path, int ins_len)
+ {
+-      u32 item_size;
++      struct btrfs_key key;
+       struct extent_buffer *leaf;
+-      struct btrfs_key orig_key;
+-      struct btrfs_item *item;
+-      struct btrfs_item *new_item;
+-      int ret = 0;
+-      int slot;
+-      u32 nritems;
+-      u32 orig_offset;
+-      struct btrfs_disk_key disk_key;
+-      char *buf;
++      struct btrfs_file_extent_item *fi;
++      u64 extent_len = 0;
++      u32 item_size;
++      int ret;
+       leaf = path->nodes[0];
+-      btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
+-      if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
+-              goto split;
++      btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++
++      BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
++             key.type != BTRFS_EXTENT_CSUM_KEY);
++
++      if (btrfs_leaf_free_space(root, leaf) >= ins_len)
++              return 0;
+       item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++      if (key.type == BTRFS_EXTENT_DATA_KEY) {
++              fi = btrfs_item_ptr(leaf, path->slots[0],
++                                  struct btrfs_file_extent_item);
++              extent_len = btrfs_file_extent_num_bytes(leaf, fi);
++      }
+       btrfs_release_path(root, path);
+-      path->search_for_split = 1;
+       path->keep_locks = 1;
+-
+-      ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
++      path->search_for_split = 1;
++      ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+       path->search_for_split = 0;
++      if (ret < 0)
++              goto err;
++      ret = -EAGAIN;
++      leaf = path->nodes[0];
+       /* if our item isn't there or got smaller, return now */
+-      if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
+-                                                      path->slots[0])) {
+-              path->keep_locks = 0;
+-              return -EAGAIN;
++      if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
++              goto err;
++
++      if (key.type == BTRFS_EXTENT_DATA_KEY) {
++              fi = btrfs_item_ptr(leaf, path->slots[0],
++                                  struct btrfs_file_extent_item);
++              if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
++                      goto err;
+       }
+       btrfs_set_path_blocking(path);
+-      ret = split_leaf(trans, root, &orig_key, path,
+-                       sizeof(struct btrfs_item), 1);
+-      path->keep_locks = 0;
++      ret = split_leaf(trans, root, &key, path, ins_len, 1);
+       BUG_ON(ret);
++      path->keep_locks = 0;
+       btrfs_unlock_up_safe(path, 1);
++      return 0;
++err:
++      path->keep_locks = 0;
++      return ret;
++}
++
++static noinline int split_item(struct btrfs_trans_handle *trans,
++                             struct btrfs_root *root,
++                             struct btrfs_path *path,
++                             struct btrfs_key *new_key,
++                             unsigned long split_offset)
++{
++      struct extent_buffer *leaf;
++      struct btrfs_item *item;
++      struct btrfs_item *new_item;
++      int slot;
++      char *buf;
++      u32 nritems;
++      u32 item_size;
++      u32 orig_offset;
++      struct btrfs_disk_key disk_key;
++
+       leaf = path->nodes[0];
+       BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+-split:
+-      /*
+-       * make sure any changes to the path from split_leaf leave it
+-       * in a blocking state
+-       */
+       btrfs_set_path_blocking(path);
+       item = btrfs_item_nr(leaf, path->slots[0]);
+@@ -3073,19 +3088,19 @@ split:
+       item_size = btrfs_item_size(leaf, item);
+       buf = kmalloc(item_size, GFP_NOFS);
++      if (!buf)
++              return -ENOMEM;
++
+       read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
+                           path->slots[0]), item_size);
+-      slot = path->slots[0] + 1;
+-      leaf = path->nodes[0];
++      slot = path->slots[0] + 1;
+       nritems = btrfs_header_nritems(leaf);
+-
+       if (slot != nritems) {
+               /* shift the items */
+               memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
+-                            btrfs_item_nr_offset(slot),
+-                            (nritems - slot) * sizeof(struct btrfs_item));
+-
++                              btrfs_item_nr_offset(slot),
++                              (nritems - slot) * sizeof(struct btrfs_item));
+       }
+       btrfs_cpu_key_to_disk(&disk_key, new_key);
+@@ -3113,16 +3128,81 @@ split:
+                           item_size - split_offset);
+       btrfs_mark_buffer_dirty(leaf);
+-      ret = 0;
+-      if (btrfs_leaf_free_space(root, leaf) < 0) {
+-              btrfs_print_leaf(root, leaf);
+-              BUG();
+-      }
++      BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
+       kfree(buf);
++      return 0;
++}
++
++/*
++ * This function splits a single item into two items,
++ * giving 'new_key' to the new item and splitting the
++ * old one at split_offset (from the start of the item).
++ *
++ * The path may be released by this operation.  After
++ * the split, the path is pointing to the old item.  The
++ * new item is going to be in the same node as the old one.
++ *
++ * Note, the item being split must be smaller enough to live alone on
++ * a tree block with room for one extra struct btrfs_item
++ *
++ * This allows us to split the item in place, keeping a lock on the
++ * leaf the entire time.
++ */
++int btrfs_split_item(struct btrfs_trans_handle *trans,
++                   struct btrfs_root *root,
++                   struct btrfs_path *path,
++                   struct btrfs_key *new_key,
++                   unsigned long split_offset)
++{
++      int ret;
++      ret = setup_leaf_for_split(trans, root, path,
++                                 sizeof(struct btrfs_item));
++      if (ret)
++              return ret;
++
++      ret = split_item(trans, root, path, new_key, split_offset);
+       return ret;
+ }
+ /*
++ * This function duplicate a item, giving 'new_key' to the new item.
++ * It guarantees both items live in the same tree leaf and the new item
++ * is contiguous with the original item.
++ *
++ * This allows us to split file extent in place, keeping a lock on the
++ * leaf the entire time.
++ */
++int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
++                       struct btrfs_root *root,
++                       struct btrfs_path *path,
++                       struct btrfs_key *new_key)
++{
++      struct extent_buffer *leaf;
++      int ret;
++      u32 item_size;
++
++      leaf = path->nodes[0];
++      item_size = btrfs_item_size_nr(leaf, path->slots[0]);
++      ret = setup_leaf_for_split(trans, root, path,
++                                 item_size + sizeof(struct btrfs_item));
++      if (ret)
++              return ret;
++
++      path->slots[0]++;
++      ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
++                                   item_size, item_size +
++                                   sizeof(struct btrfs_item), 1);
++      BUG_ON(ret);
++
++      leaf = path->nodes[0];
++      memcpy_extent_buffer(leaf,
++                           btrfs_item_ptr_offset(leaf, path->slots[0]),
++                           btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
++                           item_size);
++      return 0;
++}
++
++/*
+  * make the item pointed to by the path smaller.  new_size indicates
+  * how small to make it, and from_end tells us if we just chop bytes
+  * off the end of the item or if we shift the item to chop bytes off
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -2089,6 +2089,10 @@ int btrfs_split_item(struct btrfs_trans_
+                    struct btrfs_path *path,
+                    struct btrfs_key *new_key,
+                    unsigned long split_offset);
++int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
++                       struct btrfs_root *root,
++                       struct btrfs_path *path,
++                       struct btrfs_key *new_key);
+ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
+                     *root, struct btrfs_key *key, struct btrfs_path *p, int
+                     ins_len, int cow);
diff --git a/queue-2.6.32/btrfs-add-delayed-iput.patch b/queue-2.6.32/btrfs-add-delayed-iput.patch
new file mode 100644 (file)
index 0000000..25855f0
--- /dev/null
@@ -0,0 +1,309 @@
+From 24bbcf0442ee04660a5a030efdbb6d03f1c275cb Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:36:34 +0000
+Subject: Btrfs: Add delayed iput
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 24bbcf0442ee04660a5a030efdbb6d03f1c275cb upstream.
+
+iput() can trigger new transactions if we are dropping the
+final reference, so calling it in btrfs_commit_transaction
+may end up deadlock. This patch adds delayed iput to avoid
+the issue.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/ctree.h        |    7 +++++-
+ fs/btrfs/disk-io.c      |    4 +++
+ fs/btrfs/extent-tree.c  |    8 +++---
+ fs/btrfs/inode.c        |   55 ++++++++++++++++++++++++++++++++++++++++++++++--
+ fs/btrfs/ordered-data.c |   10 ++++++--
+ fs/btrfs/ordered-data.h |    3 +-
+ fs/btrfs/relocation.c   |    4 +--
+ fs/btrfs/super.c        |    4 +--
+ fs/btrfs/transaction.c  |   13 ++++++++---
+ 9 files changed, 90 insertions(+), 18 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -872,6 +872,9 @@ struct btrfs_fs_info {
+       struct list_head dead_roots;
+       struct list_head caching_block_groups;
++      spinlock_t delayed_iput_lock;
++      struct list_head delayed_iputs;
++
+       atomic_t nr_async_submits;
+       atomic_t async_submit_draining;
+       atomic_t nr_async_bios;
+@@ -2301,7 +2304,7 @@ int btrfs_truncate_inode_items(struct bt
+                              struct inode *inode, u64 new_size,
+                              u32 min_type);
+-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
++int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
+ int btrfs_writepages(struct address_space *mapping,
+                    struct writeback_control *wbc);
+@@ -2341,6 +2344,8 @@ int btrfs_orphan_del(struct btrfs_trans_
+ void btrfs_orphan_cleanup(struct btrfs_root *root);
+ int btrfs_cont_expand(struct inode *inode, loff_t size);
+ int btrfs_invalidate_inodes(struct btrfs_root *root);
++void btrfs_add_delayed_iput(struct inode *inode);
++void btrfs_run_delayed_iputs(struct btrfs_root *root);
+ extern const struct dentry_operations btrfs_dentry_operations;
+ /* ioctl.c */
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1476,6 +1476,7 @@ static int cleaner_kthread(void *arg)
+               if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+                   mutex_trylock(&root->fs_info->cleaner_mutex)) {
++                      btrfs_run_delayed_iputs(root);
+                       btrfs_clean_old_snapshots(root);
+                       mutex_unlock(&root->fs_info->cleaner_mutex);
+               }
+@@ -1605,6 +1606,7 @@ struct btrfs_root *open_ctree(struct sup
+       INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+       INIT_LIST_HEAD(&fs_info->trans_list);
+       INIT_LIST_HEAD(&fs_info->dead_roots);
++      INIT_LIST_HEAD(&fs_info->delayed_iputs);
+       INIT_LIST_HEAD(&fs_info->hashers);
+       INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+       INIT_LIST_HEAD(&fs_info->ordered_operations);
+@@ -1613,6 +1615,7 @@ struct btrfs_root *open_ctree(struct sup
+       spin_lock_init(&fs_info->new_trans_lock);
+       spin_lock_init(&fs_info->ref_cache_lock);
+       spin_lock_init(&fs_info->fs_roots_radix_lock);
++      spin_lock_init(&fs_info->delayed_iput_lock);
+       init_completion(&fs_info->kobj_unregister);
+       fs_info->tree_root = tree_root;
+@@ -2386,6 +2389,7 @@ int btrfs_commit_super(struct btrfs_root
+       int ret;
+       mutex_lock(&root->fs_info->cleaner_mutex);
++      btrfs_run_delayed_iputs(root);
+       btrfs_clean_old_snapshots(root);
+       mutex_unlock(&root->fs_info->cleaner_mutex);
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -2880,9 +2880,9 @@ static noinline void flush_delalloc_asyn
+       root = async->root;
+       info = async->info;
+-      btrfs_start_delalloc_inodes(root);
++      btrfs_start_delalloc_inodes(root, 0);
+       wake_up(&info->flush_wait);
+-      btrfs_wait_ordered_extents(root, 0);
++      btrfs_wait_ordered_extents(root, 0, 0);
+       spin_lock(&info->lock);
+       info->flushing = 0;
+@@ -2956,8 +2956,8 @@ static void flush_delalloc(struct btrfs_
+       return;
+ flush:
+-      btrfs_start_delalloc_inodes(root);
+-      btrfs_wait_ordered_extents(root, 0);
++      btrfs_start_delalloc_inodes(root, 0);
++      btrfs_wait_ordered_extents(root, 0, 0);
+       spin_lock(&info->lock);
+       info->flushing = 0;
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2022,6 +2022,54 @@ zeroit:
+       return -EIO;
+ }
++struct delayed_iput {
++      struct list_head list;
++      struct inode *inode;
++};
++
++void btrfs_add_delayed_iput(struct inode *inode)
++{
++      struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
++      struct delayed_iput *delayed;
++
++      if (atomic_add_unless(&inode->i_count, -1, 1))
++              return;
++
++      delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
++      delayed->inode = inode;
++
++      spin_lock(&fs_info->delayed_iput_lock);
++      list_add_tail(&delayed->list, &fs_info->delayed_iputs);
++      spin_unlock(&fs_info->delayed_iput_lock);
++}
++
++void btrfs_run_delayed_iputs(struct btrfs_root *root)
++{
++      LIST_HEAD(list);
++      struct btrfs_fs_info *fs_info = root->fs_info;
++      struct delayed_iput *delayed;
++      int empty;
++
++      spin_lock(&fs_info->delayed_iput_lock);
++      empty = list_empty(&fs_info->delayed_iputs);
++      spin_unlock(&fs_info->delayed_iput_lock);
++      if (empty)
++              return;
++
++      down_read(&root->fs_info->cleanup_work_sem);
++      spin_lock(&fs_info->delayed_iput_lock);
++      list_splice_init(&fs_info->delayed_iputs, &list);
++      spin_unlock(&fs_info->delayed_iput_lock);
++
++      while (!list_empty(&list)) {
++              delayed = list_entry(list.next, struct delayed_iput, list);
++              list_del(&delayed->list);
++              iput(delayed->inode);
++              kfree(delayed);
++      }
++      up_read(&root->fs_info->cleanup_work_sem);
++}
++
+ /*
+  * This creates an orphan entry for the given inode in case something goes
+  * wrong in the middle of an unlink/truncate.
+@@ -5568,7 +5616,7 @@ out_fail:
+  * some fairly slow code that needs optimization. This walks the list
+  * of all the inodes with pending delalloc and forces them to disk.
+  */
+-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
++int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+ {
+       struct list_head *head = &root->fs_info->delalloc_inodes;
+       struct btrfs_inode *binode;
+@@ -5587,7 +5635,10 @@ int btrfs_start_delalloc_inodes(struct b
+               spin_unlock(&root->fs_info->delalloc_lock);
+               if (inode) {
+                       filemap_flush(inode->i_mapping);
+-                      iput(inode);
++                      if (delay_iput)
++                              btrfs_add_delayed_iput(inode);
++                      else
++                              iput(inode);
+               }
+               cond_resched();
+               spin_lock(&root->fs_info->delalloc_lock);
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -352,7 +352,8 @@ int btrfs_remove_ordered_extent(struct i
+  * wait for all the ordered extents in a root.  This is done when balancing
+  * space between drives.
+  */
+-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
++int btrfs_wait_ordered_extents(struct btrfs_root *root,
++                             int nocow_only, int delay_iput)
+ {
+       struct list_head splice;
+       struct list_head *cur;
+@@ -389,7 +390,10 @@ int btrfs_wait_ordered_extents(struct bt
+               if (inode) {
+                       btrfs_start_ordered_extent(inode, ordered, 1);
+                       btrfs_put_ordered_extent(ordered);
+-                      iput(inode);
++                      if (delay_iput)
++                              btrfs_add_delayed_iput(inode);
++                      else
++                              iput(inode);
+               } else {
+                       btrfs_put_ordered_extent(ordered);
+               }
+@@ -447,7 +451,7 @@ again:
+                               btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                       else
+                               filemap_flush(inode->i_mapping);
+-                      iput(inode);
++                      btrfs_add_delayed_iput(inode);
+               }
+               cond_resched();
+--- a/fs/btrfs/ordered-data.h
++++ b/fs/btrfs/ordered-data.h
+@@ -153,9 +153,10 @@ btrfs_lookup_first_ordered_extent(struct
+ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+                               struct btrfs_ordered_extent *ordered);
+ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+ int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct inode *inode);
++int btrfs_wait_ordered_extents(struct btrfs_root *root,
++                             int nocow_only, int delay_iput);
+ #endif
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -3541,8 +3541,8 @@ int btrfs_relocate_block_group(struct bt
+              (unsigned long long)rc->block_group->key.objectid,
+              (unsigned long long)rc->block_group->flags);
+-      btrfs_start_delalloc_inodes(fs_info->tree_root);
+-      btrfs_wait_ordered_extents(fs_info->tree_root, 0);
++      btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
++      btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+       while (1) {
+               rc->extents_found = 0;
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -405,8 +405,8 @@ int btrfs_sync_fs(struct super_block *sb
+               return 0;
+       }
+-      btrfs_start_delalloc_inodes(root);
+-      btrfs_wait_ordered_extents(root, 0);
++      btrfs_start_delalloc_inodes(root, 0);
++      btrfs_wait_ordered_extents(root, 0, 0);
+       trans = btrfs_start_transaction(root, 1);
+       ret = btrfs_commit_transaction(trans, root);
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -333,6 +333,9 @@ static int __btrfs_end_transaction(struc
+       memset(trans, 0, sizeof(*trans));
+       kmem_cache_free(btrfs_trans_handle_cachep, trans);
++      if (throttle)
++              btrfs_run_delayed_iputs(root);
++
+       return 0;
+ }
+@@ -991,11 +994,11 @@ int btrfs_commit_transaction(struct btrf
+               mutex_unlock(&root->fs_info->trans_mutex);
+               if (flush_on_commit) {
+-                      btrfs_start_delalloc_inodes(root);
+-                      ret = btrfs_wait_ordered_extents(root, 0);
++                      btrfs_start_delalloc_inodes(root, 1);
++                      ret = btrfs_wait_ordered_extents(root, 0, 1);
+                       BUG_ON(ret);
+               } else if (snap_pending) {
+-                      ret = btrfs_wait_ordered_extents(root, 1);
++                      ret = btrfs_wait_ordered_extents(root, 0, 1);
+                       BUG_ON(ret);
+               }
+@@ -1113,6 +1116,10 @@ int btrfs_commit_transaction(struct btrf
+               current->journal_info = NULL;
+       kmem_cache_free(btrfs_trans_handle_cachep, trans);
++
++      if (current != root->fs_info->transaction_kthread)
++              btrfs_run_delayed_iputs(root);
++
+       return ret;
+ }
diff --git a/queue-2.6.32/btrfs-align-offsets-for-btrfs_ordered_update_i_size.patch b/queue-2.6.32/btrfs-align-offsets-for-btrfs_ordered_update_i_size.patch
new file mode 100644 (file)
index 0000000..e92d4f6
--- /dev/null
@@ -0,0 +1,34 @@
+From a038fab0cb873c75d6675e2bcffce8a3935bdce7 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <yanzheng@21cn.com>
+Date: Mon, 28 Dec 2009 05:01:58 +0000
+Subject: Btrfs: align offsets for btrfs_ordered_update_i_size
+
+From: Yan, Zheng <yanzheng@21cn.com>
+
+commit a038fab0cb873c75d6675e2bcffce8a3935bdce7 upstream.
+
+Some callers of btrfs_ordered_update_i_size can now pass in
+a NULL for the ordered extent to update against.  This makes
+sure we properly align the offset they pass in when deciding
+how much to bump the on disk i_size.
+
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/ordered-data.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -626,6 +626,8 @@ int btrfs_ordered_update_i_size(struct i
+       if (ordered)
+               offset = entry_end(ordered);
++      else
++              offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
+       mutex_lock(&tree->mutex);
+       disk_i_size = BTRFS_I(inode)->disk_i_size;
diff --git a/queue-2.6.32/btrfs-apply-updated-fallocate-i_size-fix.patch b/queue-2.6.32/btrfs-apply-updated-fallocate-i_size-fix.patch
new file mode 100644 (file)
index 0000000..146fda5
--- /dev/null
@@ -0,0 +1,34 @@
+From 23b5c50945f2294add0137799400329c0ebba290 Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Thu, 4 Feb 2010 11:33:03 -0500
+Subject: Btrfs: apply updated fallocate i_size fix
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 23b5c50945f2294add0137799400329c0ebba290 upstream.
+
+This version of the i_size fix for fallocate makes sure we only update
+the i_size when the current fallocate is really operating outside of
+i_size.
+
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/inode.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -5798,7 +5798,9 @@ static int prealloc_file_range(struct in
+               inode->i_ctime = CURRENT_TIME;
+               BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+               if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+-                  cur_offset > inode->i_size) {
++                      (actual_len > inode->i_size) &&
++                      (cur_offset > inode->i_size)) {
++
+                       if (cur_offset > actual_len)
+                               i_size  = actual_len;
+                       else
diff --git a/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-during-committing-transaction.patch b/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-during-committing-transaction.patch
new file mode 100644 (file)
index 0000000..ab9c40b
--- /dev/null
@@ -0,0 +1,133 @@
+From 2e4bfab97055aa6acdd0637913bd705c2d6506d6 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:37:02 +0000
+Subject: Btrfs: Avoid orphan inodes cleanup during committing transaction
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 2e4bfab97055aa6acdd0637913bd705c2d6506d6 upstream.
+
+btrfs_lookup_dentry may trigger orphan cleanup, so it's not good
+to call it while committing a transaction.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/ioctl.c       |   29 +++++++++++++++++------------
+ fs/btrfs/transaction.c |    4 ----
+ 2 files changed, 17 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -237,7 +237,6 @@ static noinline int create_subvol(struct
+       u64 objectid;
+       u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
+       u64 index = 0;
+-      unsigned long nr = 1;
+       /*
+        * 1 - inode item
+@@ -342,24 +341,21 @@ static noinline int create_subvol(struct
+       d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+ fail:
+-      nr = trans->blocks_used;
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+       btrfs_unreserve_metadata_space(root, 6);
+-      btrfs_btree_balance_dirty(root, nr);
+       return ret;
+ }
+ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+                          char *name, int namelen)
+ {
++      struct inode *inode;
+       struct btrfs_pending_snapshot *pending_snapshot;
+       struct btrfs_trans_handle *trans;
+-      int ret = 0;
+-      int err;
+-      unsigned long nr = 0;
++      int ret;
+       if (!root->ref_cows)
+               return -EINVAL;
+@@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_
+        */
+       ret = btrfs_reserve_metadata_space(root, 6);
+       if (ret)
+-              goto fail_unlock;
++              goto fail;
+       pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+       if (!pending_snapshot) {
+               ret = -ENOMEM;
+               btrfs_unreserve_metadata_space(root, 6);
+-              goto fail_unlock;
++              goto fail;
+       }
+       pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
+       if (!pending_snapshot->name) {
+               ret = -ENOMEM;
+               kfree(pending_snapshot);
+               btrfs_unreserve_metadata_space(root, 6);
+-              goto fail_unlock;
++              goto fail;
+       }
+       memcpy(pending_snapshot->name, name, namelen);
+       pending_snapshot->name[namelen] = '\0';
+@@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_
+       pending_snapshot->root = root;
+       list_add(&pending_snapshot->list,
+                &trans->transaction->pending_snapshots);
+-      err = btrfs_commit_transaction(trans, root);
++      ret = btrfs_commit_transaction(trans, root);
++      BUG_ON(ret);
++      btrfs_unreserve_metadata_space(root, 6);
+-fail_unlock:
+-      btrfs_btree_balance_dirty(root, nr);
++      inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
++      if (IS_ERR(inode)) {
++              ret = PTR_ERR(inode);
++              goto fail;
++      }
++      BUG_ON(!inode);
++      d_instantiate(dentry, inode);
++      ret = 0;
++fail:
+       return ret;
+ }
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -796,7 +796,6 @@ static noinline int create_pending_snaps
+       memcpy(&pending->root_key, &key, sizeof(key));
+ fail:
+       kfree(new_root_item);
+-      btrfs_unreserve_metadata_space(root, 6);
+       return ret;
+ }
+@@ -808,7 +807,6 @@ static noinline int finish_pending_snaps
+       u64 index = 0;
+       struct btrfs_trans_handle *trans;
+       struct inode *parent_inode;
+-      struct inode *inode;
+       struct btrfs_root *parent_root;
+       parent_inode = pending->dentry->d_parent->d_inode;
+@@ -840,8 +838,6 @@ static noinline int finish_pending_snaps
+       BUG_ON(ret);
+-      inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
+-      d_instantiate(pending->dentry, inode);
+ fail:
+       btrfs_end_transaction(trans, fs_info->fs_root);
+       return ret;
diff --git a/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-while-replaying-log.patch b/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-while-replaying-log.patch
new file mode 100644 (file)
index 0000000..9ad0c2a
--- /dev/null
@@ -0,0 +1,259 @@
+From c71bf099abddf3e0fdc27f251ba76fca1461d49a Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:34:40 +0000
+Subject: Btrfs: Avoid orphan inodes cleanup while replaying log
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit c71bf099abddf3e0fdc27f251ba76fca1461d49a upstream.
+
+We do log replay in a single transaction, so it's not good to do unbound
+operations. This patch cleans up orphan inodes cleanup after replaying
+the log. It also avoids doing other unbound operations such as truncating
+a file during replaying log. These unbound operations are postponed to
+the orphan inode cleanup stage.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/ctree.h      |    5 +++--
+ fs/btrfs/disk-io.c    |   17 +++++++++++------
+ fs/btrfs/inode.c      |   19 ++++++++++++++++---
+ fs/btrfs/relocation.c |    1 +
+ fs/btrfs/tree-log.c   |   49 ++++++++++++++++++++++++-------------------------
+ 5 files changed, 55 insertions(+), 36 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -859,8 +859,9 @@ struct btrfs_fs_info {
+       struct mutex ordered_operations_mutex;
+       struct rw_semaphore extent_commit_sem;
+-      struct rw_semaphore subvol_sem;
++      struct rw_semaphore cleanup_work_sem;
++      struct rw_semaphore subvol_sem;
+       struct srcu_struct subvol_srcu;
+       struct list_head trans_list;
+@@ -1034,12 +1035,12 @@ struct btrfs_root {
+       int ref_cows;
+       int track_dirty;
+       int in_radix;
++      int clean_orphans;
+       u64 defrag_trans_start;
+       struct btrfs_key defrag_progress;
+       struct btrfs_key defrag_max;
+       int defrag_running;
+-      int defrag_level;
+       char *name;
+       int in_sysfs;
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u3
+       root->stripesize = stripesize;
+       root->ref_cows = 0;
+       root->track_dirty = 0;
++      root->in_radix = 0;
++      root->clean_orphans = 0;
+       root->fs_info = fs_info;
+       root->objectid = objectid;
+@@ -928,7 +930,6 @@ static int __setup_root(u32 nodesize, u3
+       root->defrag_trans_start = fs_info->generation;
+       init_completion(&root->kobj_unregister);
+       root->defrag_running = 0;
+-      root->defrag_level = 0;
+       root->root_key.objectid = objectid;
+       root->anon_super.s_root = NULL;
+       root->anon_super.s_dev = 0;
+@@ -1210,8 +1211,10 @@ again:
+       ret = radix_tree_insert(&fs_info->fs_roots_radix,
+                               (unsigned long)root->root_key.objectid,
+                               root);
+-      if (ret == 0)
++      if (ret == 0) {
+               root->in_radix = 1;
++              root->clean_orphans = 1;
++      }
+       spin_unlock(&fs_info->fs_roots_radix_lock);
+       radix_tree_preload_end();
+       if (ret) {
+@@ -1225,10 +1228,6 @@ again:
+       ret = btrfs_find_dead_roots(fs_info->tree_root,
+                                   root->root_key.objectid);
+       WARN_ON(ret);
+-
+-      if (!(fs_info->sb->s_flags & MS_RDONLY))
+-              btrfs_orphan_cleanup(root);
+-
+       return root;
+ fail:
+       free_fs_root(root);
+@@ -1689,6 +1688,7 @@ struct btrfs_root *open_ctree(struct sup
+       mutex_init(&fs_info->cleaner_mutex);
+       mutex_init(&fs_info->volume_mutex);
+       init_rwsem(&fs_info->extent_commit_sem);
++      init_rwsem(&fs_info->cleanup_work_sem);
+       init_rwsem(&fs_info->subvol_sem);
+       btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
+@@ -2388,6 +2388,11 @@ int btrfs_commit_super(struct btrfs_root
+       mutex_lock(&root->fs_info->cleaner_mutex);
+       btrfs_clean_old_snapshots(root);
+       mutex_unlock(&root->fs_info->cleaner_mutex);
++
++      /* wait until ongoing cleanup work done */
++      down_write(&root->fs_info->cleanup_work_sem);
++      up_write(&root->fs_info->cleanup_work_sem);
++
+       trans = btrfs_start_transaction(root, 1);
+       ret = btrfs_commit_transaction(trans, root);
+       BUG_ON(ret);
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2093,16 +2093,17 @@ void btrfs_orphan_cleanup(struct btrfs_r
+       struct inode *inode;
+       int ret = 0, nr_unlink = 0, nr_truncate = 0;
+-      path = btrfs_alloc_path();
+-      if (!path)
++      if (!xchg(&root->clean_orphans, 0))
+               return;
++
++      path = btrfs_alloc_path();
++      BUG_ON(!path);
+       path->reada = -1;
+       key.objectid = BTRFS_ORPHAN_OBJECTID;
+       btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
+       key.offset = (u64)-1;
+-
+       while (1) {
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+               if (ret < 0) {
+@@ -3298,6 +3299,11 @@ void btrfs_delete_inode(struct inode *in
+       }
+       btrfs_wait_ordered_range(inode, 0, (u64)-1);
++      if (root->fs_info->log_root_recovering) {
++              BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
++              goto no_delete;
++      }
++
+       if (inode->i_nlink > 0) {
+               BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+               goto no_delete;
+@@ -3705,6 +3711,13 @@ struct inode *btrfs_lookup_dentry(struct
+       }
+       srcu_read_unlock(&root->fs_info->subvol_srcu, index);
++      if (root != sub_root) {
++              down_read(&root->fs_info->cleanup_work_sem);
++              if (!(inode->i_sb->s_flags & MS_RDONLY))
++                      btrfs_orphan_cleanup(sub_root);
++              up_read(&root->fs_info->cleanup_work_sem);
++      }
++
+       return inode;
+ }
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -3755,6 +3755,7 @@ out:
+                                      BTRFS_DATA_RELOC_TREE_OBJECTID);
+               if (IS_ERR(fs_root))
+                       err = PTR_ERR(fs_root);
++              btrfs_orphan_cleanup(fs_root);
+       }
+       return err;
+ }
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -930,6 +930,17 @@ out_nowrite:
+       return 0;
+ }
++static int insert_orphan_item(struct btrfs_trans_handle *trans,
++                            struct btrfs_root *root, u64 offset)
++{
++      int ret;
++      ret = btrfs_find_orphan_item(root, offset);
++      if (ret > 0)
++              ret = btrfs_insert_orphan_item(trans, root, offset);
++      return ret;
++}
++
++
+ /*
+  * There are a few corners where the link count of the file can't
+  * be properly maintained during replay.  So, instead of adding
+@@ -997,9 +1008,13 @@ static noinline int fixup_inode_link_cou
+       }
+       BTRFS_I(inode)->index_cnt = (u64)-1;
+-      if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+-              ret = replay_dir_deletes(trans, root, NULL, path,
+-                                       inode->i_ino, 1);
++      if (inode->i_nlink == 0) {
++              if (S_ISDIR(inode->i_mode)) {
++                      ret = replay_dir_deletes(trans, root, NULL, path,
++                                               inode->i_ino, 1);
++                      BUG_ON(ret);
++              }
++              ret = insert_orphan_item(trans, root, inode->i_ino);
+               BUG_ON(ret);
+       }
+       btrfs_free_path(path);
+@@ -1587,7 +1602,6 @@ static int replay_one_buffer(struct btrf
+               /* inode keys are done during the first stage */
+               if (key.type == BTRFS_INODE_ITEM_KEY &&
+                   wc->stage == LOG_WALK_REPLAY_INODES) {
+-                      struct inode *inode;
+                       struct btrfs_inode_item *inode_item;
+                       u32 mode;
+@@ -1603,31 +1617,16 @@ static int replay_one_buffer(struct btrf
+                                            eb, i, &key);
+                       BUG_ON(ret);
+-                      /* for regular files, truncate away
+-                       * extents past the new EOF
++                      /* for regular files, make sure corresponding
++                       * orhpan item exist. extents past the new EOF
++                       * will be truncated later by orphan cleanup.
+                        */
+                       if (S_ISREG(mode)) {
+-                              inode = read_one_inode(root,
+-                                                     key.objectid);
+-                              BUG_ON(!inode);
+-
+-                              ret = btrfs_truncate_inode_items(wc->trans,
+-                                      root, inode, inode->i_size,
+-                                      BTRFS_EXTENT_DATA_KEY);
++                              ret = insert_orphan_item(wc->trans, root,
++                                                       key.objectid);
+                               BUG_ON(ret);
+-
+-                              /* if the nlink count is zero here, the iput
+-                               * will free the inode.  We bump it to make
+-                               * sure it doesn't get freed until the link
+-                               * count fixup is done
+-                               */
+-                              if (inode->i_nlink == 0) {
+-                                      btrfs_inc_nlink(inode);
+-                                      btrfs_update_inode(wc->trans,
+-                                                         root, inode);
+-                              }
+-                              iput(inode);
+                       }
++
+                       ret = link_to_fixup_dir(wc->trans, root,
+                                               path, key.objectid);
+                       BUG_ON(ret);
diff --git a/queue-2.6.32/btrfs-avoid-superfluous-tree-log-writeout.patch b/queue-2.6.32/btrfs-avoid-superfluous-tree-log-writeout.patch
new file mode 100644 (file)
index 0000000..c36c136
--- /dev/null
@@ -0,0 +1,251 @@
+From 8cef4e160d74920ad1725f58c89fd75ec4c4ac38 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:33:26 +0000
+Subject: Btrfs: Avoid superfluous tree-log writeout
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 8cef4e160d74920ad1725f58c89fd75ec4c4ac38 upstream.
+
+We allow two log transactions at a time, but use same flag
+to mark dirty tree-log btree blocks. So we may flush dirty
+blocks belonging to newer log transaction when committing a
+log transaction. This patch fixes the issue by using two
+flags to mark dirty tree-log btree blocks.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/disk-io.c     |    6 +++---
+ fs/btrfs/extent-tree.c |   12 ++++++++++--
+ fs/btrfs/transaction.c |   21 +++++++++++----------
+ fs/btrfs/transaction.h |    6 +++---
+ fs/btrfs/tree-log.c    |   33 ++++++++++++++++++++-------------
+ 5 files changed, 47 insertions(+), 31 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -980,12 +980,12 @@ int btrfs_free_log_root_tree(struct btrf
+       while (1) {
+               ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
+-                                  0, &start, &end, EXTENT_DIRTY);
++                              0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+               if (ret)
+                       break;
+-              clear_extent_dirty(&log_root_tree->dirty_log_pages,
+-                                 start, end, GFP_NOFS);
++              clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
++                                EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+       }
+       eb = fs_info->log_root_tree->node;
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -4919,8 +4919,16 @@ struct extent_buffer *btrfs_init_new_buf
+       btrfs_set_buffer_uptodate(buf);
+       if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+-              set_extent_dirty(&root->dirty_log_pages, buf->start,
+-                       buf->start + buf->len - 1, GFP_NOFS);
++              /*
++               * we allow two log transactions at a time, use different
++               * EXENT bit to differentiate dirty pages.
++               */
++              if (root->log_transid % 2 == 0)
++                      set_extent_dirty(&root->dirty_log_pages, buf->start,
++                                      buf->start + buf->len - 1, GFP_NOFS);
++              else
++                      set_extent_new(&root->dirty_log_pages, buf->start,
++                                      buf->start + buf->len - 1, GFP_NOFS);
+       } else {
+               set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
+                        buf->start + buf->len - 1, GFP_NOFS);
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -354,7 +354,7 @@ int btrfs_end_transaction_throttle(struc
+  * those extents are sent to disk but does not wait on them
+  */
+ int btrfs_write_marked_extents(struct btrfs_root *root,
+-                             struct extent_io_tree *dirty_pages)
++                             struct extent_io_tree *dirty_pages, int mark)
+ {
+       int ret;
+       int err = 0;
+@@ -367,7 +367,7 @@ int btrfs_write_marked_extents(struct bt
+       while (1) {
+               ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+-                                          EXTENT_DIRTY);
++                                          mark);
+               if (ret)
+                       break;
+               while (start <= end) {
+@@ -413,7 +413,7 @@ int btrfs_write_marked_extents(struct bt
+  * on all the pages and clear them from the dirty pages state tree
+  */
+ int btrfs_wait_marked_extents(struct btrfs_root *root,
+-                            struct extent_io_tree *dirty_pages)
++                            struct extent_io_tree *dirty_pages, int mark)
+ {
+       int ret;
+       int err = 0;
+@@ -425,12 +425,12 @@ int btrfs_wait_marked_extents(struct btr
+       unsigned long index;
+       while (1) {
+-              ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+-                                          EXTENT_DIRTY);
++              ret = find_first_extent_bit(dirty_pages, start, &start, &end,
++                                          mark);
+               if (ret)
+                       break;
+-              clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
++              clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+               while (start <= end) {
+                       index = start >> PAGE_CACHE_SHIFT;
+                       start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+@@ -460,13 +460,13 @@ int btrfs_wait_marked_extents(struct btr
+  * those extents are on disk for transaction or log commit
+  */
+ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+-                                      struct extent_io_tree *dirty_pages)
++                              struct extent_io_tree *dirty_pages, int mark)
+ {
+       int ret;
+       int ret2;
+-      ret = btrfs_write_marked_extents(root, dirty_pages);
+-      ret2 = btrfs_wait_marked_extents(root, dirty_pages);
++      ret = btrfs_write_marked_extents(root, dirty_pages, mark);
++      ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
+       return ret || ret2;
+ }
+@@ -479,7 +479,8 @@ int btrfs_write_and_wait_transaction(str
+               return filemap_write_and_wait(btree_inode->i_mapping);
+       }
+       return btrfs_write_and_wait_marked_extents(root,
+-                                         &trans->transaction->dirty_pages);
++                                         &trans->transaction->dirty_pages,
++                                         EXTENT_DIRTY);
+ }
+ /*
+--- a/fs/btrfs/transaction.h
++++ b/fs/btrfs/transaction.h
+@@ -107,10 +107,10 @@ void btrfs_throttle(struct btrfs_root *r
+ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root);
+ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+-                                      struct extent_io_tree *dirty_pages);
++                              struct extent_io_tree *dirty_pages, int mark);
+ int btrfs_write_marked_extents(struct btrfs_root *root,
+-                                      struct extent_io_tree *dirty_pages);
++                              struct extent_io_tree *dirty_pages, int mark);
+ int btrfs_wait_marked_extents(struct btrfs_root *root,
+-                                      struct extent_io_tree *dirty_pages);
++                              struct extent_io_tree *dirty_pages, int mark);
+ int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+ #endif
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1977,10 +1977,11 @@ int btrfs_sync_log(struct btrfs_trans_ha
+ {
+       int index1;
+       int index2;
++      int mark;
+       int ret;
+       struct btrfs_root *log = root->log_root;
+       struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+-      u64 log_transid = 0;
++      unsigned long log_transid = 0;
+       mutex_lock(&root->log_mutex);
+       index1 = root->log_transid % 2;
+@@ -2014,24 +2015,29 @@ int btrfs_sync_log(struct btrfs_trans_ha
+               goto out;
+       }
++      log_transid = root->log_transid;
++      if (log_transid % 2 == 0)
++              mark = EXTENT_DIRTY;
++      else
++              mark = EXTENT_NEW;
++
+       /* we start IO on  all the marked extents here, but we don't actually
+        * wait for them until later.
+        */
+-      ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
++      ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
+       BUG_ON(ret);
+       btrfs_set_root_node(&log->root_item, log->node);
+       root->log_batch = 0;
+-      log_transid = root->log_transid;
+       root->log_transid++;
+       log->log_transid = root->log_transid;
+       root->log_start_pid = 0;
+       smp_mb();
+       /*
+-       * log tree has been flushed to disk, new modifications of
+-       * the log will be written to new positions. so it's safe to
+-       * allow log writers to go in.
++       * IO has been started, blocks of the log tree have WRITTEN flag set
++       * in their headers. new modifications of the log will be written to
++       * new positions. so it's safe to allow log writers to go in.
+        */
+       mutex_unlock(&root->log_mutex);
+@@ -2052,7 +2058,7 @@ int btrfs_sync_log(struct btrfs_trans_ha
+       index2 = log_root_tree->log_transid % 2;
+       if (atomic_read(&log_root_tree->log_commit[index2])) {
+-              btrfs_wait_marked_extents(log, &log->dirty_log_pages);
++              btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+               wait_log_commit(trans, log_root_tree,
+                               log_root_tree->log_transid);
+               mutex_unlock(&log_root_tree->log_mutex);
+@@ -2072,16 +2078,17 @@ int btrfs_sync_log(struct btrfs_trans_ha
+        * check the full commit flag again
+        */
+       if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+-              btrfs_wait_marked_extents(log, &log->dirty_log_pages);
++              btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+               mutex_unlock(&log_root_tree->log_mutex);
+               ret = -EAGAIN;
+               goto out_wake_log_root;
+       }
+       ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+-                              &log_root_tree->dirty_log_pages);
++                              &log_root_tree->dirty_log_pages,
++                              EXTENT_DIRTY | EXTENT_NEW);
+       BUG_ON(ret);
+-      btrfs_wait_marked_extents(log, &log->dirty_log_pages);
++      btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+       btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+                               log_root_tree->node->start);
+@@ -2147,12 +2154,12 @@ int btrfs_free_log(struct btrfs_trans_ha
+       while (1) {
+               ret = find_first_extent_bit(&log->dirty_log_pages,
+-                                  0, &start, &end, EXTENT_DIRTY);
++                              0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+               if (ret)
+                       break;
+-              clear_extent_dirty(&log->dirty_log_pages,
+-                                 start, end, GFP_NOFS);
++              clear_extent_bits(&log->dirty_log_pages, start, end,
++                                EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
+       }
+       if (log->log_transid > 0) {
diff --git a/queue-2.6.32/btrfs-btrfs_mark_extent_written-uses-the-wrong-slot.patch b/queue-2.6.32/btrfs-btrfs_mark_extent_written-uses-the-wrong-slot.patch
new file mode 100644 (file)
index 0000000..8f6b44e
--- /dev/null
@@ -0,0 +1,48 @@
+From 3f6fae9559225741c91f1320090b285da1413290 Mon Sep 17 00:00:00 2001
+From: Shaohua Li <shaohua.li@intel.com>
+Date: Thu, 11 Feb 2010 07:43:00 +0000
+Subject: Btrfs: btrfs_mark_extent_written uses the wrong slot
+
+From: Shaohua Li <shaohua.li@intel.com>
+
+commit 3f6fae9559225741c91f1320090b285da1413290 upstream.
+
+My test do: fallocate a big file and do write. The file is 512M, but
+after file write is done btrfs-debug-tree shows:
+item 6 key (257 EXTENT_DATA 0) itemoff 3516 itemsize 53
+                extent data disk byte 1103101952 nr 536870912
+                extent data offset 0 nr 399634432 ram 536870912
+                extent compression 0
+Looks like a regression introducted by
+6c7d54ac87f338c479d9729e8392eca3f76e11e1, where we set wrong slot.
+
+Signed-off-by: Shaohua Li <shaohua.li@intel.com>
+Acked-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/file.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -720,13 +720,15 @@ again:
+                                       inode->i_ino, orig_offset);
+               BUG_ON(ret);
+       }
+-      fi = btrfs_item_ptr(leaf, path->slots[0],
+-                         struct btrfs_file_extent_item);
+       if (del_nr == 0) {
++              fi = btrfs_item_ptr(leaf, path->slots[0],
++                         struct btrfs_file_extent_item);
+               btrfs_set_file_extent_type(leaf, fi,
+                                          BTRFS_FILE_EXTENT_REG);
+               btrfs_mark_buffer_dirty(leaf);
+       } else {
++              fi = btrfs_item_ptr(leaf, del_slot - 1,
++                         struct btrfs_file_extent_item);
+               btrfs_set_file_extent_type(leaf, fi,
+                                          BTRFS_FILE_EXTENT_REG);
+               btrfs_set_file_extent_num_bytes(leaf, fi,
diff --git a/queue-2.6.32/btrfs-check-return-value-of-open_bdev_exclusive-properly.patch b/queue-2.6.32/btrfs-check-return-value-of-open_bdev_exclusive-properly.patch
new file mode 100644 (file)
index 0000000..b45a988
--- /dev/null
@@ -0,0 +1,37 @@
+From 7f59203abeaf18bf3497b308891f95a4489810ad Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Wed, 27 Jan 2010 02:09:00 +0000
+Subject: Btrfs: check return value of open_bdev_exclusive properly
+
+From: Josef Bacik <josef@redhat.com>
+
+commit 7f59203abeaf18bf3497b308891f95a4489810ad upstream.
+
+Hit this problem while testing RAID1 failure stuff.  open_bdev_exclusive
+returns ERR_PTR(), not NULL.  So change the return value properly.  This
+is important if you accidently specify a device that doesn't exist when
+trying to add a new device to an array, you will panic the box
+dereferencing bdev.
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/volumes.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -1434,8 +1434,8 @@ int btrfs_init_new_device(struct btrfs_r
+               return -EINVAL;
+       bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+-      if (!bdev)
+-              return -EIO;
++      if (IS_ERR(bdev))
++              return PTR_ERR(bdev);
+       if (root->fs_info->fs_devices->seeding) {
+               seeding_dev = 1;
diff --git a/queue-2.6.32/btrfs-check-total-number-of-devices-when-removing-missing.patch b/queue-2.6.32/btrfs-check-total-number-of-devices-when-removing-missing.patch
new file mode 100644 (file)
index 0000000..cde0ebd
--- /dev/null
@@ -0,0 +1,51 @@
+From 035fe03a7ad56982b30ab3a522b7b08d58feccd0 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Wed, 27 Jan 2010 02:09:38 +0000
+Subject: Btrfs: check total number of devices when removing missing
+
+From: Josef Bacik <josef@redhat.com>
+
+commit 035fe03a7ad56982b30ab3a522b7b08d58feccd0 upstream.
+
+If you have a disk failure in RAID1 and then add a new disk to the
+array, and then try to remove the missing volume, it will fail.  The
+reason is the sanity check only looks at the total number of rw devices,
+which is just 2 because we have 2 good disks and 1 bad one.  Instead
+check the total number of devices in the array to make sure we can
+actually remove the device.  Tested this with a failed disk setup and
+with this test we can now run
+
+btrfs-vol -r missing /mount/point
+
+and it works fine.
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/volumes.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -1135,7 +1135,7 @@ int btrfs_rm_device(struct btrfs_root *r
+               root->fs_info->avail_metadata_alloc_bits;
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
+-          root->fs_info->fs_devices->rw_devices <= 4) {
++          root->fs_info->fs_devices->num_devices <= 4) {
+               printk(KERN_ERR "btrfs: unable to go below four devices "
+                      "on raid10\n");
+               ret = -EINVAL;
+@@ -1143,7 +1143,7 @@ int btrfs_rm_device(struct btrfs_root *r
+       }
+       if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
+-          root->fs_info->fs_devices->rw_devices <= 2) {
++          root->fs_info->fs_devices->num_devices <= 2) {
+               printk(KERN_ERR "btrfs: unable to go below two "
+                      "devices on raid1\n");
+               ret = -EINVAL;
diff --git a/queue-2.6.32/btrfs-deal-with-null-acl-sent-to-btrfs_set_acl.patch b/queue-2.6.32/btrfs-deal-with-null-acl-sent-to-btrfs_set_acl.patch
new file mode 100644 (file)
index 0000000..83b7d75
--- /dev/null
@@ -0,0 +1,43 @@
+From a9cc71a60c29a09174bee2fcef8f924c529fd4b7 Mon Sep 17 00:00:00 2001
+From: Chris Mason <chris.mason@oracle.com>
+Date: Sun, 17 Jan 2010 20:36:18 -0500
+Subject: Btrfs: deal with NULL acl sent to btrfs_set_acl
+
+From: Chris Mason <chris.mason@oracle.com>
+
+commit a9cc71a60c29a09174bee2fcef8f924c529fd4b7 upstream.
+
+It is legal for btrfs_set_acl to be sent a NULL acl.  This
+makes sure we don't dereference it.  A similar patch was sent by
+Johannes Hirte <johannes.hirte@fem.tu-ilmenau.de>
+
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/acl.c |   12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/acl.c
++++ b/fs/btrfs/acl.c
+@@ -112,12 +112,14 @@ static int btrfs_set_acl(struct btrfs_tr
+       switch (type) {
+       case ACL_TYPE_ACCESS:
+               mode = inode->i_mode;
+-              ret = posix_acl_equiv_mode(acl, &mode);
+-              if (ret < 0)
+-                      return ret;
+-              ret = 0;
+-              inode->i_mode = mode;
+               name = POSIX_ACL_XATTR_ACCESS;
++              if (acl) {
++                      ret = posix_acl_equiv_mode(acl, &mode);
++                      if (ret < 0)
++                              return ret;
++                      inode->i_mode = mode;
++              }
++              ret = 0;
+               break;
+       case ACL_TYPE_DEFAULT:
+               if (!S_ISDIR(inode->i_mode))
diff --git a/queue-2.6.32/btrfs-deny-sys_link-across-subvolumes.patch b/queue-2.6.32/btrfs-deny-sys_link-across-subvolumes.patch
new file mode 100644 (file)
index 0000000..2dda52a
--- /dev/null
@@ -0,0 +1,42 @@
+From 4a8be425a8fb8fbb5d881eb55fa6634c3463b9c9 Mon Sep 17 00:00:00 2001
+From: TARUISI Hiroaki <taruishi.hiroak@jp.fujitsu.com>
+Date: Thu, 12 Nov 2009 07:14:26 +0000
+Subject: Btrfs: deny sys_link across subvolumes.
+
+From: TARUISI Hiroaki <taruishi.hiroak@jp.fujitsu.com>
+
+commit 4a8be425a8fb8fbb5d881eb55fa6634c3463b9c9 upstream.
+
+I rebased Christian Parpart's patch to deny hard link across
+subvolumes. Original patch modifies also btrfs_rename, but
+I excluded it because we can move across subvolumes now and
+it make no problem.
+-----------------
+
+Hard link across subvolumes should not allowed in Btrfs.
+btrfs_link checks root of 'to' directory is same as root
+of 'from' file. If not same, btrfs_link returns -EPERM.
+
+Signed-off-by: TARUISI Hiroaki <taruishi.hiroak@jp.fujitsu.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/inode.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4462,6 +4462,10 @@ static int btrfs_link(struct dentry *old
+       if (inode->i_nlink == 0)
+               return -ENOENT;
++      /* do not allow sys_link's with other subvols of the same device */
++      if (root->objectid != BTRFS_I(inode)->root->objectid)
++              return -EPERM;
++
+       /*
+        * 1 item for inode ref
+        * 2 items for dir items
diff --git a/queue-2.6.32/btrfs-do-not-mark-the-chunk-as-readonly-if-in-degraded-mode.patch b/queue-2.6.32/btrfs-do-not-mark-the-chunk-as-readonly-if-in-degraded-mode.patch
new file mode 100644 (file)
index 0000000..c0c829f
--- /dev/null
@@ -0,0 +1,41 @@
+From f48b90756bd834dda852ff514f2690d3175b1f44 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Wed, 27 Jan 2010 02:07:59 +0000
+Subject: Btrfs: do not mark the chunk as readonly if in degraded mode
+
+From: Josef Bacik <josef@redhat.com>
+
+commit f48b90756bd834dda852ff514f2690d3175b1f44 upstream.
+
+If a RAID setup has chunks that span multiple disks, and one of those
+disks has failed, btrfs_chunk_readonly will return 1 since one of the
+disks in that chunk's stripes is dead and therefore not writeable.  So
+instead if we are in degraded mode, return 0 so we can go ahead and
+allocate stuff.  Without this patch all of the block groups in a RAID1
+setup will end up read-only, which will mean we can't add new disks to
+the array since we won't be able to make allocations.
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/volumes.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2538,6 +2538,11 @@ int btrfs_chunk_readonly(struct btrfs_ro
+       if (!em)
+               return 1;
++      if (btrfs_test_opt(root, DEGRADED)) {
++              free_extent_map(em);
++              return 0;
++      }
++
+       map = (struct map_lookup *)em->bdev;
+       for (i = 0; i < map->num_stripes; i++) {
+               if (!map->stripes[i].dev->writeable) {
diff --git a/queue-2.6.32/btrfs-do-not-try-and-lookup-the-file-extent-when-finishing-ordered-io.patch b/queue-2.6.32/btrfs-do-not-try-and-lookup-the-file-extent-when-finishing-ordered-io.patch
new file mode 100644 (file)
index 0000000..dca70e7
--- /dev/null
@@ -0,0 +1,105 @@
+From efd049fb26a162c3830fd3cb1001fdc09b147f3b Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Tue, 2 Feb 2010 20:50:10 +0000
+Subject: Btrfs: do not try and lookup the file extent when finishing ordered io
+
+From: Josef Bacik <josef@redhat.com>
+
+commit efd049fb26a162c3830fd3cb1001fdc09b147f3b upstream.
+
+When running the following fio job
+
+[torrent]
+filename=torrent-test
+rw=randwrite
+size=4g
+filesize=4g
+bs=4k
+ioengine=sync
+
+you would see long stalls where no work was being done.  That is because we were
+doing all this extra work to read in the file extent outside of the transaction,
+however in the random io case this ends up hurting us because the file extents
+are not there to begin with.  So axe this logic, since we end up reading in the
+file extent when we go to update it anyway.  This took the fio job from 11 mb/s
+with several ~10 second stalls to 24 mb/s to a couple of 1-2 second stalls.
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/inode.c |   46 ++--------------------------------------------
+ 1 file changed, 2 insertions(+), 44 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1680,24 +1680,6 @@ static int insert_reserved_file_extent(s
+  * before we start the transaction.  It limits the amount of btree
+  * reads required while inside the transaction.
+  */
+-static noinline void reada_csum(struct btrfs_root *root,
+-                              struct btrfs_path *path,
+-                              struct btrfs_ordered_extent *ordered_extent)
+-{
+-      struct btrfs_ordered_sum *sum;
+-      u64 bytenr;
+-
+-      sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+-                       list);
+-      bytenr = sum->sums[0].bytenr;
+-
+-      /*
+-       * we don't care about the results, the point of this search is
+-       * just to get the btree leaves into ram
+-       */
+-      btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+-}
+-
+ /* as ordered data IO finishes, this gets called so we can finish
+  * an ordered extent if the range of bytes in the file it covers are
+  * fully written.
+@@ -1708,7 +1690,6 @@ static int btrfs_finish_ordered_io(struc
+       struct btrfs_trans_handle *trans;
+       struct btrfs_ordered_extent *ordered_extent = NULL;
+       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+-      struct btrfs_path *path;
+       int compressed = 0;
+       int ret;
+@@ -1716,32 +1697,9 @@ static int btrfs_finish_ordered_io(struc
+       if (!ret)
+               return 0;
+-      /*
+-       * before we join the transaction, try to do some of our IO.
+-       * This will limit the amount of IO that we have to do with
+-       * the transaction running.  We're unlikely to need to do any
+-       * IO if the file extents are new, the disk_i_size checks
+-       * covers the most common case.
+-       */
+-      if (start < BTRFS_I(inode)->disk_i_size) {
+-              path = btrfs_alloc_path();
+-              if (path) {
+-                      ret = btrfs_lookup_file_extent(NULL, root, path,
+-                                                     inode->i_ino,
+-                                                     start, 0);
+-                      ordered_extent = btrfs_lookup_ordered_extent(inode,
+-                                                                   start);
+-                      if (!list_empty(&ordered_extent->list)) {
+-                              btrfs_release_path(root, path);
+-                              reada_csum(root, path, ordered_extent);
+-                      }
+-                      btrfs_free_path(path);
+-              }
+-      }
+-
+-      if (!ordered_extent)
+-              ordered_extent = btrfs_lookup_ordered_extent(inode, start);
++      ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+       BUG_ON(!ordered_extent);
++
+       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+               BUG_ON(!list_empty(&ordered_extent->list));
+               ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
diff --git a/queue-2.6.32/btrfs-don-t-add-extent-0-to-the-free-space-cache-v2.patch b/queue-2.6.32/btrfs-don-t-add-extent-0-to-the-free-space-cache-v2.patch
new file mode 100644 (file)
index 0000000..a2dda34
--- /dev/null
@@ -0,0 +1,48 @@
+From 06b2331f8333ec6edf41662757ce8882cc1747d5 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 26 Nov 2009 09:31:11 +0000
+Subject: Btrfs: don't add extent 0 to the free space cache v2
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 06b2331f8333ec6edf41662757ce8882cc1747d5 upstream.
+
+If block group 0 is completely free, btrfs_read_block_groups will
+add extent [0, BTRFS_SUPER_INFO_OFFSET) to the free space cache.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/extent-tree.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -195,6 +195,14 @@ static int exclude_super_stripes(struct
+       int stripe_len;
+       int i, nr, ret;
++      if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
++              stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
++              cache->bytes_super += stripe_len;
++              ret = add_excluded_extent(root, cache->key.objectid,
++                                        stripe_len);
++              BUG_ON(ret);
++      }
++
+       for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+               bytenr = btrfs_sb_offset(i);
+               ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+@@ -255,7 +263,7 @@ static u64 add_new_free_space(struct btr
+               if (ret)
+                       break;
+-              if (extent_start == start) {
++              if (extent_start <= start) {
+                       start = extent_end + 1;
+               } else if (extent_start > start && extent_start < end) {
+                       size = extent_start - start;
diff --git a/queue-2.6.32/btrfs-fail-mount-on-bad-mount-options.patch b/queue-2.6.32/btrfs-fail-mount-on-bad-mount-options.patch
new file mode 100644 (file)
index 0000000..9d66a71
--- /dev/null
@@ -0,0 +1,50 @@
+From a7a3f7cadd9bdee569243f7ead9550aa16b60e07 Mon Sep 17 00:00:00 2001
+From: Sage Weil <sage@newdream.net>
+Date: Sat, 7 Nov 2009 06:19:16 +0000
+Subject: Btrfs: fail mount on bad mount options
+
+From: Sage Weil <sage@newdream.net>
+
+commit a7a3f7cadd9bdee569243f7ead9550aa16b60e07 upstream.
+
+We shouldn't silently ignore unrecognized options.
+
+Signed-off-by: Sage Weil <sage@newdream.net>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/super.c |    9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -128,6 +128,7 @@ int btrfs_parse_options(struct btrfs_roo
+       substring_t args[MAX_OPT_ARGS];
+       char *p, *num;
+       int intarg;
++      int ret = 0;
+       if (!options)
+               return 0;
+@@ -262,12 +263,18 @@ int btrfs_parse_options(struct btrfs_roo
+               case Opt_discard:
+                       btrfs_set_opt(info->mount_opt, DISCARD);
+                       break;
++              case Opt_err:
++                      printk(KERN_INFO "btrfs: unrecognized mount option "
++                             "'%s'\n", p);
++                      ret = -EINVAL;
++                      goto out;
+               default:
+                       break;
+               }
+       }
++out:
+       kfree(options);
+-      return 0;
++      return ret;
+ }
+ /*
diff --git a/queue-2.6.32/btrfs-fix-a-memory-leak-in-btrfs_init_acl.patch b/queue-2.6.32/btrfs-fix-a-memory-leak-in-btrfs_init_acl.patch
new file mode 100644 (file)
index 0000000..f692439
--- /dev/null
@@ -0,0 +1,31 @@
+From f858153c367a397235d3e81136741e40e44faf1d Mon Sep 17 00:00:00 2001
+From: Yang Hongyang <yanghy@cn.fujitsu.com>
+Date: Tue, 26 Jan 2010 00:48:23 +0000
+Subject: Btrfs: fix a memory leak in btrfs_init_acl
+
+From: Yang Hongyang <yanghy@cn.fujitsu.com>
+
+commit f858153c367a397235d3e81136741e40e44faf1d upstream.
+
+In btrfs_init_acl() cloned acl is not released
+
+Signed-off-by: Yang Hongyang <yanghy@cn.fujitsu.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/acl.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/btrfs/acl.c
++++ b/fs/btrfs/acl.c
+@@ -272,6 +272,7 @@ int btrfs_init_acl(struct btrfs_trans_ha
+                                                   ACL_TYPE_ACCESS);
+                       }
+               }
++              posix_acl_release(clone);
+       }
+ failed:
+       posix_acl_release(acl);
diff --git a/queue-2.6.32/btrfs-fix-btrfs_drop_extent_cache-for-skip-pinned-case.patch b/queue-2.6.32/btrfs-fix-btrfs_drop_extent_cache-for-skip-pinned-case.patch
new file mode 100644 (file)
index 0000000..9ecc378
--- /dev/null
@@ -0,0 +1,45 @@
+From 55ef68990029fcd8d04d42fc184aa7fb18cf309e Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:36:44 +0000
+Subject: Btrfs: Fix btrfs_drop_extent_cache for skip pinned case
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 55ef68990029fcd8d04d42fc184aa7fb18cf309e upstream.
+
+The check for skip pinned case is wrong, it may breaks the
+while loop too soon.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/file.c |   10 +++-------
+ 1 file changed, 3 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode
+               }
+               flags = em->flags;
+               if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+-                      if (em->start <= start &&
+-                          (!testend || em->start + em->len >= start + len)) {
++                      if (testend && em->start + em->len >= start + len) {
+                               free_extent_map(em);
+                               write_unlock(&em_tree->lock);
+                               break;
+                       }
+-                      if (start < em->start) {
+-                              len = em->start - start;
+-                      } else {
++                      start = em->start + em->len;
++                      if (testend)
+                               len = start + len - (em->start + em->len);
+-                              start = em->start + em->len;
+-                      }
+                       free_extent_map(em);
+                       write_unlock(&em_tree->lock);
+                       continue;
diff --git a/queue-2.6.32/btrfs-fix-disk_i_size-update-corner-case.patch b/queue-2.6.32/btrfs-fix-disk_i_size-update-corner-case.patch
new file mode 100644 (file)
index 0000000..c742fd4
--- /dev/null
@@ -0,0 +1,448 @@
+From c216775458a2ee345d9412a2770c2916acfb5d30 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:34:21 +0000
+Subject: Btrfs: Fix disk_i_size update corner case
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit c216775458a2ee345d9412a2770c2916acfb5d30 upstream.
+
+There are some cases file extents are inserted without involving
+ordered struct. In these cases, we update disk_i_size directly,
+without checking pending ordered extent and DELALLOC bit. This
+patch extends btrfs_ordered_update_i_size() to handle these cases.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/btrfs_inode.h  |    5 --
+ fs/btrfs/inode.c        |   71 ++++++++++++++++++++------------
+ fs/btrfs/ordered-data.c |  105 +++++++++++++++++++++++++++++++++++++-----------
+ fs/btrfs/ordered-data.h |    2 
+ 4 files changed, 127 insertions(+), 56 deletions(-)
+
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -44,9 +44,6 @@ struct btrfs_inode {
+        */
+       struct extent_io_tree io_failure_tree;
+-      /* held while inesrting or deleting extents from files */
+-      struct mutex extent_mutex;
+-
+       /* held while logging the inode in tree-log.c */
+       struct mutex log_mutex;
+@@ -166,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_
+ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
+ {
+-      inode->i_size = size;
++      i_size_write(inode, size);
+       BTRFS_I(inode)->disk_i_size = size;
+ }
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -188,8 +188,18 @@ static noinline int insert_inline_extent
+       btrfs_mark_buffer_dirty(leaf);
+       btrfs_free_path(path);
++      /*
++       * we're an inline extent, so nobody can
++       * extend the file past i_size without locking
++       * a page we already have locked.
++       *
++       * We must do any isize and inode updates
++       * before we unlock the pages.  Otherwise we
++       * could end up racing with unlink.
++       */
+       BTRFS_I(inode)->disk_i_size = inode->i_size;
+       btrfs_update_inode(trans, root, inode);
++
+       return 0;
+ fail:
+       btrfs_free_path(path);
+@@ -415,7 +425,6 @@ again:
+                                                   start, end,
+                                                   total_compressed, pages);
+               }
+-              btrfs_end_transaction(trans, root);
+               if (ret == 0) {
+                       /*
+                        * inline extent creation worked, we don't need
+@@ -429,9 +438,11 @@ again:
+                            EXTENT_CLEAR_DELALLOC |
+                            EXTENT_CLEAR_ACCOUNTING |
+                            EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
+-                      ret = 0;
++
++                      btrfs_end_transaction(trans, root);
+                       goto free_pages_out;
+               }
++              btrfs_end_transaction(trans, root);
+       }
+       if (will_compress) {
+@@ -542,7 +553,6 @@ static noinline int submit_compressed_ex
+       if (list_empty(&async_cow->extents))
+               return 0;
+-      trans = btrfs_join_transaction(root, 1);
+       while (!list_empty(&async_cow->extents)) {
+               async_extent = list_entry(async_cow->extents.next,
+@@ -589,19 +599,15 @@ retry:
+               lock_extent(io_tree, async_extent->start,
+                           async_extent->start + async_extent->ram_size - 1,
+                           GFP_NOFS);
+-              /*
+-               * here we're doing allocation and writeback of the
+-               * compressed pages
+-               */
+-              btrfs_drop_extent_cache(inode, async_extent->start,
+-                                      async_extent->start +
+-                                      async_extent->ram_size - 1, 0);
++              trans = btrfs_join_transaction(root, 1);
+               ret = btrfs_reserve_extent(trans, root,
+                                          async_extent->compressed_size,
+                                          async_extent->compressed_size,
+                                          0, alloc_hint,
+                                          (u64)-1, &ins, 1);
++              btrfs_end_transaction(trans, root);
++
+               if (ret) {
+                       int i;
+                       for (i = 0; i < async_extent->nr_pages; i++) {
+@@ -617,6 +623,14 @@ retry:
+                       goto retry;
+               }
++              /*
++               * here we're doing allocation and writeback of the
++               * compressed pages
++               */
++              btrfs_drop_extent_cache(inode, async_extent->start,
++                                      async_extent->start +
++                                      async_extent->ram_size - 1, 0);
++
+               em = alloc_extent_map(GFP_NOFS);
+               em->start = async_extent->start;
+               em->len = async_extent->ram_size;
+@@ -648,8 +662,6 @@ retry:
+                                              BTRFS_ORDERED_COMPRESSED);
+               BUG_ON(ret);
+-              btrfs_end_transaction(trans, root);
+-
+               /*
+                * clear dirty, set writeback and unlock the pages.
+                */
+@@ -671,13 +683,11 @@ retry:
+                                   async_extent->nr_pages);
+               BUG_ON(ret);
+-              trans = btrfs_join_transaction(root, 1);
+               alloc_hint = ins.objectid + ins.offset;
+               kfree(async_extent);
+               cond_resched();
+       }
+-      btrfs_end_transaction(trans, root);
+       return 0;
+ }
+@@ -741,6 +751,7 @@ static noinline int cow_file_range(struc
+                                    EXTENT_CLEAR_DIRTY |
+                                    EXTENT_SET_WRITEBACK |
+                                    EXTENT_END_WRITEBACK);
++
+                       *nr_written = *nr_written +
+                            (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+                       *page_started = 1;
+@@ -1727,18 +1738,27 @@ static int btrfs_finish_ordered_io(struc
+               }
+       }
+-      trans = btrfs_join_transaction(root, 1);
+-
+       if (!ordered_extent)
+               ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+       BUG_ON(!ordered_extent);
+-      if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
+-              goto nocow;
++      if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
++              BUG_ON(!list_empty(&ordered_extent->list));
++              ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
++              if (!ret) {
++                      trans = btrfs_join_transaction(root, 1);
++                      ret = btrfs_update_inode(trans, root, inode);
++                      BUG_ON(ret);
++                      btrfs_end_transaction(trans, root);
++              }
++              goto out;
++      }
+       lock_extent(io_tree, ordered_extent->file_offset,
+                   ordered_extent->file_offset + ordered_extent->len - 1,
+                   GFP_NOFS);
++      trans = btrfs_join_transaction(root, 1);
++
+       if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
+               compressed = 1;
+       if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+@@ -1765,22 +1785,20 @@ static int btrfs_finish_ordered_io(struc
+       unlock_extent(io_tree, ordered_extent->file_offset,
+                   ordered_extent->file_offset + ordered_extent->len - 1,
+                   GFP_NOFS);
+-nocow:
+       add_pending_csums(trans, inode, ordered_extent->file_offset,
+                         &ordered_extent->list);
+-      mutex_lock(&BTRFS_I(inode)->extent_mutex);
+-      btrfs_ordered_update_i_size(inode, ordered_extent);
+-      btrfs_update_inode(trans, root, inode);
+-      btrfs_remove_ordered_extent(inode, ordered_extent);
+-      mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+-
++      /* this also removes the ordered extent from the tree */
++      btrfs_ordered_update_i_size(inode, 0, ordered_extent);
++      ret = btrfs_update_inode(trans, root, inode);
++      BUG_ON(ret);
++      btrfs_end_transaction(trans, root);
++out:
+       /* once for us */
+       btrfs_put_ordered_extent(ordered_extent);
+       /* once for the tree */
+       btrfs_put_ordered_extent(ordered_extent);
+-      btrfs_end_transaction(trans, root);
+       return 0;
+ }
+@@ -3562,7 +3580,6 @@ static noinline void init_btrfs_i(struct
+       INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
+       RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+       btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+-      mutex_init(&BTRFS_I(inode)->extent_mutex);
+       mutex_init(&BTRFS_I(inode)->log_mutex);
+ }
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -291,16 +291,16 @@ int btrfs_put_ordered_extent(struct btrf
+ /*
+  * remove an ordered extent from the tree.  No references are dropped
+- * but, anyone waiting on this extent is woken up.
++ * and you must wake_up entry->wait.  You must hold the tree mutex
++ * while you call this function.
+  */
+-int btrfs_remove_ordered_extent(struct inode *inode,
++static int __btrfs_remove_ordered_extent(struct inode *inode,
+                               struct btrfs_ordered_extent *entry)
+ {
+       struct btrfs_ordered_inode_tree *tree;
+       struct rb_node *node;
+       tree = &BTRFS_I(inode)->ordered_tree;
+-      mutex_lock(&tree->mutex);
+       node = &entry->rb_node;
+       rb_erase(node, &tree->tree);
+       tree->last = NULL;
+@@ -326,9 +326,26 @@ int btrfs_remove_ordered_extent(struct i
+       }
+       spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
++      return 0;
++}
++
++/*
++ * remove an ordered extent from the tree.  No references are dropped
++ * but any waiters are woken.
++ */
++int btrfs_remove_ordered_extent(struct inode *inode,
++                              struct btrfs_ordered_extent *entry)
++{
++      struct btrfs_ordered_inode_tree *tree;
++      int ret;
++
++      tree = &BTRFS_I(inode)->ordered_tree;
++      mutex_lock(&tree->mutex);
++      ret = __btrfs_remove_ordered_extent(inode, entry);
+       mutex_unlock(&tree->mutex);
+       wake_up(&entry->wait);
+-      return 0;
++
++      return ret;
+ }
+ /*
+@@ -589,7 +606,7 @@ out:
+  * After an extent is done, call this to conditionally update the on disk
+  * i_size.  i_size is updated to cover any fully written part of the file.
+  */
+-int btrfs_ordered_update_i_size(struct inode *inode,
++int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+                               struct btrfs_ordered_extent *ordered)
+ {
+       struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+@@ -597,18 +614,30 @@ int btrfs_ordered_update_i_size(struct i
+       u64 disk_i_size;
+       u64 new_i_size;
+       u64 i_size_test;
++      u64 i_size = i_size_read(inode);
+       struct rb_node *node;
++      struct rb_node *prev = NULL;
+       struct btrfs_ordered_extent *test;
++      int ret = 1;
++
++      if (ordered)
++              offset = entry_end(ordered);
+       mutex_lock(&tree->mutex);
+       disk_i_size = BTRFS_I(inode)->disk_i_size;
++      /* truncate file */
++      if (disk_i_size > i_size) {
++              BTRFS_I(inode)->disk_i_size = i_size;
++              ret = 0;
++              goto out;
++      }
++
+       /*
+        * if the disk i_size is already at the inode->i_size, or
+        * this ordered extent is inside the disk i_size, we're done
+        */
+-      if (disk_i_size >= inode->i_size ||
+-          ordered->file_offset + ordered->len <= disk_i_size) {
++      if (disk_i_size == i_size || offset <= disk_i_size) {
+               goto out;
+       }
+@@ -616,8 +645,7 @@ int btrfs_ordered_update_i_size(struct i
+        * we can't update the disk_isize if there are delalloc bytes
+        * between disk_i_size and  this ordered extent
+        */
+-      if (test_range_bit(io_tree, disk_i_size,
+-                         ordered->file_offset + ordered->len - 1,
++      if (test_range_bit(io_tree, disk_i_size, offset - 1,
+                          EXTENT_DELALLOC, 0, NULL)) {
+               goto out;
+       }
+@@ -626,20 +654,32 @@ int btrfs_ordered_update_i_size(struct i
+        * if we find an ordered extent then we can't update disk i_size
+        * yet
+        */
+-      node = &ordered->rb_node;
+-      while (1) {
+-              node = rb_prev(node);
+-              if (!node)
+-                      break;
++      if (ordered) {
++              node = rb_prev(&ordered->rb_node);
++      } else {
++              prev = tree_search(tree, offset);
++              /*
++               * we insert file extents without involving ordered struct,
++               * so there should be no ordered struct cover this offset
++               */
++              if (prev) {
++                      test = rb_entry(prev, struct btrfs_ordered_extent,
++                                      rb_node);
++                      BUG_ON(offset_in_entry(test, offset));
++              }
++              node = prev;
++      }
++      while (node) {
+               test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+               if (test->file_offset + test->len <= disk_i_size)
+                       break;
+-              if (test->file_offset >= inode->i_size)
++              if (test->file_offset >= i_size)
+                       break;
+               if (test->file_offset >= disk_i_size)
+                       goto out;
++              node = rb_prev(node);
+       }
+-      new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
++      new_i_size = min_t(u64, offset, i_size);
+       /*
+        * at this point, we know we can safely update i_size to at least
+@@ -647,7 +687,14 @@ int btrfs_ordered_update_i_size(struct i
+        * walk forward and see if ios from higher up in the file have
+        * finished.
+        */
+-      node = rb_next(&ordered->rb_node);
++      if (ordered) {
++              node = rb_next(&ordered->rb_node);
++      } else {
++              if (prev)
++                      node = rb_next(prev);
++              else
++                      node = rb_first(&tree->tree);
++      }
+       i_size_test = 0;
+       if (node) {
+               /*
+@@ -655,10 +702,10 @@ int btrfs_ordered_update_i_size(struct i
+                * between our ordered extent and the next one.
+                */
+               test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+-              if (test->file_offset > entry_end(ordered))
++              if (test->file_offset > offset)
+                       i_size_test = test->file_offset;
+       } else {
+-              i_size_test = i_size_read(inode);
++              i_size_test = i_size;
+       }
+       /*
+@@ -667,15 +714,25 @@ int btrfs_ordered_update_i_size(struct i
+        * are no delalloc bytes in this area, it is safe to update
+        * disk_i_size to the end of the region.
+        */
+-      if (i_size_test > entry_end(ordered) &&
+-          !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+-                         EXTENT_DELALLOC, 0, NULL)) {
+-              new_i_size = min_t(u64, i_size_test, i_size_read(inode));
++      if (i_size_test > offset &&
++          !test_range_bit(io_tree, offset, i_size_test - 1,
++                          EXTENT_DELALLOC, 0, NULL)) {
++              new_i_size = min_t(u64, i_size_test, i_size);
+       }
+       BTRFS_I(inode)->disk_i_size = new_i_size;
++      ret = 0;
+ out:
++      /*
++       * we need to remove the ordered extent with the tree lock held
++       * so that other people calling this function don't find our fully
++       * processed ordered entry and skip updating the i_size
++       */
++      if (ordered)
++              __btrfs_remove_ordered_extent(inode, ordered);
+       mutex_unlock(&tree->mutex);
+-      return 0;
++      if (ordered)
++              wake_up(&ordered->wait);
++      return ret;
+ }
+ /*
+--- a/fs/btrfs/ordered-data.h
++++ b/fs/btrfs/ordered-data.h
+@@ -150,7 +150,7 @@ void btrfs_start_ordered_extent(struct i
+ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+ struct btrfs_ordered_extent *
+ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+-int btrfs_ordered_update_i_size(struct inode *inode,
++int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
+                               struct btrfs_ordered_extent *ordered);
+ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
+ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
diff --git a/queue-2.6.32/btrfs-fix-memory-leaks-in-error-paths.patch b/queue-2.6.32/btrfs-fix-memory-leaks-in-error-paths.patch
new file mode 100644 (file)
index 0000000..0ba1012
--- /dev/null
@@ -0,0 +1,53 @@
+From 2423fdfb96e3f9ff3baeb6c4c78d74145547891d Mon Sep 17 00:00:00 2001
+From: Jiri Slaby <jslaby@suse.cz>
+Date: Wed, 6 Jan 2010 16:57:22 +0000
+Subject: Btrfs, fix memory leaks in error paths
+
+From: Jiri Slaby <jslaby@suse.cz>
+
+commit 2423fdfb96e3f9ff3baeb6c4c78d74145547891d upstream.
+
+Stanse found 2 memory leaks in relocate_block_group and
+__btrfs_map_block. cluster and multi are not freed/assigned on all
+paths. Fix that.
+
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+Cc: linux-btrfs@vger.kernel.org
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/relocation.c |    4 +++-
+ fs/btrfs/volumes.c    |    4 +++-
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -3281,8 +3281,10 @@ static noinline_for_stack int relocate_b
+               return -ENOMEM;
+       path = btrfs_alloc_path();
+-      if (!path)
++      if (!path) {
++              kfree(cluster);
+               return -ENOMEM;
++      }
+       rc->extents_found = 0;
+       rc->extents_skipped = 0;
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2649,8 +2649,10 @@ again:
+       em = lookup_extent_mapping(em_tree, logical, *length);
+       read_unlock(&em_tree->lock);
+-      if (!em && unplug_page)
++      if (!em && unplug_page) {
++              kfree(multi);
+               return 0;
++      }
+       if (!em) {
+               printk(KERN_CRIT "unable to find logical %llu len %llu\n",
diff --git a/queue-2.6.32/btrfs-fix-missing-last-entry-in-readdir-3.patch b/queue-2.6.32/btrfs-fix-missing-last-entry-in-readdir-3.patch
new file mode 100644 (file)
index 0000000..49a506b
--- /dev/null
@@ -0,0 +1,46 @@
+From 406266ab9ac8ed8b085c58aacd9e3161480dc5d5 Mon Sep 17 00:00:00 2001
+From: Jan Engelhardt <jengelh@medozas.de>
+Date: Wed, 9 Dec 2009 22:00:38 +0000
+Subject: btrfs: fix missing last-entry in readdir(3)
+
+From: Jan Engelhardt <jengelh@medozas.de>
+
+commit 406266ab9ac8ed8b085c58aacd9e3161480dc5d5 upstream.
+
+parent 49313cdac7b34c9f7ecbb1780cfc648b1c082cd7 (v2.6.32-1-g49313cd)
+commit ff48c08e1c05c67e8348ab6f8a24de8034e0e34d
+Author: Jan Engelhardt <jengelh@medozas.de>
+Date:   Wed Dec 9 22:57:36 2009 +0100
+
+Btrfs: fix missing last-entry in readdir(3)
+
+When one does a 32-bit readdir(3), the last entry of a directory is
+missing. This is however not due to passing a large value to filldir,
+but it seems to have to do with glibc doing telldir or something
+quirky. In any case, this patch fixes it in practice.
+
+Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/inode.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3995,7 +3995,11 @@ skip:
+       /* Reached end of directory/root. Bump pos past the last item. */
+       if (key_type == BTRFS_DIR_INDEX_KEY)
+-              filp->f_pos = INT_LIMIT(off_t);
++              /*
++               * 32-bit glibc will use getdents64, but then strtol -
++               * so the last number we can serve is this.
++               */
++              filp->f_pos = 0x7fffffff;
+       else
+               filp->f_pos++;
+ nopos:
diff --git a/queue-2.6.32/btrfs-fix-oopsen-when-dropping-empty-tree.patch b/queue-2.6.32/btrfs-fix-oopsen-when-dropping-empty-tree.patch
new file mode 100644 (file)
index 0000000..dc184fc
--- /dev/null
@@ -0,0 +1,46 @@
+From 7a7965f83e89f0be506a96769938a721e4e5ae50 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Mon, 1 Feb 2010 02:41:17 +0000
+Subject: Btrfs: Fix oopsen when dropping empty tree.
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 7a7965f83e89f0be506a96769938a721e4e5ae50 upstream.
+
+When dropping a empty tree, walk_down_tree() skips checking
+extent information for the tree root. This will triggers a
+BUG_ON in walk_up_proc().
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/extent-tree.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -5402,10 +5402,6 @@ static noinline int walk_down_tree(struc
+       int ret;
+       while (level >= 0) {
+-              if (path->slots[level] >=
+-                  btrfs_header_nritems(path->nodes[level]))
+-                      break;
+-
+               ret = walk_down_proc(trans, root, path, wc, lookup_info);
+               if (ret > 0)
+                       break;
+@@ -5413,6 +5409,10 @@ static noinline int walk_down_tree(struc
+               if (level == 0)
+                       break;
++              if (path->slots[level] >=
++                  btrfs_header_nritems(path->nodes[level]))
++                      break;
++
+               ret = do_walk_down(trans, root, path, wc, &lookup_info);
+               if (ret > 0) {
+                       path->slots[level]++;
diff --git a/queue-2.6.32/btrfs-fix-per-root-used-space-accounting.patch b/queue-2.6.32/btrfs-fix-per-root-used-space-accounting.patch
new file mode 100644 (file)
index 0000000..da62b76
--- /dev/null
@@ -0,0 +1,208 @@
+From 86b9f2eca5e0984145e3c7698a7cd6dd65c2a93f Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:36:50 +0000
+Subject: Btrfs: Fix per root used space accounting
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 86b9f2eca5e0984145e3c7698a7cd6dd65c2a93f upstream.
+
+The bytes_used field in root item was originally planned to
+trace the amount of used data and tree blocks. But it never
+worked right since we can't trace freeing of data accurately.
+This patch changes it to only trace the amount of tree blocks.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/ctree.c       |   31 ++++++++++++++-----------------
+ fs/btrfs/ctree.h       |    4 ++++
+ fs/btrfs/extent-tree.c |   31 +++++++++++++++++++++++--------
+ fs/btrfs/ioctl.c       |    2 +-
+ fs/btrfs/transaction.c |    6 +++++-
+ 5 files changed, 47 insertions(+), 27 deletions(-)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -456,9 +456,8 @@ static noinline int __btrfs_cow_block(st
+               extent_buffer_get(cow);
+               spin_unlock(&root->node_lock);
+-              btrfs_free_extent(trans, root, buf->start, buf->len,
+-                                parent_start, root->root_key.objectid,
+-                                level, 0);
++              btrfs_free_tree_block(trans, root, buf->start, buf->len,
++                              parent_start, root->root_key.objectid, level);
+               free_extent_buffer(buf);
+               add_root_to_dirty_list(root);
+       } else {
+@@ -473,9 +472,8 @@ static noinline int __btrfs_cow_block(st
+               btrfs_set_node_ptr_generation(parent, parent_slot,
+                                             trans->transid);
+               btrfs_mark_buffer_dirty(parent);
+-              btrfs_free_extent(trans, root, buf->start, buf->len,
+-                                parent_start, root->root_key.objectid,
+-                                level, 0);
++              btrfs_free_tree_block(trans, root, buf->start, buf->len,
++                              parent_start, root->root_key.objectid, level);
+       }
+       if (unlock_orig)
+               btrfs_tree_unlock(buf);
+@@ -1035,8 +1033,8 @@ static noinline int balance_level(struct
+               btrfs_tree_unlock(mid);
+               /* once for the path */
+               free_extent_buffer(mid);
+-              ret = btrfs_free_extent(trans, root, mid->start, mid->len,
+-                                      0, root->root_key.objectid, level, 1);
++              ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
++                                          0, root->root_key.objectid, level);
+               /* once for the root ptr */
+               free_extent_buffer(mid);
+               return ret;
+@@ -1100,10 +1098,10 @@ static noinline int balance_level(struct
+                                      1);
+                       if (wret)
+                               ret = wret;
+-                      wret = btrfs_free_extent(trans, root, bytenr,
+-                                               blocksize, 0,
+-                                               root->root_key.objectid,
+-                                               level, 0);
++                      wret = btrfs_free_tree_block(trans, root,
++                                                   bytenr, blocksize, 0,
++                                                   root->root_key.objectid,
++                                                   level);
+                       if (wret)
+                               ret = wret;
+               } else {
+@@ -1148,9 +1146,8 @@ static noinline int balance_level(struct
+               wret = del_ptr(trans, root, path, level + 1, pslot);
+               if (wret)
+                       ret = wret;
+-              wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+-                                       0, root->root_key.objectid,
+-                                       level, 0);
++              wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
++                                       0, root->root_key.objectid, level);
+               if (wret)
+                       ret = wret;
+       } else {
+@@ -3794,8 +3791,8 @@ static noinline int btrfs_del_leaf(struc
+        */
+       btrfs_unlock_up_safe(path, 0);
+-      ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
+-                              0, root->root_key.objectid, 0, 0);
++      ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
++                                  0, root->root_key.objectid, 0);
+       return ret;
+ }
+ /*
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -1982,6 +1982,10 @@ struct extent_buffer *btrfs_alloc_free_b
+                                       u64 parent, u64 root_objectid,
+                                       struct btrfs_disk_key *key, int level,
+                                       u64 hint, u64 empty_size);
++int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
++                        struct btrfs_root *root,
++                        u64 bytenr, u32 blocksize,
++                        u64 parent, u64 root_objectid, int level);
+ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
+                                           struct btrfs_root *root,
+                                           u64 bytenr, u32 blocksize,
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3454,14 +3454,6 @@ static int update_block_group(struct btr
+       else
+               old_val -= num_bytes;
+       btrfs_set_super_bytes_used(&info->super_copy, old_val);
+-
+-      /* block accounting for root item */
+-      old_val = btrfs_root_used(&root->root_item);
+-      if (alloc)
+-              old_val += num_bytes;
+-      else
+-              old_val -= num_bytes;
+-      btrfs_set_root_used(&root->root_item, old_val);
+       spin_unlock(&info->delalloc_lock);
+       while (total) {
+@@ -4049,6 +4041,21 @@ int btrfs_free_extent(struct btrfs_trans
+       return ret;
+ }
++int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
++                        struct btrfs_root *root,
++                        u64 bytenr, u32 blocksize,
++                        u64 parent, u64 root_objectid, int level)
++{
++      u64 used;
++      spin_lock(&root->node_lock);
++      used = btrfs_root_used(&root->root_item) - blocksize;
++      btrfs_set_root_used(&root->root_item, used);
++      spin_unlock(&root->node_lock);
++
++      return btrfs_free_extent(trans, root, bytenr, blocksize,
++                               parent, root_objectid, level, 0);
++}
++
+ static u64 stripe_align(struct btrfs_root *root, u64 val)
+ {
+       u64 mask = ((u64)root->stripesize - 1);
+@@ -4897,6 +4904,14 @@ static int alloc_tree_block(struct btrfs
+                                       extent_op);
+               BUG_ON(ret);
+       }
++
++      if (root_objectid == root->root_key.objectid) {
++              u64 used;
++              spin_lock(&root->node_lock);
++              used = btrfs_root_used(&root->root_item) + num_bytes;
++              btrfs_set_root_used(&root->root_item, used);
++              spin_unlock(&root->node_lock);
++      }
+       return ret;
+ }
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -289,7 +289,7 @@ static noinline int create_subvol(struct
+       btrfs_set_root_generation(&root_item, trans->transid);
+       btrfs_set_root_level(&root_item, 0);
+       btrfs_set_root_refs(&root_item, 1);
+-      btrfs_set_root_used(&root_item, 0);
++      btrfs_set_root_used(&root_item, leaf->len);
+       btrfs_set_root_last_snapshot(&root_item, 0);
+       memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -501,13 +501,16 @@ static int update_cowonly_root(struct bt
+ {
+       int ret;
+       u64 old_root_bytenr;
++      u64 old_root_used;
+       struct btrfs_root *tree_root = root->fs_info->tree_root;
++      old_root_used = btrfs_root_used(&root->root_item);
+       btrfs_write_dirty_block_groups(trans, root);
+       while (1) {
+               old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+-              if (old_root_bytenr == root->node->start)
++              if (old_root_bytenr == root->node->start &&
++                  old_root_used == btrfs_root_used(&root->root_item))
+                       break;
+               btrfs_set_root_node(&root->root_item, root->node);
+@@ -516,6 +519,7 @@ static int update_cowonly_root(struct bt
+                                       &root->root_item);
+               BUG_ON(ret);
++              old_root_used = btrfs_root_used(&root->root_item);
+               ret = btrfs_write_dirty_block_groups(trans, root);
+               BUG_ON(ret);
+       }
diff --git a/queue-2.6.32/btrfs-fix-possible-panic-on-unmount.patch b/queue-2.6.32/btrfs-fix-possible-panic-on-unmount.patch
new file mode 100644 (file)
index 0000000..0fbc3d9
--- /dev/null
@@ -0,0 +1,123 @@
+From 11dfe35a0108097f2df1f042c485fa7f758c2cdf Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Fri, 13 Nov 2009 20:12:59 +0000
+Subject: Btrfs: fix possible panic on unmount
+
+From: Josef Bacik <josef@redhat.com>
+
+commit 11dfe35a0108097f2df1f042c485fa7f758c2cdf upstream.
+
+We can race with the unmount of an fs and the stopping of a kthread where we
+will free the block group before we're done using it.  The reason for this is
+because we do not hold a reference on the block group while its caching, since
+the allocator drops its reference once it exits or moves on to the next block
+group.  This patch fixes the problem by taking a reference to the block group
+before we start caching and dropping it when we're done to make sure all
+accesses to the block group are safe.  Thanks,
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/extent-tree.c |   32 +++++++++++++++++++-------------
+ 1 file changed, 19 insertions(+), 13 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs
+       return (cache->flags & bits) == bits;
+ }
++void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
++{
++      atomic_inc(&cache->count);
++}
++
++void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
++{
++      if (atomic_dec_and_test(&cache->count))
++              kfree(cache);
++}
++
+ /*
+  * this adds the block group to the fs_info rb tree for the block group
+  * cache
+@@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btr
+               }
+       }
+       if (ret)
+-              atomic_inc(&ret->count);
++              btrfs_get_block_group(ret);
+       spin_unlock(&info->block_group_cache_lock);
+       return ret;
+@@ -407,6 +418,8 @@ err:
+       put_caching_control(caching_ctl);
+       atomic_dec(&block_group->space_info->caching_threads);
++      btrfs_put_block_group(block_group);
++
+       return 0;
+ }
+@@ -447,6 +460,7 @@ static int cache_block_group(struct btrf
+       up_write(&fs_info->extent_commit_sem);
+       atomic_inc(&cache->space_info->caching_threads);
++      btrfs_get_block_group(cache);
+       tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+                         cache->key.objectid);
+@@ -486,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lo
+       return cache;
+ }
+-void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
+-{
+-      if (atomic_dec_and_test(&cache->count))
+-              kfree(cache);
+-}
+-
+ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
+                                                 u64 flags)
+ {
+@@ -2582,7 +2590,7 @@ next_block_group(struct btrfs_root *root
+       if (node) {
+               cache = rb_entry(node, struct btrfs_block_group_cache,
+                                cache_node);
+-              atomic_inc(&cache->count);
++              btrfs_get_block_group(cache);
+       } else
+               cache = NULL;
+       spin_unlock(&root->fs_info->block_group_cache_lock);
+@@ -4227,7 +4235,7 @@ search:
+               u64 offset;
+               int cached;
+-              atomic_inc(&block_group->count);
++              btrfs_get_block_group(block_group);
+               search_start = block_group->key.objectid;
+ have_block_group:
+@@ -4315,7 +4323,7 @@ have_block_group:
+                               btrfs_put_block_group(block_group);
+                               block_group = last_ptr->block_group;
+-                              atomic_inc(&block_group->count);
++                              btrfs_get_block_group(block_group);
+                               spin_unlock(&last_ptr->lock);
+                               spin_unlock(&last_ptr->refill_lock);
+@@ -7395,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs
+                       wait_block_group_cache_done(block_group);
+               btrfs_remove_free_space_cache(block_group);
+-
+-              WARN_ON(atomic_read(&block_group->count) != 1);
+-              kfree(block_group);
++              btrfs_put_block_group(block_group);
+               spin_lock(&info->block_group_cache_lock);
+       }
diff --git a/queue-2.6.32/btrfs-fix-race-between-allocate-and-release-extent-buffer.patch b/queue-2.6.32/btrfs-fix-race-between-allocate-and-release-extent-buffer.patch
new file mode 100644 (file)
index 0000000..da43bdf
--- /dev/null
@@ -0,0 +1,36 @@
+From f044ba7835b84e69c68b620ca8fa27e5ef67759d Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 4 Feb 2010 08:46:56 +0000
+Subject: Btrfs: fix race between allocate and release extent buffer.
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit f044ba7835b84e69c68b620ca8fa27e5ef67759d upstream.
+
+Increase extent buffer's reference count while holding the lock.
+Otherwise it can race with try_release_extent_buffer.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/extent_io.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -3165,10 +3165,9 @@ struct extent_buffer *alloc_extent_buffe
+               spin_unlock(&tree->buffer_lock);
+               goto free_eb;
+       }
+-      spin_unlock(&tree->buffer_lock);
+-
+       /* add one reference for the tree */
+       atomic_inc(&eb->refs);
++      spin_unlock(&tree->buffer_lock);
+       return eb;
+ free_eb:
diff --git a/queue-2.6.32/btrfs-fix-race-in-btrfs_mark_extent_written.patch b/queue-2.6.32/btrfs-fix-race-in-btrfs_mark_extent_written.patch
new file mode 100644 (file)
index 0000000..9705b0b
--- /dev/null
@@ -0,0 +1,195 @@
+From 6c7d54ac87f338c479d9729e8392eca3f76e11e1 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Fri, 15 Jan 2010 08:43:09 +0000
+Subject: Btrfs: Fix race in btrfs_mark_extent_written
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 6c7d54ac87f338c479d9729e8392eca3f76e11e1 upstream.
+
+Fix bug reported by Johannes Hirte. The reason of that bug
+is btrfs_del_items is called after btrfs_duplicate_item and
+btrfs_del_items triggers tree balance. The fix is check that
+case and call btrfs_search_slot when needed.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/file.c |  100 ++++++++++++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 80 insertions(+), 20 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -506,7 +506,8 @@ next_slot:
+ }
+ static int extent_mergeable(struct extent_buffer *leaf, int slot,
+-                          u64 objectid, u64 bytenr, u64 *start, u64 *end)
++                          u64 objectid, u64 bytenr, u64 orig_offset,
++                          u64 *start, u64 *end)
+ {
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
+@@ -522,6 +523,7 @@ static int extent_mergeable(struct exten
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+       if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
+           btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
++          btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
+           btrfs_file_extent_compression(leaf, fi) ||
+           btrfs_file_extent_encryption(leaf, fi) ||
+           btrfs_file_extent_other_encoding(leaf, fi))
+@@ -561,6 +563,7 @@ int btrfs_mark_extent_written(struct btr
+       u64 split;
+       int del_nr = 0;
+       int del_slot = 0;
++      int recow;
+       int ret;
+       btrfs_drop_extent_cache(inode, start, end - 1, 0);
+@@ -568,6 +571,7 @@ int btrfs_mark_extent_written(struct btr
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+ again:
++      recow = 0;
+       split = start;
+       key.objectid = inode->i_ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+@@ -591,12 +595,60 @@ again:
+       bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+       num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+       orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
++      memcpy(&new_key, &key, sizeof(new_key));
++
++      if (start == key.offset && end < extent_end) {
++              other_start = 0;
++              other_end = start;
++              if (extent_mergeable(leaf, path->slots[0] - 1,
++                                   inode->i_ino, bytenr, orig_offset,
++                                   &other_start, &other_end)) {
++                      new_key.offset = end;
++                      btrfs_set_item_key_safe(trans, root, path, &new_key);
++                      fi = btrfs_item_ptr(leaf, path->slots[0],
++                                          struct btrfs_file_extent_item);
++                      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                                      extent_end - end);
++                      btrfs_set_file_extent_offset(leaf, fi,
++                                                   end - orig_offset);
++                      fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
++                                          struct btrfs_file_extent_item);
++                      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                                      end - other_start);
++                      btrfs_mark_buffer_dirty(leaf);
++                      goto out;
++              }
++      }
++
++      if (start > key.offset && end == extent_end) {
++              other_start = end;
++              other_end = 0;
++              if (extent_mergeable(leaf, path->slots[0] + 1,
++                                   inode->i_ino, bytenr, orig_offset,
++                                   &other_start, &other_end)) {
++                      fi = btrfs_item_ptr(leaf, path->slots[0],
++                                          struct btrfs_file_extent_item);
++                      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                                      start - key.offset);
++                      path->slots[0]++;
++                      new_key.offset = start;
++                      btrfs_set_item_key_safe(trans, root, path, &new_key);
++
++                      fi = btrfs_item_ptr(leaf, path->slots[0],
++                                          struct btrfs_file_extent_item);
++                      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                                      other_end - start);
++                      btrfs_set_file_extent_offset(leaf, fi,
++                                                   start - orig_offset);
++                      btrfs_mark_buffer_dirty(leaf);
++                      goto out;
++              }
++      }
+       while (start > key.offset || end < extent_end) {
+               if (key.offset == start)
+                       split = end;
+-              memcpy(&new_key, &key, sizeof(new_key));
+               new_key.offset = split;
+               ret = btrfs_duplicate_item(trans, root, path, &new_key);
+               if (ret == -EAGAIN) {
+@@ -631,15 +683,18 @@ again:
+                       path->slots[0]--;
+                       extent_end = end;
+               }
++              recow = 1;
+       }
+-      fi = btrfs_item_ptr(leaf, path->slots[0],
+-                          struct btrfs_file_extent_item);
+-
+       other_start = end;
+       other_end = 0;
+-      if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+-                           bytenr, &other_start, &other_end)) {
++      if (extent_mergeable(leaf, path->slots[0] + 1,
++                           inode->i_ino, bytenr, orig_offset,
++                           &other_start, &other_end)) {
++              if (recow) {
++                      btrfs_release_path(root, path);
++                      goto again;
++              }
+               extent_end = other_end;
+               del_slot = path->slots[0] + 1;
+               del_nr++;
+@@ -650,8 +705,13 @@ again:
+       }
+       other_start = 0;
+       other_end = start;
+-      if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+-                           bytenr, &other_start, &other_end)) {
++      if (extent_mergeable(leaf, path->slots[0] - 1,
++                           inode->i_ino, bytenr, orig_offset,
++                           &other_start, &other_end)) {
++              if (recow) {
++                      btrfs_release_path(root, path);
++                      goto again;
++              }
+               key.offset = other_start;
+               del_slot = path->slots[0];
+               del_nr++;
+@@ -660,22 +720,22 @@ again:
+                                       inode->i_ino, orig_offset);
+               BUG_ON(ret);
+       }
++      fi = btrfs_item_ptr(leaf, path->slots[0],
++                         struct btrfs_file_extent_item);
+       if (del_nr == 0) {
+               btrfs_set_file_extent_type(leaf, fi,
+                                          BTRFS_FILE_EXTENT_REG);
+               btrfs_mark_buffer_dirty(leaf);
+-              goto out;
+-      }
+-
+-      fi = btrfs_item_ptr(leaf, del_slot - 1,
+-                          struct btrfs_file_extent_item);
+-      btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+-      btrfs_set_file_extent_num_bytes(leaf, fi,
+-                                      extent_end - key.offset);
+-      btrfs_mark_buffer_dirty(leaf);
++      } else {
++              btrfs_set_file_extent_type(leaf, fi,
++                                         BTRFS_FILE_EXTENT_REG);
++              btrfs_set_file_extent_num_bytes(leaf, fi,
++                                              extent_end - key.offset);
++              btrfs_mark_buffer_dirty(leaf);
+-      ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+-      BUG_ON(ret);
++              ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
++              BUG_ON(ret);
++      }
+ out:
+       btrfs_free_path(path);
+       return 0;
diff --git a/queue-2.6.32/btrfs-fix-regression-in-orphan-cleanup.patch b/queue-2.6.32/btrfs-fix-regression-in-orphan-cleanup.patch
new file mode 100644 (file)
index 0000000..a7fef37
--- /dev/null
@@ -0,0 +1,95 @@
+From 6c090a11e1c403b727a6a8eff0b97d5fb9e95cb5 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Fri, 15 Jan 2010 20:08:22 +0000
+Subject: Btrfs: fix regression in orphan cleanup
+
+From: Josef Bacik <josef@redhat.com>
+
+commit 6c090a11e1c403b727a6a8eff0b97d5fb9e95cb5 upstream.
+
+Currently orphan cleanup only ever gets triggered if we cross subvolumes during
+a lookup, which means that if we just mount a plain jane fs that has orphans in
+it, they will never get cleaned up.  This results in panic's like these
+
+http://www.kerneloops.org/oops.php?number=1109085
+
+where adding an orphan entry results in -EEXIST being returned and we panic.  In
+order to fix this, we check to see on lookup if our root has had the orphan
+cleanup done, and if not go ahead and do it.  This is easily reproduceable by
+running this testcase
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+       char data[4096];
+       char newdata[4096];
+       int fd1, fd2;
+
+       memset(data, 'a', 4096);
+       memset(newdata, 'b', 4096);
+
+       while (1) {
+               int i;
+
+               fd1 = creat("file1", 0666);
+               if (fd1 < 0)
+                       break;
+
+               for (i = 0; i < 512; i++)
+                       write(fd1, data, 4096);
+
+               fsync(fd1);
+               close(fd1);
+
+               fd2 = creat("file2", 0666);
+               if (fd2 < 0)
+                       break;
+
+               ftruncate(fd2, 4096 * 512);
+
+               for (i = 0; i < 512; i++)
+                       write(fd2, newdata, 4096);
+               close(fd2);
+
+               i = rename("file2", "file1");
+               unlink("file1");
+       }
+
+       return 0;
+}
+
+and then pulling the power on the box, and then trying to run that test again
+when the box comes back up.  I've tested this locally and it fixes the problem.
+Thanks to Tomas Carnecky for helping me track this down initially.
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/inode.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3796,6 +3796,12 @@ struct inode *btrfs_lookup_dentry(struct
+       if (location.type == BTRFS_INODE_ITEM_KEY) {
+               inode = btrfs_iget(dir->i_sb, &location, root);
++              if (unlikely(root->clean_orphans) &&
++                  !(inode->i_sb->s_flags & MS_RDONLY)) {
++                      down_read(&root->fs_info->cleanup_work_sem);
++                      btrfs_orphan_cleanup(root);
++                      up_read(&root->fs_info->cleanup_work_sem);
++              }
+               return inode;
+       }
diff --git a/queue-2.6.32/btrfs-kfree-correct-pointer-during-mount-option-parsing.patch b/queue-2.6.32/btrfs-kfree-correct-pointer-during-mount-option-parsing.patch
new file mode 100644 (file)
index 0000000..f02572b
--- /dev/null
@@ -0,0 +1,50 @@
+From da495ecc0fb096b383754952a1c152147bc95b52 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Thu, 25 Feb 2010 20:38:35 +0000
+Subject: Btrfs: kfree correct pointer during mount option parsing
+
+From: Josef Bacik <josef@redhat.com>
+
+commit da495ecc0fb096b383754952a1c152147bc95b52 upstream.
+
+We kstrdup the options string, but then strsep screws with the pointer,
+so when we kfree() it, we're not giving it the right pointer.
+
+Tested-by: Andy Lutomirski <luto@mit.edu>
+
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/super.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -126,7 +126,7 @@ int btrfs_parse_options(struct btrfs_roo
+ {
+       struct btrfs_fs_info *info = root->fs_info;
+       substring_t args[MAX_OPT_ARGS];
+-      char *p, *num;
++      char *p, *num, *orig;
+       int intarg;
+       int ret = 0;
+@@ -141,6 +141,7 @@ int btrfs_parse_options(struct btrfs_roo
+       if (!options)
+               return -ENOMEM;
++      orig = options;
+       while ((p = strsep(&options, ",")) != NULL) {
+               int token;
+@@ -273,7 +274,7 @@ int btrfs_parse_options(struct btrfs_roo
+               }
+       }
+ out:
+-      kfree(options);
++      kfree(orig);
+       return ret;
+ }
diff --git a/queue-2.6.32/btrfs-make-error-return-negative-in-btrfs_sync_file.patch b/queue-2.6.32/btrfs-make-error-return-negative-in-btrfs_sync_file.patch
new file mode 100644 (file)
index 0000000..5b7f0c0
--- /dev/null
@@ -0,0 +1,31 @@
+From 014e4ac4f7d9c981750491fa40ea35efadc9ed49 Mon Sep 17 00:00:00 2001
+From: Roel Kluin <roel.kluin@gmail.com>
+Date: Fri, 29 Jan 2010 10:42:11 +0000
+Subject: Btrfs: make error return negative in btrfs_sync_file()
+
+From: Roel Kluin <roel.kluin@gmail.com>
+
+commit 014e4ac4f7d9c981750491fa40ea35efadc9ed49 upstream.
+
+It appears the error return should be negative
+
+Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/file.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1133,7 +1133,7 @@ int btrfs_sync_file(struct file *file, s
+       }
+       mutex_lock(&dentry->d_inode->i_mutex);
+ out:
+-      return ret > 0 ? EIO : ret;
++      return ret > 0 ? -EIO : ret;
+ }
+ static const struct vm_operations_struct btrfs_file_vm_ops = {
diff --git a/queue-2.6.32/btrfs-make-fallocate-2-more-enospc-friendly.patch b/queue-2.6.32/btrfs-make-fallocate-2-more-enospc-friendly.patch
new file mode 100644 (file)
index 0000000..ad8c47c
--- /dev/null
@@ -0,0 +1,170 @@
+From 5a303d5d4b8055d2e5a03e92d04745bfc5881a22 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:34:52 +0000
+Subject: Btrfs: Make fallocate(2) more ENOSPC friendly
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 5a303d5d4b8055d2e5a03e92d04745bfc5881a22 upstream.
+
+fallocate(2) may allocate large number of file extents, so it's not
+good to do it in a single transaction. This patch make fallocate(2)
+start a new transaction for each file extents it allocates.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/inode.c |   65 +++++++++++++++++++++++++++----------------------------
+ 1 file changed, 32 insertions(+), 33 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -5664,10 +5664,10 @@ out_fail:
+       return err;
+ }
+-static int prealloc_file_range(struct btrfs_trans_handle *trans,
+-                             struct inode *inode, u64 start, u64 end,
++static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+                              u64 alloc_hint, int mode)
+ {
++      struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_key ins;
+       u64 alloc_size;
+@@ -5678,17 +5678,23 @@ static int prealloc_file_range(struct bt
+       while (num_bytes > 0) {
+               alloc_size = min(num_bytes, root->fs_info->max_extent);
+-              ret = btrfs_reserve_metadata_space(root, 1);
+-              if (ret)
+-                      goto out;
+-
+               ret = btrfs_reserve_extent(trans, root, alloc_size,
+                                          root->sectorsize, 0, alloc_hint,
+                                          (u64)-1, &ins, 1);
+               if (ret) {
+                       WARN_ON(1);
+-                      goto out;
++                      break;
+               }
++
++              ret = btrfs_reserve_metadata_space(root, 3);
++              if (ret) {
++                      btrfs_free_reserved_extent(root, ins.objectid,
++                                                 ins.offset);
++                      break;
++              }
++
++              trans = btrfs_start_transaction(root, 1);
++
+               ret = insert_reserved_file_extent(trans, inode,
+                                                 cur_offset, ins.objectid,
+                                                 ins.offset, ins.offset,
+@@ -5697,22 +5703,25 @@ static int prealloc_file_range(struct bt
+               BUG_ON(ret);
+               btrfs_drop_extent_cache(inode, cur_offset,
+                                       cur_offset + ins.offset -1, 0);
++
+               num_bytes -= ins.offset;
+               cur_offset += ins.offset;
+               alloc_hint = ins.objectid + ins.offset;
+-              btrfs_unreserve_metadata_space(root, 1);
+-      }
+-out:
+-      if (cur_offset > start) {
++
+               inode->i_ctime = CURRENT_TIME;
+               BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+               if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+-                  cur_offset > i_size_read(inode))
+-                      btrfs_i_size_write(inode, cur_offset);
++                  cur_offset > inode->i_size) {
++                      i_size_write(inode, cur_offset);
++                      btrfs_ordered_update_i_size(inode, cur_offset, NULL);
++              }
++
+               ret = btrfs_update_inode(trans, root, inode);
+               BUG_ON(ret);
+-      }
++              btrfs_end_transaction(trans, root);
++              btrfs_unreserve_metadata_space(root, 3);
++      }
+       return ret;
+ }
+@@ -5727,8 +5736,6 @@ static long btrfs_fallocate(struct inode
+       u64 locked_end;
+       u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+       struct extent_map *em;
+-      struct btrfs_trans_handle *trans;
+-      struct btrfs_root *root;
+       int ret;
+       alloc_start = offset & ~mask;
+@@ -5747,9 +5754,7 @@ static long btrfs_fallocate(struct inode
+                       goto out;
+       }
+-      root = BTRFS_I(inode)->root;
+-
+-      ret = btrfs_check_data_free_space(root, inode,
++      ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
+                                         alloc_end - alloc_start);
+       if (ret)
+               goto out;
+@@ -5758,12 +5763,6 @@ static long btrfs_fallocate(struct inode
+       while (1) {
+               struct btrfs_ordered_extent *ordered;
+-              trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
+-              if (!trans) {
+-                      ret = -EIO;
+-                      goto out_free;
+-              }
+-
+               /* the extent lock is ordered inside the running
+                * transaction
+                */
+@@ -5777,8 +5776,6 @@ static long btrfs_fallocate(struct inode
+                       btrfs_put_ordered_extent(ordered);
+                       unlock_extent(&BTRFS_I(inode)->io_tree,
+                                     alloc_start, locked_end, GFP_NOFS);
+-                      btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+-
+                       /*
+                        * we can't wait on the range with the transaction
+                        * running or with the extent lock held
+@@ -5799,9 +5796,12 @@ static long btrfs_fallocate(struct inode
+               BUG_ON(IS_ERR(em) || !em);
+               last_byte = min(extent_map_end(em), alloc_end);
+               last_byte = (last_byte + mask) & ~mask;
+-              if (em->block_start == EXTENT_MAP_HOLE) {
+-                      ret = prealloc_file_range(trans, inode, cur_offset,
+-                                              last_byte, alloc_hint, mode);
++              if (em->block_start == EXTENT_MAP_HOLE ||
++                  (cur_offset >= inode->i_size &&
++                   !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
++                      ret = prealloc_file_range(inode,
++                                                cur_offset, last_byte,
++                                                alloc_hint, mode);
+                       if (ret < 0) {
+                               free_extent_map(em);
+                               break;
+@@ -5820,9 +5820,8 @@ static long btrfs_fallocate(struct inode
+       unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                     GFP_NOFS);
+-      btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+-out_free:
+-      btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
++      btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
++                                     alloc_end - alloc_start);
+ out:
+       mutex_unlock(&inode->i_mutex);
+       return ret;
diff --git a/queue-2.6.32/btrfs-make-metadata-chunks-smaller.patch b/queue-2.6.32/btrfs-make-metadata-chunks-smaller.patch
new file mode 100644 (file)
index 0000000..5bc3768
--- /dev/null
@@ -0,0 +1,67 @@
+From 83d3c9696fed237a3d96fce18299e2fcf112109f Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Mon, 7 Dec 2009 21:45:59 +0000
+Subject: Btrfs: make metadata chunks smaller
+
+From: Josef Bacik <josef@redhat.com>
+
+commit 83d3c9696fed237a3d96fce18299e2fcf112109f upstream.
+
+This patch makes us a bit less zealous about making sure we have enough free
+metadata space by pearing down the size of new metadata chunks to 256mb instead
+of 1gb.  Also, we used to try an allocate metadata chunks when allocating data,
+but that sort of thing is done elsewhere now so we can just remove it.  With my
+-ENOSPC test I used to have 3gb reserved for metadata out of 75gb, now I have
+1.7gb.  Thanks,
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/extent-tree.c |   11 +----------
+ fs/btrfs/volumes.c     |    2 +-
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -4593,7 +4593,6 @@ int btrfs_reserve_extent(struct btrfs_tr
+ {
+       int ret;
+       u64 search_start = 0;
+-      struct btrfs_fs_info *info = root->fs_info;
+       data = btrfs_get_alloc_profile(root, data);
+ again:
+@@ -4601,17 +4600,9 @@ again:
+        * the only place that sets empty_size is btrfs_realloc_node, which
+        * is not called recursively on allocations
+        */
+-      if (empty_size || root->ref_cows) {
+-              if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
+-                      ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+-                                   2 * 1024 * 1024,
+-                                   BTRFS_BLOCK_GROUP_METADATA |
+-                                   (info->metadata_alloc_profile &
+-                                    info->avail_metadata_alloc_bits), 0);
+-              }
++      if (empty_size || root->ref_cows)
+               ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                    num_bytes + 2 * 1024 * 1024, data, 0);
+-      }
+       WARN_ON(num_bytes < root->sectorsize);
+       ret = find_free_extent(trans, root, num_bytes, empty_size,
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct bt
+               max_chunk_size = 10 * calc_size;
+               min_stripe_size = 64 * 1024 * 1024;
+       } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
+-              max_chunk_size = 4 * calc_size;
++              max_chunk_size = 256 * 1024 * 1024;
+               min_stripe_size = 32 * 1024 * 1024;
+       } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
+               calc_size = 8 * 1024 * 1024;
diff --git a/queue-2.6.32/btrfs-make-sure-fallocate-properly-starts-a-transaction.patch b/queue-2.6.32/btrfs-make-sure-fallocate-properly-starts-a-transaction.patch
new file mode 100644 (file)
index 0000000..130476d
--- /dev/null
@@ -0,0 +1,64 @@
+From 3a1abec9f6880cf406593c392636199ea1c6c917 Mon Sep 17 00:00:00 2001
+From: Chris Mason <chris.mason@oracle.com>
+Date: Thu, 17 Dec 2009 15:47:17 -0500
+Subject: Btrfs: make sure fallocate properly starts a transaction
+
+From: Chris Mason <chris.mason@oracle.com>
+
+commit 3a1abec9f6880cf406593c392636199ea1c6c917 upstream.
+
+The recent patch to make fallocate enospc friendly would send
+down a NULL trans handle to the allocator.  This moves the
+transaction start to properly fix things.
+
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/inode.c |   13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -5802,23 +5802,23 @@ static int prealloc_file_range(struct in
+       while (num_bytes > 0) {
+               alloc_size = min(num_bytes, root->fs_info->max_extent);
++              trans = btrfs_start_transaction(root, 1);
++
+               ret = btrfs_reserve_extent(trans, root, alloc_size,
+                                          root->sectorsize, 0, alloc_hint,
+                                          (u64)-1, &ins, 1);
+               if (ret) {
+                       WARN_ON(1);
+-                      break;
++                      goto stop_trans;
+               }
+               ret = btrfs_reserve_metadata_space(root, 3);
+               if (ret) {
+                       btrfs_free_reserved_extent(root, ins.objectid,
+                                                  ins.offset);
+-                      break;
++                      goto stop_trans;
+               }
+-              trans = btrfs_start_transaction(root, 1);
+-
+               ret = insert_reserved_file_extent(trans, inode,
+                                                 cur_offset, ins.objectid,
+                                                 ins.offset, ins.offset,
+@@ -5847,6 +5847,11 @@ static int prealloc_file_range(struct in
+               btrfs_unreserve_metadata_space(root, 3);
+       }
+       return ret;
++
++stop_trans:
++      btrfs_end_transaction(trans, root);
++      return ret;
++
+ }
+ static long btrfs_fallocate(struct inode *inode, int mode,
diff --git a/queue-2.6.32/btrfs-make-truncate-2-more-enospc-friendly.patch b/queue-2.6.32/btrfs-make-truncate-2-more-enospc-friendly.patch
new file mode 100644 (file)
index 0000000..4fb7e9e
--- /dev/null
@@ -0,0 +1,568 @@
+From 8082510e7124cc50d728f1b875639cb4e22312cc Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:35:36 +0000
+Subject: Btrfs: Make truncate(2) more ENOSPC friendly
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 8082510e7124cc50d728f1b875639cb4e22312cc upstream.
+
+truncating and deleting regular files are unbound operations,
+so it's not good to do them in a single transaction. This
+patch makes btrfs_truncate and btrfs_delete_inode start a
+new transaction after all items in a tree leaf are deleted.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/inode.c      |  316 ++++++++++++++++++++++++++++++--------------------
+ fs/btrfs/relocation.c |   33 +++--
+ 2 files changed, 212 insertions(+), 137 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2848,37 +2848,40 @@ out:
+  * min_type is the minimum key type to truncate down to.  If set to 0, this
+  * will kill all the items on this inode, including the INODE_ITEM_KEY.
+  */
+-noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+-                                      struct btrfs_root *root,
+-                                      struct inode *inode,
+-                                      u64 new_size, u32 min_type)
++int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
++                             struct btrfs_root *root,
++                             struct inode *inode,
++                             u64 new_size, u32 min_type)
+ {
+-      int ret;
+       struct btrfs_path *path;
+-      struct btrfs_key key;
+-      struct btrfs_key found_key;
+-      u32 found_type = (u8)-1;
+       struct extent_buffer *leaf;
+       struct btrfs_file_extent_item *fi;
++      struct btrfs_key key;
++      struct btrfs_key found_key;
+       u64 extent_start = 0;
+       u64 extent_num_bytes = 0;
+       u64 extent_offset = 0;
+       u64 item_end = 0;
++      u64 mask = root->sectorsize - 1;
++      u32 found_type = (u8)-1;
+       int found_extent;
+       int del_item;
+       int pending_del_nr = 0;
+       int pending_del_slot = 0;
+       int extent_type = -1;
+       int encoding;
+-      u64 mask = root->sectorsize - 1;
++      int ret;
++      int err = 0;
++
++      BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+       if (root->ref_cows)
+               btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
++
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       path->reada = -1;
+-      /* FIXME, add redo link to tree so we don't leak on crash */
+       key.objectid = inode->i_ino;
+       key.offset = (u64)-1;
+       key.type = (u8)-1;
+@@ -2886,17 +2889,17 @@ noinline int btrfs_truncate_inode_items(
+ search_again:
+       path->leave_spinning = 1;
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+-      if (ret < 0)
+-              goto error;
++      if (ret < 0) {
++              err = ret;
++              goto out;
++      }
+       if (ret > 0) {
+               /* there are no items in the tree for us to truncate, we're
+                * done
+                */
+-              if (path->slots[0] == 0) {
+-                      ret = 0;
+-                      goto error;
+-              }
++              if (path->slots[0] == 0)
++                      goto out;
+               path->slots[0]--;
+       }
+@@ -2931,28 +2934,17 @@ search_again:
+                       }
+                       item_end--;
+               }
+-              if (item_end < new_size) {
+-                      if (found_type == BTRFS_DIR_ITEM_KEY)
+-                              found_type = BTRFS_INODE_ITEM_KEY;
+-                      else if (found_type == BTRFS_EXTENT_ITEM_KEY)
+-                              found_type = BTRFS_EXTENT_DATA_KEY;
+-                      else if (found_type == BTRFS_EXTENT_DATA_KEY)
+-                              found_type = BTRFS_XATTR_ITEM_KEY;
+-                      else if (found_type == BTRFS_XATTR_ITEM_KEY)
+-                              found_type = BTRFS_INODE_REF_KEY;
+-                      else if (found_type)
+-                              found_type--;
+-                      else
++              if (found_type > min_type) {
++                      del_item = 1;
++              } else {
++                      if (item_end < new_size)
+                               break;
+-                      btrfs_set_key_type(&key, found_type);
+-                      goto next;
++                      if (found_key.offset >= new_size)
++                              del_item = 1;
++                      else
++                              del_item = 0;
+               }
+-              if (found_key.offset >= new_size)
+-                      del_item = 1;
+-              else
+-                      del_item = 0;
+               found_extent = 0;
+-
+               /* FIXME, shrink the extent if the ref count is only 1 */
+               if (found_type != BTRFS_EXTENT_DATA_KEY)
+                       goto delete;
+@@ -3039,42 +3031,36 @@ delete:
+                                               inode->i_ino, extent_offset);
+                       BUG_ON(ret);
+               }
+-next:
+-              if (path->slots[0] == 0) {
+-                      if (pending_del_nr)
+-                              goto del_pending;
+-                      btrfs_release_path(root, path);
+-                      if (found_type == BTRFS_INODE_ITEM_KEY)
+-                              break;
+-                      goto search_again;
+-              }
+-              path->slots[0]--;
+-              if (pending_del_nr &&
+-                  path->slots[0] + 1 != pending_del_slot) {
+-                      struct btrfs_key debug;
+-del_pending:
+-                      btrfs_item_key_to_cpu(path->nodes[0], &debug,
+-                                            pending_del_slot);
+-                      ret = btrfs_del_items(trans, root, path,
+-                                            pending_del_slot,
+-                                            pending_del_nr);
+-                      BUG_ON(ret);
+-                      pending_del_nr = 0;
++              if (found_type == BTRFS_INODE_ITEM_KEY)
++                      break;
++
++              if (path->slots[0] == 0 ||
++                  path->slots[0] != pending_del_slot) {
++                      if (root->ref_cows) {
++                              err = -EAGAIN;
++                              goto out;
++                      }
++                      if (pending_del_nr) {
++                              ret = btrfs_del_items(trans, root, path,
++                                              pending_del_slot,
++                                              pending_del_nr);
++                              BUG_ON(ret);
++                              pending_del_nr = 0;
++                      }
+                       btrfs_release_path(root, path);
+-                      if (found_type == BTRFS_INODE_ITEM_KEY)
+-                              break;
+                       goto search_again;
++              } else {
++                      path->slots[0]--;
+               }
+       }
+-      ret = 0;
+-error:
++out:
+       if (pending_del_nr) {
+               ret = btrfs_del_items(trans, root, path, pending_del_slot,
+                                     pending_del_nr);
+       }
+       btrfs_free_path(path);
+-      return ret;
++      return err;
+ }
+ /*
+@@ -3194,10 +3180,6 @@ int btrfs_cont_expand(struct inode *inod
+       if (size <= hole_start)
+               return 0;
+-      err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+-      if (err)
+-              return err;
+-
+       while (1) {
+               struct btrfs_ordered_extent *ordered;
+               btrfs_wait_ordered_range(inode, hole_start,
+@@ -3210,9 +3192,6 @@ int btrfs_cont_expand(struct inode *inod
+               btrfs_put_ordered_extent(ordered);
+       }
+-      trans = btrfs_start_transaction(root, 1);
+-      btrfs_set_trans_block_group(trans, inode);
+-
+       cur_offset = hole_start;
+       while (1) {
+               em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+@@ -3220,38 +3199,120 @@ int btrfs_cont_expand(struct inode *inod
+               BUG_ON(IS_ERR(em) || !em);
+               last_byte = min(extent_map_end(em), block_end);
+               last_byte = (last_byte + mask) & ~mask;
+-              if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
++              if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+                       u64 hint_byte = 0;
+                       hole_size = last_byte - cur_offset;
+-                      err = btrfs_drop_extents(trans, inode, cur_offset,
+-                                               cur_offset + hole_size,
+-                                               &hint_byte, 1);
+-                      if (err)
+-                              break;
+-                      err = btrfs_reserve_metadata_space(root, 1);
++                      err = btrfs_reserve_metadata_space(root, 2);
+                       if (err)
+                               break;
++                      trans = btrfs_start_transaction(root, 1);
++                      btrfs_set_trans_block_group(trans, inode);
++
++                      err = btrfs_drop_extents(trans, inode, cur_offset,
++                                               cur_offset + hole_size,
++                                               &hint_byte, 1);
++                      BUG_ON(err);
++
+                       err = btrfs_insert_file_extent(trans, root,
+                                       inode->i_ino, cur_offset, 0,
+                                       0, hole_size, 0, hole_size,
+                                       0, 0, 0);
++                      BUG_ON(err);
++
+                       btrfs_drop_extent_cache(inode, hole_start,
+                                       last_byte - 1, 0);
+-                      btrfs_unreserve_metadata_space(root, 1);
++
++                      btrfs_end_transaction(trans, root);
++                      btrfs_unreserve_metadata_space(root, 2);
+               }
+               free_extent_map(em);
+               cur_offset = last_byte;
+-              if (err || cur_offset >= block_end)
++              if (cur_offset >= block_end)
+                       break;
+       }
+-      btrfs_end_transaction(trans, root);
+       unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
+       return err;
+ }
++static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
++{
++      struct btrfs_root *root = BTRFS_I(inode)->root;
++      struct btrfs_trans_handle *trans;
++      unsigned long nr;
++      int ret;
++
++      if (attr->ia_size == inode->i_size)
++              return 0;
++
++      if (attr->ia_size > inode->i_size) {
++              unsigned long limit;
++              limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
++              if (attr->ia_size > inode->i_sb->s_maxbytes)
++                      return -EFBIG;
++              if (limit != RLIM_INFINITY && attr->ia_size > limit) {
++                      send_sig(SIGXFSZ, current, 0);
++                      return -EFBIG;
++              }
++      }
++
++      ret = btrfs_reserve_metadata_space(root, 1);
++      if (ret)
++              return ret;
++
++      trans = btrfs_start_transaction(root, 1);
++      btrfs_set_trans_block_group(trans, inode);
++
++      ret = btrfs_orphan_add(trans, inode);
++      BUG_ON(ret);
++
++      nr = trans->blocks_used;
++      btrfs_end_transaction(trans, root);
++      btrfs_unreserve_metadata_space(root, 1);
++      btrfs_btree_balance_dirty(root, nr);
++
++      if (attr->ia_size > inode->i_size) {
++              ret = btrfs_cont_expand(inode, attr->ia_size);
++              if (ret) {
++                      btrfs_truncate(inode);
++                      return ret;
++              }
++
++              i_size_write(inode, attr->ia_size);
++              btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
++
++              trans = btrfs_start_transaction(root, 1);
++              btrfs_set_trans_block_group(trans, inode);
++
++              ret = btrfs_update_inode(trans, root, inode);
++              BUG_ON(ret);
++              if (inode->i_nlink > 0) {
++                      ret = btrfs_orphan_del(trans, inode);
++                      BUG_ON(ret);
++              }
++              nr = trans->blocks_used;
++              btrfs_end_transaction(trans, root);
++              btrfs_btree_balance_dirty(root, nr);
++              return 0;
++      }
++
++      /*
++       * We're truncating a file that used to have good data down to
++       * zero. Make sure it gets into the ordered flush list so that
++       * any new writes get down to disk quickly.
++       */
++      if (attr->ia_size == 0)
++              BTRFS_I(inode)->ordered_data_close = 1;
++
++      /* we don't support swapfiles, so vmtruncate shouldn't fail */
++      ret = vmtruncate(inode, attr->ia_size);
++      BUG_ON(ret);
++
++      return 0;
++}
++
+ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
+ {
+       struct inode *inode = dentry->d_inode;
+@@ -3262,23 +3323,14 @@ static int btrfs_setattr(struct dentry *
+               return err;
+       if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+-              if (attr->ia_size > inode->i_size) {
+-                      err = btrfs_cont_expand(inode, attr->ia_size);
+-                      if (err)
+-                              return err;
+-              } else if (inode->i_size > 0 &&
+-                         attr->ia_size == 0) {
+-
+-                      /* we're truncating a file that used to have good
+-                       * data down to zero.  Make sure it gets into
+-                       * the ordered flush list so that any new writes
+-                       * get down to disk quickly.
+-                       */
+-                      BTRFS_I(inode)->ordered_data_close = 1;
+-              }
++              err = btrfs_setattr_size(inode, attr);
++              if (err)
++                      return err;
+       }
++      attr->ia_valid &= ~ATTR_SIZE;
+-      err = inode_setattr(inode, attr);
++      if (attr->ia_valid)
++              err = inode_setattr(inode, attr);
+       if (!err && ((attr->ia_valid & ATTR_MODE)))
+               err = btrfs_acl_chmod(inode);
+@@ -3310,30 +3362,32 @@ void btrfs_delete_inode(struct inode *in
+       }
+       btrfs_i_size_write(inode, 0);
+-      trans = btrfs_join_transaction(root, 1);
+-      btrfs_set_trans_block_group(trans, inode);
+-      ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
+-      if (ret) {
+-              btrfs_orphan_del(NULL, inode);
+-              goto no_delete_lock;
+-      }
++      while (1) {
++              trans = btrfs_start_transaction(root, 1);
++              btrfs_set_trans_block_group(trans, inode);
++              ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+-      btrfs_orphan_del(trans, inode);
++              if (ret != -EAGAIN)
++                      break;
+-      nr = trans->blocks_used;
+-      clear_inode(inode);
++              nr = trans->blocks_used;
++              btrfs_end_transaction(trans, root);
++              trans = NULL;
++              btrfs_btree_balance_dirty(root, nr);
++      }
+-      btrfs_end_transaction(trans, root);
+-      btrfs_btree_balance_dirty(root, nr);
+-      return;
++      if (ret == 0) {
++              ret = btrfs_orphan_del(trans, inode);
++              BUG_ON(ret);
++      }
+-no_delete_lock:
+       nr = trans->blocks_used;
+       btrfs_end_transaction(trans, root);
+       btrfs_btree_balance_dirty(root, nr);
+ no_delete:
+       clear_inode(inode);
++      return;
+ }
+ /*
+@@ -5097,17 +5151,20 @@ static void btrfs_truncate(struct inode
+       unsigned long nr;
+       u64 mask = root->sectorsize - 1;
+-      if (!S_ISREG(inode->i_mode))
+-              return;
+-      if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
++      if (!S_ISREG(inode->i_mode)) {
++              WARN_ON(1);
+               return;
++      }
+       ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+       if (ret)
+               return;
++
+       btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
++      btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+       trans = btrfs_start_transaction(root, 1);
++      btrfs_set_trans_block_group(trans, inode);
+       /*
+        * setattr is responsible for setting the ordered_data_close flag,
+@@ -5129,21 +5186,32 @@ static void btrfs_truncate(struct inode
+       if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+               btrfs_add_ordered_operation(trans, root, inode);
+-      btrfs_set_trans_block_group(trans, inode);
+-      btrfs_i_size_write(inode, inode->i_size);
++      while (1) {
++              ret = btrfs_truncate_inode_items(trans, root, inode,
++                                               inode->i_size,
++                                               BTRFS_EXTENT_DATA_KEY);
++              if (ret != -EAGAIN)
++                      break;
+-      ret = btrfs_orphan_add(trans, inode);
+-      if (ret)
+-              goto out;
+-      /* FIXME, add redo link to tree so we don't leak on crash */
+-      ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
+-                                    BTRFS_EXTENT_DATA_KEY);
+-      btrfs_update_inode(trans, root, inode);
++              ret = btrfs_update_inode(trans, root, inode);
++              BUG_ON(ret);
++
++              nr = trans->blocks_used;
++              btrfs_end_transaction(trans, root);
++              btrfs_btree_balance_dirty(root, nr);
++
++              trans = btrfs_start_transaction(root, 1);
++              btrfs_set_trans_block_group(trans, inode);
++      }
+-      ret = btrfs_orphan_del(trans, inode);
++      if (ret == 0 && inode->i_nlink > 0) {
++              ret = btrfs_orphan_del(trans, inode);
++              BUG_ON(ret);
++      }
++
++      ret = btrfs_update_inode(trans, root, inode);
+       BUG_ON(ret);
+-out:
+       nr = trans->blocks_used;
+       ret = btrfs_end_transaction_throttle(trans, root);
+       BUG_ON(ret);
+@@ -5240,9 +5308,9 @@ void btrfs_destroy_inode(struct inode *i
+       spin_lock(&root->list_lock);
+       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+-              printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
+-                     " list\n", inode->i_ino);
+-              dump_stack();
++              printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
++                     inode->i_ino);
++              list_del_init(&BTRFS_I(inode)->i_orphan);
+       }
+       spin_unlock(&root->list_lock);
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struc
+       return 0;
+ }
++static void put_inodes(struct list_head *list)
++{
++      struct inodevec *ivec;
++      while (!list_empty(list)) {
++              ivec = list_entry(list->next, struct inodevec, list);
++              list_del(&ivec->list);
++              while (ivec->nr > 0) {
++                      ivec->nr--;
++                      iput(ivec->inode[ivec->nr]);
++              }
++              kfree(ivec);
++      }
++}
++
+ static int find_next_key(struct btrfs_path *path, int level,
+                        struct btrfs_key *key)
+@@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_relo
+               btrfs_btree_balance_dirty(root, nr);
++              /*
++               * put inodes outside transaction, otherwise we may deadlock.
++               */
++              put_inodes(&inode_list);
++
+               if (replaced && rc->stage == UPDATE_DATA_PTRS)
+                       invalidate_extent_cache(root, &key, &next_key);
+       }
+@@ -1752,19 +1771,7 @@ out:
+       btrfs_btree_balance_dirty(root, nr);
+-      /*
+-       * put inodes while we aren't holding the tree locks
+-       */
+-      while (!list_empty(&inode_list)) {
+-              struct inodevec *ivec;
+-              ivec = list_entry(inode_list.next, struct inodevec, list);
+-              list_del(&ivec->list);
+-              while (ivec->nr > 0) {
+-                      ivec->nr--;
+-                      iput(ivec->inode[ivec->nr]);
+-              }
+-              kfree(ivec);
+-      }
++      put_inodes(&inode_list);
+       if (replaced && rc->stage == UPDATE_DATA_PTRS)
+               invalidate_extent_cache(root, &key, &next_key);
diff --git a/queue-2.6.32/btrfs-pass-transaction-handle-to-security-and-acl-initialization-functions.patch b/queue-2.6.32/btrfs-pass-transaction-handle-to-security-and-acl-initialization-functions.patch
new file mode 100644 (file)
index 0000000..6b7389e
--- /dev/null
@@ -0,0 +1,419 @@
+From f34f57a3ab4e73304d78c125682f1a53cd3975f2 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:35:27 +0000
+Subject: Btrfs: Pass transaction handle to security and ACL initialization functions
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit f34f57a3ab4e73304d78c125682f1a53cd3975f2 upstream.
+
+Pass transaction handle down to security and ACL initialization
+functions, so we can avoid starting nested transactions
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/acl.c      |   23 ++++++++------
+ fs/btrfs/ctree.h    |   13 +++++---
+ fs/btrfs/dir-item.c |   19 ++++--------
+ fs/btrfs/inode.c    |   15 +++++----
+ fs/btrfs/xattr.c    |   80 +++++++++++++++++++++++++++++++++++-----------------
+ fs/btrfs/xattr.h    |    9 +++--
+ 6 files changed, 96 insertions(+), 63 deletions(-)
+
+--- a/fs/btrfs/acl.c
++++ b/fs/btrfs/acl.c
+@@ -94,7 +94,8 @@ static int btrfs_xattr_get_acl(struct in
+ /*
+  * Needs to be called with fs_mutex held
+  */
+-static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
++static int btrfs_set_acl(struct btrfs_trans_handle *trans,
++                       struct inode *inode, struct posix_acl *acl, int type)
+ {
+       int ret, size = 0;
+       const char *name;
+@@ -140,8 +141,7 @@ static int btrfs_set_acl(struct inode *i
+                       goto out;
+       }
+-      ret = __btrfs_setxattr(inode, name, value, size, 0);
+-
++      ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
+ out:
+       kfree(value);
+@@ -154,7 +154,7 @@ out:
+ static int btrfs_xattr_set_acl(struct inode *inode, int type,
+                              const void *value, size_t size)
+ {
+-      int ret = 0;
++      int ret;
+       struct posix_acl *acl = NULL;
+       if (!is_owner_or_cap(inode))
+@@ -170,7 +170,7 @@ static int btrfs_xattr_set_acl(struct in
+               }
+       }
+-      ret = btrfs_set_acl(inode, acl, type);
++      ret = btrfs_set_acl(NULL, inode, acl, type);
+       posix_acl_release(acl);
+@@ -224,7 +224,8 @@ int btrfs_check_acl(struct inode *inode,
+  * stuff has been fixed to work with that.  If the locking stuff changes, we
+  * need to re-evaluate the acl locking stuff.
+  */
+-int btrfs_init_acl(struct inode *inode, struct inode *dir)
++int btrfs_init_acl(struct btrfs_trans_handle *trans,
++                 struct inode *inode, struct inode *dir)
+ {
+       struct posix_acl *acl = NULL;
+       int ret = 0;
+@@ -249,7 +250,8 @@ int btrfs_init_acl(struct inode *inode,
+               mode_t mode;
+               if (S_ISDIR(inode->i_mode)) {
+-                      ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
++                      ret = btrfs_set_acl(trans, inode, acl,
++                                          ACL_TYPE_DEFAULT);
+                       if (ret)
+                               goto failed;
+               }
+@@ -264,7 +266,7 @@ int btrfs_init_acl(struct inode *inode,
+                       inode->i_mode = mode;
+                       if (ret > 0) {
+                               /* we need an acl */
+-                              ret = btrfs_set_acl(inode, clone,
++                              ret = btrfs_set_acl(trans, inode, clone,
+                                                   ACL_TYPE_ACCESS);
+                       }
+               }
+@@ -297,7 +299,7 @@ int btrfs_acl_chmod(struct inode *inode)
+       ret = posix_acl_chmod_masq(clone, inode->i_mode);
+       if (!ret)
+-              ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
++              ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
+       posix_acl_release(clone);
+@@ -323,7 +325,8 @@ int btrfs_acl_chmod(struct inode *inode)
+       return 0;
+ }
+-int btrfs_init_acl(struct inode *inode, struct inode *dir)
++int btrfs_init_acl(struct btrfs_trans_handle *trans,
++                 struct inode *inode, struct inode *dir)
+ {
+       return 0;
+ }
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -310,6 +310,9 @@ struct btrfs_header {
+ #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+                                       sizeof(struct btrfs_item) - \
+                                       sizeof(struct btrfs_file_extent_item))
++#define BTRFS_MAX_XATTR_SIZE(r)       (BTRFS_LEAF_DATA_SIZE(r) - \
++                               sizeof(struct btrfs_item) -\
++                               sizeof(struct btrfs_dir_item))
+ /*
+@@ -2201,9 +2204,10 @@ int btrfs_delete_one_dir_name(struct btr
+                             struct btrfs_path *path,
+                             struct btrfs_dir_item *di);
+ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+-                          struct btrfs_root *root, const char *name,
+-                          u16 name_len, const void *data, u16 data_len,
+-                          u64 dir);
++                          struct btrfs_root *root,
++                          struct btrfs_path *path, u64 objectid,
++                          const char *name, u16 name_len,
++                          const void *data, u16 data_len);
+ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
+                                         struct btrfs_root *root,
+                                         struct btrfs_path *path, u64 dir,
+@@ -2382,7 +2386,8 @@ int btrfs_check_acl(struct inode *inode,
+ #else
+ #define btrfs_check_acl NULL
+ #endif
+-int btrfs_init_acl(struct inode *inode, struct inode *dir);
++int btrfs_init_acl(struct btrfs_trans_handle *trans,
++                 struct inode *inode, struct inode *dir);
+ int btrfs_acl_chmod(struct inode *inode);
+ /* relocation.c */
+--- a/fs/btrfs/dir-item.c
++++ b/fs/btrfs/dir-item.c
+@@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_wit
+  * into the tree
+  */
+ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
+-                          struct btrfs_root *root, const char *name,
+-                          u16 name_len, const void *data, u16 data_len,
+-                          u64 dir)
++                          struct btrfs_root *root,
++                          struct btrfs_path *path, u64 objectid,
++                          const char *name, u16 name_len,
++                          const void *data, u16 data_len)
+ {
+       int ret = 0;
+-      struct btrfs_path *path;
+       struct btrfs_dir_item *dir_item;
+       unsigned long name_ptr, data_ptr;
+       struct btrfs_key key, location;
+@@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs
+       struct extent_buffer *leaf;
+       u32 data_size;
+-      key.objectid = dir;
++      BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
++
++      key.objectid = objectid;
+       btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
+       key.offset = btrfs_name_hash(name, name_len);
+-      path = btrfs_alloc_path();
+-      if (!path)
+-              return -ENOMEM;
+-      if (name_len + data_len + sizeof(struct btrfs_dir_item) >
+-          BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
+-              return -ENOSPC;
+       data_size = sizeof(*dir_item) + name_len + data_len;
+       dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+@@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs
+       write_extent_buffer(leaf, data, data_ptr, data_len);
+       btrfs_mark_buffer_dirty(path->nodes[0]);
+-      btrfs_free_path(path);
+       return ret;
+ }
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -88,13 +88,14 @@ static noinline int cow_file_range(struc
+                                  u64 start, u64 end, int *page_started,
+                                  unsigned long *nr_written, int unlock);
+-static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
++static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
++                                   struct inode *inode,  struct inode *dir)
+ {
+       int err;
+-      err = btrfs_init_acl(inode, dir);
++      err = btrfs_init_acl(trans, inode, dir);
+       if (!err)
+-              err = btrfs_xattr_security_init(inode, dir);
++              err = btrfs_xattr_security_init(trans, inode, dir);
+       return err;
+ }
+@@ -4296,7 +4297,7 @@ static int btrfs_mknod(struct inode *dir
+       if (IS_ERR(inode))
+               goto out_unlock;
+-      err = btrfs_init_inode_security(inode, dir);
++      err = btrfs_init_inode_security(trans, inode, dir);
+       if (err) {
+               drop_inode = 1;
+               goto out_unlock;
+@@ -4367,7 +4368,7 @@ static int btrfs_create(struct inode *di
+       if (IS_ERR(inode))
+               goto out_unlock;
+-      err = btrfs_init_inode_security(inode, dir);
++      err = btrfs_init_inode_security(trans, inode, dir);
+       if (err) {
+               drop_inode = 1;
+               goto out_unlock;
+@@ -4500,7 +4501,7 @@ static int btrfs_mkdir(struct inode *dir
+       drop_on_err = 1;
+-      err = btrfs_init_inode_security(inode, dir);
++      err = btrfs_init_inode_security(trans, inode, dir);
+       if (err)
+               goto out_fail;
+@@ -5660,7 +5661,7 @@ static int btrfs_symlink(struct inode *d
+       if (IS_ERR(inode))
+               goto out_unlock;
+-      err = btrfs_init_inode_security(inode, dir);
++      err = btrfs_init_inode_security(trans, inode, dir);
+       if (err) {
+               drop_inode = 1;
+               goto out_unlock;
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -85,22 +85,23 @@ out:
+       return ret;
+ }
+-int __btrfs_setxattr(struct inode *inode, const char *name,
+-                          const void *value, size_t size, int flags)
++static int do_setxattr(struct btrfs_trans_handle *trans,
++                     struct inode *inode, const char *name,
++                     const void *value, size_t size, int flags)
+ {
+       struct btrfs_dir_item *di;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+-      struct btrfs_trans_handle *trans;
+       struct btrfs_path *path;
+-      int ret = 0, mod = 0;
++      size_t name_len = strlen(name);
++      int ret = 0;
++
++      if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
++              return -ENOSPC;
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
+-      trans = btrfs_join_transaction(root, 1);
+-      btrfs_set_trans_block_group(trans, inode);
+-
+       /* first lets see if we already have this xattr */
+       di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+                               strlen(name), -1);
+@@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode
+               }
+               ret = btrfs_delete_one_dir_name(trans, root, path, di);
+-              if (ret)
+-                      goto out;
++              BUG_ON(ret);
+               btrfs_release_path(root, path);
+               /* if we don't have a value then we are removing the xattr */
+-              if (!value) {
+-                      mod = 1;
++              if (!value)
+                       goto out;
+-              }
+       } else {
+               btrfs_release_path(root, path);
+@@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode
+       }
+       /* ok we have to create a completely new xattr */
+-      ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
+-                                    value, size, inode->i_ino);
++      ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
++                                    name, name_len, value, size);
++      BUG_ON(ret);
++out:
++      btrfs_free_path(path);
++      return ret;
++}
++
++int __btrfs_setxattr(struct btrfs_trans_handle *trans,
++                   struct inode *inode, const char *name,
++                   const void *value, size_t size, int flags)
++{
++      struct btrfs_root *root = BTRFS_I(inode)->root;
++      int ret;
++
++      if (trans)
++              return do_setxattr(trans, inode, name, value, size, flags);
++
++      ret = btrfs_reserve_metadata_space(root, 2);
+       if (ret)
+-              goto out;
+-      mod = 1;
++              return ret;
+-out:
+-      if (mod) {
+-              inode->i_ctime = CURRENT_TIME;
+-              ret = btrfs_update_inode(trans, root, inode);
++      trans = btrfs_start_transaction(root, 1);
++      if (!trans) {
++              ret = -ENOMEM;
++              goto out;
+       }
++      btrfs_set_trans_block_group(trans, inode);
+-      btrfs_end_transaction(trans, root);
+-      btrfs_free_path(path);
++      ret = do_setxattr(trans, inode, name, value, size, flags);
++      if (ret)
++              goto out;
++
++      inode->i_ctime = CURRENT_TIME;
++      ret = btrfs_update_inode(trans, root, inode);
++      BUG_ON(ret);
++out:
++      btrfs_end_transaction_throttle(trans, root);
++      btrfs_unreserve_metadata_space(root, 2);
+       return ret;
+ }
+@@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry
+       if (size == 0)
+               value = "";  /* empty EA, do not remove */
+-      return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
++
++      return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
++                              flags);
+ }
+ int btrfs_removexattr(struct dentry *dentry, const char *name)
+@@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *den
+       if (!btrfs_is_valid_xattr(name))
+               return -EOPNOTSUPP;
+-      return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
++
++      return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
++                              XATTR_REPLACE);
+ }
+-int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
++int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
++                            struct inode *inode, struct inode *dir)
+ {
+       int err;
+       size_t len;
+@@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct ino
+       } else {
+               strcpy(name, XATTR_SECURITY_PREFIX);
+               strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+-              err = __btrfs_setxattr(inode, name, value, len, 0);
++              err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+               kfree(name);
+       }
+--- a/fs/btrfs/xattr.h
++++ b/fs/btrfs/xattr.h
+@@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr
+ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
+               void *buffer, size_t size);
+-extern int __btrfs_setxattr(struct inode *inode, const char *name,
+-              const void *value, size_t size, int flags);
+-
++extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
++                          struct inode *inode, const char *name,
++                          const void *value, size_t size, int flags);
+ extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
+               void *buffer, size_t size);
+ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
+               const void *value, size_t size, int flags);
+ extern int btrfs_removexattr(struct dentry *dentry, const char *name);
+-extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
++extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
++                                   struct inode *inode, struct inode *dir);
+ #endif /* __XATTR__ */
diff --git a/queue-2.6.32/btrfs-remove-bug_on-due-to-mounting-bad-filesystem.patch b/queue-2.6.32/btrfs-remove-bug_on-due-to-mounting-bad-filesystem.patch
new file mode 100644 (file)
index 0000000..d26e559
--- /dev/null
@@ -0,0 +1,63 @@
+From d7ce5843bb28ada6845ab2ae8510ba3f12d33154 Mon Sep 17 00:00:00 2001
+From: Miao Xie <miaox@cn.fujitsu.com>
+Date: Tue, 2 Feb 2010 08:46:44 +0000
+Subject: Btrfs: remove BUG_ON() due to mounting bad filesystem
+
+From: Miao Xie <miaox@cn.fujitsu.com>
+
+commit d7ce5843bb28ada6845ab2ae8510ba3f12d33154 upstream.
+
+Mounting a bad filesystem caused a BUG_ON(). The following is steps to
+reproduce it.
+ # mkfs.btrfs /dev/sda2
+ # mount /dev/sda2 /mnt
+ # mkfs.btrfs /dev/sda1 /dev/sda2
+ (the program says that /dev/sda2 was mounted, and then exits. )
+ # umount /mnt
+ # mount /dev/sda1 /mnt
+
+At the third step, mkfs.btrfs exited in the way of make filesystem. So the
+initialization of the filesystem didn't finish. So the filesystem was bad, and
+it caused BUG_ON() when mounting it. But BUG_ON() should be called by the wrong
+code, not user's operation, so I think it is a bug of btrfs.
+
+This patch fixes it.
+
+Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/disk-io.c    |    7 ++++++-
+ fs/btrfs/relocation.c |    3 ++-
+ 2 files changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1982,7 +1982,12 @@ struct btrfs_root *open_ctree(struct sup
+       if (!(sb->s_flags & MS_RDONLY)) {
+               ret = btrfs_recover_relocation(tree_root);
+-              BUG_ON(ret);
++              if (ret < 0) {
++                      printk(KERN_WARNING
++                             "btrfs: failed to recover relocation\n");
++                      err = -EINVAL;
++                      goto fail_trans_kthread;
++              }
+       }
+       location.objectid = BTRFS_FS_TREE_OBJECTID;
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -3764,7 +3764,8 @@ out:
+                                      BTRFS_DATA_RELOC_TREE_OBJECTID);
+               if (IS_ERR(fs_root))
+                       err = PTR_ERR(fs_root);
+-              btrfs_orphan_cleanup(fs_root);
++              else
++                      btrfs_orphan_cleanup(fs_root);
+       }
+       return err;
+ }
diff --git a/queue-2.6.32/btrfs-rewrite-btrfs_drop_extents.patch b/queue-2.6.32/btrfs-rewrite-btrfs_drop_extents.patch
new file mode 100644 (file)
index 0000000..a0f4de3
--- /dev/null
@@ -0,0 +1,943 @@
+From 920bbbfb05c9fce22e088d20eb9dcb8f96342de9 Mon Sep 17 00:00:00 2001
+From: Yan, Zheng <zheng.yan@oracle.com>
+Date: Thu, 12 Nov 2009 09:34:08 +0000
+Subject: Btrfs: Rewrite btrfs_drop_extents
+
+From: Yan, Zheng <zheng.yan@oracle.com>
+
+commit 920bbbfb05c9fce22e088d20eb9dcb8f96342de9 upstream.
+
+Rewrite btrfs_drop_extents by using btrfs_duplicate_item, so we can
+avoid calling lock_extent within transaction.
+
+Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/ctree.h    |    7 
+ fs/btrfs/file.c     |  661 ++++++++++++++++++++--------------------------------
+ fs/btrfs/inode.c    |   27 --
+ fs/btrfs/ioctl.c    |    3 
+ fs/btrfs/tree-log.c |    4 
+ 5 files changed, 278 insertions(+), 424 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -2349,12 +2349,9 @@ int btrfs_drop_extent_cache(struct inode
+                           int skip_pinned);
+ int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
+ extern const struct file_operations btrfs_file_operations;
+-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+-                     struct btrfs_root *root, struct inode *inode,
+-                     u64 start, u64 end, u64 locked_end,
+-                     u64 inline_limit, u64 *hint_block, int drop_cache);
++int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
++                     u64 start, u64 end, u64 *hint_byte, int drop_cache);
+ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+-                            struct btrfs_root *root,
+                             struct inode *inode, u64 start, u64 end);
+ int btrfs_release_file(struct inode *inode, struct file *file);
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -265,319 +265,247 @@ int btrfs_drop_extent_cache(struct inode
+  * If an extent intersects the range but is not entirely inside the range
+  * it is either truncated or split.  Anything entirely inside the range
+  * is deleted from the tree.
+- *
+- * inline_limit is used to tell this code which offsets in the file to keep
+- * if they contain inline extents.
+  */
+-noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+-                     struct btrfs_root *root, struct inode *inode,
+-                     u64 start, u64 end, u64 locked_end,
+-                     u64 inline_limit, u64 *hint_byte, int drop_cache)
++int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
++                     u64 start, u64 end, u64 *hint_byte, int drop_cache)
+ {
+-      u64 extent_end = 0;
+-      u64 search_start = start;
+-      u64 ram_bytes = 0;
+-      u64 disk_bytenr = 0;
+-      u64 orig_locked_end = locked_end;
+-      u8 compression;
+-      u8 encryption;
+-      u16 other_encoding = 0;
++      struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_buffer *leaf;
+-      struct btrfs_file_extent_item *extent;
++      struct btrfs_file_extent_item *fi;
+       struct btrfs_path *path;
+       struct btrfs_key key;
+-      struct btrfs_file_extent_item old;
+-      int keep;
+-      int slot;
+-      int bookend;
+-      int found_type = 0;
+-      int found_extent;
+-      int found_inline;
++      struct btrfs_key new_key;
++      u64 search_start = start;
++      u64 disk_bytenr = 0;
++      u64 num_bytes = 0;
++      u64 extent_offset = 0;
++      u64 extent_end = 0;
++      int del_nr = 0;
++      int del_slot = 0;
++      int extent_type;
+       int recow;
+       int ret;
+-      inline_limit = 0;
+       if (drop_cache)
+               btrfs_drop_extent_cache(inode, start, end - 1, 0);
+       path = btrfs_alloc_path();
+       if (!path)
+               return -ENOMEM;
++
+       while (1) {
+               recow = 0;
+-              btrfs_release_path(root, path);
+               ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                              search_start, -1);
+               if (ret < 0)
+-                      goto out;
+-              if (ret > 0) {
+-                      if (path->slots[0] == 0) {
+-                              ret = 0;
+-                              goto out;
+-                      }
+-                      path->slots[0]--;
++                      break;
++              if (ret > 0 && path->slots[0] > 0 && search_start == start) {
++                      leaf = path->nodes[0];
++                      btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
++                      if (key.objectid == inode->i_ino &&
++                          key.type == BTRFS_EXTENT_DATA_KEY)
++                              path->slots[0]--;
+               }
++              ret = 0;
+ next_slot:
+-              keep = 0;
+-              bookend = 0;
+-              found_extent = 0;
+-              found_inline = 0;
+-              compression = 0;
+-              encryption = 0;
+-              extent = NULL;
+               leaf = path->nodes[0];
+-              slot = path->slots[0];
+-              ret = 0;
+-              btrfs_item_key_to_cpu(leaf, &key, slot);
+-              if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
+-                  key.offset >= end) {
+-                      goto out;
+-              }
+-              if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+-                  key.objectid != inode->i_ino) {
+-                      goto out;
+-              }
+-              if (recow) {
+-                      search_start = max(key.offset, start);
+-                      continue;
+-              }
+-              if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
+-                      extent = btrfs_item_ptr(leaf, slot,
+-                                              struct btrfs_file_extent_item);
+-                      found_type = btrfs_file_extent_type(leaf, extent);
+-                      compression = btrfs_file_extent_compression(leaf,
+-                                                                  extent);
+-                      encryption = btrfs_file_extent_encryption(leaf,
+-                                                                extent);
+-                      other_encoding = btrfs_file_extent_other_encoding(leaf,
+-                                                                extent);
+-                      if (found_type == BTRFS_FILE_EXTENT_REG ||
+-                          found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+-                              extent_end =
+-                                   btrfs_file_extent_disk_bytenr(leaf,
+-                                                                 extent);
+-                              if (extent_end)
+-                                      *hint_byte = extent_end;
+-
+-                              extent_end = key.offset +
+-                                   btrfs_file_extent_num_bytes(leaf, extent);
+-                              ram_bytes = btrfs_file_extent_ram_bytes(leaf,
+-                                                              extent);
+-                              found_extent = 1;
+-                      } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+-                              found_inline = 1;
+-                              extent_end = key.offset +
+-                                   btrfs_file_extent_inline_len(leaf, extent);
++              if (path->slots[0] >= btrfs_header_nritems(leaf)) {
++                      BUG_ON(del_nr > 0);
++                      ret = btrfs_next_leaf(root, path);
++                      if (ret < 0)
++                              break;
++                      if (ret > 0) {
++                              ret = 0;
++                              break;
+                       }
++                      leaf = path->nodes[0];
++                      recow = 1;
++              }
++
++              btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
++              if (key.objectid > inode->i_ino ||
++                  key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
++                      break;
++
++              fi = btrfs_item_ptr(leaf, path->slots[0],
++                                  struct btrfs_file_extent_item);
++              extent_type = btrfs_file_extent_type(leaf, fi);
++
++              if (extent_type == BTRFS_FILE_EXTENT_REG ||
++                  extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
++                      disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
++                      num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
++                      extent_offset = btrfs_file_extent_offset(leaf, fi);
++                      extent_end = key.offset +
++                              btrfs_file_extent_num_bytes(leaf, fi);
++              } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
++                      extent_end = key.offset +
++                              btrfs_file_extent_inline_len(leaf, fi);
+               } else {
++                      WARN_ON(1);
+                       extent_end = search_start;
+               }
+-              /* we found nothing we can drop */
+-              if ((!found_extent && !found_inline) ||
+-                  search_start >= extent_end) {
+-                      int nextret;
+-                      u32 nritems;
+-                      nritems = btrfs_header_nritems(leaf);
+-                      if (slot >= nritems - 1) {
+-                              nextret = btrfs_next_leaf(root, path);
+-                              if (nextret)
+-                                      goto out;
+-                              recow = 1;
+-                      } else {
+-                              path->slots[0]++;
+-                      }
++              if (extent_end <= search_start) {
++                      path->slots[0]++;
+                       goto next_slot;
+               }
+-              if (end <= extent_end && start >= key.offset && found_inline)
+-                      *hint_byte = EXTENT_MAP_INLINE;
++              search_start = max(key.offset, start);
++              if (recow) {
++                      btrfs_release_path(root, path);
++                      continue;
++              }
+-              if (found_extent) {
+-                      read_extent_buffer(leaf, &old, (unsigned long)extent,
+-                                         sizeof(old));
+-              }
+-
+-              if (end < extent_end && end >= key.offset) {
+-                      bookend = 1;
+-                      if (found_inline && start <= key.offset)
+-                              keep = 1;
+-              }
+-
+-              if (bookend && found_extent) {
+-                      if (locked_end < extent_end) {
+-                              ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+-                                              locked_end, extent_end - 1,
+-                                              GFP_NOFS);
+-                              if (!ret) {
+-                                      btrfs_release_path(root, path);
+-                                      lock_extent(&BTRFS_I(inode)->io_tree,
+-                                              locked_end, extent_end - 1,
+-                                              GFP_NOFS);
+-                                      locked_end = extent_end;
+-                                      continue;
+-                              }
+-                              locked_end = extent_end;
++              /*
++               *     | - range to drop - |
++               *  | -------- extent -------- |
++               */
++              if (start > key.offset && end < extent_end) {
++                      BUG_ON(del_nr > 0);
++                      BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
++
++                      memcpy(&new_key, &key, sizeof(new_key));
++                      new_key.offset = start;
++                      ret = btrfs_duplicate_item(trans, root, path,
++                                                 &new_key);
++                      if (ret == -EAGAIN) {
++                              btrfs_release_path(root, path);
++                              continue;
+                       }
+-                      disk_bytenr = le64_to_cpu(old.disk_bytenr);
+-                      if (disk_bytenr != 0) {
++                      if (ret < 0)
++                              break;
++
++                      leaf = path->nodes[0];
++                      fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
++                                          struct btrfs_file_extent_item);
++                      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                                      start - key.offset);
++
++                      fi = btrfs_item_ptr(leaf, path->slots[0],
++                                          struct btrfs_file_extent_item);
++
++                      extent_offset += start - key.offset;
++                      btrfs_set_file_extent_offset(leaf, fi, extent_offset);
++                      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                                      extent_end - start);
++                      btrfs_mark_buffer_dirty(leaf);
++
++                      if (disk_bytenr > 0) {
+                               ret = btrfs_inc_extent_ref(trans, root,
+-                                         disk_bytenr,
+-                                         le64_to_cpu(old.disk_num_bytes), 0,
+-                                         root->root_key.objectid,
+-                                         key.objectid, key.offset -
+-                                         le64_to_cpu(old.offset));
++                                              disk_bytenr, num_bytes, 0,
++                                              root->root_key.objectid,
++                                              new_key.objectid,
++                                              start - extent_offset);
+                               BUG_ON(ret);
++                              *hint_byte = disk_bytenr;
+                       }
++                      key.offset = start;
+               }
++              /*
++               *  | ---- range to drop ----- |
++               *      | -------- extent -------- |
++               */
++              if (start <= key.offset && end < extent_end) {
++                      BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
++
++                      memcpy(&new_key, &key, sizeof(new_key));
++                      new_key.offset = end;
++                      btrfs_set_item_key_safe(trans, root, path, &new_key);
+-              if (found_inline) {
+-                      u64 mask = root->sectorsize - 1;
+-                      search_start = (extent_end + mask) & ~mask;
+-              } else
+-                      search_start = extent_end;
+-
+-              /* truncate existing extent */
+-              if (start > key.offset) {
+-                      u64 new_num;
+-                      u64 old_num;
+-                      keep = 1;
+-                      WARN_ON(start & (root->sectorsize - 1));
+-                      if (found_extent) {
+-                              new_num = start - key.offset;
+-                              old_num = btrfs_file_extent_num_bytes(leaf,
+-                                                                    extent);
+-                              *hint_byte =
+-                                      btrfs_file_extent_disk_bytenr(leaf,
+-                                                                    extent);
+-                              if (btrfs_file_extent_disk_bytenr(leaf,
+-                                                                extent)) {
+-                                      inode_sub_bytes(inode, old_num -
+-                                                      new_num);
+-                              }
+-                              btrfs_set_file_extent_num_bytes(leaf,
+-                                                      extent, new_num);
+-                              btrfs_mark_buffer_dirty(leaf);
+-                      } else if (key.offset < inline_limit &&
+-                                 (end > extent_end) &&
+-                                 (inline_limit < extent_end)) {
+-                              u32 new_size;
+-                              new_size = btrfs_file_extent_calc_inline_size(
+-                                                 inline_limit - key.offset);
+-                              inode_sub_bytes(inode, extent_end -
+-                                              inline_limit);
+-                              btrfs_set_file_extent_ram_bytes(leaf, extent,
+-                                                      new_size);
+-                              if (!compression && !encryption) {
+-                                      btrfs_truncate_item(trans, root, path,
+-                                                          new_size, 1);
+-                              }
++                      extent_offset += end - key.offset;
++                      btrfs_set_file_extent_offset(leaf, fi, extent_offset);
++                      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                                      extent_end - end);
++                      btrfs_mark_buffer_dirty(leaf);
++                      if (disk_bytenr > 0) {
++                              inode_sub_bytes(inode, end - key.offset);
++                              *hint_byte = disk_bytenr;
+                       }
++                      break;
+               }
+-              /* delete the entire extent */
+-              if (!keep) {
+-                      if (found_inline)
+-                              inode_sub_bytes(inode, extent_end -
+-                                              key.offset);
+-                      ret = btrfs_del_item(trans, root, path);
+-                      /* TODO update progress marker and return */
+-                      BUG_ON(ret);
+-                      extent = NULL;
+-                      btrfs_release_path(root, path);
+-                      /* the extent will be freed later */
+-              }
+-              if (bookend && found_inline && start <= key.offset) {
+-                      u32 new_size;
+-                      new_size = btrfs_file_extent_calc_inline_size(
+-                                                 extent_end - end);
+-                      inode_sub_bytes(inode, end - key.offset);
+-                      btrfs_set_file_extent_ram_bytes(leaf, extent,
+-                                                      new_size);
+-                      if (!compression && !encryption)
+-                              ret = btrfs_truncate_item(trans, root, path,
+-                                                        new_size, 0);
+-                      BUG_ON(ret);
+-              }
+-              /* create bookend, splitting the extent in two */
+-              if (bookend && found_extent) {
+-                      struct btrfs_key ins;
+-                      ins.objectid = inode->i_ino;
+-                      ins.offset = end;
+-                      btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
+-
+-                      btrfs_release_path(root, path);
+-                      path->leave_spinning = 1;
+-                      ret = btrfs_insert_empty_item(trans, root, path, &ins,
+-                                                    sizeof(*extent));
+-                      BUG_ON(ret);
+-                      leaf = path->nodes[0];
+-                      extent = btrfs_item_ptr(leaf, path->slots[0],
+-                                              struct btrfs_file_extent_item);
+-                      write_extent_buffer(leaf, &old,
+-                                          (unsigned long)extent, sizeof(old));
+-
+-                      btrfs_set_file_extent_compression(leaf, extent,
+-                                                        compression);
+-                      btrfs_set_file_extent_encryption(leaf, extent,
+-                                                       encryption);
+-                      btrfs_set_file_extent_other_encoding(leaf, extent,
+-                                                           other_encoding);
+-                      btrfs_set_file_extent_offset(leaf, extent,
+-                                  le64_to_cpu(old.offset) + end - key.offset);
+-                      WARN_ON(le64_to_cpu(old.num_bytes) <
+-                              (extent_end - end));
+-                      btrfs_set_file_extent_num_bytes(leaf, extent,
+-                                                      extent_end - end);
++              search_start = extent_end;
++              /*
++               *       | ---- range to drop ----- |
++               *  | -------- extent -------- |
++               */
++              if (start > key.offset && end >= extent_end) {
++                      BUG_ON(del_nr > 0);
++                      BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+-                      /*
+-                       * set the ram bytes to the size of the full extent
+-                       * before splitting.  This is a worst case flag,
+-                       * but its the best we can do because we don't know
+-                       * how splitting affects compression
+-                       */
+-                      btrfs_set_file_extent_ram_bytes(leaf, extent,
+-                                                      ram_bytes);
+-                      btrfs_set_file_extent_type(leaf, extent, found_type);
+-
+-                      btrfs_unlock_up_safe(path, 1);
+-                      btrfs_mark_buffer_dirty(path->nodes[0]);
+-                      btrfs_set_lock_blocking(path->nodes[0]);
++                      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                                      start - key.offset);
++                      btrfs_mark_buffer_dirty(leaf);
++                      if (disk_bytenr > 0) {
++                              inode_sub_bytes(inode, extent_end - start);
++                              *hint_byte = disk_bytenr;
++                      }
++                      if (end == extent_end)
++                              break;
+-                      path->leave_spinning = 0;
+-                      btrfs_release_path(root, path);
+-                      if (disk_bytenr != 0)
+-                              inode_add_bytes(inode, extent_end - end);
++                      path->slots[0]++;
++                      goto next_slot;
+               }
+-              if (found_extent && !keep) {
+-                      u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
++              /*
++               *  | ---- range to drop ----- |
++               *    | ------ extent ------ |
++               */
++              if (start <= key.offset && end >= extent_end) {
++                      if (del_nr == 0) {
++                              del_slot = path->slots[0];
++                              del_nr = 1;
++                      } else {
++                              BUG_ON(del_slot + del_nr != path->slots[0]);
++                              del_nr++;
++                      }
+-                      if (old_disk_bytenr != 0) {
++                      if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                               inode_sub_bytes(inode,
+-                                              le64_to_cpu(old.num_bytes));
++                                              extent_end - key.offset);
++                              extent_end = ALIGN(extent_end,
++                                                 root->sectorsize);
++                      } else if (disk_bytenr > 0) {
+                               ret = btrfs_free_extent(trans, root,
+-                                              old_disk_bytenr,
+-                                              le64_to_cpu(old.disk_num_bytes),
+-                                              0, root->root_key.objectid,
++                                              disk_bytenr, num_bytes, 0,
++                                              root->root_key.objectid,
+                                               key.objectid, key.offset -
+-                                              le64_to_cpu(old.offset));
++                                              extent_offset);
+                               BUG_ON(ret);
+-                              *hint_byte = old_disk_bytenr;
++                              inode_sub_bytes(inode,
++                                              extent_end - key.offset);
++                              *hint_byte = disk_bytenr;
+                       }
+-              }
+-              if (search_start >= end) {
+-                      ret = 0;
+-                      goto out;
++                      if (end == extent_end)
++                              break;
++
++                      if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
++                              path->slots[0]++;
++                              goto next_slot;
++                      }
++
++                      ret = btrfs_del_items(trans, root, path, del_slot,
++                                            del_nr);
++                      BUG_ON(ret);
++
++                      del_nr = 0;
++                      del_slot = 0;
++
++                      btrfs_release_path(root, path);
++                      continue;
+               }
++
++              BUG_ON(1);
+       }
+-out:
+-      btrfs_free_path(path);
+-      if (locked_end > orig_locked_end) {
+-              unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
+-                            locked_end - 1, GFP_NOFS);
++
++      if (del_nr > 0) {
++              ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
++              BUG_ON(ret);
+       }
++
++      btrfs_free_path(path);
+       return ret;
+ }
+@@ -620,23 +548,23 @@ static int extent_mergeable(struct exten
+  * two or three.
+  */
+ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
+-                            struct btrfs_root *root,
+                             struct inode *inode, u64 start, u64 end)
+ {
++      struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_buffer *leaf;
+       struct btrfs_path *path;
+       struct btrfs_file_extent_item *fi;
+       struct btrfs_key key;
++      struct btrfs_key new_key;
+       u64 bytenr;
+       u64 num_bytes;
+       u64 extent_end;
+       u64 orig_offset;
+       u64 other_start;
+       u64 other_end;
+-      u64 split = start;
+-      u64 locked_end = end;
+-      int extent_type;
+-      int split_end = 1;
++      u64 split;
++      int del_nr = 0;
++      int del_slot = 0;
+       int ret;
+       btrfs_drop_extent_cache(inode, start, end - 1, 0);
+@@ -644,12 +572,10 @@ int btrfs_mark_extent_written(struct btr
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+ again:
++      split = start;
+       key.objectid = inode->i_ino;
+       key.type = BTRFS_EXTENT_DATA_KEY;
+-      if (split == start)
+-              key.offset = split;
+-      else
+-              key.offset = split - 1;
++      key.offset = split;
+       ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+       if (ret > 0 && path->slots[0] > 0)
+@@ -661,8 +587,8 @@ again:
+              key.type != BTRFS_EXTENT_DATA_KEY);
+       fi = btrfs_item_ptr(leaf, path->slots[0],
+                           struct btrfs_file_extent_item);
+-      extent_type = btrfs_file_extent_type(leaf, fi);
+-      BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
++      BUG_ON(btrfs_file_extent_type(leaf, fi) !=
++             BTRFS_FILE_EXTENT_PREALLOC);
+       extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+       BUG_ON(key.offset > start || extent_end < end);
+@@ -670,150 +596,91 @@ again:
+       num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+       orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
+-      if (key.offset == start)
+-              split = end;
+-
+-      if (key.offset == start && extent_end == end) {
+-              int del_nr = 0;
+-              int del_slot = 0;
+-              other_start = end;
+-              other_end = 0;
+-              if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+-                                   bytenr, &other_start, &other_end)) {
+-                      extent_end = other_end;
+-                      del_slot = path->slots[0] + 1;
+-                      del_nr++;
+-                      ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+-                                              0, root->root_key.objectid,
+-                                              inode->i_ino, orig_offset);
+-                      BUG_ON(ret);
+-              }
+-              other_start = 0;
+-              other_end = start;
+-              if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+-                                   bytenr, &other_start, &other_end)) {
+-                      key.offset = other_start;
+-                      del_slot = path->slots[0];
+-                      del_nr++;
+-                      ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+-                                              0, root->root_key.objectid,
+-                                              inode->i_ino, orig_offset);
+-                      BUG_ON(ret);
+-              }
+-              split_end = 0;
+-              if (del_nr == 0) {
+-                      btrfs_set_file_extent_type(leaf, fi,
+-                                                 BTRFS_FILE_EXTENT_REG);
+-                      goto done;
++      while (start > key.offset || end < extent_end) {
++              if (key.offset == start)
++                      split = end;
++
++              memcpy(&new_key, &key, sizeof(new_key));
++              new_key.offset = split;
++              ret = btrfs_duplicate_item(trans, root, path, &new_key);
++              if (ret == -EAGAIN) {
++                      btrfs_release_path(root, path);
++                      goto again;
+               }
++              BUG_ON(ret < 0);
+-              fi = btrfs_item_ptr(leaf, del_slot - 1,
++              leaf = path->nodes[0];
++              fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+                                   struct btrfs_file_extent_item);
+-              btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+               btrfs_set_file_extent_num_bytes(leaf, fi,
+-                                              extent_end - key.offset);
++                                              split - key.offset);
++
++              fi = btrfs_item_ptr(leaf, path->slots[0],
++                                  struct btrfs_file_extent_item);
++
++              btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
++              btrfs_set_file_extent_num_bytes(leaf, fi,
++                                              extent_end - split);
+               btrfs_mark_buffer_dirty(leaf);
+-              ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
++              ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
++                                         root->root_key.objectid,
++                                         inode->i_ino, orig_offset);
+               BUG_ON(ret);
+-              goto release;
+-      } else if (split == start) {
+-              if (locked_end < extent_end) {
+-                      ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+-                                      locked_end, extent_end - 1, GFP_NOFS);
+-                      if (!ret) {
+-                              btrfs_release_path(root, path);
+-                              lock_extent(&BTRFS_I(inode)->io_tree,
+-                                      locked_end, extent_end - 1, GFP_NOFS);
+-                              locked_end = extent_end;
+-                              goto again;
+-                      }
+-                      locked_end = extent_end;
+-              }
+-              btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
+-      } else  {
+-              BUG_ON(key.offset != start);
+-              key.offset = split;
+-              btrfs_set_file_extent_offset(leaf, fi, key.offset -
+-                                           orig_offset);
+-              btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
+-              btrfs_set_item_key_safe(trans, root, path, &key);
+-              extent_end = split;
+-      }
+-      if (extent_end == end) {
+-              split_end = 0;
+-              extent_type = BTRFS_FILE_EXTENT_REG;
+-      }
+-      if (extent_end == end && split == start) {
+-              other_start = end;
+-              other_end = 0;
+-              if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+-                                   bytenr, &other_start, &other_end)) {
+-                      path->slots[0]++;
+-                      fi = btrfs_item_ptr(leaf, path->slots[0],
+-                                          struct btrfs_file_extent_item);
+-                      key.offset = split;
+-                      btrfs_set_item_key_safe(trans, root, path, &key);
+-                      btrfs_set_file_extent_offset(leaf, fi, key.offset -
+-                                                   orig_offset);
+-                      btrfs_set_file_extent_num_bytes(leaf, fi,
+-                                                      other_end - split);
+-                      goto done;
+-              }
+-      }
+-      if (extent_end == end && split == end) {
+-              other_start = 0;
+-              other_end = start;
+-              if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
+-                                   bytenr, &other_start, &other_end)) {
++              if (split == start) {
++                      key.offset = start;
++              } else {
++                      BUG_ON(start != key.offset);
+                       path->slots[0]--;
+-                      fi = btrfs_item_ptr(leaf, path->slots[0],
+-                                          struct btrfs_file_extent_item);
+-                      btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
+-                                                      other_start);
+-                      goto done;
++                      extent_end = end;
+               }
+       }
+-      btrfs_mark_buffer_dirty(leaf);
+-
+-      ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+-                                 root->root_key.objectid,
+-                                 inode->i_ino, orig_offset);
+-      BUG_ON(ret);
+-      btrfs_release_path(root, path);
+-
+-      key.offset = start;
+-      ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
+-      BUG_ON(ret);
+-
+-      leaf = path->nodes[0];
+       fi = btrfs_item_ptr(leaf, path->slots[0],
+                           struct btrfs_file_extent_item);
+-      btrfs_set_file_extent_generation(leaf, fi, trans->transid);
+-      btrfs_set_file_extent_type(leaf, fi, extent_type);
+-      btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
+-      btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
+-      btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
+-      btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
+-      btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+-      btrfs_set_file_extent_compression(leaf, fi, 0);
+-      btrfs_set_file_extent_encryption(leaf, fi, 0);
+-      btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+-done:
+-      btrfs_mark_buffer_dirty(leaf);
+-release:
+-      btrfs_release_path(root, path);
+-      if (split_end && split == start) {
+-              split = end;
+-              goto again;
++      other_start = end;
++      other_end = 0;
++      if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
++                           bytenr, &other_start, &other_end)) {
++              extent_end = other_end;
++              del_slot = path->slots[0] + 1;
++              del_nr++;
++              ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
++                                      0, root->root_key.objectid,
++                                      inode->i_ino, orig_offset);
++              BUG_ON(ret);
++      }
++      other_start = 0;
++      other_end = start;
++      if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
++                           bytenr, &other_start, &other_end)) {
++              key.offset = other_start;
++              del_slot = path->slots[0];
++              del_nr++;
++              ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
++                                      0, root->root_key.objectid,
++                                      inode->i_ino, orig_offset);
++              BUG_ON(ret);
+       }
+-      if (locked_end > end) {
+-              unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+-                            GFP_NOFS);
++      if (del_nr == 0) {
++              btrfs_set_file_extent_type(leaf, fi,
++                                         BTRFS_FILE_EXTENT_REG);
++              btrfs_mark_buffer_dirty(leaf);
++              goto out;
+       }
++
++      fi = btrfs_item_ptr(leaf, del_slot - 1,
++                          struct btrfs_file_extent_item);
++      btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
++      btrfs_set_file_extent_num_bytes(leaf, fi,
++                                      extent_end - key.offset);
++      btrfs_mark_buffer_dirty(leaf);
++
++      ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
++      BUG_ON(ret);
++out:
+       btrfs_free_path(path);
+       return 0;
+ }
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -230,8 +230,7 @@ static noinline int cow_file_range_inlin
+               return 1;
+       }
+-      ret = btrfs_drop_extents(trans, root, inode, start,
+-                               aligned_end, aligned_end, start,
++      ret = btrfs_drop_extents(trans, inode, start, aligned_end,
+                                &hint_byte, 1);
+       BUG_ON(ret);
+@@ -1596,7 +1595,6 @@ static int insert_reserved_file_extent(s
+                                      struct inode *inode, u64 file_pos,
+                                      u64 disk_bytenr, u64 disk_num_bytes,
+                                      u64 num_bytes, u64 ram_bytes,
+-                                     u64 locked_end,
+                                      u8 compression, u8 encryption,
+                                      u16 other_encoding, int extent_type)
+ {
+@@ -1622,9 +1620,8 @@ static int insert_reserved_file_extent(s
+        * the caller is expected to unpin it and allow it to be merged
+        * with the others.
+        */
+-      ret = btrfs_drop_extents(trans, root, inode, file_pos,
+-                               file_pos + num_bytes, locked_end,
+-                               file_pos, &hint, 0);
++      ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
++                               &hint, 0);
+       BUG_ON(ret);
+       ins.objectid = inode->i_ino;
+@@ -1746,7 +1743,7 @@ static int btrfs_finish_ordered_io(struc
+               compressed = 1;
+       if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
+               BUG_ON(compressed);
+-              ret = btrfs_mark_extent_written(trans, root, inode,
++              ret = btrfs_mark_extent_written(trans, inode,
+                                               ordered_extent->file_offset,
+                                               ordered_extent->file_offset +
+                                               ordered_extent->len);
+@@ -1758,8 +1755,6 @@ static int btrfs_finish_ordered_io(struc
+                                               ordered_extent->disk_len,
+                                               ordered_extent->len,
+                                               ordered_extent->len,
+-                                              ordered_extent->file_offset +
+-                                              ordered_extent->len,
+                                               compressed, 0, 0,
+                                               BTRFS_FILE_EXTENT_REG);
+               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+@@ -3209,11 +3204,9 @@ int btrfs_cont_expand(struct inode *inod
+               if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+                       u64 hint_byte = 0;
+                       hole_size = last_byte - cur_offset;
+-                      err = btrfs_drop_extents(trans, root, inode,
+-                                               cur_offset,
++                      err = btrfs_drop_extents(trans, inode, cur_offset,
+                                                cur_offset + hole_size,
+-                                               block_end,
+-                                               cur_offset, &hint_byte, 1);
++                                               &hint_byte, 1);
+                       if (err)
+                               break;
+@@ -5643,7 +5636,7 @@ out_fail:
+ static int prealloc_file_range(struct btrfs_trans_handle *trans,
+                              struct inode *inode, u64 start, u64 end,
+-                             u64 locked_end, u64 alloc_hint, int mode)
++                             u64 alloc_hint, int mode)
+ {
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_key ins;
+@@ -5669,8 +5662,7 @@ static int prealloc_file_range(struct bt
+               ret = insert_reserved_file_extent(trans, inode,
+                                                 cur_offset, ins.objectid,
+                                                 ins.offset, ins.offset,
+-                                                ins.offset, locked_end,
+-                                                0, 0, 0,
++                                                ins.offset, 0, 0, 0,
+                                                 BTRFS_FILE_EXTENT_PREALLOC);
+               BUG_ON(ret);
+               btrfs_drop_extent_cache(inode, cur_offset,
+@@ -5779,8 +5771,7 @@ static long btrfs_fallocate(struct inode
+               last_byte = (last_byte + mask) & ~mask;
+               if (em->block_start == EXTENT_MAP_HOLE) {
+                       ret = prealloc_file_range(trans, inode, cur_offset,
+-                                      last_byte, locked_end + 1,
+-                                      alloc_hint, mode);
++                                              last_byte, alloc_hint, mode);
+                       if (ret < 0) {
+                               free_extent_map(em);
+                               break;
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -1032,8 +1032,7 @@ static noinline long btrfs_ioctl_clone(s
+       BUG_ON(!trans);
+       /* punch hole in destination first */
+-      btrfs_drop_extents(trans, root, inode, off, off + len,
+-                         off + len, 0, &hint_byte, 1);
++      btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
+       /* clone data */
+       key.objectid = src->i_ino;
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -542,8 +542,8 @@ static noinline int replay_one_extent(st
+       saved_nbytes = inode_get_bytes(inode);
+       /* drop any overlapping extents */
+-      ret = btrfs_drop_extents(trans, root, inode,
+-                       start, extent_end, extent_end, start, &alloc_hint, 1);
++      ret = btrfs_drop_extents(trans, inode, start, extent_end,
++                               &alloc_hint, 1);
+       BUG_ON(ret);
+       if (found_type == BTRFS_FILE_EXTENT_REG ||
diff --git a/queue-2.6.32/btrfs-run-orphan-cleanup-on-default-fs-root.patch b/queue-2.6.32/btrfs-run-orphan-cleanup-on-default-fs-root.patch
new file mode 100644 (file)
index 0000000..f9d7ca2
--- /dev/null
@@ -0,0 +1,61 @@
+From e3acc2a6850efff647f1c5458524eb3a8bcba20a Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@redhat.com>
+Date: Tue, 26 Jan 2010 14:30:53 +0000
+Subject: Btrfs: run orphan cleanup on default fs root
+
+From: Josef Bacik <josef@redhat.com>
+
+commit e3acc2a6850efff647f1c5458524eb3a8bcba20a upstream.
+
+This patch revert's commit
+
+6c090a11e1c403b727a6a8eff0b97d5fb9e95cb5
+
+Since it introduces this problem where we can run orphan cleanup on a
+volume that can have orphan entries re-added.  Instead of my original
+fix, Yan Zheng pointed out that we can just revert my original fix and
+then run the orphan cleanup in open_ctree after we look up the fs_root.
+I have tested this with all the tests that gave me problems and this
+patch fixes both problems.  Thanks,
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/disk-io.c |    6 ++++++
+ fs/btrfs/inode.c   |    6 ------
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1993,6 +1993,12 @@ struct btrfs_root *open_ctree(struct sup
+       if (!fs_info->fs_root)
+               goto fail_trans_kthread;
++      if (!(sb->s_flags & MS_RDONLY)) {
++              down_read(&fs_info->cleanup_work_sem);
++              btrfs_orphan_cleanup(fs_info->fs_root);
++              up_read(&fs_info->cleanup_work_sem);
++      }
++
+       return tree_root;
+ fail_trans_kthread:
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3796,12 +3796,6 @@ struct inode *btrfs_lookup_dentry(struct
+       if (location.type == BTRFS_INODE_ITEM_KEY) {
+               inode = btrfs_iget(dir->i_sb, &location, root);
+-              if (unlikely(root->clean_orphans) &&
+-                  !(inode->i_sb->s_flags & MS_RDONLY)) {
+-                      down_read(&root->fs_info->cleanup_work_sem);
+-                      btrfs_orphan_cleanup(root);
+-                      up_read(&root->fs_info->cleanup_work_sem);
+-              }
+               return inode;
+       }
diff --git a/queue-2.6.32/btrfs-show-discard-option-in-proc-mounts.patch b/queue-2.6.32/btrfs-show-discard-option-in-proc-mounts.patch
new file mode 100644 (file)
index 0000000..9299cc2
--- /dev/null
@@ -0,0 +1,34 @@
+From 20a5239a5d0f340e29827a6a2d28a138001c44b8 Mon Sep 17 00:00:00 2001
+From: Matthew Wilcox <matthew@wil.cx>
+Date: Mon, 14 Dec 2009 22:01:12 +0000
+Subject: Btrfs: Show discard option in /proc/mounts
+
+From: Matthew Wilcox <matthew@wil.cx>
+
+commit 20a5239a5d0f340e29827a6a2d28a138001c44b8 upstream.
+
+Christoph's patch e244a0aeb6a599c19a7c802cda6e2d67c847b154 doesn't display
+the discard option in /proc/mounts, leading to some confusion for me.
+Here's the missing bit.
+
+Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/btrfs/super.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -457,6 +457,8 @@ static int btrfs_show_options(struct seq
+               seq_puts(seq, ",notreelog");
+       if (btrfs_test_opt(root, FLUSHONCOMMIT))
+               seq_puts(seq, ",flushoncommit");
++      if (btrfs_test_opt(root, DISCARD))
++              seq_puts(seq, ",discard");
+       if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
+               seq_puts(seq, ",noacl");
+       return 0;
diff --git a/queue-2.6.32/btrfs-use-correct-values-when-updating-inode-i_size-on-fallocate.patch b/queue-2.6.32/btrfs-use-correct-values-when-updating-inode-i_size-on-fallocate.patch
new file mode 100644 (file)
index 0000000..81cfe44
--- /dev/null
@@ -0,0 +1,71 @@
+From d1ea6a61454e7d7ff0873d0ad1ae27d5807da0d3 Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Wed, 20 Jan 2010 07:28:54 +0000
+Subject: Btrfs: Use correct values when updating inode i_size on fallocate
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit d1ea6a61454e7d7ff0873d0ad1ae27d5807da0d3 upstream.
+
+commit f2bc9dd07e3424c4ec5f3949961fe053d47bc825
+Author: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date:   Wed Jan 20 12:57:53 2010 +0530
+
+    Btrfs: Use correct values when updating inode i_size on fallocate
+
+    Even though we allocate more, we should be updating inode i_size
+    as per the arguments passed
+
+    Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+Signed-off-by: Chris Mason <chris.mason@oracle.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/btrfs/inode.c |   13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -5799,7 +5799,7 @@ out_fail:
+ }
+ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+-                             u64 alloc_hint, int mode)
++                      u64 alloc_hint, int mode, loff_t actual_len)
+ {
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+@@ -5808,6 +5808,7 @@ static int prealloc_file_range(struct in
+       u64 cur_offset = start;
+       u64 num_bytes = end - start;
+       int ret = 0;
++      u64 i_size;
+       while (num_bytes > 0) {
+               alloc_size = min(num_bytes, root->fs_info->max_extent);
+@@ -5846,8 +5847,12 @@ static int prealloc_file_range(struct in
+               BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
+               if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+                   cur_offset > inode->i_size) {
+-                      i_size_write(inode, cur_offset);
+-                      btrfs_ordered_update_i_size(inode, cur_offset, NULL);
++                      if (cur_offset > actual_len)
++                              i_size  = actual_len;
++                      else
++                              i_size = cur_offset;
++                      i_size_write(inode, i_size);
++                      btrfs_ordered_update_i_size(inode, i_size, NULL);
+               }
+               ret = btrfs_update_inode(trans, root, inode);
+@@ -5940,7 +5945,7 @@ static long btrfs_fallocate(struct inode
+                    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                       ret = prealloc_file_range(inode,
+                                                 cur_offset, last_byte,
+-                                                alloc_hint, mode);
++                                              alloc_hint, mode, offset+len);
+                       if (ret < 0) {
+                               free_extent_map(em);
+                               break;
diff --git a/queue-2.6.32/crypto-testmgr-fix-complain-about-lack-test-for-internal-used-algorithm.patch b/queue-2.6.32/crypto-testmgr-fix-complain-about-lack-test-for-internal-used-algorithm.patch
new file mode 100644 (file)
index 0000000..6937869
--- /dev/null
@@ -0,0 +1,142 @@
+From 863b557a88f8c033f7419fabafef4712a5055f85 Mon Sep 17 00:00:00 2001
+From: Youquan, Song <youquan.song@intel.com>
+Date: Wed, 23 Dec 2009 19:45:20 +0800
+Subject: crypto: testmgr - Fix complain about lack test for internal used algorithm
+
+From: Youquan, Song <youquan.song@intel.com>
+
+commit 863b557a88f8c033f7419fabafef4712a5055f85 upstream.
+
+When load aesni-intel and ghash_clmulni-intel driver,kernel will complain no
+ test for some internal used algorithm.
+The strange information as following:
+
+alg: No test for __aes-aesni (__driver-aes-aesni)
+alg: No test for __ecb-aes-aesni (__driver-ecb-aes-aesni)
+alg: No test for __cbc-aes-aesni (__driver-cbc-aes-aesni)
+alg: No test for __ecb-aes-aesni (cryptd(__driver-ecb-aes-aesni)
+alg: No test for __ghash (__ghash-pclmulqdqni)
+alg: No test for __ghash (cryptd(__ghash-pclmulqdqni))
+
+This patch add NULL test entries for these algorithm and driver.
+
+Signed-off-by: Youquan, Song <youquan.song@intel.com>
+Signed-off-by: Ying, Huang <ying.huang@intel.com>
+Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
+Acked-by: Jiri Kosina <jkosina@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ crypto/testmgr.c |   84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 84 insertions(+)
+
+--- a/crypto/testmgr.c
++++ b/crypto/testmgr.c
+@@ -1477,9 +1477,54 @@ static int alg_test_cprng(const struct a
+       return err;
+ }
++static int alg_test_null(const struct alg_test_desc *desc,
++                           const char *driver, u32 type, u32 mask)
++{
++      return 0;
++}
++
+ /* Please keep this list sorted by algorithm name. */
+ static const struct alg_test_desc alg_test_descs[] = {
+       {
++              .alg = "__driver-cbc-aes-aesni",
++              .test = alg_test_null,
++              .suite = {
++                      .cipher = {
++                              .enc = {
++                                      .vecs = NULL,
++                                      .count = 0
++                              },
++                              .dec = {
++                                      .vecs = NULL,
++                                      .count = 0
++                              }
++                      }
++              }
++      }, {
++              .alg = "__driver-ecb-aes-aesni",
++              .test = alg_test_null,
++              .suite = {
++                      .cipher = {
++                              .enc = {
++                                      .vecs = NULL,
++                                      .count = 0
++                              },
++                              .dec = {
++                                      .vecs = NULL,
++                                      .count = 0
++                              }
++                      }
++              }
++      }, {
++              .alg = "__ghash-pclmulqdqni",
++              .test = alg_test_null,
++              .suite = {
++                      .hash = {
++                              .vecs = NULL,
++                              .count = 0
++                      }
++              }
++      }, {
+               .alg = "ansi_cprng",
+               .test = alg_test_cprng,
+               .fips_allowed = 1,
+@@ -1623,6 +1668,30 @@ static const struct alg_test_desc alg_te
+                       }
+               }
+       }, {
++              .alg = "cryptd(__driver-ecb-aes-aesni)",
++              .test = alg_test_null,
++              .suite = {
++                      .cipher = {
++                              .enc = {
++                                      .vecs = NULL,
++                                      .count = 0
++                              },
++                              .dec = {
++                                      .vecs = NULL,
++                                      .count = 0
++                              }
++                      }
++              }
++      }, {
++              .alg = "cryptd(__ghash-pclmulqdqni)",
++              .test = alg_test_null,
++              .suite = {
++                      .hash = {
++                              .vecs = NULL,
++                              .count = 0
++                      }
++              }
++      }, {
+               .alg = "ctr(aes)",
+               .test = alg_test_skcipher,
+               .fips_allowed = 1,
+@@ -1668,6 +1737,21 @@ static const struct alg_test_desc alg_te
+                               }
+                       }
+               }
++      }, {
++              .alg = "ecb(__aes-aesni)",
++              .test = alg_test_null,
++              .suite = {
++                      .cipher = {
++                              .enc = {
++                                      .vecs = NULL,
++                                      .count = 0
++                              },
++                              .dec = {
++                                      .vecs = NULL,
++                                      .count = 0
++                              }
++                      }
++              }
+       }, {
+               .alg = "ecb(aes)",
+               .test = alg_test_skcipher,
diff --git a/queue-2.6.32/dlm-always-use-gfp_nofs.patch b/queue-2.6.32/dlm-always-use-gfp_nofs.patch
new file mode 100644 (file)
index 0000000..e4169bf
--- /dev/null
@@ -0,0 +1,467 @@
+From 573c24c4af6664ffcd9aa7ba617a35fde2b95534 Mon Sep 17 00:00:00 2001
+From: David Teigland <teigland@redhat.com>
+Date: Mon, 30 Nov 2009 16:34:43 -0600
+Subject: dlm: always use GFP_NOFS
+
+From: David Teigland <teigland@redhat.com>
+
+commit 573c24c4af6664ffcd9aa7ba617a35fde2b95534 upstream.
+
+Replace all GFP_KERNEL and ls_allocation with GFP_NOFS.
+ls_allocation would be GFP_KERNEL for userland lockspaces
+and GFP_NOFS for file system lockspaces.
+
+It was discovered that any lockspaces on the system can
+affect all others by triggering memory reclaim in the
+file system which could in turn call back into the dlm
+to acquire locks, deadlocking dlm threads that were
+shared by all lockspaces, like dlm_recv.
+
+Signed-off-by: David Teigland <teigland@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/dlm/config.c       |   24 ++++++++++++------------
+ fs/dlm/debug_fs.c     |    2 +-
+ fs/dlm/dir.c          |    7 +++----
+ fs/dlm/dlm_internal.h |    1 -
+ fs/dlm/lock.c         |    6 +++---
+ fs/dlm/lockspace.c    |   15 +++++----------
+ fs/dlm/lowcomms.c     |    6 +++---
+ fs/dlm/member.c       |    8 ++++----
+ fs/dlm/memory.c       |    6 +++---
+ fs/dlm/netlink.c      |    2 +-
+ fs/dlm/plock.c        |    6 +++---
+ fs/dlm/rcom.c         |    2 +-
+ fs/dlm/requestqueue.c |    2 +-
+ fs/dlm/user.c         |   12 ++++++------
+ 14 files changed, 46 insertions(+), 53 deletions(-)
+
+--- a/fs/dlm/config.c
++++ b/fs/dlm/config.c
+@@ -410,10 +410,10 @@ static struct config_group *make_cluster
+       struct dlm_comms *cms = NULL;
+       void *gps = NULL;
+-      cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
+-      gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
+-      sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
+-      cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
++      cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
++      gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
++      sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
++      cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
+       if (!cl || !gps || !sps || !cms)
+               goto fail;
+@@ -482,9 +482,9 @@ static struct config_group *make_space(s
+       struct dlm_nodes *nds = NULL;
+       void *gps = NULL;
+-      sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
+-      gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
+-      nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
++      sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
++      gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
++      nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
+       if (!sp || !gps || !nds)
+               goto fail;
+@@ -536,7 +536,7 @@ static struct config_item *make_comm(str
+ {
+       struct dlm_comm *cm;
+-      cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
++      cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
+       if (!cm)
+               return ERR_PTR(-ENOMEM);
+@@ -569,7 +569,7 @@ static struct config_item *make_node(str
+       struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+       struct dlm_node *nd;
+-      nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
++      nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
+       if (!nd)
+               return ERR_PTR(-ENOMEM);
+@@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dl
+       if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
+               return -ENOSPC;
+-      addr = kzalloc(sizeof(*addr), GFP_KERNEL);
++      addr = kzalloc(sizeof(*addr), GFP_NOFS);
+       if (!addr)
+               return -ENOMEM;
+@@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int **
+       ids_count = sp->members_count;
+-      ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
++      ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
+       if (!ids) {
+               rv = -ENOMEM;
+               goto out;
+@@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int **
+       if (!new_count)
+               goto out_ids;
+-      new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
++      new = kcalloc(new_count, sizeof(int), GFP_NOFS);
+       if (!new) {
+               kfree(ids);
+               rv = -ENOMEM;
+--- a/fs/dlm/debug_fs.c
++++ b/fs/dlm/debug_fs.c
+@@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_
+       if (bucket >= ls->ls_rsbtbl_size)
+               return NULL;
+-      ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL);
++      ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
+       if (!ri)
+               return NULL;
+       if (n == 0)
+--- a/fs/dlm/dir.c
++++ b/fs/dlm/dir.c
+@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(
+       spin_unlock(&ls->ls_recover_list_lock);
+       if (!found)
+-              de = kzalloc(sizeof(struct dlm_direntry) + len,
+-                           ls->ls_allocation);
++              de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
+       return de;
+ }
+@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls
+       dlm_dir_clear(ls);
+-      last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation);
++      last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
+       if (!last_name)
+               goto out;
+@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls,
+       if (namelen > DLM_RESNAME_MAXLEN)
+               return -EINVAL;
+-      de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation);
++      de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
+       if (!de)
+               return -ENOMEM;
+--- a/fs/dlm/dlm_internal.h
++++ b/fs/dlm/dlm_internal.h
+@@ -473,7 +473,6 @@ struct dlm_ls {
+       int                     ls_low_nodeid;
+       int                     ls_total_weight;
+       int                     *ls_node_array;
+-      gfp_t                   ls_allocation;
+       struct dlm_rsb          ls_stub_rsb;    /* for returning errors */
+       struct dlm_lkb          ls_stub_lkb;    /* for returning errors */
+--- a/fs/dlm/lock.c
++++ b/fs/dlm/lock.c
+@@ -2689,7 +2689,7 @@ static int _create_message(struct dlm_ls
+          pass into lowcomms_commit and a message buffer (mb) that we
+          write our data into */
+-      mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
++      mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
+       if (!mh)
+               return -ENOBUFS;
+@@ -4512,7 +4512,7 @@ int dlm_user_request(struct dlm_ls *ls,
+       }
+       if (flags & DLM_LKF_VALBLK) {
+-              ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
++              ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
+               if (!ua->lksb.sb_lvbptr) {
+                       kfree(ua);
+                       __put_lkb(ls, lkb);
+@@ -4582,7 +4582,7 @@ int dlm_user_convert(struct dlm_ls *ls,
+       ua = lkb->lkb_ua;
+       if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
+-              ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL);
++              ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
+               if (!ua->lksb.sb_lvbptr) {
+                       error = -ENOMEM;
+                       goto out_put;
+--- a/fs/dlm/lockspace.c
++++ b/fs/dlm/lockspace.c
+@@ -430,7 +430,7 @@ static int new_lockspace(const char *nam
+       error = -ENOMEM;
+-      ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
++      ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
+       if (!ls)
+               goto out;
+       memcpy(ls->ls_name, name, namelen);
+@@ -443,11 +443,6 @@ static int new_lockspace(const char *nam
+       if (flags & DLM_LSFL_TIMEWARN)
+               set_bit(LSFL_TIMEWARN, &ls->ls_flags);
+-      if (flags & DLM_LSFL_FS)
+-              ls->ls_allocation = GFP_NOFS;
+-      else
+-              ls->ls_allocation = GFP_KERNEL;
+-
+       /* ls_exflags are forced to match among nodes, and we don't
+          need to require all nodes to have some flags set */
+       ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
+@@ -456,7 +451,7 @@ static int new_lockspace(const char *nam
+       size = dlm_config.ci_rsbtbl_size;
+       ls->ls_rsbtbl_size = size;
+-      ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
++      ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
+       if (!ls->ls_rsbtbl)
+               goto out_lsfree;
+       for (i = 0; i < size; i++) {
+@@ -468,7 +463,7 @@ static int new_lockspace(const char *nam
+       size = dlm_config.ci_lkbtbl_size;
+       ls->ls_lkbtbl_size = size;
+-      ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
++      ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
+       if (!ls->ls_lkbtbl)
+               goto out_rsbfree;
+       for (i = 0; i < size; i++) {
+@@ -480,7 +475,7 @@ static int new_lockspace(const char *nam
+       size = dlm_config.ci_dirtbl_size;
+       ls->ls_dirtbl_size = size;
+-      ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
++      ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
+       if (!ls->ls_dirtbl)
+               goto out_lkbfree;
+       for (i = 0; i < size; i++) {
+@@ -527,7 +522,7 @@ static int new_lockspace(const char *nam
+       mutex_init(&ls->ls_requestqueue_mutex);
+       mutex_init(&ls->ls_clear_proc_locks);
+-      ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
++      ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
+       if (!ls->ls_recover_buf)
+               goto out_dirfree;
+--- a/fs/dlm/lowcomms.c
++++ b/fs/dlm/lowcomms.c
+@@ -1060,7 +1060,7 @@ static void init_local(void)
+               if (dlm_our_addr(&sas, i))
+                       break;
+-              addr = kmalloc(sizeof(*addr), GFP_KERNEL);
++              addr = kmalloc(sizeof(*addr), GFP_NOFS);
+               if (!addr)
+                       break;
+               memcpy(addr, &sas, sizeof(*addr));
+@@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void)
+       struct sockaddr_storage localaddr;
+       struct sctp_event_subscribe subscribe;
+       int result = -EINVAL, num = 1, i, addr_len;
+-      struct connection *con = nodeid2con(0, GFP_KERNEL);
++      struct connection *con = nodeid2con(0, GFP_NOFS);
+       int bufsize = NEEDED_RMEM;
+       if (!con)
+@@ -1171,7 +1171,7 @@ out:
+ static int tcp_listen_for_all(void)
+ {
+       struct socket *sock = NULL;
+-      struct connection *con = nodeid2con(0, GFP_KERNEL);
++      struct connection *con = nodeid2con(0, GFP_NOFS);
+       int result = -EINVAL;
+       if (!con)
+--- a/fs/dlm/member.c
++++ b/fs/dlm/member.c
+@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls
+       struct dlm_member *memb;
+       int w, error;
+-      memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
++      memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
+       if (!memb)
+               return -ENOMEM;
+@@ -143,7 +143,7 @@ static void make_member_array(struct dlm
+       ls->ls_total_weight = total;
+-      array = kmalloc(sizeof(int) * total, ls->ls_allocation);
++      array = kmalloc(sizeof(int) * total, GFP_NOFS);
+       if (!array)
+               return;
+@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *l
+                       continue;
+               log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
+-              memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation);
++              memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
+               if (!memb)
+                       return -ENOMEM;
+               memb->nodeid = rv->new[i];
+@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
+       int *ids = NULL, *new = NULL;
+       int error, ids_count = 0, new_count = 0;
+-      rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation);
++      rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
+       if (!rv)
+               return -ENOMEM;
+--- a/fs/dlm/memory.c
++++ b/fs/dlm/memory.c
+@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls
+ {
+       char *p;
+-      p = kzalloc(ls->ls_lvblen, ls->ls_allocation);
++      p = kzalloc(ls->ls_lvblen, GFP_NOFS);
+       return p;
+ }
+@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct
+       DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
+-      r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation);
++      r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
+       return r;
+ }
+@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct
+ {
+       struct dlm_lkb *lkb;
+-      lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation);
++      lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
+       return lkb;
+ }
+--- a/fs/dlm/netlink.c
++++ b/fs/dlm/netlink.c
+@@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct s
+       struct sk_buff *skb;
+       void *data;
+-      skb = genlmsg_new(size, GFP_KERNEL);
++      skb = genlmsg_new(size, GFP_NOFS);
+       if (!skb)
+               return -ENOMEM;
+--- a/fs/dlm/plock.c
++++ b/fs/dlm/plock.c
+@@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lock
+       if (!ls)
+               return -EINVAL;
+-      xop = kzalloc(sizeof(*xop), GFP_KERNEL);
++      xop = kzalloc(sizeof(*xop), GFP_NOFS);
+       if (!xop) {
+               rv = -ENOMEM;
+               goto out;
+@@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lo
+       if (!ls)
+               return -EINVAL;
+-      op = kzalloc(sizeof(*op), GFP_KERNEL);
++      op = kzalloc(sizeof(*op), GFP_NOFS);
+       if (!op) {
+               rv = -ENOMEM;
+               goto out;
+@@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *locks
+       if (!ls)
+               return -EINVAL;
+-      op = kzalloc(sizeof(*op), GFP_KERNEL);
++      op = kzalloc(sizeof(*op), GFP_NOFS);
+       if (!op) {
+               rv = -ENOMEM;
+               goto out;
+--- a/fs/dlm/rcom.c
++++ b/fs/dlm/rcom.c
+@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls
+       char *mb;
+       int mb_len = sizeof(struct dlm_rcom) + len;
+-      mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb);
++      mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
+       if (!mh) {
+               log_print("create_rcom to %d type %d len %d ENOBUFS",
+                         to_nodeid, type, len);
+--- a/fs/dlm/requestqueue.c
++++ b/fs/dlm/requestqueue.c
+@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls
+       struct rq_entry *e;
+       int length = ms->m_header.h_length - sizeof(struct dlm_message);
+-      e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation);
++      e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
+       if (!e) {
+               log_print("dlm_add_requestqueue: out of memory len %d", length);
+               return;
+--- a/fs/dlm/user.c
++++ b/fs/dlm/user.c
+@@ -267,7 +267,7 @@ static int device_user_lock(struct dlm_u
+               goto out;
+       }
+-      ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
++      ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
+       if (!ua)
+               goto out;
+       ua->proc = proc;
+@@ -307,7 +307,7 @@ static int device_user_unlock(struct dlm
+       if (!ls)
+               return -ENOENT;
+-      ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL);
++      ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
+       if (!ua)
+               goto out;
+       ua->proc = proc;
+@@ -352,7 +352,7 @@ static int dlm_device_register(struct dl
+       error = -ENOMEM;
+       len = strlen(name) + strlen(name_prefix) + 2;
+-      ls->ls_device.name = kzalloc(len, GFP_KERNEL);
++      ls->ls_device.name = kzalloc(len, GFP_NOFS);
+       if (!ls->ls_device.name)
+               goto fail;
+@@ -520,7 +520,7 @@ static ssize_t device_write(struct file
+ #endif
+               return -EINVAL;
+-      kbuf = kzalloc(count + 1, GFP_KERNEL);
++      kbuf = kzalloc(count + 1, GFP_NOFS);
+       if (!kbuf)
+               return -ENOMEM;
+@@ -546,7 +546,7 @@ static ssize_t device_write(struct file
+               /* add 1 after namelen so that the name string is terminated */
+               kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
+-                             GFP_KERNEL);
++                             GFP_NOFS);
+               if (!kbuf) {
+                       kfree(k32buf);
+                       return -ENOMEM;
+@@ -648,7 +648,7 @@ static int device_open(struct inode *ino
+       if (!ls)
+               return -ENOENT;
+-      proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
++      proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
+       if (!proc) {
+               dlm_put_lockspace(ls);
+               return -ENOMEM;
diff --git a/queue-2.6.32/dlm-fix-ordering-of-bast-and-cast.patch b/queue-2.6.32/dlm-fix-ordering-of-bast-and-cast.patch
new file mode 100644 (file)
index 0000000..4988b2b
--- /dev/null
@@ -0,0 +1,272 @@
+From 7fe2b3190b8b299409f13cf3a6f85c2bd371f8bb Mon Sep 17 00:00:00 2001
+From: David Teigland <teigland@redhat.com>
+Date: Wed, 24 Feb 2010 11:08:18 -0600
+Subject: dlm: fix ordering of bast and cast
+
+From: David Teigland <teigland@redhat.com>
+
+commit 7fe2b3190b8b299409f13cf3a6f85c2bd371f8bb upstream.
+
+When both blocking and completion callbacks are queued for lock,
+the dlm would always deliver the completion callback (cast) first.
+In some cases the blocking callback (bast) is queued before the
+cast, though, and should be delivered first.  This patch keeps
+track of the order in which they were queued and delivers them
+in that order.
+
+This patch also keeps track of the granted mode in the last cast
+and eliminates the following bast if the bast mode is compatible
+with the preceding cast mode.  This happens when a remotely mastered
+lock is demoted, e.g. EX->NL, in which case the local node queues
+a cast immediately after sending the demote message.  In this way
+a cast can be queued for a mode, e.g. NL, that makes an in-transit
+bast extraneous.
+
+Signed-off-by: David Teigland <teigland@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/dlm/ast.c          |   72 +++++++++++++++++++++++++++++++++++++++-----------
+ fs/dlm/ast.h          |    4 +-
+ fs/dlm/dlm_internal.h |   10 +++++-
+ fs/dlm/lock.c         |    4 +-
+ fs/dlm/user.c         |   10 ++++--
+ fs/dlm/user.h         |    4 +-
+ 6 files changed, 77 insertions(+), 27 deletions(-)
+
+--- a/fs/dlm/ast.c
++++ b/fs/dlm/ast.c
+@@ -2,7 +2,7 @@
+ *******************************************************************************
+ **
+ **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
++**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
+ **
+ **  This copyrighted material is made available to anyone wishing to use,
+ **  modify, copy, or redistribute it subject to the terms and conditions
+@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb)
+       spin_unlock(&ast_queue_lock);
+ }
+-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
++void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode)
+ {
+       if (lkb->lkb_flags & DLM_IFL_USER) {
+-              dlm_user_add_ast(lkb, type, bastmode);
++              dlm_user_add_ast(lkb, type, mode);
+               return;
+       }
+@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, in
+       if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) {
+               kref_get(&lkb->lkb_ref);
+               list_add_tail(&lkb->lkb_astqueue, &ast_queue);
++              lkb->lkb_ast_first = type;
+       }
++
++      /* sanity check, this should not happen */
++
++      if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
++              log_print("repeat cast %d castmode %d lock %x %s",
++                        mode, lkb->lkb_castmode,
++                        lkb->lkb_id, lkb->lkb_resource->res_name);
++
+       lkb->lkb_ast_type |= type;
+-      if (bastmode)
+-              lkb->lkb_bastmode = bastmode;
++      if (type == AST_BAST)
++              lkb->lkb_bastmode = mode;
++      else
++              lkb->lkb_castmode = mode;
+       spin_unlock(&ast_queue_lock);
+       set_bit(WAKE_ASTS, &astd_wakeflags);
+@@ -59,9 +70,9 @@ static void process_asts(void)
+       struct dlm_ls *ls = NULL;
+       struct dlm_rsb *r = NULL;
+       struct dlm_lkb *lkb;
+-      void (*cast) (void *astparam);
+-      void (*bast) (void *astparam, int mode);
+-      int type = 0, bastmode;
++      void (*castfn) (void *astparam);
++      void (*bastfn) (void *astparam, int mode);
++      int type, first, bastmode, castmode, do_bast, do_cast, last_castmode;
+ repeat:
+       spin_lock(&ast_queue_lock);
+@@ -75,17 +86,48 @@ repeat:
+               list_del(&lkb->lkb_astqueue);
+               type = lkb->lkb_ast_type;
+               lkb->lkb_ast_type = 0;
++              first = lkb->lkb_ast_first;
++              lkb->lkb_ast_first = 0;
+               bastmode = lkb->lkb_bastmode;
+-
++              castmode = lkb->lkb_castmode;
++              castfn = lkb->lkb_astfn;
++              bastfn = lkb->lkb_bastfn;
+               spin_unlock(&ast_queue_lock);
+-              cast = lkb->lkb_astfn;
+-              bast = lkb->lkb_bastfn;
+-              if ((type & AST_COMP) && cast)
+-                      cast(lkb->lkb_astparam);
++              do_cast = (type & AST_COMP) && castfn;
++              do_bast = (type & AST_BAST) && bastfn;
++
++              /* Skip a bast if its blocking mode is compatible with the
++                 granted mode of the preceding cast. */
+-              if ((type & AST_BAST) && bast)
+-                      bast(lkb->lkb_astparam, bastmode);
++              if (do_bast) {
++                      if (first == AST_COMP)
++                              last_castmode = castmode;
++                      else
++                              last_castmode = lkb->lkb_castmode_done;
++                      if (dlm_modes_compat(bastmode, last_castmode))
++                              do_bast = 0;
++              }
++
++              if (first == AST_COMP) {
++                      if (do_cast)
++                              castfn(lkb->lkb_astparam);
++                      if (do_bast)
++                              bastfn(lkb->lkb_astparam, bastmode);
++              } else if (first == AST_BAST) {
++                      if (do_bast)
++                              bastfn(lkb->lkb_astparam, bastmode);
++                      if (do_cast)
++                              castfn(lkb->lkb_astparam);
++              } else {
++                      log_error(ls, "bad ast_first %d ast_type %d",
++                                first, type);
++              }
++
++              if (do_cast)
++                      lkb->lkb_castmode_done = castmode;
++              if (do_bast)
++                      lkb->lkb_bastmode_done = bastmode;
+               /* this removes the reference added by dlm_add_ast
+                  and may result in the lkb being freed */
+--- a/fs/dlm/ast.h
++++ b/fs/dlm/ast.h
+@@ -1,7 +1,7 @@
+ /******************************************************************************
+ *******************************************************************************
+ **
+-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
++**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
+ **
+ **  This copyrighted material is made available to anyone wishing to use,
+ **  modify, copy, or redistribute it subject to the terms and conditions
+@@ -13,7 +13,7 @@
+ #ifndef __ASTD_DOT_H__
+ #define __ASTD_DOT_H__
+-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
++void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
+ void dlm_del_ast(struct dlm_lkb *lkb);
+ void dlm_astd_wake(void);
+--- a/fs/dlm/dlm_internal.h
++++ b/fs/dlm/dlm_internal.h
+@@ -2,7 +2,7 @@
+ *******************************************************************************
+ **
+ **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
+-**  Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
++**  Copyright (C) 2004-2010 Red Hat, Inc.  All rights reserved.
+ **
+ **  This copyrighted material is made available to anyone wishing to use,
+ **  modify, copy, or redistribute it subject to the terms and conditions
+@@ -232,11 +232,17 @@ struct dlm_lkb {
+       int8_t                  lkb_status;     /* granted, waiting, convert */
+       int8_t                  lkb_rqmode;     /* requested lock mode */
+       int8_t                  lkb_grmode;     /* granted lock mode */
+-      int8_t                  lkb_bastmode;   /* requested mode */
+       int8_t                  lkb_highbast;   /* highest mode bast sent for */
++
+       int8_t                  lkb_wait_type;  /* type of reply waiting for */
+       int8_t                  lkb_wait_count;
+       int8_t                  lkb_ast_type;   /* type of ast queued for */
++      int8_t                  lkb_ast_first;  /* type of first ast queued */
++
++      int8_t                  lkb_bastmode;   /* req mode of queued bast */
++      int8_t                  lkb_castmode;   /* gr mode of queued cast */
++      int8_t                  lkb_bastmode_done; /* last delivered bastmode */
++      int8_t                  lkb_castmode_done; /* last delivered castmode */
+       struct list_head        lkb_idtbl_list; /* lockspace lkbtbl */
+       struct list_head        lkb_statequeue; /* rsb g/c/w list */
+--- a/fs/dlm/lock.c
++++ b/fs/dlm/lock.c
+@@ -1,7 +1,7 @@
+ /******************************************************************************
+ *******************************************************************************
+ **
+-**  Copyright (C) 2005-2008 Red Hat, Inc.  All rights reserved.
++**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
+ **
+ **  This copyrighted material is made available to anyone wishing to use,
+ **  modify, copy, or redistribute it subject to the terms and conditions
+@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r
+       lkb->lkb_lksb->sb_status = rv;
+       lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
+-      dlm_add_ast(lkb, AST_COMP, 0);
++      dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
+ }
+ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
+--- a/fs/dlm/user.c
++++ b/fs/dlm/user.c
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
++ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
+  *
+  * This copyrighted material is made available to anyone wishing to use,
+  * modify, copy, or redistribute it subject to the terms and conditions
+@@ -173,7 +173,7 @@ static int lkb_is_endoflife(struct dlm_l
+ /* we could possibly check if the cancel of an orphan has resulted in the lkb
+    being removed and then remove that lkb from the orphans list and free it */
+-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode)
++void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
+ {
+       struct dlm_ls *ls;
+       struct dlm_user_args *ua;
+@@ -206,8 +206,10 @@ void dlm_user_add_ast(struct dlm_lkb *lk
+       ast_type = lkb->lkb_ast_type;
+       lkb->lkb_ast_type |= type;
+-      if (bastmode)
+-              lkb->lkb_bastmode = bastmode;
++      if (type == AST_BAST)
++              lkb->lkb_bastmode = mode;
++      else
++              lkb->lkb_castmode = mode;
+       if (!ast_type) {
+               kref_get(&lkb->lkb_ref);
+--- a/fs/dlm/user.h
++++ b/fs/dlm/user.h
+@@ -1,5 +1,5 @@
+ /*
+- * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
++ * Copyright (C) 2006-2010 Red Hat, Inc.  All rights reserved.
+  *
+  * This copyrighted material is made available to anyone wishing to use,
+  * modify, copy, or redistribute it subject to the terms and conditions
+@@ -9,7 +9,7 @@
+ #ifndef __USER_DOT_H__
+ #define __USER_DOT_H__
+-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode);
++void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode);
+ int dlm_user_init(void);
+ void dlm_user_exit(void);
+ int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/queue-2.6.32/dlm-send-reply-before-bast.patch b/queue-2.6.32/dlm-send-reply-before-bast.patch
new file mode 100644 (file)
index 0000000..8cefd14
--- /dev/null
@@ -0,0 +1,285 @@
+From cf6620acc0f6fac57968aafef79ab372bdcf6157 Mon Sep 17 00:00:00 2001
+From: David Teigland <teigland@redhat.com>
+Date: Wed, 24 Feb 2010 11:59:23 -0600
+Subject: dlm: send reply before bast
+
+From: David Teigland <teigland@redhat.com>
+
+commit cf6620acc0f6fac57968aafef79ab372bdcf6157 upstream.
+
+When the lock master processes a successful operation (request,
+convert, cancel, or unlock), it will process the effects of the
+change before sending the reply for the operation.  The "effects"
+of the operation are:
+
+- blocking callbacks (basts) for any newly granted locks
+- waiting or converting locks that can now be granted
+
+The cast is queued on the local node when the reply from the lock
+master is received.  This means that a lock holder can receive a
+bast for a lock mode that is doesn't yet know has been granted.
+
+Signed-off-by: David Teigland <teigland@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/dlm/lock.c |  110 ++++++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 84 insertions(+), 26 deletions(-)
+
+--- a/fs/dlm/lock.c
++++ b/fs/dlm/lock.c
+@@ -2280,20 +2280,30 @@ static int do_request(struct dlm_rsb *r,
+       if (can_be_queued(lkb)) {
+               error = -EINPROGRESS;
+               add_lkb(r, lkb, DLM_LKSTS_WAITING);
+-              send_blocking_asts(r, lkb);
+               add_timeout(lkb);
+               goto out;
+       }
+       error = -EAGAIN;
+-      if (force_blocking_asts(lkb))
+-              send_blocking_asts_all(r, lkb);
+       queue_cast(r, lkb, -EAGAIN);
+-
+  out:
+       return error;
+ }
++static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
++                             int error)
++{
++      switch (error) {
++      case -EAGAIN:
++              if (force_blocking_asts(lkb))
++                      send_blocking_asts_all(r, lkb);
++              break;
++      case -EINPROGRESS:
++              send_blocking_asts(r, lkb);
++              break;
++      }
++}
++
+ static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
+ {
+       int error = 0;
+@@ -2304,7 +2314,6 @@ static int do_convert(struct dlm_rsb *r,
+       if (can_be_granted(r, lkb, 1, &deadlk)) {
+               grant_lock(r, lkb);
+               queue_cast(r, lkb, 0);
+-              grant_pending_locks(r);
+               goto out;
+       }
+@@ -2334,7 +2343,6 @@ static int do_convert(struct dlm_rsb *r,
+               if (_can_be_granted(r, lkb, 1)) {
+                       grant_lock(r, lkb);
+                       queue_cast(r, lkb, 0);
+-                      grant_pending_locks(r);
+                       goto out;
+               }
+               /* else fall through and move to convert queue */
+@@ -2344,28 +2352,47 @@ static int do_convert(struct dlm_rsb *r,
+               error = -EINPROGRESS;
+               del_lkb(r, lkb);
+               add_lkb(r, lkb, DLM_LKSTS_CONVERT);
+-              send_blocking_asts(r, lkb);
+               add_timeout(lkb);
+               goto out;
+       }
+       error = -EAGAIN;
+-      if (force_blocking_asts(lkb))
+-              send_blocking_asts_all(r, lkb);
+       queue_cast(r, lkb, -EAGAIN);
+-
+  out:
+       return error;
+ }
++static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
++                             int error)
++{
++      switch (error) {
++      case 0:
++              grant_pending_locks(r);
++              /* grant_pending_locks also sends basts */
++              break;
++      case -EAGAIN:
++              if (force_blocking_asts(lkb))
++                      send_blocking_asts_all(r, lkb);
++              break;
++      case -EINPROGRESS:
++              send_blocking_asts(r, lkb);
++              break;
++      }
++}
++
+ static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+ {
+       remove_lock(r, lkb);
+       queue_cast(r, lkb, -DLM_EUNLOCK);
+-      grant_pending_locks(r);
+       return -DLM_EUNLOCK;
+ }
++static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
++                            int error)
++{
++      grant_pending_locks(r);
++}
++
+ /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
+  
+ static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
+@@ -2375,12 +2402,18 @@ static int do_cancel(struct dlm_rsb *r,
+       error = revert_lock(r, lkb);
+       if (error) {
+               queue_cast(r, lkb, -DLM_ECANCEL);
+-              grant_pending_locks(r);
+               return -DLM_ECANCEL;
+       }
+       return 0;
+ }
++static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
++                            int error)
++{
++      if (error)
++              grant_pending_locks(r);
++}
++
+ /*
+  * Four stage 3 varieties:
+  * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
+@@ -2402,11 +2435,15 @@ static int _request_lock(struct dlm_rsb
+               goto out;
+       }
+-      if (is_remote(r))
++      if (is_remote(r)) {
+               /* receive_request() calls do_request() on remote node */
+               error = send_request(r, lkb);
+-      else
++      } else {
+               error = do_request(r, lkb);
++              /* for remote locks the request_reply is sent
++                 between do_request and do_request_effects */
++              do_request_effects(r, lkb, error);
++      }
+  out:
+       return error;
+ }
+@@ -2417,11 +2454,15 @@ static int _convert_lock(struct dlm_rsb
+ {
+       int error;
+-      if (is_remote(r))
++      if (is_remote(r)) {
+               /* receive_convert() calls do_convert() on remote node */
+               error = send_convert(r, lkb);
+-      else
++      } else {
+               error = do_convert(r, lkb);
++              /* for remote locks the convert_reply is sent
++                 between do_convert and do_convert_effects */
++              do_convert_effects(r, lkb, error);
++      }
+       return error;
+ }
+@@ -2432,11 +2473,15 @@ static int _unlock_lock(struct dlm_rsb *
+ {
+       int error;
+-      if (is_remote(r))
++      if (is_remote(r)) {
+               /* receive_unlock() calls do_unlock() on remote node */
+               error = send_unlock(r, lkb);
+-      else
++      } else {
+               error = do_unlock(r, lkb);
++              /* for remote locks the unlock_reply is sent
++                 between do_unlock and do_unlock_effects */
++              do_unlock_effects(r, lkb, error);
++      }
+       return error;
+ }
+@@ -2447,11 +2492,15 @@ static int _cancel_lock(struct dlm_rsb *
+ {
+       int error;
+-      if (is_remote(r))
++      if (is_remote(r)) {
+               /* receive_cancel() calls do_cancel() on remote node */
+               error = send_cancel(r, lkb);
+-      else
++      } else {
+               error = do_cancel(r, lkb);
++              /* for remote locks the cancel_reply is sent
++                 between do_cancel and do_cancel_effects */
++              do_cancel_effects(r, lkb, error);
++      }
+       return error;
+ }
+@@ -3191,6 +3240,7 @@ static void receive_request(struct dlm_l
+       attach_lkb(r, lkb);
+       error = do_request(r, lkb);
+       send_request_reply(r, lkb, error);
++      do_request_effects(r, lkb, error);
+       unlock_rsb(r);
+       put_rsb(r);
+@@ -3226,15 +3276,19 @@ static void receive_convert(struct dlm_l
+               goto out;
+       receive_flags(lkb, ms);
++
+       error = receive_convert_args(ls, lkb, ms);
+-      if (error)
+-              goto out_reply;
++      if (error) {
++              send_convert_reply(r, lkb, error);
++              goto out;
++      }
++
+       reply = !down_conversion(lkb);
+       error = do_convert(r, lkb);
+- out_reply:
+       if (reply)
+               send_convert_reply(r, lkb, error);
++      do_convert_effects(r, lkb, error);
+  out:
+       unlock_rsb(r);
+       put_rsb(r);
+@@ -3266,13 +3320,16 @@ static void receive_unlock(struct dlm_ls
+               goto out;
+       receive_flags(lkb, ms);
++
+       error = receive_unlock_args(ls, lkb, ms);
+-      if (error)
+-              goto out_reply;
++      if (error) {
++              send_unlock_reply(r, lkb, error);
++              goto out;
++      }
+       error = do_unlock(r, lkb);
+- out_reply:
+       send_unlock_reply(r, lkb, error);
++      do_unlock_effects(r, lkb, error);
+  out:
+       unlock_rsb(r);
+       put_rsb(r);
+@@ -3307,6 +3364,7 @@ static void receive_cancel(struct dlm_ls
+       error = do_cancel(r, lkb);
+       send_cancel_reply(r, lkb, error);
++      do_cancel_effects(r, lkb, error);
+  out:
+       unlock_rsb(r);
+       put_rsb(r);
diff --git a/queue-2.6.32/ext4-fix-optional-arg-mount-options.patch b/queue-2.6.32/ext4-fix-optional-arg-mount-options.patch
new file mode 100644 (file)
index 0000000..31e76ff
--- /dev/null
@@ -0,0 +1,77 @@
+From 15121c18a22ae483279f76dc9e554334b800d0f7 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Mon, 15 Feb 2010 20:17:55 -0500
+Subject: ext4: Fix optional-arg mount options
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+commit 15121c18a22ae483279f76dc9e554334b800d0f7 upstream.
+
+We have 2 mount options, "barrier" and "auto_da_alloc" which may or
+may not take a 1/0 argument.  This causes the ext4 superblock mount
+code to subtract uninitialized pointers and pass the result to
+kmalloc, which results in very noisy failures.
+
+Per Ted's suggestion, initialize the args struct so that
+we know whether match_token() found an argument for the
+option, and skip match_int() if not.
+
+Also, return error (0) from parse_options if we thought
+we found an argument, but match_int() Fails.
+
+Reported-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/super.c |   23 +++++++++++++++--------
+ 1 file changed, 15 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1218,6 +1218,11 @@ static int parse_options(char *options,
+               if (!*p)
+                       continue;
++              /*
++               * Initialize args struct so we know whether arg was
++               * found; some options take optional arguments.
++               */
++              args[0].to = args[0].from = 0;
+               token = match_token(p, tokens, args);
+               switch (token) {
+               case Opt_bsd_df:
+@@ -1503,10 +1508,11 @@ set_qf_format:
+                       clear_opt(sbi->s_mount_opt, BARRIER);
+                       break;
+               case Opt_barrier:
+-                      if (match_int(&args[0], &option)) {
+-                              set_opt(sbi->s_mount_opt, BARRIER);
+-                              break;
+-                      }
++                      if (args[0].from) {
++                              if (match_int(&args[0], &option))
++                                      return 0;
++                      } else
++                              option = 1;     /* No argument, default to 1 */
+                       if (option)
+                               set_opt(sbi->s_mount_opt, BARRIER);
+                       else
+@@ -1579,10 +1585,11 @@ set_qf_format:
+                       set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                       break;
+               case Opt_auto_da_alloc:
+-                      if (match_int(&args[0], &option)) {
+-                              clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+-                              break;
+-                      }
++                      if (args[0].from) {
++                              if (match_int(&args[0], &option))
++                                      return 0;
++                      } else
++                              option = 1;     /* No argument, default to 1 */
+                       if (option)
+                               clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                       else
diff --git a/queue-2.6.32/ext4-make-sure-the-move_ext-ioctl-can-t-overwrite-append-only-files.patch b/queue-2.6.32/ext4-make-sure-the-move_ext-ioctl-can-t-overwrite-append-only-files.patch
new file mode 100644 (file)
index 0000000..394ea66
--- /dev/null
@@ -0,0 +1,34 @@
+From 1f5a81e41f8b1a782c68d3843e9ec1bfaadf7d72 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Wed, 2 Jun 2010 22:04:39 -0400
+Subject: ext4: Make sure the MOVE_EXT ioctl can't overwrite append-only files
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 1f5a81e41f8b1a782c68d3843e9ec1bfaadf7d72 upstream.
+
+Dan Roseberg has reported a problem with the MOVE_EXT ioctl.  If the
+donor file is an append-only file, we should not allow the operation
+to proceed, lest we end up overwriting the contents of an append-only
+file.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Cc: Dan Rosenberg <dan.j.rosenberg@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/move_extent.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -958,6 +958,9 @@ mext_check_arguments(struct inode *orig_
+               return -EINVAL;
+       }
++      if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
++              return -EPERM;
++
+       /* Ext4 move extent does not support swapfile */
+       if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+               ext4_debug("ext4 move extent: The argument files should "
diff --git a/queue-2.6.32/fix-sba-iommu-to-handle-allocation-failure-properly.patch b/queue-2.6.32/fix-sba-iommu-to-handle-allocation-failure-properly.patch
new file mode 100644 (file)
index 0000000..c235c0a
--- /dev/null
@@ -0,0 +1,110 @@
+From e2a465675dc089e9a56ba2fa2a5fbd9bd8844d18 Mon Sep 17 00:00:00 2001
+From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
+Date: Tue, 17 Nov 2009 14:44:35 -0800
+Subject: [IA64] fix SBA IOMMU to handle allocation failure properly
+
+From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
+
+commit e2a465675dc089e9a56ba2fa2a5fbd9bd8844d18 upstream.
+
+It's possible that SBA IOMMU might fail to find I/O space under heavy
+I/Os.  SBA IOMMU panics on allocation failure but it shouldn't; drivers
+can handle the failure.  The majority of other IOMMU drivers don't panic
+on allocation failure.
+
+This patch fixes SBA IOMMU path to handle allocation failure properly.
+
+Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
+Cc: Fenghua Yu <fenghua.yu@intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Tony Luck <tony.luck@intel.com>
+Acked-by: Leonardo Chiquitto <lchiquitto@novell.com>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/ia64/hp/common/sba_iommu.c |   38 +++++++++++++++++++++++++++++---------
+ 1 file changed, 29 insertions(+), 9 deletions(-)
+
+--- a/arch/ia64/hp/common/sba_iommu.c
++++ b/arch/ia64/hp/common/sba_iommu.c
+@@ -677,12 +677,19 @@ sba_alloc_range(struct ioc *ioc, struct
+                       spin_unlock_irqrestore(&ioc->saved_lock, flags);
+                       pide = sba_search_bitmap(ioc, dev, pages_needed, 0);
+-                      if (unlikely(pide >= (ioc->res_size << 3)))
+-                              panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n",
+-                                    ioc->ioc_hpa);
++                      if (unlikely(pide >= (ioc->res_size << 3))) {
++                              printk(KERN_WARNING "%s: I/O MMU @ %p is"
++                                     "out of mapping resources, %u %u %lx\n",
++                                     __func__, ioc->ioc_hpa, ioc->res_size,
++                                     pages_needed, dma_get_seg_boundary(dev));
++                              return -1;
++                      }
+ #else
+-                      panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n",
+-                            ioc->ioc_hpa);
++                      printk(KERN_WARNING "%s: I/O MMU @ %p is"
++                             "out of mapping resources, %u %u %lx\n",
++                             __func__, ioc->ioc_hpa, ioc->res_size,
++                             pages_needed, dma_get_seg_boundary(dev));
++                      return -1;
+ #endif
+               }
+       }
+@@ -965,6 +972,8 @@ static dma_addr_t sba_map_page(struct de
+ #endif
+       pide = sba_alloc_range(ioc, dev, size);
++      if (pide < 0)
++              return 0;
+       iovp = (dma_addr_t) pide << iovp_shift;
+@@ -1320,6 +1329,7 @@ sba_coalesce_chunks(struct ioc *ioc, str
+       unsigned long dma_offset, dma_len; /* start/len of DMA stream */
+       int n_mappings = 0;
+       unsigned int max_seg_size = dma_get_max_seg_size(dev);
++      int idx;
+       while (nents > 0) {
+               unsigned long vaddr = (unsigned long) sba_sg_address(startsg);
+@@ -1418,16 +1428,22 @@ sba_coalesce_chunks(struct ioc *ioc, str
+               vcontig_sg->dma_length = vcontig_len;
+               dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask;
+               ASSERT(dma_len <= DMA_CHUNK_SIZE);
+-              dma_sg->dma_address = (dma_addr_t) (PIDE_FLAG
+-                      | (sba_alloc_range(ioc, dev, dma_len) << iovp_shift)
+-                      | dma_offset);
++              idx = sba_alloc_range(ioc, dev, dma_len);
++              if (idx < 0) {
++                      dma_sg->dma_length = 0;
++                      return -1;
++              }
++              dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift)
++                                                 | dma_offset);
+               n_mappings++;
+       }
+       return n_mappings;
+ }
+-
++static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist,
++                             int nents, enum dma_data_direction dir,
++                             struct dma_attrs *attrs);
+ /**
+  * sba_map_sg - map Scatter/Gather list
+  * @dev: instance of PCI owned by the driver that's asking.
+@@ -1493,6 +1509,10 @@ static int sba_map_sg_attrs(struct devic
+       ** Access to the virtual address is what forces a two pass algorithm.
+       */
+       coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
++      if (coalesced < 0) {
++              sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
++              return 0;
++      }
+       /*
+       ** Program the I/O Pdir
diff --git a/queue-2.6.32/hwpoison-abort-on-failed-unmap.patch b/queue-2.6.32/hwpoison-abort-on-failed-unmap.patch
new file mode 100644 (file)
index 0000000..d66b84f
--- /dev/null
@@ -0,0 +1,78 @@
+From 1668bfd5be9d8a52536c4865000fbbe065a3613b Mon Sep 17 00:00:00 2001
+From: Wu Fengguang <fengguang.wu@intel.com>
+Date: Wed, 16 Dec 2009 12:19:58 +0100
+Subject: HWPOISON: abort on failed unmap
+
+From: Wu Fengguang <fengguang.wu@intel.com>
+
+commit 1668bfd5be9d8a52536c4865000fbbe065a3613b upstream.
+
+Don't try to isolate a still mapped page. Otherwise we will hit the
+BUG_ON(page_mapped(page)) in __remove_from_page_cache().
+
+Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Renninger <trenn@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/memory-failure.c |   20 +++++++++++++++-----
+ 1 file changed, 15 insertions(+), 5 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -637,7 +637,7 @@ static int page_action(struct page_state
+  * Do all that is necessary to remove user space mappings. Unmap
+  * the pages and send SIGBUS to the processes if the data was dirty.
+  */
+-static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
++static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+                                 int trapno)
+ {
+       enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+@@ -647,15 +647,18 @@ static void hwpoison_user_mappings(struc
+       int i;
+       int kill = 1;
+-      if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
+-              return;
++      if (PageReserved(p) || PageSlab(p))
++              return SWAP_SUCCESS;
+       /*
+        * This check implies we don't kill processes if their pages
+        * are in the swap cache early. Those are always late kills.
+        */
+       if (!page_mapped(p))
+-              return;
++              return SWAP_SUCCESS;
++
++      if (PageCompound(p) || PageKsm(p))
++              return SWAP_FAIL;
+       if (PageSwapCache(p)) {
+               printk(KERN_ERR
+@@ -717,6 +720,8 @@ static void hwpoison_user_mappings(struc
+        */
+       kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+                     ret != SWAP_SUCCESS, pfn);
++
++      return ret;
+ }
+ int __memory_failure(unsigned long pfn, int trapno, int ref)
+@@ -786,8 +791,13 @@ int __memory_failure(unsigned long pfn,
+       /*
+        * Now take care of user space mappings.
++       * Abort on fail: __remove_from_page_cache() assumes unmapped page.
+        */
+-      hwpoison_user_mappings(p, pfn, trapno);
++      if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
++              printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
++              res = -EBUSY;
++              goto out;
++      }
+       /*
+        * Torn down by someone else?
diff --git a/queue-2.6.32/hwpoison-remove-the-anonymous-entry.patch b/queue-2.6.32/hwpoison-remove-the-anonymous-entry.patch
new file mode 100644 (file)
index 0000000..6ae9ce7
--- /dev/null
@@ -0,0 +1,31 @@
+From 9b9a29ecd75e310f75a9243e1c3538ad34598fcb Mon Sep 17 00:00:00 2001
+From: Wu Fengguang <fengguang.wu@intel.com>
+Date: Wed, 16 Dec 2009 12:19:57 +0100
+Subject: HWPOISON: remove the anonymous entry
+
+From: Wu Fengguang <fengguang.wu@intel.com>
+
+commit 9b9a29ecd75e310f75a9243e1c3538ad34598fcb upstream.
+
+(PG_swapbacked && !PG_lru) pages should not happen.
+Better to treat them as unknown pages.
+
+Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
+Signed-off-by: Andi Kleen <ak@linux.intel.com>
+Signed-off-by: Thomas Renninger <trenn@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/memory-failure.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -589,7 +589,6 @@ static struct page_state {
+       { lru|dirty,    lru|dirty,      "LRU",          me_pagecache_dirty },
+       { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
+-      { swapbacked,   swapbacked,     "anonymous",    me_pagecache_clean },
+       /*
+        * Catchall entry: must be at end.
diff --git a/queue-2.6.32/ibmvfc-fix-command-completion-handling.patch b/queue-2.6.32/ibmvfc-fix-command-completion-handling.patch
new file mode 100644 (file)
index 0000000..f8958f4
--- /dev/null
@@ -0,0 +1,73 @@
+From f5832fa2f8dc39adcf3ae348d2d6383163235e79 Mon Sep 17 00:00:00 2001
+From: Brian King <brking@linux.vnet.ibm.com>
+Date: Tue, 20 Apr 2010 14:21:33 -0500
+Subject: [SCSI] ibmvfc: Fix command completion handling
+
+From: Brian King <brking@linux.vnet.ibm.com>
+
+commit f5832fa2f8dc39adcf3ae348d2d6383163235e79 upstream.
+
+Commands which are completed by the VIOS are placed on a CRQ
+in kernel memory for the ibmvfc driver to process. Each CRQ
+entry is 16 bytes. The ibmvfc driver reads the first 8 bytes
+to check if the entry is valid, then reads the next 8 bytes to get
+the handle, which is a pointer the completed command. This fixes
+an issue seen on Power 7 where the processor reordered the
+loads from memory, resulting in processing command completion
+with a stale handle. This could result in command timeouts,
+and also early completion of commands.
+
+Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
+Signed-off-by: James Bottomley <James.Bottomley@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/scsi/ibmvscsi/ibmvfc.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/scsi/ibmvscsi/ibmvfc.c
++++ b/drivers/scsi/ibmvscsi/ibmvfc.c
+@@ -2720,6 +2720,7 @@ static struct ibmvfc_async_crq *ibmvfc_n
+       if (crq->valid & 0x80) {
+               if (++async_crq->cur == async_crq->size)
+                       async_crq->cur = 0;
++              rmb();
+       } else
+               crq = NULL;
+@@ -2742,6 +2743,7 @@ static struct ibmvfc_crq *ibmvfc_next_cr
+       if (crq->valid & 0x80) {
+               if (++queue->cur == queue->size)
+                       queue->cur = 0;
++              rmb();
+       } else
+               crq = NULL;
+@@ -2790,12 +2792,14 @@ static void ibmvfc_tasklet(void *data)
+               while ((async = ibmvfc_next_async_crq(vhost)) != NULL) {
+                       ibmvfc_handle_async(async, vhost);
+                       async->valid = 0;
++                      wmb();
+               }
+               /* Pull all the valid messages off the CRQ */
+               while ((crq = ibmvfc_next_crq(vhost)) != NULL) {
+                       ibmvfc_handle_crq(crq, vhost);
+                       crq->valid = 0;
++                      wmb();
+               }
+               vio_enable_interrupts(vdev);
+@@ -2803,10 +2807,12 @@ static void ibmvfc_tasklet(void *data)
+                       vio_disable_interrupts(vdev);
+                       ibmvfc_handle_async(async, vhost);
+                       async->valid = 0;
++                      wmb();
+               } else if ((crq = ibmvfc_next_crq(vhost)) != NULL) {
+                       vio_disable_interrupts(vdev);
+                       ibmvfc_handle_crq(crq, vhost);
+                       crq->valid = 0;
++                      wmb();
+               } else
+                       done = 1;
+       }
diff --git a/queue-2.6.32/ibmvfc-reduce-error-recovery-timeout.patch b/queue-2.6.32/ibmvfc-reduce-error-recovery-timeout.patch
new file mode 100644 (file)
index 0000000..69de386
--- /dev/null
@@ -0,0 +1,44 @@
+From daa142d1773dd3a986f02a8a4da929608d24daaa Mon Sep 17 00:00:00 2001
+From: Brian King <brking@linux.vnet.ibm.com>
+Date: Tue, 20 Apr 2010 14:21:35 -0500
+Subject: [SCSI] ibmvfc: Reduce error recovery timeout
+
+From: Brian King <brking@linux.vnet.ibm.com>
+
+commit daa142d1773dd3a986f02a8a4da929608d24daaa upstream.
+
+If a command times out resulting in EH getting invoked, we wait for the
+aborted commands to come back after sending the abort. Shorten
+the amount of time we wait for these responses, to ensure we don't
+get stuck in EH for several minutes.
+
+Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
+Signed-off-by: James Bottomley <James.Bottomley@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/scsi/ibmvscsi/ibmvfc.c |    2 +-
+ drivers/scsi/ibmvscsi/ibmvfc.h |    1 +
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/scsi/ibmvscsi/ibmvfc.c
++++ b/drivers/scsi/ibmvscsi/ibmvfc.c
+@@ -1969,7 +1969,7 @@ static int ibmvfc_wait_for_ops(struct ib
+       DECLARE_COMPLETION_ONSTACK(comp);
+       int wait;
+       unsigned long flags;
+-      signed long timeout = init_timeout * HZ;
++      signed long timeout = IBMVFC_ABORT_WAIT_TIMEOUT * HZ;
+       ENTER;
+       do {
+--- a/drivers/scsi/ibmvscsi/ibmvfc.h
++++ b/drivers/scsi/ibmvscsi/ibmvfc.h
+@@ -38,6 +38,7 @@
+ #define IBMVFC_ADISC_PLUS_CANCEL_TIMEOUT      \
+               (IBMVFC_ADISC_TIMEOUT + IBMVFC_ADISC_CANCEL_TIMEOUT)
+ #define IBMVFC_INIT_TIMEOUT           120
++#define IBMVFC_ABORT_WAIT_TIMEOUT     40
+ #define IBMVFC_MAX_REQUESTS_DEFAULT   100
+ #define IBMVFC_DEBUG                  0
diff --git a/queue-2.6.32/loop-update-mtime-when-writing-using-aops.patch b/queue-2.6.32/loop-update-mtime-when-writing-using-aops.patch
new file mode 100644 (file)
index 0000000..6021cd1
--- /dev/null
@@ -0,0 +1,31 @@
+From 02246c41171097ceab3246f6dc251ac89de6004b Mon Sep 17 00:00:00 2001
+From: Nikanth Karthikesan <knikanth@suse.de>
+Date: Thu, 8 Apr 2010 21:39:31 +0200
+Subject: loop: Update mtime when writing using aops
+
+From: Nikanth Karthikesan <knikanth@suse.de>
+
+commit 02246c41171097ceab3246f6dc251ac89de6004b upstream.
+
+Update mtime when writing to backing filesystem using the address space
+operations write_begin and write_end.
+
+Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
+Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/block/loop.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/block/loop.c
++++ b/drivers/block/loop.c
+@@ -238,6 +238,8 @@ static int do_lo_send_aops(struct loop_d
+               if (ret)
+                       goto fail;
++              file_update_time(file);
++
+               transfer_result = lo_do_transfer(lo, WRITE, page, offset,
+                               bvec->bv_page, bv_offs, size, IV);
+               copied = size;
diff --git a/queue-2.6.32/md-raid1-delay-reads-that-could-overtake-behind-writes.patch b/queue-2.6.32/md-raid1-delay-reads-that-could-overtake-behind-writes.patch
new file mode 100644 (file)
index 0000000..c200039
--- /dev/null
@@ -0,0 +1,113 @@
+From e555190d82c0f58e825e3cbd9e6ebe2e7ac713bd Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.de>
+Date: Wed, 31 Mar 2010 11:21:44 +1100
+Subject: md/raid1: delay reads that could overtake behind-writes.
+
+From: NeilBrown <neilb@suse.de>
+
+commit e555190d82c0f58e825e3cbd9e6ebe2e7ac713bd upstream.
+
+When a raid1 array is configured to support write-behind
+on some devices, it normally only reads from other devices.
+If all devices are write-behind (because the rest have failed)
+it is possible for a read request to be serviced before a
+behind-write request, which would appear as data corruption.
+
+So when forced to read from a WriteMostly device, wait for any
+write-behind to complete, and don't start any more behind-writes.
+
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ drivers/md/bitmap.c |    4 +++-
+ drivers/md/bitmap.h |    3 +++
+ drivers/md/raid1.c  |   25 ++++++++++++++++++-------
+ 3 files changed, 24 insertions(+), 8 deletions(-)
+
+--- a/drivers/md/bitmap.c
++++ b/drivers/md/bitmap.c
+@@ -1317,7 +1317,8 @@ void bitmap_endwrite(struct bitmap *bitm
+ {
+       if (!bitmap) return;
+       if (behind) {
+-              atomic_dec(&bitmap->behind_writes);
++              if (atomic_dec_and_test(&bitmap->behind_writes))
++                      wake_up(&bitmap->behind_wait);
+               PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n",
+                 atomic_read(&bitmap->behind_writes), bitmap->max_write_behind);
+       }
+@@ -1629,6 +1630,7 @@ int bitmap_create(mddev_t *mddev)
+       atomic_set(&bitmap->pending_writes, 0);
+       init_waitqueue_head(&bitmap->write_wait);
+       init_waitqueue_head(&bitmap->overflow_wait);
++      init_waitqueue_head(&bitmap->behind_wait);
+       bitmap->mddev = mddev;
+--- a/drivers/md/bitmap.h
++++ b/drivers/md/bitmap.h
+@@ -254,6 +254,9 @@ struct bitmap {
+       wait_queue_head_t write_wait;
+       wait_queue_head_t overflow_wait;
++#ifndef __GENKSYMS__
++      wait_queue_head_t behind_wait;
++#endif
+ };
+ /* the bitmap API */
+--- a/drivers/md/raid1.c
++++ b/drivers/md/raid1.c
+@@ -845,6 +845,15 @@ static int make_request(struct request_q
+               }
+               mirror = conf->mirrors + rdisk;
++              if (test_bit(WriteMostly, &mirror->rdev->flags) &&
++                  bitmap) {
++                      /* Reading from a write-mostly device must
++                       * take care not to over-take any writes
++                       * that are 'behind'
++                       */
++                      wait_event(bitmap->behind_wait,
++                                 atomic_read(&bitmap->behind_writes) == 0);
++              }
+               r1_bio->read_disk = rdisk;
+               read_bio = bio_clone(bio, GFP_NOIO);
+@@ -922,9 +931,13 @@ static int make_request(struct request_q
+               set_bit(R1BIO_Degraded, &r1_bio->state);
+       }
+-      /* do behind I/O ? */
++      /* do behind I/O ?
++       * Not if there are too many, or cannot allocate memory,
++       * or a reader on WriteMostly is waiting for behind writes
++       * to flush */
+       if (bitmap &&
+           atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
++          !waitqueue_active(&bitmap->behind_wait) &&
+           (behind_pages = alloc_behind_pages(bio)) != NULL)
+               set_bit(R1BIO_BehindIO, &r1_bio->state);
+@@ -2105,15 +2118,13 @@ static int stop(mddev_t *mddev)
+ {
+       conf_t *conf = mddev->private;
+       struct bitmap *bitmap = mddev->bitmap;
+-      int behind_wait = 0;
+       /* wait for behind writes to complete */
+-      while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
+-              behind_wait++;
+-              printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
+-              set_current_state(TASK_UNINTERRUPTIBLE);
+-              schedule_timeout(HZ); /* wait a second */
++      if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
++              printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev));
+               /* need to kick something here to make sure I/O goes? */
++              wait_event(bitmap->behind_wait,
++                         atomic_read(&bitmap->behind_writes) == 0);
+       }
+       raise_barrier(conf);
diff --git a/queue-2.6.32/memory-hotplug-fix-a-bug-on-dev-mem-for-64-bit-kernels.patch b/queue-2.6.32/memory-hotplug-fix-a-bug-on-dev-mem-for-64-bit-kernels.patch
new file mode 100644 (file)
index 0000000..852c227
--- /dev/null
@@ -0,0 +1,74 @@
+From ea0854170c95245a258b386c7a9314399c949fe0 Mon Sep 17 00:00:00 2001
+From: Shaohui Zheng <shaohui.zheng@intel.com>
+Date: Tue, 2 Feb 2010 13:44:16 -0800
+Subject: memory hotplug: fix a bug on /dev/mem for 64-bit kernels
+
+From: Shaohui Zheng <shaohui.zheng@intel.com>
+
+commit ea0854170c95245a258b386c7a9314399c949fe0 upstream.
+
+Newly added memory can not be accessed via /dev/mem, because we do not
+update the variables high_memory, max_pfn and max_low_pfn.
+
+Add a function update_end_of_memory_vars() to update these variables for
+64-bit kernels.
+
+[akpm@linux-foundation.org: simplify comment]
+Signed-off-by: Shaohui Zheng <shaohui.zheng@intel.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Li Haicheng <haicheng.li@intel.com>
+Reviewed-by: Wu Fengguang <fengguang.wu@intel.com>
+Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: Ingo Molnar <mingo@elte.hu>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/x86/mm/init_64.c |   19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -49,6 +49,7 @@
+ #include <asm/numa.h>
+ #include <asm/cacheflush.h>
+ #include <asm/init.h>
++#include <linux/bootmem.h>
+ static unsigned long dma_reserve __initdata;
+@@ -615,6 +616,21 @@ void __init paging_init(void)
+  */
+ #ifdef CONFIG_MEMORY_HOTPLUG
+ /*
++ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
++ * updating.
++ */
++static void  update_end_of_memory_vars(u64 start, u64 size)
++{
++      unsigned long end_pfn = PFN_UP(start + size);
++
++      if (end_pfn > max_pfn) {
++              max_pfn = end_pfn;
++              max_low_pfn = end_pfn;
++              high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
++      }
++}
++
++/*
+  * Memory is added always to NORMAL zone. This means you will never get
+  * additional DMA/DMA32 memory.
+  */
+@@ -633,6 +649,9 @@ int arch_add_memory(int nid, u64 start,
+       ret = __add_pages(nid, zone, start_pfn, nr_pages);
+       WARN_ON_ONCE(ret);
++      /* update max_pfn, max_low_pfn and high_memory */
++      update_end_of_memory_vars(start, size);
++
+       return ret;
+ }
+ EXPORT_SYMBOL_GPL(arch_add_memory);
diff --git a/queue-2.6.32/mutex-don-t-spin-when-the-owner-cpu-is-offline-or-other-weird-cases.patch b/queue-2.6.32/mutex-don-t-spin-when-the-owner-cpu-is-offline-or-other-weird-cases.patch
new file mode 100644 (file)
index 0000000..9881230
--- /dev/null
@@ -0,0 +1,67 @@
+From 4b402210486c6414fe5fbfd85934a0a22da56b04 Mon Sep 17 00:00:00 2001
+From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Date: Fri, 16 Apr 2010 23:20:00 +0200
+Subject: mutex: Don't spin when the owner CPU is offline or other weird cases
+
+From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+
+commit 4b402210486c6414fe5fbfd85934a0a22da56b04 upstream.
+
+Due to recent load-balancer changes that delay the task migration to
+the next wakeup, the adaptive mutex spinning ends up in a live lock
+when the owner's CPU gets offlined because the cpu_online() check
+lives before the owner running check.
+
+This patch changes mutex_spin_on_owner() to return 0 (don't spin) in
+any case where we aren't sure about the owner struct validity or CPU
+number, and if the said CPU is offline. There is no point going back &
+re-evaluate spinning in corner cases like that, let's just go to
+sleep.
+
+Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LKML-Reference: <1271212509.13059.135.camel@pasglop>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/sched.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -5590,7 +5590,7 @@ int mutex_spin_on_owner(struct mutex *lo
+        * the mutex owner just released it and exited.
+        */
+       if (probe_kernel_address(&owner->cpu, cpu))
+-              goto out;
++              return 0;
+ #else
+       cpu = owner->cpu;
+ #endif
+@@ -5600,14 +5600,14 @@ int mutex_spin_on_owner(struct mutex *lo
+        * the cpu field may no longer be valid.
+        */
+       if (cpu >= nr_cpumask_bits)
+-              goto out;
++              return 0;
+       /*
+        * We need to validate that we can do a
+        * get_cpu() and that we have the percpu area.
+        */
+       if (!cpu_online(cpu))
+-              goto out;
++              return 0;
+       rq = cpu_rq(cpu);
+@@ -5626,7 +5626,7 @@ int mutex_spin_on_owner(struct mutex *lo
+               cpu_relax();
+       }
+-out:
++
+       return 1;
+ }
+ #endif
diff --git a/queue-2.6.32/nohz-introduce-arch_needs_cpu.patch b/queue-2.6.32/nohz-introduce-arch_needs_cpu.patch
new file mode 100644 (file)
index 0000000..97234a2
--- /dev/null
@@ -0,0 +1,121 @@
+From 3c5d92a0cfb5103c0d5ab74d4ae6373d3af38148 Mon Sep 17 00:00:00 2001
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Date: Tue, 29 Sep 2009 14:25:16 +0200
+Subject: nohz: Introduce arch_needs_cpu
+
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+
+commit 3c5d92a0cfb5103c0d5ab74d4ae6373d3af38148 upstream.
+
+Allow the architecture to request a normal jiffy tick when the system
+goes idle and tick_nohz_stop_sched_tick is called . On s390 the hook is
+used to prevent the system going fully idle if there has been an
+interrupt other than a clock comparator interrupt since the last wakeup.
+
+On s390 the HiperSockets response time for 1 connection ping-pong goes
+down from 42 to 34 microseconds. The CPU cost decreases by 27%.
+
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+LKML-Reference: <20090929122533.402715150@de.ibm.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: John Jolly <jjolly@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/s390/include/asm/cputime.h |    8 ++++++++
+ arch/s390/kernel/s390_ext.c     |    2 ++
+ arch/s390/kernel/vtime.c        |    2 ++
+ drivers/s390/cio/cio.c          |    1 +
+ include/linux/tick.h            |    3 +++
+ kernel/time/tick-sched.c        |   13 ++++++++-----
+ 6 files changed, 24 insertions(+), 5 deletions(-)
+
+--- a/arch/s390/include/asm/cputime.h
++++ b/arch/s390/include/asm/cputime.h
+@@ -183,6 +183,7 @@ struct s390_idle_data {
+       unsigned long long idle_count;
+       unsigned long long idle_enter;
+       unsigned long long idle_time;
++      int nohz_delay;
+ };
+ DECLARE_PER_CPU(struct s390_idle_data, s390_idle);
+@@ -198,4 +199,11 @@ static inline void s390_idle_check(void)
+               vtime_start_cpu();
+ }
++static inline int s390_nohz_delay(int cpu)
++{
++      return per_cpu(s390_idle, cpu).nohz_delay != 0;
++}
++
++#define arch_needs_cpu(cpu) s390_nohz_delay(cpu)
++
+ #endif /* _S390_CPUTIME_H */
+--- a/arch/s390/kernel/s390_ext.c
++++ b/arch/s390/kernel/s390_ext.c
+@@ -126,6 +126,8 @@ void __irq_entry do_extint(struct pt_reg
+               /* Serve timer interrupts first. */
+               clock_comparator_work();
+       kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++;
++      if (code != 0x1004)
++              __get_cpu_var(s390_idle).nohz_delay = 1;
+         index = ext_hash(code);
+       for (p = ext_int_hash[index]; p; p = p->next) {
+               if (likely(p->code == code))
+--- a/arch/s390/kernel/vtime.c
++++ b/arch/s390/kernel/vtime.c
+@@ -167,6 +167,8 @@ void vtime_stop_cpu(void)
+       /* Wait for external, I/O or machine check interrupt. */
+       psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT;
++      idle->nohz_delay = 0;
++
+       /* Check if the CPU timer needs to be reprogrammed. */
+       if (vq->do_spt) {
+               __u64 vmax = VTIMER_MAX_SLICE;
+--- a/drivers/s390/cio/cio.c
++++ b/drivers/s390/cio/cio.c
+@@ -618,6 +618,7 @@ void __irq_entry do_IRQ(struct pt_regs *
+       old_regs = set_irq_regs(regs);
+       s390_idle_check();
+       irq_enter();
++      __get_cpu_var(s390_idle).nohz_delay = 1;
+       if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator)
+               /* Serve timer interrupts first. */
+               clock_comparator_work();
+--- a/include/linux/tick.h
++++ b/include/linux/tick.h
+@@ -98,6 +98,9 @@ extern int tick_check_oneshot_change(int
+ extern struct tick_sched *tick_get_tick_sched(int cpu);
+ extern void tick_check_idle(int cpu);
+ extern int tick_oneshot_mode_active(void);
++#  ifndef arch_needs_cpu
++#   define arch_needs_cpu(cpu) (0)
++#  endif
+ # else
+ static inline void tick_clock_notify(void) { }
+ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }
+--- a/kernel/time/tick-sched.c
++++ b/kernel/time/tick-sched.c
+@@ -289,12 +289,15 @@ void tick_nohz_stop_sched_tick(int inidl
+                       time_delta = KTIME_MAX;
+       } while (read_seqretry(&xtime_lock, seq));
+-      /* Get the next timer wheel timer */
+-      next_jiffies = get_next_timer_interrupt(last_jiffies);
+-      delta_jiffies = next_jiffies - last_jiffies;
+-
+-      if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
++      if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
++          arch_needs_cpu(cpu)) {
++              next_jiffies = last_jiffies + 1;
+               delta_jiffies = 1;
++      } else {
++              /* Get the next timer wheel timer */
++              next_jiffies = get_next_timer_interrupt(last_jiffies);
++              delta_jiffies = next_jiffies - last_jiffies;
++      }
+       /*
+        * Do not stop the tick, if we are only one off
+        * or if the cpu is required for rcu
diff --git a/queue-2.6.32/nohz-reuse-ktime-in-sub-functions-of-tick_check_idle.patch b/queue-2.6.32/nohz-reuse-ktime-in-sub-functions-of-tick_check_idle.patch
new file mode 100644 (file)
index 0000000..6107b7c
--- /dev/null
@@ -0,0 +1,194 @@
+From eed3b9cf3fe3fcc7a50238dfcab63a63914e8f42 Mon Sep 17 00:00:00 2001
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Date: Tue, 29 Sep 2009 14:25:15 +0200
+Subject: nohz: Reuse ktime in sub-functions of tick_check_idle.
+
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+
+commit eed3b9cf3fe3fcc7a50238dfcab63a63914e8f42 upstream.
+
+On a system with NOHZ=y tick_check_idle calls tick_nohz_stop_idle and
+tick_nohz_update_jiffies. Given the right conditions (ts->idle_active
+and/or ts->tick_stopped) both function get a time stamp with ktime_get.
+The same time stamp can be reused if both function require one.
+
+On s390 this change has the additional benefit that gcc inlines the
+tick_nohz_stop_idle function into tick_check_idle. The number of
+instructions to execute tick_check_idle drops from 225 to 144
+(without the ktime_get optimization it is 367 vs 215 instructions).
+
+before:
+
+ 0)               |  tick_check_idle() {
+ 0)               |    tick_nohz_stop_idle() {
+ 0)               |      ktime_get() {
+ 0)               |        read_tod_clock() {
+ 0)   0.601 us    |        }
+ 0)   1.765 us    |      }
+ 0)   3.047 us    |    }
+ 0)               |    ktime_get() {
+ 0)               |      read_tod_clock() {
+ 0)   0.570 us    |      }
+ 0)   1.727 us    |    }
+ 0)               |    tick_do_update_jiffies64() {
+ 0)   0.609 us    |    }
+ 0)   8.055 us    |  }
+
+after:
+
+ 0)               |  tick_check_idle() {
+ 0)               |    ktime_get() {
+ 0)               |      read_tod_clock() {
+ 0)   0.617 us    |      }
+ 0)   1.773 us    |    }
+ 0)               |    tick_do_update_jiffies64() {
+ 0)   0.593 us    |    }
+ 0)   4.477 us    |  }
+
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Cc: john stultz <johnstul@us.ibm.com>
+LKML-Reference: <20090929122533.206589318@de.ibm.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: John Jolly <jjolly@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/time/tick-sched.c |   62 +++++++++++++++++++++++++----------------------
+ 1 file changed, 33 insertions(+), 29 deletions(-)
+
+--- a/kernel/time/tick-sched.c
++++ b/kernel/time/tick-sched.c
+@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
+  * value. We do this unconditionally on any cpu, as we don't know whether the
+  * cpu, which has the update task assigned is in a long sleep.
+  */
+-static void tick_nohz_update_jiffies(void)
++static void tick_nohz_update_jiffies(ktime_t now)
+ {
+       int cpu = smp_processor_id();
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+       unsigned long flags;
+-      ktime_t now;
+-
+-      if (!ts->tick_stopped)
+-              return;
+       cpumask_clear_cpu(cpu, nohz_cpu_mask);
+-      now = ktime_get();
+       ts->idle_waketime = now;
+       local_irq_save(flags);
+@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(voi
+       touch_softlockup_watchdog();
+ }
+-static void tick_nohz_stop_idle(int cpu)
++static void tick_nohz_stop_idle(int cpu, ktime_t now)
+ {
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
++      ktime_t delta;
+-      if (ts->idle_active) {
+-              ktime_t now, delta;
+-              now = ktime_get();
+-              delta = ktime_sub(now, ts->idle_entrytime);
+-              ts->idle_lastupdate = now;
+-              ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+-              ts->idle_active = 0;
++      delta = ktime_sub(now, ts->idle_entrytime);
++      ts->idle_lastupdate = now;
++      ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
++      ts->idle_active = 0;
+-              sched_clock_idle_wakeup_event(0);
+-      }
++      sched_clock_idle_wakeup_event(0);
+ }
+ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+@@ -463,7 +455,11 @@ void tick_nohz_restart_sched_tick(void)
+       ktime_t now;
+       local_irq_disable();
+-      tick_nohz_stop_idle(cpu);
++      if (ts->idle_active || (ts->inidle && ts->tick_stopped))
++              now = ktime_get();
++
++      if (ts->idle_active)
++              tick_nohz_stop_idle(cpu, now);
+       if (!ts->inidle || !ts->tick_stopped) {
+               ts->inidle = 0;
+@@ -477,7 +473,6 @@ void tick_nohz_restart_sched_tick(void)
+       /* Update jiffies first */
+       select_nohz_load_balancer(0);
+-      now = ktime_get();
+       tick_do_update_jiffies64(now);
+       cpumask_clear_cpu(cpu, nohz_cpu_mask);
+@@ -611,22 +606,18 @@ static void tick_nohz_switch_to_nohz(voi
+  * timer and do not touch the other magic bits which need to be done
+  * when idle is left.
+  */
+-static void tick_nohz_kick_tick(int cpu)
++static void tick_nohz_kick_tick(int cpu, ktime_t now)
+ {
+ #if 0
+       /* Switch back to 2.6.27 behaviour */
+       struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+-      ktime_t delta, now;
+-
+-      if (!ts->tick_stopped)
+-              return;
++      ktime_t delta;
+       /*
+        * Do not touch the tick device, when the next expiry is either
+        * already reached or less/equal than the tick period.
+        */
+-      now = ktime_get();
+       delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
+       if (delta.tv64 <= tick_period.tv64)
+               return;
+@@ -635,9 +626,26 @@ static void tick_nohz_kick_tick(int cpu)
+ #endif
+ }
++static inline void tick_check_nohz(int cpu)
++{
++      struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
++      ktime_t now;
++
++      if (!ts->idle_active && !ts->tick_stopped)
++              return;
++      now = ktime_get();
++      if (ts->idle_active)
++              tick_nohz_stop_idle(cpu, now);
++      if (ts->tick_stopped) {
++              tick_nohz_update_jiffies(now);
++              tick_nohz_kick_tick(cpu, now);
++      }
++}
++
+ #else
+ static inline void tick_nohz_switch_to_nohz(void) { }
++static inline void tick_check_nohz(int cpu) { }
+ #endif /* NO_HZ */
+@@ -647,11 +655,7 @@ static inline void tick_nohz_switch_to_n
+ void tick_check_idle(int cpu)
+ {
+       tick_check_oneshot_broadcast(cpu);
+-#ifdef CONFIG_NO_HZ
+-      tick_nohz_stop_idle(cpu);
+-      tick_nohz_update_jiffies();
+-      tick_nohz_kick_tick(cpu);
+-#endif
++      tick_check_nohz(cpu);
+ }
+ /*
diff --git a/queue-2.6.32/ocfs2-find-proper-end-cpos-for-a-leaf-refcount-block.patch b/queue-2.6.32/ocfs2-find-proper-end-cpos-for-a-leaf-refcount-block.patch
new file mode 100644 (file)
index 0000000..6a150f7
--- /dev/null
@@ -0,0 +1,254 @@
+From 38a04e432768ec0b016f3c687b4de31ac111ae59 Mon Sep 17 00:00:00 2001
+From: Tao Ma <tao.ma@oracle.com>
+Date: Mon, 30 Nov 2009 14:32:19 +0800
+Subject: ocfs2: Find proper end cpos for a leaf refcount block.
+
+From: Tao Ma <tao.ma@oracle.com>
+
+commit 38a04e432768ec0b016f3c687b4de31ac111ae59 upstream.
+
+ocfs2 refcount tree is stored as an extent tree while
+the leaf ocfs2_refcount_rec points to a refcount block.
+
+The following step can trip a kernel panic.
+mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE
+mount -t ocfs2 $DEVICE $MNT_DIR
+FILE_NAME=$RANDOM
+FILE_NAME_1=$RANDOM
+FILE_REF="${FILE_NAME}_ref"
+FILE_REF_1="${FILE_NAME}_ref_1"
+for((i=0;i<305;i++))
+do
+# /mnt/1048576 is a file with 1048576 sizes.
+cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
+cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1
+done
+for((i=0;i<3;i++))
+do
+cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
+done
+
+for((i=0;i<2;i++))
+do
+cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
+cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1
+done
+
+cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
+
+for((i=0;i<11;i++))
+do
+cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME
+cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1
+done
+reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF
+# write_f is a program which will write some bytes to a file at offset.
+# write_f -f file_name -l offset -w write_bytes.
+./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096
+./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096
+./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096
+./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096
+./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096
+reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1
+./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096
+#kernel panic here.
+
+The reason is that if the ocfs2_extent_rec is the last record
+in a leaf extent block, the old solution fails to find the
+suitable end cpos. So this patch try to walk through the b-tree,
+find the next sub root and get the c_pos the next sub-tree starts
+from.
+
+btw, I have runned tristan's test case against the patched kernel
+for several days and this type of kernel panic never happens again.
+
+Signed-off-by: Tao Ma <tao.ma@oracle.com>
+Signed-off-by: Joel Becker <joel.becker@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ocfs2/alloc.c        |   10 ++--
+ fs/ocfs2/alloc.h        |    5 ++
+ fs/ocfs2/refcounttree.c |  117 ++++++++++++++++++++++++++++++++++++++++++++----
+ 3 files changed, 119 insertions(+), 13 deletions(-)
+
+--- a/fs/ocfs2/alloc.c
++++ b/fs/ocfs2/alloc.c
+@@ -1765,9 +1765,9 @@ set_and_inc:
+  *
+  * The array index of the subtree root is passed back.
+  */
+-static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+-                                 struct ocfs2_path *left,
+-                                 struct ocfs2_path *right)
++int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
++                          struct ocfs2_path *left,
++                          struct ocfs2_path *right)
+ {
+       int i = 0;
+@@ -2872,8 +2872,8 @@ out:
+  * This looks similar, but is subtly different to
+  * ocfs2_find_cpos_for_left_leaf().
+  */
+-static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+-                                        struct ocfs2_path *path, u32 *cpos)
++int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
++                                 struct ocfs2_path *path, u32 *cpos)
+ {
+       int i, j, ret = 0;
+       u64 blkno;
+--- a/fs/ocfs2/alloc.h
++++ b/fs/ocfs2/alloc.h
+@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_
+ int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
+                             handle_t *handle,
+                             struct ocfs2_path *path);
++int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
++                                 struct ocfs2_path *path, u32 *cpos);
++int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
++                          struct ocfs2_path *left,
++                          struct ocfs2_path *right);
+ #endif /* OCFS2_ALLOC_H */
+--- a/fs/ocfs2/refcounttree.c
++++ b/fs/ocfs2/refcounttree.c
+@@ -969,6 +969,103 @@ out:
+ }
+ /*
++ * Find the end range for a leaf refcount block indicated by
++ * el->l_recs[index].e_blkno.
++ */
++static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
++                                     struct buffer_head *ref_root_bh,
++                                     struct ocfs2_extent_block *eb,
++                                     struct ocfs2_extent_list *el,
++                                     int index,  u32 *cpos_end)
++{
++      int ret, i, subtree_root;
++      u32 cpos;
++      u64 blkno;
++      struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
++      struct ocfs2_path *left_path = NULL, *right_path = NULL;
++      struct ocfs2_extent_tree et;
++      struct ocfs2_extent_list *tmp_el;
++
++      if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
++              /*
++               * We have a extent rec after index, so just use the e_cpos
++               * of the next extent rec.
++               */
++              *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
++              return 0;
++      }
++
++      if (!eb || (eb && !eb->h_next_leaf_blk)) {
++              /*
++               * We are the last extent rec, so any high cpos should
++               * be stored in this leaf refcount block.
++               */
++              *cpos_end = UINT_MAX;
++              return 0;
++      }
++
++      /*
++       * If the extent block isn't the last one, we have to find
++       * the subtree root between this extent block and the next
++       * leaf extent block and get the corresponding e_cpos from
++       * the subroot. Otherwise we may corrupt the b-tree.
++       */
++      ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
++
++      left_path = ocfs2_new_path_from_et(&et);
++      if (!left_path) {
++              ret = -ENOMEM;
++              mlog_errno(ret);
++              goto out;
++      }
++
++      cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
++      ret = ocfs2_find_path(ci, left_path, cpos);
++      if (ret) {
++              mlog_errno(ret);
++              goto out;
++      }
++
++      right_path = ocfs2_new_path_from_path(left_path);
++      if (!right_path) {
++              ret = -ENOMEM;
++              mlog_errno(ret);
++              goto out;
++      }
++
++      ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
++      if (ret) {
++              mlog_errno(ret);
++              goto out;
++      }
++
++      ret = ocfs2_find_path(ci, right_path, cpos);
++      if (ret) {
++              mlog_errno(ret);
++              goto out;
++      }
++
++      subtree_root = ocfs2_find_subtree_root(&et, left_path,
++                                             right_path);
++
++      tmp_el = left_path->p_node[subtree_root].el;
++      blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
++      for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
++              if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
++                      *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
++                      break;
++              }
++      }
++
++      BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
++
++out:
++      ocfs2_free_path(left_path);
++      ocfs2_free_path(right_path);
++      return ret;
++}
++
++/*
+  * Given a cpos and len, try to find the refcount record which contains cpos.
+  * 1. If cpos can be found in one refcount record, return the record.
+  * 2. If cpos can't be found, return a fake record which start from cpos
+@@ -983,10 +1080,10 @@ static int ocfs2_get_refcount_rec(struct
+                                 struct buffer_head **ret_bh)
+ {
+       int ret = 0, i, found;
+-      u32 low_cpos;
++      u32 low_cpos, uninitialized_var(cpos_end);
+       struct ocfs2_extent_list *el;
+-      struct ocfs2_extent_rec *tmp, *rec = NULL;
+-      struct ocfs2_extent_block *eb;
++      struct ocfs2_extent_rec *rec = NULL;
++      struct ocfs2_extent_block *eb = NULL;
+       struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
+       struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+       struct ocfs2_refcount_block *rb =
+@@ -1034,12 +1131,16 @@ static int ocfs2_get_refcount_rec(struct
+               }
+       }
+-      /* adjust len when we have ocfs2_extent_rec after it. */
+-      if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
+-              tmp = &el->l_recs[i+1];
++      if (found) {
++              ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
++                                                eb, el, i, &cpos_end);
++              if (ret) {
++                      mlog_errno(ret);
++                      goto out;
++              }
+-              if (le32_to_cpu(tmp->e_cpos) < cpos + len)
+-                      len = le32_to_cpu(tmp->e_cpos) - cpos;
++              if (cpos_end < low_cpos + len)
++                      len = cpos_end - low_cpos;
+       }
+       ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
diff --git a/queue-2.6.32/ocfs2-set-ms_posixacl-on-remount.patch b/queue-2.6.32/ocfs2-set-ms_posixacl-on-remount.patch
new file mode 100644 (file)
index 0000000..0ddbc3a
--- /dev/null
@@ -0,0 +1,35 @@
+From 57b09bb5e492c37c1e4273fe4e435ffd1d2ddbe0 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 15 Oct 2009 14:54:05 +0200
+Subject: ocfs2: Set MS_POSIXACL on remount
+
+From: Jan Kara <jack@suse.cz>
+
+commit 57b09bb5e492c37c1e4273fe4e435ffd1d2ddbe0 upstream.
+
+We have to set MS_POSIXACL on remount as well. Otherwise VFS
+would not know we started supporting ACLs after remount and
+thus ACLs would not work.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Joel Becker <joel.becker@oracle.com>
+Signed-off-by: Mark Fasheh <mfasheh@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ocfs2/super.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/ocfs2/super.c
++++ b/fs/ocfs2/super.c
+@@ -701,6 +701,10 @@ unlock_osb:
+               if (!ocfs2_is_hard_readonly(osb))
+                       ocfs2_set_journal_params(osb);
++
++              sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
++                      ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
++                                                      MS_POSIXACL : 0);
+       }
+ out:
+       unlock_kernel();
diff --git a/queue-2.6.32/powerpc-eeh-fix-a-bug-when-pci-structure-is-null.patch b/queue-2.6.32/powerpc-eeh-fix-a-bug-when-pci-structure-is-null.patch
new file mode 100644 (file)
index 0000000..d250cce
--- /dev/null
@@ -0,0 +1,105 @@
+From 8d3d50bf1913561ef3b1f5b53115c5a481ba9b1e Mon Sep 17 00:00:00 2001
+From: Breno Leitao <leitao@linux.vnet.ibm.com>
+Date: Wed, 3 Feb 2010 05:56:41 +0000
+Subject: powerpc/eeh: Fix a bug when pci structure is null
+
+From: Breno Leitao <leitao@linux.vnet.ibm.com>
+
+commit 8d3d50bf1913561ef3b1f5b53115c5a481ba9b1e upstream.
+
+During a EEH recover, the pci_dev structure can be null, mainly if an
+eeh event is detected during cpi config operation. In this case, the
+pci_dev will not be known (and will be null) the kernel will crash
+with the following message:
+
+Unable to handle kernel paging request for data at address 0x000000a0
+Faulting instruction address: 0xc00000000006b8b4
+Oops: Kernel access of bad area, sig: 11 [#1]
+
+NIP [c00000000006b8b4] .eeh_event_handler+0x10c/0x1a0
+LR [c00000000006b8a8] .eeh_event_handler+0x100/0x1a0
+Call Trace:
+[c0000003a80dff00] [c00000000006b8a8] .eeh_event_handler+0x100/0x1a0
+[c0000003a80dff90] [c000000000031f1c] .kernel_thread+0x54/0x70
+
+The bug occurs because pci_name() tries to access a null pointer.
+This patch just guarantee that pci_name() is not called on Null pointers.
+
+Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>
+Signed-off-by: Linas Vepstas <linasvepstas@gmail.com>
+Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Acked-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/powerpc/include/asm/ppc-pci.h          |    5 +++++
+ arch/powerpc/platforms/pseries/eeh.c        |    4 ++--
+ arch/powerpc/platforms/pseries/eeh_driver.c |    4 ++--
+ arch/powerpc/platforms/pseries/eeh_event.c  |    2 +-
+ 4 files changed, 10 insertions(+), 5 deletions(-)
+
+--- a/arch/powerpc/include/asm/ppc-pci.h
++++ b/arch/powerpc/include/asm/ppc-pci.h
+@@ -137,6 +137,11 @@ struct device_node * find_device_pe(stru
+ void eeh_sysfs_add_device(struct pci_dev *pdev);
+ void eeh_sysfs_remove_device(struct pci_dev *pdev);
++static inline const char *eeh_pci_name(struct pci_dev *pdev)
++{
++      return pdev ? pci_name(pdev) : "<null>";
++}
++
+ #endif /* CONFIG_EEH */
+ #else /* CONFIG_PCI */
+--- a/arch/powerpc/platforms/pseries/eeh.c
++++ b/arch/powerpc/platforms/pseries/eeh.c
+@@ -491,7 +491,7 @@ int eeh_dn_check_failure(struct device_n
+           pdn->eeh_mode & EEH_MODE_NOCHECK) {
+               ignored_check++;
+               pr_debug("EEH: Ignored check (%x) for %s %s\n",
+-                       pdn->eeh_mode, pci_name (dev), dn->full_name);
++                       pdn->eeh_mode, eeh_pci_name(dev), dn->full_name);
+               return 0;
+       }
+@@ -515,7 +515,7 @@ int eeh_dn_check_failure(struct device_n
+                       printk (KERN_ERR "EEH: %d reads ignored for recovering device at "
+                               "location=%s driver=%s pci addr=%s\n",
+                               pdn->eeh_check_count, location,
+-                              dev->driver->name, pci_name(dev));
++                              dev->driver->name, eeh_pci_name(dev));
+                       printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+                               dev->driver->name);
+                       dump_stack();
+--- a/arch/powerpc/platforms/pseries/eeh_driver.c
++++ b/arch/powerpc/platforms/pseries/eeh_driver.c
+@@ -353,7 +353,7 @@ struct pci_dn * handle_eeh_events (struc
+               location = location ? location : "unknown";
+               printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
+                               "for location=%s pci addr=%s\n",
+-                      location, pci_name(event->dev));
++                      location, eeh_pci_name(event->dev));
+               return NULL;
+       }
+@@ -384,7 +384,7 @@ struct pci_dn * handle_eeh_events (struc
+               pci_str = pci_name (frozen_pdn->pcidev);
+               drv_str = pcid_name (frozen_pdn->pcidev);
+       } else {
+-              pci_str = pci_name (event->dev);
++              pci_str = eeh_pci_name(event->dev);
+               drv_str = pcid_name (event->dev);
+       }
+       
+--- a/arch/powerpc/platforms/pseries/eeh_event.c
++++ b/arch/powerpc/platforms/pseries/eeh_event.c
+@@ -80,7 +80,7 @@ static int eeh_event_handler(void * dumm
+       eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
+       printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
+-             pci_name(event->dev));
++             eeh_pci_name(event->dev));
+       pdn = handle_eeh_events(event);
diff --git a/queue-2.6.32/reiserfs-fix-oops-while-creating-privroot-with-selinux-enabled.patch b/queue-2.6.32/reiserfs-fix-oops-while-creating-privroot-with-selinux-enabled.patch
new file mode 100644 (file)
index 0000000..7814d20
--- /dev/null
@@ -0,0 +1,56 @@
+From 6cb4aff0a77cc0e6bae9475d62205319e3ebbf3f Mon Sep 17 00:00:00 2001
+From: Jeff Mahoney <jeffm@suse.com>
+Date: Tue, 23 Mar 2010 13:35:38 -0700
+Subject: reiserfs: fix oops while creating privroot with selinux enabled
+
+From: Jeff Mahoney <jeffm@suse.com>
+
+commit 6cb4aff0a77cc0e6bae9475d62205319e3ebbf3f upstream.
+
+Commit 57fe60df ("reiserfs: add atomic addition of selinux attributes
+during inode creation") contains a bug that will cause it to oops when
+mounting a file system that didn't previously contain extended attributes
+on a system using security.* xattrs.
+
+The issue is that while creating the privroot during mount
+reiserfs_security_init calls reiserfs_xattr_jcreate_nblocks which
+dereferences the xattr root.  The xattr root doesn't exist, so we get an
+oops.
+
+Addresses http://bugzilla.kernel.org/show_bug.cgi?id=15309
+
+Signed-off-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/reiserfs/xattr_security.c   |    2 +-
+ include/linux/reiserfs_xattr.h |    5 +++++
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+--- a/fs/reiserfs/xattr_security.c
++++ b/fs/reiserfs/xattr_security.c
+@@ -75,7 +75,7 @@ int reiserfs_security_init(struct inode
+               return error;
+       }
+-      if (sec->length) {
++      if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
+               blocks = reiserfs_xattr_jcreate_nblocks(inode) +
+                        reiserfs_xattr_nblocks(inode, sec->length);
+               /* We don't want to count the directories twice if we have
+--- a/include/linux/reiserfs_xattr.h
++++ b/include/linux/reiserfs_xattr.h
+@@ -70,6 +70,11 @@ int reiserfs_security_write(struct reise
+ void reiserfs_security_free(struct reiserfs_security_handle *sec);
+ #endif
++static inline int reiserfs_xattrs_initialized(struct super_block *sb)
++{
++      return REISERFS_SB(sb)->priv_root != NULL;
++}
++
+ #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
+ static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
+ {
diff --git a/queue-2.6.32/reiserfs-properly-honor-read-only-devices.patch b/queue-2.6.32/reiserfs-properly-honor-read-only-devices.patch
new file mode 100644 (file)
index 0000000..265606b
--- /dev/null
@@ -0,0 +1,71 @@
+From 3f8b5ee33293d43ca360771b535dfae8c57259dc Mon Sep 17 00:00:00 2001
+From: Jeff Mahoney <jeffm@suse.com>
+Date: Tue, 23 Mar 2010 13:35:39 -0700
+Subject: reiserfs: properly honor read-only devices
+
+From: Jeff Mahoney <jeffm@suse.com>
+
+commit 3f8b5ee33293d43ca360771b535dfae8c57259dc upstream.
+
+The reiserfs journal behaves inconsistently when determining whether to
+allow a mount of a read-only device.
+
+This is due to the use of the continue_replay variable to short circuit
+the journal scanning.  If it's set, it's assumed that there are
+transactions to replay, but there may not be.  If it's unset, it's assumed
+that there aren't any, and that may not be the case either.
+
+I've observed two failure cases:
+1) Where a clean file system on a read-only device refuses to mount
+2) Where a clean file system on a read-only device passes the
+   optimization and then tries writing the journal header to update
+   the latest mount id.
+
+The former is easily observable by using a freshly created file system on
+a read-only loopback device.
+
+This patch moves the check into journal_read_transaction, where it can
+bail out before it's about to replay a transaction.  That way it can go
+through and skip transactions where appropriate, yet still refuse to mount
+a file system with outstanding transactions.
+
+Signed-off-by: Jeff Mahoney <jeffm@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/reiserfs/journal.c |   15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
+--- a/fs/reiserfs/journal.c
++++ b/fs/reiserfs/journal.c
+@@ -2184,6 +2184,15 @@ static int journal_read_transaction(stru
+               brelse(d_bh);
+               return 1;
+       }
++
++      if (bdev_read_only(sb->s_bdev)) {
++              reiserfs_warning(sb, "clm-2076",
++                               "device is readonly, unable to replay log");
++              brelse(c_bh);
++              brelse(d_bh);
++              return -EROFS;
++      }
++
+       trans_id = get_desc_trans_id(desc);
+       /* now we know we've got a good transaction, and it was inside the valid time ranges */
+       log_blocks = kmalloc(get_desc_trans_len(desc) *
+@@ -2422,12 +2431,6 @@ static int journal_read(struct super_blo
+               goto start_log_replay;
+       }
+-      if (continue_replay && bdev_read_only(sb->s_bdev)) {
+-              reiserfs_warning(sb, "clm-2076",
+-                               "device is readonly, unable to replay log");
+-              return -1;
+-      }
+-
+       /* ok, there are transactions that need to be replayed.  start with the first log block, find
+        ** all the valid transactions, and pick out the oldest.
+        */
diff --git a/queue-2.6.32/sched-cputime-introduce-thread_group_times.patch b/queue-2.6.32/sched-cputime-introduce-thread_group_times.patch
new file mode 100644 (file)
index 0000000..9550a70
--- /dev/null
@@ -0,0 +1,320 @@
+From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001
+From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Date: Wed, 2 Dec 2009 17:28:07 +0900
+Subject: sched, cputime: Introduce thread_group_times()
+
+From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+commit 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 upstream.
+
+This is a real fix for problem of utime/stime values decreasing
+described in the thread:
+
+   http://lkml.org/lkml/2009/11/3/522
+
+Now cputime is accounted in the following way:
+
+ - {u,s}time in task_struct are increased every time when the thread
+   is interrupted by a tick (timer interrupt).
+
+ - When a thread exits, its {u,s}time are added to signal->{u,s}time,
+   after adjusted by task_times().
+
+ - When all threads in a thread_group exits, accumulated {u,s}time
+   (and also c{u,s}time) in signal struct are added to c{u,s}time
+   in signal struct of the group's parent.
+
+So {u,s}time in task struct are "raw" tick count, while
+{u,s}time and c{u,s}time in signal struct are "adjusted" values.
+
+And accounted values are used by:
+
+ - task_times(), to get cputime of a thread:
+   This function returns adjusted values that originates from raw
+   {u,s}time and scaled by sum_exec_runtime that accounted by CFS.
+
+ - thread_group_cputime(), to get cputime of a thread group:
+   This function returns sum of all {u,s}time of living threads in
+   the group, plus {u,s}time in the signal struct that is sum of
+   adjusted cputimes of all exited threads belonged to the group.
+
+The problem is the return value of thread_group_cputime(),
+because it is mixed sum of "raw" value and "adjusted" value:
+
+  group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time)
+
+This misbehavior can break {u,s}time monotonicity.
+Assume that if there is a thread that have raw values greater
+than adjusted values (e.g. interrupted by 1000Hz ticks 50 times
+but only runs 45ms) and if it exits, cputime will decrease (e.g.
+-5ms).
+
+To fix this, we could do:
+
+  group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time)
+
+But task_times() contains hard divisions, so applying it for
+every thread should be avoided.
+
+This patch fixes the above problem in the following way:
+
+ - Modify thread's exit (= __exit_signal()) not to use task_times().
+   It means {u,s}time in signal struct accumulates raw values instead
+   of adjusted values.  As the result it makes thread_group_cputime()
+   to return pure sum of "raw" values.
+
+ - Introduce a new function thread_group_times(*task, *utime, *stime)
+   that converts "raw" values of thread_group_cputime() to "adjusted"
+   values, in same calculation procedure as task_times().
+
+ - Modify group's exit (= wait_task_zombie()) to use this introduced
+   thread_group_times().  It make c{u,s}time in signal struct to
+   have adjusted values like before this patch.
+
+ - Replace some thread_group_cputime() by thread_group_times().
+   This replacements are only applied where conveys the "adjusted"
+   cputime to users, and where already uses task_times() near by it.
+   (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.)
+
+This patch have a positive side effect:
+
+ - Before this patch, if a group contains many short-life threads
+   (e.g. runs 0.9ms and not interrupted by ticks), the group's
+   cputime could be invisible since thread's cputime was accumulated
+   after adjusted: imagine adjustment function as adj(ticks, runtime),
+     {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0.
+   After this patch it will not happen because the adjustment is
+   applied after accumulated.
+
+v2:
+ - remove if()s, put new variables into signal_struct.
+
+Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Cc: Spencer Candland <spencer@bluehost.com>
+Cc: Americo Wang <xiyou.wangcong@gmail.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Balbir Singh <balbir@in.ibm.com>
+Cc: Stanislaw Gruszka <sgruszka@redhat.com>
+LKML-Reference: <4B162517.8040909@jp.fujitsu.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+
+---
+ fs/proc/array.c       |    5 +----
+ include/linux/sched.h |    4 ++++
+ kernel/exit.c         |   20 ++++++++++++--------
+ kernel/fork.c         |    3 +++
+ kernel/sched.c        |   41 +++++++++++++++++++++++++++++++++++++++++
+ kernel/sys.c          |   18 ++++++++----------
+ 6 files changed, 69 insertions(+), 22 deletions(-)
+
+--- a/fs/proc/array.c
++++ b/fs/proc/array.c
+@@ -405,7 +405,6 @@ static int do_task_stat(struct seq_file
+               /* add up live thread stats at the group level */
+               if (whole) {
+-                      struct task_cputime cputime;
+                       struct task_struct *t = task;
+                       do {
+                               min_flt += t->min_flt;
+@@ -416,9 +415,7 @@ static int do_task_stat(struct seq_file
+                       min_flt += sig->min_flt;
+                       maj_flt += sig->maj_flt;
+-                      thread_group_cputime(task, &cputime);
+-                      utime = cputime.utime;
+-                      stime = cputime.stime;
++                      thread_group_times(task, &utime, &stime);
+                       gtime = cputime_add(gtime, sig->gtime);
+               }
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -628,6 +628,9 @@ struct signal_struct {
+       cputime_t utime, stime, cutime, cstime;
+       cputime_t gtime;
+       cputime_t cgtime;
++#ifndef CONFIG_VIRT_CPU_ACCOUNTING
++      cputime_t prev_utime, prev_stime;
++#endif
+       unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
+       unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
+       unsigned long inblock, oublock, cinblock, coublock;
+@@ -1725,6 +1728,7 @@ static inline void put_task_struct(struc
+ extern cputime_t task_utime(struct task_struct *p);
+ extern cputime_t task_stime(struct task_struct *p);
+ extern cputime_t task_gtime(struct task_struct *p);
++extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+ /*
+  * Per process flags
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -110,8 +110,8 @@ static void __exit_signal(struct task_st
+                * We won't ever get here for the group leader, since it
+                * will have been the last reference on the signal_struct.
+                */
+-              sig->utime = cputime_add(sig->utime, task_utime(tsk));
+-              sig->stime = cputime_add(sig->stime, task_stime(tsk));
++              sig->utime = cputime_add(sig->utime, tsk->utime);
++              sig->stime = cputime_add(sig->stime, tsk->stime);
+               sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
+               sig->min_flt += tsk->min_flt;
+               sig->maj_flt += tsk->maj_flt;
+@@ -1205,6 +1205,7 @@ static int wait_task_zombie(struct wait_
+               struct signal_struct *psig;
+               struct signal_struct *sig;
+               unsigned long maxrss;
++              cputime_t tgutime, tgstime;
+               /*
+                * The resource counters for the group leader are in its
+@@ -1220,20 +1221,23 @@ static int wait_task_zombie(struct wait_
+                * need to protect the access to parent->signal fields,
+                * as other threads in the parent group can be right
+                * here reaping other children at the same time.
++               *
++               * We use thread_group_times() to get times for the thread
++               * group, which consolidates times for all threads in the
++               * group including the group leader.
+                */
++              thread_group_times(p, &tgutime, &tgstime);
+               spin_lock_irq(&p->real_parent->sighand->siglock);
+               psig = p->real_parent->signal;
+               sig = p->signal;
+               psig->cutime =
+                       cputime_add(psig->cutime,
+-                      cputime_add(p->utime,
+-                      cputime_add(sig->utime,
+-                                  sig->cutime)));
++                      cputime_add(tgutime,
++                                  sig->cutime));
+               psig->cstime =
+                       cputime_add(psig->cstime,
+-                      cputime_add(p->stime,
+-                      cputime_add(sig->stime,
+-                                  sig->cstime)));
++                      cputime_add(tgstime,
++                                  sig->cstime));
+               psig->cgtime =
+                       cputime_add(psig->cgtime,
+                       cputime_add(p->gtime,
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -884,6 +884,9 @@ static int copy_signal(unsigned long clo
+       sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
+       sig->gtime = cputime_zero;
+       sig->cgtime = cputime_zero;
++#ifndef CONFIG_VIRT_CPU_ACCOUNTING
++      sig->prev_utime = sig->prev_stime = cputime_zero;
++#endif
+       sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
+       sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
+       sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -5215,6 +5215,16 @@ cputime_t task_stime(struct task_struct
+ {
+       return p->stime;
+ }
++
++void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
++{
++      struct task_cputime cputime;
++
++      thread_group_cputime(p, &cputime);
++
++      *ut = cputime.utime;
++      *st = cputime.stime;
++}
+ #else
+ #ifndef nsecs_to_cputime
+@@ -5258,6 +5268,37 @@ cputime_t task_stime(struct task_struct
+       return p->prev_stime;
+ }
++
++/*
++ * Must be called with siglock held.
++ */
++void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
++{
++      struct signal_struct *sig = p->signal;
++      struct task_cputime cputime;
++      cputime_t rtime, utime, total;
++
++      thread_group_cputime(p, &cputime);
++
++      total = cputime_add(cputime.utime, cputime.stime);
++      rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
++
++      if (total) {
++              u64 temp;
++
++              temp = (u64)(rtime * cputime.utime);
++              do_div(temp, total);
++              utime = (cputime_t)temp;
++      } else
++              utime = rtime;
++
++      sig->prev_utime = max(sig->prev_utime, utime);
++      sig->prev_stime = max(sig->prev_stime,
++                            cputime_sub(rtime, sig->prev_utime));
++
++      *ut = sig->prev_utime;
++      *st = sig->prev_stime;
++}
+ #endif
+ inline cputime_t task_gtime(struct task_struct *p)
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -911,16 +911,15 @@ change_okay:
+ void do_sys_times(struct tms *tms)
+ {
+-      struct task_cputime cputime;
+-      cputime_t cutime, cstime;
++      cputime_t tgutime, tgstime, cutime, cstime;
+-      thread_group_cputime(current, &cputime);
+       spin_lock_irq(&current->sighand->siglock);
++      thread_group_times(current, &tgutime, &tgstime);
+       cutime = current->signal->cutime;
+       cstime = current->signal->cstime;
+       spin_unlock_irq(&current->sighand->siglock);
+-      tms->tms_utime = cputime_to_clock_t(cputime.utime);
+-      tms->tms_stime = cputime_to_clock_t(cputime.stime);
++      tms->tms_utime = cputime_to_clock_t(tgutime);
++      tms->tms_stime = cputime_to_clock_t(tgstime);
+       tms->tms_cutime = cputime_to_clock_t(cutime);
+       tms->tms_cstime = cputime_to_clock_t(cstime);
+ }
+@@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_stru
+ {
+       struct task_struct *t;
+       unsigned long flags;
+-      cputime_t utime, stime;
+-      struct task_cputime cputime;
++      cputime_t tgutime, tgstime, utime, stime;
+       unsigned long maxrss = 0;
+       memset((char *) r, 0, sizeof *r);
+@@ -1373,9 +1371,9 @@ static void k_getrusage(struct task_stru
+                               break;
+               case RUSAGE_SELF:
+-                      thread_group_cputime(p, &cputime);
+-                      utime = cputime_add(utime, cputime.utime);
+-                      stime = cputime_add(stime, cputime.stime);
++                      thread_group_times(p, &tgutime, &tgstime);
++                      utime = cputime_add(utime, tgutime);
++                      stime = cputime_add(stime, tgstime);
+                       r->ru_nvcsw += p->signal->nvcsw;
+                       r->ru_nivcsw += p->signal->nivcsw;
+                       r->ru_minflt += p->signal->min_flt;
diff --git a/queue-2.6.32/sched-fix-granularity-of-task_u-stime.patch b/queue-2.6.32/sched-fix-granularity-of-task_u-stime.patch
new file mode 100644 (file)
index 0000000..c57e80f
--- /dev/null
@@ -0,0 +1,103 @@
+From 761b1d26df542fd5eb348837351e4d2f3bc7bffe Mon Sep 17 00:00:00 2001
+From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Date: Thu, 12 Nov 2009 13:33:45 +0900
+Subject: sched: Fix granularity of task_u/stime()
+
+From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+commit 761b1d26df542fd5eb348837351e4d2f3bc7bffe upstream.
+
+Originally task_s/utime() were designed to return clock_t but
+later changed to return cputime_t by following commit:
+
+  commit efe567fc8281661524ffa75477a7c4ca9b466c63
+  Author: Christian Borntraeger <borntraeger@de.ibm.com>
+  Date:   Thu Aug 23 15:18:02 2007 +0200
+
+It only changed the type of return value, but not the
+implementation. As the result the granularity of task_s/utime()
+is still that of clock_t, not that of cputime_t.
+
+So using task_s/utime() in __exit_signal() makes values
+accumulated to the signal struct to be rounded and coarse
+grained.
+
+This patch removes casts to clock_t in task_u/stime(), to keep
+granularity of cputime_t over the calculation.
+
+v2:
+  Use div_u64() to avoid error "undefined reference to `__udivdi3`"
+  on some 32bit systems.
+
+Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Cc: xiyou.wangcong@gmail.com
+Cc: Spencer Candland <spencer@bluehost.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Stanislaw Gruszka <sgruszka@redhat.com>
+LKML-Reference: <4AFB9029.9000208@jp.fujitsu.com>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Jiri Slaby <jslaby@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ kernel/sched.c |   22 +++++++++++++---------
+ 1 file changed, 13 insertions(+), 9 deletions(-)
+
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -5216,41 +5216,45 @@ cputime_t task_stime(struct task_struct
+       return p->stime;
+ }
+ #else
++
++#ifndef nsecs_to_cputime
++# define nsecs_to_cputime(__nsecs) \
++      msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC))
++#endif
++
+ cputime_t task_utime(struct task_struct *p)
+ {
+-      clock_t utime = cputime_to_clock_t(p->utime),
+-              total = utime + cputime_to_clock_t(p->stime);
++      cputime_t utime = p->utime, total = utime + p->stime;
+       u64 temp;
+       /*
+        * Use CFS's precise accounting:
+        */
+-      temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
++      temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime);
+       if (total) {
+               temp *= utime;
+               do_div(temp, total);
+       }
+-      utime = (clock_t)temp;
++      utime = (cputime_t)temp;
+-      p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
++      p->prev_utime = max(p->prev_utime, utime);
+       return p->prev_utime;
+ }
+ cputime_t task_stime(struct task_struct *p)
+ {
+-      clock_t stime;
++      cputime_t stime;
+       /*
+        * Use CFS's precise accounting. (we subtract utime from
+        * the total, to make sure the total observed by userspace
+        * grows monotonically - apps rely on that):
+        */
+-      stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
+-                      cputime_to_clock_t(task_utime(p));
++      stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p);
+       if (stime >= 0)
+-              p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
++              p->prev_stime = max(p->prev_stime, stime);
+       return p->prev_stime;
+ }
index 9f0301de405c31848db041ef3a59d15d2b3fde08..3eb261dfe892e107bdc715f2dfa8d7e8c89a25b8 100644 (file)
@@ -40,3 +40,71 @@ irq-add-new-irq-flag-irqf_no_suspend.patch
 xen-do-not-suspend-ipi-irqs.patch
 ext4-fix-freeze-deadlock-under-io.patch
 drm-i915-use-rsen-instead-of-htplg-for-tfp410-monitor-detection.patch
+btrfs-avoid-superfluous-tree-log-writeout.patch
+btrfs-add-btrfs_duplicate_item.patch
+btrfs-rewrite-btrfs_drop_extents.patch
+btrfs-fix-disk_i_size-update-corner-case.patch
+btrfs-avoid-orphan-inodes-cleanup-while-replaying-log.patch
+btrfs-avoid-orphan-inodes-cleanup-during-committing-transaction.patch
+btrfs-make-fallocate-2-more-enospc-friendly.patch
+btrfs-make-truncate-2-more-enospc-friendly.patch
+btrfs-pass-transaction-handle-to-security-and-acl-initialization-functions.patch
+btrfs-add-delayed-iput.patch
+btrfs-fix-btrfs_drop_extent_cache-for-skip-pinned-case.patch
+btrfs-fix-per-root-used-space-accounting.patch
+btrfs-don-t-add-extent-0-to-the-free-space-cache-v2.patch
+btrfs-fail-mount-on-bad-mount-options.patch
+btrfs-deny-sys_link-across-subvolumes.patch
+btrfs-show-discard-option-in-proc-mounts.patch
+btrfs-make-metadata-chunks-smaller.patch
+btrfs-make-sure-fallocate-properly-starts-a-transaction.patch
+btrfs-fix-missing-last-entry-in-readdir-3.patch
+btrfs-align-offsets-for-btrfs_ordered_update_i_size.patch
+btrfs-fix-memory-leaks-in-error-paths.patch
+btrfs-fix-race-in-btrfs_mark_extent_written.patch
+btrfs-fix-regression-in-orphan-cleanup.patch
+btrfs-deal-with-null-acl-sent-to-btrfs_set_acl.patch
+btrfs-fix-possible-panic-on-unmount.patch
+btrfs-use-correct-values-when-updating-inode-i_size-on-fallocate.patch
+btrfs-fix-a-memory-leak-in-btrfs_init_acl.patch
+btrfs-run-orphan-cleanup-on-default-fs-root.patch
+btrfs-do-not-mark-the-chunk-as-readonly-if-in-degraded-mode.patch
+btrfs-check-return-value-of-open_bdev_exclusive-properly.patch
+btrfs-check-total-number-of-devices-when-removing-missing.patch
+btrfs-fix-race-between-allocate-and-release-extent-buffer.patch
+btrfs-make-error-return-negative-in-btrfs_sync_file.patch
+btrfs-remove-bug_on-due-to-mounting-bad-filesystem.patch
+btrfs-fix-oopsen-when-dropping-empty-tree.patch
+btrfs-do-not-try-and-lookup-the-file-extent-when-finishing-ordered-io.patch
+btrfs-apply-updated-fallocate-i_size-fix.patch
+btrfs-btrfs_mark_extent_written-uses-the-wrong-slot.patch
+btrfs-kfree-correct-pointer-during-mount-option-parsing.patch
+nohz-introduce-arch_needs_cpu.patch
+nohz-reuse-ktime-in-sub-functions-of-tick_check_idle.patch
+timekeeping-fix-clock_gettime-vsyscall-time-warp.patch
+sched-fix-granularity-of-task_u-stime.patch
+sched-cputime-introduce-thread_group_times.patch
+mutex-don-t-spin-when-the-owner-cpu-is-offline-or-other-weird-cases.patch
+fix-sba-iommu-to-handle-allocation-failure-properly.patch
+crypto-testmgr-fix-complain-about-lack-test-for-internal-used-algorithm.patch
+memory-hotplug-fix-a-bug-on-dev-mem-for-64-bit-kernels.patch
+x86-fix-out-of-order-of-gsi.patch
+hwpoison-remove-the-anonymous-entry.patch
+hwpoison-abort-on-failed-unmap.patch
+powerpc-eeh-fix-a-bug-when-pci-structure-is-null.patch
+acpi-fix-regression-where-_ppc-is-not-read-at-boot-even-when-ignore_ppc-0.patch
+ext4-make-sure-the-move_ext-ioctl-can-t-overwrite-append-only-files.patch
+ext4-fix-optional-arg-mount-options.patch
+reiserfs-properly-honor-read-only-devices.patch
+reiserfs-fix-oops-while-creating-privroot-with-selinux-enabled.patch
+dlm-always-use-gfp_nofs.patch
+dlm-fix-ordering-of-bast-and-cast.patch
+dlm-send-reply-before-bast.patch
+ocfs2-find-proper-end-cpos-for-a-leaf-refcount-block.patch
+ocfs2-set-ms_posixacl-on-remount.patch
+skip-check-for-mandatory-locks-when-unlocking.patch
+loop-update-mtime-when-writing-using-aops.patch
+aic79xx-check-for-non-null-scb-in-ahd_handle_nonpkt_busfree.patch
+ibmvfc-fix-command-completion-handling.patch
+ibmvfc-reduce-error-recovery-timeout.patch
+md-raid1-delay-reads-that-could-overtake-behind-writes.patch
diff --git a/queue-2.6.32/skip-check-for-mandatory-locks-when-unlocking.patch b/queue-2.6.32/skip-check-for-mandatory-locks-when-unlocking.patch
new file mode 100644 (file)
index 0000000..bb8e7c5
--- /dev/null
@@ -0,0 +1,36 @@
+From ee860b6a650360c91f5d5f9a94262aad9be90015 Mon Sep 17 00:00:00 2001
+From: Sachin Prabhu <sprabhu@redhat.com>
+Date: Wed, 10 Mar 2010 10:28:40 -0500
+Subject: [PATCH] Skip check for mandatory locks when unlocking
+
+From: Sachin Prabhu <sprabhu@redhat.com>
+
+commit ee860b6a650360c91f5d5f9a94262aad9be90015 upstream.
+
+ocfs2_lock() will skip locks on file which has mode set to 02666. This
+is a problem in cases where the mode of the file is changed after a
+process has obtained a lock on the file.
+
+ocfs2_lock() should skip the check for mandatory locks when unlocking a
+file.
+
+Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
+Signed-off-by: Joel Becker <joel.becker@oracle.com>
+Signed-off-by: Neil Brown <neilb@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ocfs2/locks.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ocfs2/locks.c
++++ b/fs/ocfs2/locks.c
+@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cm
+       if (!(fl->fl_flags & FL_POSIX))
+               return -ENOLCK;
+-      if (__mandatory_lock(inode))
++      if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+               return -ENOLCK;
+       return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/queue-2.6.32/timekeeping-fix-clock_gettime-vsyscall-time-warp.patch b/queue-2.6.32/timekeeping-fix-clock_gettime-vsyscall-time-warp.patch
new file mode 100644 (file)
index 0000000..3ec98c2
--- /dev/null
@@ -0,0 +1,159 @@
+From 0696b711e4be45fa104c12329f617beb29c03f78 Mon Sep 17 00:00:00 2001
+From: Lin Ming <ming.m.lin@intel.com>
+Date: Tue, 17 Nov 2009 13:49:50 +0800
+Subject: timekeeping: Fix clock_gettime vsyscall time warp
+
+From: Lin Ming <ming.m.lin@intel.com>
+
+commit 0696b711e4be45fa104c12329f617beb29c03f78 upstream.
+
+Since commit 0a544198 "timekeeping: Move NTP adjusted clock multiplier
+to struct timekeeper" the clock multiplier of vsyscall is updated with
+the unmodified clock multiplier of the clock source and not with the
+NTP adjusted multiplier of the timekeeper.
+
+This causes user space observerable time warps:
+new CLOCK-warp maximum: 120 nsecs,  00000025c337c537 -> 00000025c337c4bf
+
+Add a new argument "mult" to update_vsyscall() and hand in the
+timekeeping internal NTP adjusted multiplier.
+
+Signed-off-by: Lin Ming <ming.m.lin@intel.com>
+Cc: "Zhang Yanmin" <yanmin_zhang@linux.intel.com>
+Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Tony Luck <tony.luck@intel.com>
+LKML-Reference: <1258436990.17765.83.camel@minggr.sh.intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Kurt Garloff <garloff@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/ia64/kernel/time.c       |    4 ++--
+ arch/powerpc/kernel/time.c    |    5 +++--
+ arch/s390/kernel/time.c       |    3 ++-
+ arch/x86/kernel/vsyscall_64.c |    5 +++--
+ include/linux/clocksource.h   |    6 ++++--
+ kernel/time/timekeeping.c     |    6 +++---
+ 6 files changed, 17 insertions(+), 12 deletions(-)
+
+--- a/arch/ia64/kernel/time.c
++++ b/arch/ia64/kernel/time.c
+@@ -473,7 +473,7 @@ void update_vsyscall_tz(void)
+ {
+ }
+-void update_vsyscall(struct timespec *wall, struct clocksource *c)
++void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult)
+ {
+         unsigned long flags;
+@@ -481,7 +481,7 @@ void update_vsyscall(struct timespec *wa
+         /* copy fsyscall clock data */
+         fsyscall_gtod_data.clk_mask = c->mask;
+-        fsyscall_gtod_data.clk_mult = c->mult;
++        fsyscall_gtod_data.clk_mult = mult;
+         fsyscall_gtod_data.clk_shift = c->shift;
+         fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio;
+         fsyscall_gtod_data.clk_cycle_last = c->cycle_last;
+--- a/arch/powerpc/kernel/time.c
++++ b/arch/powerpc/kernel/time.c
+@@ -864,7 +864,8 @@ static cycle_t timebase_read(struct cloc
+       return (cycle_t)get_tb();
+ }
+-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
++void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
++                   u32 mult)
+ {
+       u64 t2x, stamp_xsec;
+@@ -877,7 +878,7 @@ void update_vsyscall(struct timespec *wa
+       /* XXX this assumes clock->shift == 22 */
+       /* 4611686018 ~= 2^(20+64-22) / 1e9 */
+-      t2x = (u64) clock->mult * 4611686018ULL;
++      t2x = (u64) mult * 4611686018ULL;
+       stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC;
+       do_div(stamp_xsec, 1000000000);
+       stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC;
+--- a/arch/s390/kernel/time.c
++++ b/arch/s390/kernel/time.c
+@@ -214,7 +214,8 @@ struct clocksource * __init clocksource_
+       return &clocksource_tod;
+ }
+-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
++void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
++                   u32 mult)
+ {
+       if (clock != &clocksource_tod)
+               return;
+--- a/arch/x86/kernel/vsyscall_64.c
++++ b/arch/x86/kernel/vsyscall_64.c
+@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
+       write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+ }
+-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
++void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
++                   u32 mult)
+ {
+       unsigned long flags;
+@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wa
+       vsyscall_gtod_data.clock.vread = clock->vread;
+       vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
+       vsyscall_gtod_data.clock.mask = clock->mask;
+-      vsyscall_gtod_data.clock.mult = clock->mult;
++      vsyscall_gtod_data.clock.mult = mult;
+       vsyscall_gtod_data.clock.shift = clock->shift;
+       vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
+       vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
+--- a/include/linux/clocksource.h
++++ b/include/linux/clocksource.h
+@@ -282,10 +282,12 @@ extern struct clocksource * __init __wea
+ extern void clocksource_mark_unstable(struct clocksource *cs);
+ #ifdef CONFIG_GENERIC_TIME_VSYSCALL
+-extern void update_vsyscall(struct timespec *ts, struct clocksource *c);
++extern void
++update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult);
+ extern void update_vsyscall_tz(void);
+ #else
+-static inline void update_vsyscall(struct timespec *ts, struct clocksource *c)
++static inline void
++update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult)
+ {
+ }
+--- a/kernel/time/timekeeping.c
++++ b/kernel/time/timekeeping.c
+@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsec
+ {
+       xtime.tv_sec += leapsecond;
+       wall_to_monotonic.tv_sec -= leapsecond;
+-      update_vsyscall(&xtime, timekeeper.clock);
++      update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+ }
+ #ifdef CONFIG_GENERIC_TIME
+@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv)
+       timekeeper.ntp_error = 0;
+       ntp_clear();
+-      update_vsyscall(&xtime, timekeeper.clock);
++      update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+       write_sequnlock_irqrestore(&xtime_lock, flags);
+@@ -822,7 +822,7 @@ void update_wall_time(void)
+       update_xtime_cache(nsecs);
+       /* check to see if there is a new clocksource to use */
+-      update_vsyscall(&xtime, timekeeper.clock);
++      update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
+ }
+ /**
diff --git a/queue-2.6.32/x86-fix-out-of-order-of-gsi.patch b/queue-2.6.32/x86-fix-out-of-order-of-gsi.patch
new file mode 100644 (file)
index 0000000..467a7cf
--- /dev/null
@@ -0,0 +1,176 @@
+From fad539956c9e69749a03f7817d22d1bab87657bf Mon Sep 17 00:00:00 2001
+From: Eric W. Biederman <ebiederm@xmission.com>
+Date: Sun, 28 Feb 2010 01:06:34 -0800
+Subject: x86: Fix out of order of gsi
+
+From: Eric W. Biederman <ebiederm@xmission.com>
+
+commit fad539956c9e69749a03f7817d22d1bab87657bf upstream.
+
+Iranna D Ankad reported that IBM x3950 systems have boot
+problems after this commit:
+
+ |
+ | commit b9c61b70075c87a8612624736faf4a2de5b1ed30
+ |
+ |    x86/pci: update pirq_enable_irq() to setup io apic routing
+ |
+
+The problem is that with the patch, the machine freezes when
+console=ttyS0,... kernel serial parameter is passed.
+
+It seem to freeze at DVD initialization and the whole problem
+seem to be DVD/pata related, but somehow exposed through the
+serial parameter.
+
+Such apic problems can expose really weird behavior:
+
+  ACPI: IOAPIC (id[0x10] address[0xfecff000] gsi_base[0])
+  IOAPIC[0]: apic_id 16, version 0, address 0xfecff000, GSI 0-2
+  ACPI: IOAPIC (id[0x0f] address[0xfec00000] gsi_base[3])
+  IOAPIC[1]: apic_id 15, version 0, address 0xfec00000, GSI 3-38
+  ACPI: IOAPIC (id[0x0e] address[0xfec01000] gsi_base[39])
+  IOAPIC[2]: apic_id 14, version 0, address 0xfec01000, GSI 39-74
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 1 global_irq 4 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 5 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 3 global_irq 6 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 4 global_irq 7 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 6 global_irq 9 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 7 global_irq 10 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 8 global_irq 11 low edge)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 12 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 12 global_irq 15 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 13 global_irq 16 dfl dfl)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 14 global_irq 17 low edge)
+  ACPI: INT_SRC_OVR (bus 0 bus_irq 15 global_irq 18 dfl dfl)
+
+It turns out that the system has three io apic controllers, but
+boot ioapic routing is in the second one, and that gsi_base is
+not 0 - it is using a bunch of INT_SRC_OVR...
+
+So these recent changes:
+
+ 1. one set routing for first io apic controller
+ 2. assume irq = gsi
+
+... will break that system.
+
+So try to remap those gsis, need to seperate boot_ioapic_idx
+detection out of enable_IO_APIC() and call them early.
+
+So introduce boot_ioapic_idx, and remap_ioapic_gsi()...
+
+ -v2: shift gsi with delta instead of gsi_base of boot_ioapic_idx
+
+ -v3: double check with find_isa_irq_apic(0, mp_INT) to get right
+      boot_ioapic_idx
+
+ -v4: nr_legacy_irqs
+
+ -v5: add print out for boot_ioapic_idx, and also make it could be
+      applied for current kernel and previous kernel
+
+ -v6: add bus_irq, in acpi_sci_ioapic_setup, so can get overwride
+      for sci right mapping...
+
+ -v7: looks like pnpacpi get irq instead of gsi, so need to revert
+      them back...
+
+ -v8: split into two patches
+
+ -v9: according to Eric, use fixed 16 for shifting instead of remap
+
+ -v10: still need to touch rsparser.c
+
+ -v11: just revert back to way Eric suggest...
+      anyway the ioapic in first ioapic is blocked by second...
+
+ -v12: two patches, this one will add more loop but check apic_id and irq > 16
+
+Reported-by: Iranna D Ankad <iranna.ankad@in.ibm.com>
+Bisected-by: Iranna D Ankad <iranna.ankad@in.ibm.com>
+Tested-by: Gary Hade <garyhade@us.ibm.com>
+Signed-off-by: Yinghai Lu <yinghai@kernel.org>
+Cc: Eric W. Biederman <ebiederm@xmission.com>
+Cc: Thomas Renninger <trenn@suse.de>
+Cc: Eric W. Biederman <ebiederm@xmission.com>
+Cc: Suresh Siddha <suresh.b.siddha@intel.com>
+Cc: len.brown@intel.com
+LKML-Reference: <4B8A321A.1000008@kernel.org>
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ arch/x86/kernel/apic/io_apic.c |   28 ++++++++++------------------
+ 1 file changed, 10 insertions(+), 18 deletions(-)
+
+--- a/arch/x86/kernel/apic/io_apic.c
++++ b/arch/x86/kernel/apic/io_apic.c
+@@ -1484,7 +1484,7 @@ static struct {
+ static void __init setup_IO_APIC_irqs(void)
+ {
+-      int apic_id = 0, pin, idx, irq;
++      int apic_id, pin, idx, irq;
+       int notcon = 0;
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+@@ -1492,14 +1492,7 @@ static void __init setup_IO_APIC_irqs(vo
+       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+-#ifdef CONFIG_ACPI
+-      if (!acpi_disabled && acpi_ioapic) {
+-              apic_id = mp_find_ioapic(0);
+-              if (apic_id < 0)
+-                      apic_id = 0;
+-      }
+-#endif
+-
++      for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
+       for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+               idx = find_irq_entry(apic_id, pin, mp_INT);
+               if (idx == -1) {
+@@ -1521,6 +1514,9 @@ static void __init setup_IO_APIC_irqs(vo
+               irq = pin_2_irq(idx, apic_id, pin);
++              if ((apic_id > 0) && (irq > 16))
++                      continue;
++
+               /*
+                * Skip the timer IRQ if there's a quirk handler
+                * installed and if it returns 1:
+@@ -4083,27 +4079,23 @@ int acpi_get_override_irq(int bus_irq, i
+ #ifdef CONFIG_SMP
+ void __init setup_ioapic_dest(void)
+ {
+-      int pin, ioapic = 0, irq, irq_entry;
++      int pin, ioapic, irq, irq_entry;
+       struct irq_desc *desc;
+       const struct cpumask *mask;
+       if (skip_ioapic_setup == 1)
+               return;
+-#ifdef CONFIG_ACPI
+-      if (!acpi_disabled && acpi_ioapic) {
+-              ioapic = mp_find_ioapic(0);
+-              if (ioapic < 0)
+-                      ioapic = 0;
+-      }
+-#endif
+-
++      for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
+       for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+               irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+               if (irq_entry == -1)
+                       continue;
+               irq = pin_2_irq(irq_entry, ioapic, pin);
++              if ((ioapic > 0) && (irq > 16))
++                      continue;
++
+               desc = irq_to_desc(irq);
+               /*