From: Greg Kroah-Hartman Date: Wed, 11 Aug 2010 23:00:30 +0000 (-0700) Subject: .32 patches X-Git-Tag: v2.6.32.19~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=8de808454c5ab95d981fd7ad5c933471c38a13fc;p=thirdparty%2Fkernel%2Fstable-queue.git .32 patches --- diff --git a/queue-2.6.32/acpi-fix-regression-where-_ppc-is-not-read-at-boot-even-when-ignore_ppc-0.patch b/queue-2.6.32/acpi-fix-regression-where-_ppc-is-not-read-at-boot-even-when-ignore_ppc-0.patch new file mode 100644 index 00000000000..f81b88f2c22 --- /dev/null +++ b/queue-2.6.32/acpi-fix-regression-where-_ppc-is-not-read-at-boot-even-when-ignore_ppc-0.patch @@ -0,0 +1,43 @@ +From 455c0d71d46e86b0b7ff2c9dcfc19bc162302ee9 Mon Sep 17 00:00:00 2001 +From: Darrick J. Wong +Date: Thu, 18 Feb 2010 10:28:20 -0800 +Subject: ACPI: Fix regression where _PPC is not read at boot even when ignore_ppc=0 + +From: Darrick J. Wong + +commit 455c0d71d46e86b0b7ff2c9dcfc19bc162302ee9 upstream. + +Earlier, Ingo Molnar posted a patch to make it so that the kernel would avoid +reading _PPC on his broken T60. Unfortunately, it seems that with Thomas +Renninger's patch last July to eliminate _PPC evaluations when the processor +driver loads, the kernel never actually reads _PPC at all! This is problematic +if you happen to boot your non-T60 computer in a state where the BIOS _wants_ +_PPC to be something other than zero. + +So, put the _PPC evaluation back into acpi_processor_get_performance_info if +ignore_ppc isn't 1. + +Signed-off-by: Darrick J. Wong +Signed-off-by: Len Brown +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/acpi/processor_perflib.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/acpi/processor_perflib.c ++++ b/drivers/acpi/processor_perflib.c +@@ -356,7 +356,11 @@ static int acpi_processor_get_performanc + if (result) + goto update_bios; + +- return 0; ++ /* We need to call _PPC once when cpufreq starts */ ++ if (ignore_ppc != 1) ++ result = acpi_processor_get_platform_limit(pr); ++ ++ return result; + + /* + * Having _PPC but missing frequencies (_PSS, _PCT) is a very good hint that diff --git a/queue-2.6.32/aic79xx-check-for-non-null-scb-in-ahd_handle_nonpkt_busfree.patch b/queue-2.6.32/aic79xx-check-for-non-null-scb-in-ahd_handle_nonpkt_busfree.patch new file mode 100644 index 00000000000..8060837299f --- /dev/null +++ b/queue-2.6.32/aic79xx-check-for-non-null-scb-in-ahd_handle_nonpkt_busfree.patch @@ -0,0 +1,105 @@ +From 534ef056db8a8fb6b9d50188d88ed5d1fbc66673 Mon Sep 17 00:00:00 2001 +From: Hannes Reinecke +Date: Fri, 15 Jan 2010 13:07:34 +0100 +Subject: [SCSI] aic79xx: check for non-NULL scb in ahd_handle_nonpkt_busfree + +From: Hannes Reinecke + +commit 534ef056db8a8fb6b9d50188d88ed5d1fbc66673 upstream. + +When removing several devices aic79xx will occasionally Oops +in ahd_handle_nonpkt_busfree during rescan. Looking at the +code I found that we're indeed not checking if the scb in +question is NULL. So check for it before accessing it. + +Signed-off-by: Hannes Reinecke +Signed-off-by: James Bottomley +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/scsi/aic7xxx/aic79xx_core.c | 53 +++++++++++++++++++++--------------- + 1 file changed, 31 insertions(+), 22 deletions(-) + +--- a/drivers/scsi/aic7xxx/aic79xx_core.c ++++ b/drivers/scsi/aic7xxx/aic79xx_core.c +@@ -3171,13 +3171,16 @@ ahd_handle_nonpkt_busfree(struct ahd_sof + tinfo->curr.transport_version = 2; + tinfo->goal.transport_version = 2; + tinfo->goal.ppr_options = 0; +- /* +- * Remove any SCBs in the waiting for selection +- * queue that may also be for this target so +- * that command ordering is preserved. +- */ +- ahd_freeze_devq(ahd, scb); +- ahd_qinfifo_requeue_tail(ahd, scb); ++ if (scb != NULL) { ++ /* ++ * Remove any SCBs in the waiting ++ * for selection queue that may ++ * also be for this target so that ++ * command ordering is preserved. ++ */ ++ ahd_freeze_devq(ahd, scb); ++ ahd_qinfifo_requeue_tail(ahd, scb); ++ } + printerror = 0; + } + } else if (ahd_sent_msg(ahd, AHDMSG_EXT, MSG_EXT_WDTR, FALSE) +@@ -3194,13 +3197,16 @@ ahd_handle_nonpkt_busfree(struct ahd_sof + MSG_EXT_WDTR_BUS_8_BIT, + AHD_TRANS_CUR|AHD_TRANS_GOAL, + /*paused*/TRUE); +- /* +- * Remove any SCBs in the waiting for selection +- * queue that may also be for this target so that +- * command ordering is preserved. +- */ +- ahd_freeze_devq(ahd, scb); +- ahd_qinfifo_requeue_tail(ahd, scb); ++ if (scb != NULL) { ++ /* ++ * Remove any SCBs in the waiting for ++ * selection queue that may also be for ++ * this target so that command ordering ++ * is preserved. ++ */ ++ ahd_freeze_devq(ahd, scb); ++ ahd_qinfifo_requeue_tail(ahd, scb); ++ } + printerror = 0; + } else if (ahd_sent_msg(ahd, AHDMSG_EXT, MSG_EXT_SDTR, FALSE) + && ppr_busfree == 0) { +@@ -3217,13 +3223,16 @@ ahd_handle_nonpkt_busfree(struct ahd_sof + /*ppr_options*/0, + AHD_TRANS_CUR|AHD_TRANS_GOAL, + /*paused*/TRUE); +- /* +- * Remove any SCBs in the waiting for selection +- * queue that may also be for this target so that +- * command ordering is preserved. +- */ +- ahd_freeze_devq(ahd, scb); +- ahd_qinfifo_requeue_tail(ahd, scb); ++ if (scb != NULL) { ++ /* ++ * Remove any SCBs in the waiting for ++ * selection queue that may also be for ++ * this target so that command ordering ++ * is preserved. ++ */ ++ ahd_freeze_devq(ahd, scb); ++ ahd_qinfifo_requeue_tail(ahd, scb); ++ } + printerror = 0; + } else if ((ahd->msg_flags & MSG_FLAG_EXPECT_IDE_BUSFREE) != 0 + && ahd_sent_msg(ahd, AHDMSG_1B, +@@ -3251,7 +3260,7 @@ ahd_handle_nonpkt_busfree(struct ahd_sof + * the message phases. We check it last in case we + * had to send some other message that caused a busfree. + */ +- if (printerror != 0 ++ if (scb != NULL && printerror != 0 + && (lastphase == P_MESGIN || lastphase == P_MESGOUT) + && ((ahd->msg_flags & MSG_FLAG_EXPECT_PPR_BUSFREE) != 0)) { + diff --git a/queue-2.6.32/btrfs-add-btrfs_duplicate_item.patch b/queue-2.6.32/btrfs-add-btrfs_duplicate_item.patch new file mode 100644 index 00000000000..0e2a71a115e --- /dev/null +++ b/queue-2.6.32/btrfs-add-btrfs_duplicate_item.patch @@ -0,0 +1,299 @@ +From ad48fd754676bfae4139be1a897b1ea58f9aaf21 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:33:58 +0000 +Subject: Btrfs: Add btrfs_duplicate_item + +From: Yan, Zheng + +commit ad48fd754676bfae4139be1a897b1ea58f9aaf21 upstream. + +btrfs_duplicate_item duplicates item with new key, guaranteeing +the source item and the new items are in the same tree leaf and +contiguous. It allows us to split file extent in place, without +using lock_extent to prevent bookend extent race. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/ctree.c | 198 ++++++++++++++++++++++++++++++++++++++----------------- + fs/btrfs/ctree.h | 4 + + 2 files changed, 143 insertions(+), 59 deletions(-) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -37,6 +37,11 @@ static int balance_node_right(struct btr + struct extent_buffer *src_buf); + static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); ++static int setup_items_for_insert(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, struct btrfs_path *path, ++ struct btrfs_key *cpu_key, u32 *data_size, ++ u32 total_data, u32 total_size, int nr); ++ + + struct btrfs_path *btrfs_alloc_path(void) + { +@@ -2997,75 +3002,85 @@ again: + return ret; + } + +-/* +- * This function splits a single item into two items, +- * giving 'new_key' to the new item and splitting the +- * old one at split_offset (from the start of the item). +- * +- * The path may be released by this operation. After +- * the split, the path is pointing to the old item. The +- * new item is going to be in the same node as the old one. +- * +- * Note, the item being split must be smaller enough to live alone on +- * a tree block with room for one extra struct btrfs_item +- * +- * This allows us to split the item in place, keeping a lock on the +- * leaf the entire time. +- */ +-int btrfs_split_item(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct btrfs_path *path, +- struct btrfs_key *new_key, +- unsigned long split_offset) ++static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, int ins_len) + { +- u32 item_size; ++ struct btrfs_key key; + struct extent_buffer *leaf; +- struct btrfs_key orig_key; +- struct btrfs_item *item; +- struct btrfs_item *new_item; +- int ret = 0; +- int slot; +- u32 nritems; +- u32 orig_offset; +- struct btrfs_disk_key disk_key; +- char *buf; ++ struct btrfs_file_extent_item *fi; ++ u64 extent_len = 0; ++ u32 item_size; ++ int ret; + + leaf = path->nodes[0]; +- btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); +- if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) +- goto split; ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ ++ BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && ++ key.type != BTRFS_EXTENT_CSUM_KEY); ++ ++ if (btrfs_leaf_free_space(root, leaf) >= ins_len) ++ return 0; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ if (key.type == BTRFS_EXTENT_DATA_KEY) { ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); ++ extent_len = btrfs_file_extent_num_bytes(leaf, fi); ++ } + btrfs_release_path(root, path); + +- path->search_for_split = 1; + path->keep_locks = 1; +- +- ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); ++ path->search_for_split = 1; ++ ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + path->search_for_split = 0; ++ if (ret < 0) ++ goto err; + ++ ret = -EAGAIN; ++ leaf = path->nodes[0]; + /* if our item isn't there or got smaller, return now */ +- if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], +- path->slots[0])) { +- path->keep_locks = 0; +- return -EAGAIN; ++ if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) ++ goto err; ++ ++ if (key.type == BTRFS_EXTENT_DATA_KEY) { ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); ++ if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) ++ goto err; + } + + btrfs_set_path_blocking(path); +- ret = split_leaf(trans, root, &orig_key, path, +- sizeof(struct btrfs_item), 1); +- path->keep_locks = 0; ++ ret = split_leaf(trans, root, &key, path, ins_len, 1); + BUG_ON(ret); + ++ path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); ++ return 0; ++err: ++ path->keep_locks = 0; ++ return ret; ++} ++ ++static noinline int split_item(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, ++ struct btrfs_key *new_key, ++ unsigned long split_offset) ++{ ++ struct extent_buffer *leaf; ++ struct btrfs_item *item; ++ struct btrfs_item *new_item; ++ int slot; ++ char *buf; ++ u32 nritems; ++ u32 item_size; ++ u32 orig_offset; ++ struct btrfs_disk_key disk_key; ++ + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + +-split: +- /* +- * make sure any changes to the path from split_leaf leave it +- * in a blocking state +- */ + btrfs_set_path_blocking(path); + + item = btrfs_item_nr(leaf, path->slots[0]); +@@ -3073,19 +3088,19 @@ split: + item_size = btrfs_item_size(leaf, item); + + buf = kmalloc(item_size, GFP_NOFS); ++ if (!buf) ++ return -ENOMEM; ++ + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, + path->slots[0]), item_size); +- slot = path->slots[0] + 1; +- leaf = path->nodes[0]; + ++ slot = path->slots[0] + 1; + nritems = btrfs_header_nritems(leaf); +- + if (slot != nritems) { + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), +- btrfs_item_nr_offset(slot), +- (nritems - slot) * sizeof(struct btrfs_item)); +- ++ btrfs_item_nr_offset(slot), ++ (nritems - slot) * sizeof(struct btrfs_item)); + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); +@@ -3113,16 +3128,81 @@ split: + item_size - split_offset); + btrfs_mark_buffer_dirty(leaf); + +- ret = 0; +- if (btrfs_leaf_free_space(root, leaf) < 0) { +- btrfs_print_leaf(root, leaf); +- BUG(); +- } ++ BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); + kfree(buf); ++ return 0; ++} ++ ++/* ++ * This function splits a single item into two items, ++ * giving 'new_key' to the new item and splitting the ++ * old one at split_offset (from the start of the item). ++ * ++ * The path may be released by this operation. After ++ * the split, the path is pointing to the old item. The ++ * new item is going to be in the same node as the old one. ++ * ++ * Note, the item being split must be smaller enough to live alone on ++ * a tree block with room for one extra struct btrfs_item ++ * ++ * This allows us to split the item in place, keeping a lock on the ++ * leaf the entire time. ++ */ ++int btrfs_split_item(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, ++ struct btrfs_key *new_key, ++ unsigned long split_offset) ++{ ++ int ret; ++ ret = setup_leaf_for_split(trans, root, path, ++ sizeof(struct btrfs_item)); ++ if (ret) ++ return ret; ++ ++ ret = split_item(trans, root, path, new_key, split_offset); + return ret; + } + + /* ++ * This function duplicate a item, giving 'new_key' to the new item. ++ * It guarantees both items live in the same tree leaf and the new item ++ * is contiguous with the original item. ++ * ++ * This allows us to split file extent in place, keeping a lock on the ++ * leaf the entire time. ++ */ ++int btrfs_duplicate_item(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, ++ struct btrfs_key *new_key) ++{ ++ struct extent_buffer *leaf; ++ int ret; ++ u32 item_size; ++ ++ leaf = path->nodes[0]; ++ item_size = btrfs_item_size_nr(leaf, path->slots[0]); ++ ret = setup_leaf_for_split(trans, root, path, ++ item_size + sizeof(struct btrfs_item)); ++ if (ret) ++ return ret; ++ ++ path->slots[0]++; ++ ret = setup_items_for_insert(trans, root, path, new_key, &item_size, ++ item_size, item_size + ++ sizeof(struct btrfs_item), 1); ++ BUG_ON(ret); ++ ++ leaf = path->nodes[0]; ++ memcpy_extent_buffer(leaf, ++ btrfs_item_ptr_offset(leaf, path->slots[0]), ++ btrfs_item_ptr_offset(leaf, path->slots[0] - 1), ++ item_size); ++ return 0; ++} ++ ++/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -2089,6 +2089,10 @@ int btrfs_split_item(struct btrfs_trans_ + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset); ++int btrfs_duplicate_item(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct btrfs_path *path, ++ struct btrfs_key *new_key); + int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow); diff --git a/queue-2.6.32/btrfs-add-delayed-iput.patch b/queue-2.6.32/btrfs-add-delayed-iput.patch new file mode 100644 index 00000000000..25855f02e7e --- /dev/null +++ b/queue-2.6.32/btrfs-add-delayed-iput.patch @@ -0,0 +1,309 @@ +From 24bbcf0442ee04660a5a030efdbb6d03f1c275cb Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:36:34 +0000 +Subject: Btrfs: Add delayed iput + +From: Yan, Zheng + +commit 24bbcf0442ee04660a5a030efdbb6d03f1c275cb upstream. + +iput() can trigger new transactions if we are dropping the +final reference, so calling it in btrfs_commit_transaction +may end up deadlock. This patch adds delayed iput to avoid +the issue. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 7 +++++- + fs/btrfs/disk-io.c | 4 +++ + fs/btrfs/extent-tree.c | 8 +++--- + fs/btrfs/inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-- + fs/btrfs/ordered-data.c | 10 ++++++-- + fs/btrfs/ordered-data.h | 3 +- + fs/btrfs/relocation.c | 4 +-- + fs/btrfs/super.c | 4 +-- + fs/btrfs/transaction.c | 13 ++++++++--- + 9 files changed, 90 insertions(+), 18 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -872,6 +872,9 @@ struct btrfs_fs_info { + struct list_head dead_roots; + struct list_head caching_block_groups; + ++ spinlock_t delayed_iput_lock; ++ struct list_head delayed_iputs; ++ + atomic_t nr_async_submits; + atomic_t async_submit_draining; + atomic_t nr_async_bios; +@@ -2301,7 +2304,7 @@ int btrfs_truncate_inode_items(struct bt + struct inode *inode, u64 new_size, + u32 min_type); + +-int btrfs_start_delalloc_inodes(struct btrfs_root *root); ++int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); + int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); + int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); +@@ -2341,6 +2344,8 @@ int btrfs_orphan_del(struct btrfs_trans_ + void btrfs_orphan_cleanup(struct btrfs_root *root); + int btrfs_cont_expand(struct inode *inode, loff_t size); + int btrfs_invalidate_inodes(struct btrfs_root *root); ++void btrfs_add_delayed_iput(struct inode *inode); ++void btrfs_run_delayed_iputs(struct btrfs_root *root); + extern const struct dentry_operations btrfs_dentry_operations; + + /* ioctl.c */ +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1476,6 +1476,7 @@ static int cleaner_kthread(void *arg) + + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && + mutex_trylock(&root->fs_info->cleaner_mutex)) { ++ btrfs_run_delayed_iputs(root); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + } +@@ -1605,6 +1606,7 @@ struct btrfs_root *open_ctree(struct sup + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); ++ INIT_LIST_HEAD(&fs_info->delayed_iputs); + INIT_LIST_HEAD(&fs_info->hashers); + INIT_LIST_HEAD(&fs_info->delalloc_inodes); + INIT_LIST_HEAD(&fs_info->ordered_operations); +@@ -1613,6 +1615,7 @@ struct btrfs_root *open_ctree(struct sup + spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); ++ spin_lock_init(&fs_info->delayed_iput_lock); + + init_completion(&fs_info->kobj_unregister); + fs_info->tree_root = tree_root; +@@ -2386,6 +2389,7 @@ int btrfs_commit_super(struct btrfs_root + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); ++ btrfs_run_delayed_iputs(root); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -2880,9 +2880,9 @@ static noinline void flush_delalloc_asyn + root = async->root; + info = async->info; + +- btrfs_start_delalloc_inodes(root); ++ btrfs_start_delalloc_inodes(root, 0); + wake_up(&info->flush_wait); +- btrfs_wait_ordered_extents(root, 0); ++ btrfs_wait_ordered_extents(root, 0, 0); + + spin_lock(&info->lock); + info->flushing = 0; +@@ -2956,8 +2956,8 @@ static void flush_delalloc(struct btrfs_ + return; + + flush: +- btrfs_start_delalloc_inodes(root); +- btrfs_wait_ordered_extents(root, 0); ++ btrfs_start_delalloc_inodes(root, 0); ++ btrfs_wait_ordered_extents(root, 0, 0); + + spin_lock(&info->lock); + info->flushing = 0; +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -2022,6 +2022,54 @@ zeroit: + return -EIO; + } + ++struct delayed_iput { ++ struct list_head list; ++ struct inode *inode; ++}; ++ ++void btrfs_add_delayed_iput(struct inode *inode) ++{ ++ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; ++ struct delayed_iput *delayed; ++ ++ if (atomic_add_unless(&inode->i_count, -1, 1)) ++ return; ++ ++ delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); ++ delayed->inode = inode; ++ ++ spin_lock(&fs_info->delayed_iput_lock); ++ list_add_tail(&delayed->list, &fs_info->delayed_iputs); ++ spin_unlock(&fs_info->delayed_iput_lock); ++} ++ ++void btrfs_run_delayed_iputs(struct btrfs_root *root) ++{ ++ LIST_HEAD(list); ++ struct btrfs_fs_info *fs_info = root->fs_info; ++ struct delayed_iput *delayed; ++ int empty; ++ ++ spin_lock(&fs_info->delayed_iput_lock); ++ empty = list_empty(&fs_info->delayed_iputs); ++ spin_unlock(&fs_info->delayed_iput_lock); ++ if (empty) ++ return; ++ ++ down_read(&root->fs_info->cleanup_work_sem); ++ spin_lock(&fs_info->delayed_iput_lock); ++ list_splice_init(&fs_info->delayed_iputs, &list); ++ spin_unlock(&fs_info->delayed_iput_lock); ++ ++ while (!list_empty(&list)) { ++ delayed = list_entry(list.next, struct delayed_iput, list); ++ list_del(&delayed->list); ++ iput(delayed->inode); ++ kfree(delayed); ++ } ++ up_read(&root->fs_info->cleanup_work_sem); ++} ++ + /* + * This creates an orphan entry for the given inode in case something goes + * wrong in the middle of an unlink/truncate. +@@ -5568,7 +5616,7 @@ out_fail: + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ +-int btrfs_start_delalloc_inodes(struct btrfs_root *root) ++int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) + { + struct list_head *head = &root->fs_info->delalloc_inodes; + struct btrfs_inode *binode; +@@ -5587,7 +5635,10 @@ int btrfs_start_delalloc_inodes(struct b + spin_unlock(&root->fs_info->delalloc_lock); + if (inode) { + filemap_flush(inode->i_mapping); +- iput(inode); ++ if (delay_iput) ++ btrfs_add_delayed_iput(inode); ++ else ++ iput(inode); + } + cond_resched(); + spin_lock(&root->fs_info->delalloc_lock); +--- a/fs/btrfs/ordered-data.c ++++ b/fs/btrfs/ordered-data.c +@@ -352,7 +352,8 @@ int btrfs_remove_ordered_extent(struct i + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) ++int btrfs_wait_ordered_extents(struct btrfs_root *root, ++ int nocow_only, int delay_iput) + { + struct list_head splice; + struct list_head *cur; +@@ -389,7 +390,10 @@ int btrfs_wait_ordered_extents(struct bt + if (inode) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); +- iput(inode); ++ if (delay_iput) ++ btrfs_add_delayed_iput(inode); ++ else ++ iput(inode); + } else { + btrfs_put_ordered_extent(ordered); + } +@@ -447,7 +451,7 @@ again: + btrfs_wait_ordered_range(inode, 0, (u64)-1); + else + filemap_flush(inode->i_mapping); +- iput(inode); ++ btrfs_add_delayed_iput(inode); + } + + cond_resched(); +--- a/fs/btrfs/ordered-data.h ++++ b/fs/btrfs/ordered-data.h +@@ -153,9 +153,10 @@ btrfs_lookup_first_ordered_extent(struct + int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered); + int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); +-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); + int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); + int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); ++int btrfs_wait_ordered_extents(struct btrfs_root *root, ++ int nocow_only, int delay_iput); + #endif +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -3541,8 +3541,8 @@ int btrfs_relocate_block_group(struct bt + (unsigned long long)rc->block_group->key.objectid, + (unsigned long long)rc->block_group->flags); + +- btrfs_start_delalloc_inodes(fs_info->tree_root); +- btrfs_wait_ordered_extents(fs_info->tree_root, 0); ++ btrfs_start_delalloc_inodes(fs_info->tree_root, 0); ++ btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); + + while (1) { + rc->extents_found = 0; +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -405,8 +405,8 @@ int btrfs_sync_fs(struct super_block *sb + return 0; + } + +- btrfs_start_delalloc_inodes(root); +- btrfs_wait_ordered_extents(root, 0); ++ btrfs_start_delalloc_inodes(root, 0); ++ btrfs_wait_ordered_extents(root, 0, 0); + + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -333,6 +333,9 @@ static int __btrfs_end_transaction(struc + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + ++ if (throttle) ++ btrfs_run_delayed_iputs(root); ++ + return 0; + } + +@@ -991,11 +994,11 @@ int btrfs_commit_transaction(struct btrf + mutex_unlock(&root->fs_info->trans_mutex); + + if (flush_on_commit) { +- btrfs_start_delalloc_inodes(root); +- ret = btrfs_wait_ordered_extents(root, 0); ++ btrfs_start_delalloc_inodes(root, 1); ++ ret = btrfs_wait_ordered_extents(root, 0, 1); + BUG_ON(ret); + } else if (snap_pending) { +- ret = btrfs_wait_ordered_extents(root, 1); ++ ret = btrfs_wait_ordered_extents(root, 0, 1); + BUG_ON(ret); + } + +@@ -1113,6 +1116,10 @@ int btrfs_commit_transaction(struct btrf + current->journal_info = NULL; + + kmem_cache_free(btrfs_trans_handle_cachep, trans); ++ ++ if (current != root->fs_info->transaction_kthread) ++ btrfs_run_delayed_iputs(root); ++ + return ret; + } + diff --git a/queue-2.6.32/btrfs-align-offsets-for-btrfs_ordered_update_i_size.patch b/queue-2.6.32/btrfs-align-offsets-for-btrfs_ordered_update_i_size.patch new file mode 100644 index 00000000000..e92d4f677d7 --- /dev/null +++ b/queue-2.6.32/btrfs-align-offsets-for-btrfs_ordered_update_i_size.patch @@ -0,0 +1,34 @@ +From a038fab0cb873c75d6675e2bcffce8a3935bdce7 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Mon, 28 Dec 2009 05:01:58 +0000 +Subject: Btrfs: align offsets for btrfs_ordered_update_i_size + +From: Yan, Zheng + +commit a038fab0cb873c75d6675e2bcffce8a3935bdce7 upstream. + +Some callers of btrfs_ordered_update_i_size can now pass in +a NULL for the ordered extent to update against. This makes +sure we properly align the offset they pass in when deciding +how much to bump the on disk i_size. + +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/ordered-data.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/ordered-data.c ++++ b/fs/btrfs/ordered-data.c +@@ -626,6 +626,8 @@ int btrfs_ordered_update_i_size(struct i + + if (ordered) + offset = entry_end(ordered); ++ else ++ offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); + + mutex_lock(&tree->mutex); + disk_i_size = BTRFS_I(inode)->disk_i_size; diff --git a/queue-2.6.32/btrfs-apply-updated-fallocate-i_size-fix.patch b/queue-2.6.32/btrfs-apply-updated-fallocate-i_size-fix.patch new file mode 100644 index 00000000000..146fda503e0 --- /dev/null +++ b/queue-2.6.32/btrfs-apply-updated-fallocate-i_size-fix.patch @@ -0,0 +1,34 @@ +From 23b5c50945f2294add0137799400329c0ebba290 Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Thu, 4 Feb 2010 11:33:03 -0500 +Subject: Btrfs: apply updated fallocate i_size fix + +From: Aneesh Kumar K.V + +commit 23b5c50945f2294add0137799400329c0ebba290 upstream. + +This version of the i_size fix for fallocate makes sure we only update +the i_size when the current fallocate is really operating outside of +i_size. + +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -5798,7 +5798,9 @@ static int prealloc_file_range(struct in + inode->i_ctime = CURRENT_TIME; + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; + if (!(mode & FALLOC_FL_KEEP_SIZE) && +- cur_offset > inode->i_size) { ++ (actual_len > inode->i_size) && ++ (cur_offset > inode->i_size)) { ++ + if (cur_offset > actual_len) + i_size = actual_len; + else diff --git a/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-during-committing-transaction.patch b/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-during-committing-transaction.patch new file mode 100644 index 00000000000..ab9c40bb16a --- /dev/null +++ b/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-during-committing-transaction.patch @@ -0,0 +1,133 @@ +From 2e4bfab97055aa6acdd0637913bd705c2d6506d6 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:37:02 +0000 +Subject: Btrfs: Avoid orphan inodes cleanup during committing transaction + +From: Yan, Zheng + +commit 2e4bfab97055aa6acdd0637913bd705c2d6506d6 upstream. + +btrfs_lookup_dentry may trigger orphan cleanup, so it's not good +to call it while committing a transaction. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 29 +++++++++++++++++------------ + fs/btrfs/transaction.c | 4 ---- + 2 files changed, 17 insertions(+), 16 deletions(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -237,7 +237,6 @@ static noinline int create_subvol(struct + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; +- unsigned long nr = 1; + + /* + * 1 - inode item +@@ -342,24 +341,21 @@ static noinline int create_subvol(struct + + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); + fail: +- nr = trans->blocks_used; + err = btrfs_commit_transaction(trans, root); + if (err && !ret) + ret = err; + + btrfs_unreserve_metadata_space(root, 6); +- btrfs_btree_balance_dirty(root, nr); + return ret; + } + + static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen) + { ++ struct inode *inode; + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; +- int ret = 0; +- int err; +- unsigned long nr = 0; ++ int ret; + + if (!root->ref_cows) + return -EINVAL; +@@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_ + */ + ret = btrfs_reserve_metadata_space(root, 6); + if (ret) +- goto fail_unlock; ++ goto fail; + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; + btrfs_unreserve_metadata_space(root, 6); +- goto fail_unlock; ++ goto fail; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); + btrfs_unreserve_metadata_space(root, 6); +- goto fail_unlock; ++ goto fail; + } + memcpy(pending_snapshot->name, name, namelen); + pending_snapshot->name[namelen] = '\0'; +@@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_ + pending_snapshot->root = root; + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); +- err = btrfs_commit_transaction(trans, root); ++ ret = btrfs_commit_transaction(trans, root); ++ BUG_ON(ret); ++ btrfs_unreserve_metadata_space(root, 6); + +-fail_unlock: +- btrfs_btree_balance_dirty(root, nr); ++ inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); ++ if (IS_ERR(inode)) { ++ ret = PTR_ERR(inode); ++ goto fail; ++ } ++ BUG_ON(!inode); ++ d_instantiate(dentry, inode); ++ ret = 0; ++fail: + return ret; + } + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -796,7 +796,6 @@ static noinline int create_pending_snaps + memcpy(&pending->root_key, &key, sizeof(key)); + fail: + kfree(new_root_item); +- btrfs_unreserve_metadata_space(root, 6); + return ret; + } + +@@ -808,7 +807,6 @@ static noinline int finish_pending_snaps + u64 index = 0; + struct btrfs_trans_handle *trans; + struct inode *parent_inode; +- struct inode *inode; + struct btrfs_root *parent_root; + + parent_inode = pending->dentry->d_parent->d_inode; +@@ -840,8 +838,6 @@ static noinline int finish_pending_snaps + + BUG_ON(ret); + +- inode = btrfs_lookup_dentry(parent_inode, pending->dentry); +- d_instantiate(pending->dentry, inode); + fail: + btrfs_end_transaction(trans, fs_info->fs_root); + return ret; diff --git a/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-while-replaying-log.patch b/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-while-replaying-log.patch new file mode 100644 index 00000000000..9ad0c2a7f92 --- /dev/null +++ b/queue-2.6.32/btrfs-avoid-orphan-inodes-cleanup-while-replaying-log.patch @@ -0,0 +1,259 @@ +From c71bf099abddf3e0fdc27f251ba76fca1461d49a Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:34:40 +0000 +Subject: Btrfs: Avoid orphan inodes cleanup while replaying log + +From: Yan, Zheng + +commit c71bf099abddf3e0fdc27f251ba76fca1461d49a upstream. + +We do log replay in a single transaction, so it's not good to do unbound +operations. This patch cleans up orphan inodes cleanup after replaying +the log. It also avoids doing other unbound operations such as truncating +a file during replaying log. These unbound operations are postponed to +the orphan inode cleanup stage. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 5 +++-- + fs/btrfs/disk-io.c | 17 +++++++++++------ + fs/btrfs/inode.c | 19 ++++++++++++++++--- + fs/btrfs/relocation.c | 1 + + fs/btrfs/tree-log.c | 49 ++++++++++++++++++++++++------------------------- + 5 files changed, 55 insertions(+), 36 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -859,8 +859,9 @@ struct btrfs_fs_info { + struct mutex ordered_operations_mutex; + struct rw_semaphore extent_commit_sem; + +- struct rw_semaphore subvol_sem; ++ struct rw_semaphore cleanup_work_sem; + ++ struct rw_semaphore subvol_sem; + struct srcu_struct subvol_srcu; + + struct list_head trans_list; +@@ -1034,12 +1035,12 @@ struct btrfs_root { + int ref_cows; + int track_dirty; + int in_radix; ++ int clean_orphans; + + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; + int defrag_running; +- int defrag_level; + char *name; + int in_sysfs; + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u3 + root->stripesize = stripesize; + root->ref_cows = 0; + root->track_dirty = 0; ++ root->in_radix = 0; ++ root->clean_orphans = 0; + + root->fs_info = fs_info; + root->objectid = objectid; +@@ -928,7 +930,6 @@ static int __setup_root(u32 nodesize, u3 + root->defrag_trans_start = fs_info->generation; + init_completion(&root->kobj_unregister); + root->defrag_running = 0; +- root->defrag_level = 0; + root->root_key.objectid = objectid; + root->anon_super.s_root = NULL; + root->anon_super.s_dev = 0; +@@ -1210,8 +1211,10 @@ again: + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); +- if (ret == 0) ++ if (ret == 0) { + root->in_radix = 1; ++ root->clean_orphans = 1; ++ } + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); + if (ret) { +@@ -1225,10 +1228,6 @@ again: + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid); + WARN_ON(ret); +- +- if (!(fs_info->sb->s_flags & MS_RDONLY)) +- btrfs_orphan_cleanup(root); +- + return root; + fail: + free_fs_root(root); +@@ -1689,6 +1688,7 @@ struct btrfs_root *open_ctree(struct sup + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); + init_rwsem(&fs_info->extent_commit_sem); ++ init_rwsem(&fs_info->cleanup_work_sem); + init_rwsem(&fs_info->subvol_sem); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); +@@ -2388,6 +2388,11 @@ int btrfs_commit_super(struct btrfs_root + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); ++ ++ /* wait until ongoing cleanup work done */ ++ down_write(&root->fs_info->cleanup_work_sem); ++ up_write(&root->fs_info->cleanup_work_sem); ++ + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -2093,16 +2093,17 @@ void btrfs_orphan_cleanup(struct btrfs_r + struct inode *inode; + int ret = 0, nr_unlink = 0, nr_truncate = 0; + +- path = btrfs_alloc_path(); +- if (!path) ++ if (!xchg(&root->clean_orphans, 0)) + return; ++ ++ path = btrfs_alloc_path(); ++ BUG_ON(!path); + path->reada = -1; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = (u64)-1; + +- + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { +@@ -3298,6 +3299,11 @@ void btrfs_delete_inode(struct inode *in + } + btrfs_wait_ordered_range(inode, 0, (u64)-1); + ++ if (root->fs_info->log_root_recovering) { ++ BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); ++ goto no_delete; ++ } ++ + if (inode->i_nlink > 0) { + BUG_ON(btrfs_root_refs(&root->root_item) != 0); + goto no_delete; +@@ -3705,6 +3711,13 @@ struct inode *btrfs_lookup_dentry(struct + } + srcu_read_unlock(&root->fs_info->subvol_srcu, index); + ++ if (root != sub_root) { ++ down_read(&root->fs_info->cleanup_work_sem); ++ if (!(inode->i_sb->s_flags & MS_RDONLY)) ++ btrfs_orphan_cleanup(sub_root); ++ up_read(&root->fs_info->cleanup_work_sem); ++ } ++ + return inode; + } + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -3755,6 +3755,7 @@ out: + BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(fs_root)) + err = PTR_ERR(fs_root); ++ btrfs_orphan_cleanup(fs_root); + } + return err; + } +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -930,6 +930,17 @@ out_nowrite: + return 0; + } + ++static int insert_orphan_item(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, u64 offset) ++{ ++ int ret; ++ ret = btrfs_find_orphan_item(root, offset); ++ if (ret > 0) ++ ret = btrfs_insert_orphan_item(trans, root, offset); ++ return ret; ++} ++ ++ + /* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding +@@ -997,9 +1008,13 @@ static noinline int fixup_inode_link_cou + } + BTRFS_I(inode)->index_cnt = (u64)-1; + +- if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { +- ret = replay_dir_deletes(trans, root, NULL, path, +- inode->i_ino, 1); ++ if (inode->i_nlink == 0) { ++ if (S_ISDIR(inode->i_mode)) { ++ ret = replay_dir_deletes(trans, root, NULL, path, ++ inode->i_ino, 1); ++ BUG_ON(ret); ++ } ++ ret = insert_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } + btrfs_free_path(path); +@@ -1587,7 +1602,6 @@ static int replay_one_buffer(struct btrf + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { +- struct inode *inode; + struct btrfs_inode_item *inode_item; + u32 mode; + +@@ -1603,31 +1617,16 @@ static int replay_one_buffer(struct btrf + eb, i, &key); + BUG_ON(ret); + +- /* for regular files, truncate away +- * extents past the new EOF ++ /* for regular files, make sure corresponding ++ * orhpan item exist. extents past the new EOF ++ * will be truncated later by orphan cleanup. + */ + if (S_ISREG(mode)) { +- inode = read_one_inode(root, +- key.objectid); +- BUG_ON(!inode); +- +- ret = btrfs_truncate_inode_items(wc->trans, +- root, inode, inode->i_size, +- BTRFS_EXTENT_DATA_KEY); ++ ret = insert_orphan_item(wc->trans, root, ++ key.objectid); + BUG_ON(ret); +- +- /* if the nlink count is zero here, the iput +- * will free the inode. We bump it to make +- * sure it doesn't get freed until the link +- * count fixup is done +- */ +- if (inode->i_nlink == 0) { +- btrfs_inc_nlink(inode); +- btrfs_update_inode(wc->trans, +- root, inode); +- } +- iput(inode); + } ++ + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + BUG_ON(ret); diff --git a/queue-2.6.32/btrfs-avoid-superfluous-tree-log-writeout.patch b/queue-2.6.32/btrfs-avoid-superfluous-tree-log-writeout.patch new file mode 100644 index 00000000000..c36c1361229 --- /dev/null +++ b/queue-2.6.32/btrfs-avoid-superfluous-tree-log-writeout.patch @@ -0,0 +1,251 @@ +From 8cef4e160d74920ad1725f58c89fd75ec4c4ac38 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:33:26 +0000 +Subject: Btrfs: Avoid superfluous tree-log writeout + +From: Yan, Zheng + +commit 8cef4e160d74920ad1725f58c89fd75ec4c4ac38 upstream. + +We allow two log transactions at a time, but use same flag +to mark dirty tree-log btree blocks. So we may flush dirty +blocks belonging to newer log transaction when committing a +log transaction. This patch fixes the issue by using two +flags to mark dirty tree-log btree blocks. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 6 +++--- + fs/btrfs/extent-tree.c | 12 ++++++++++-- + fs/btrfs/transaction.c | 21 +++++++++++---------- + fs/btrfs/transaction.h | 6 +++--- + fs/btrfs/tree-log.c | 33 ++++++++++++++++++++------------- + 5 files changed, 47 insertions(+), 31 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -980,12 +980,12 @@ int btrfs_free_log_root_tree(struct btrf + + while (1) { + ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, +- 0, &start, &end, EXTENT_DIRTY); ++ 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + if (ret) + break; + +- clear_extent_dirty(&log_root_tree->dirty_log_pages, +- start, end, GFP_NOFS); ++ clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, ++ EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + } + eb = fs_info->log_root_tree->node; + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4919,8 +4919,16 @@ struct extent_buffer *btrfs_init_new_buf + btrfs_set_buffer_uptodate(buf); + + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { +- set_extent_dirty(&root->dirty_log_pages, buf->start, +- buf->start + buf->len - 1, GFP_NOFS); ++ /* ++ * we allow two log transactions at a time, use different ++ * EXENT bit to differentiate dirty pages. ++ */ ++ if (root->log_transid % 2 == 0) ++ set_extent_dirty(&root->dirty_log_pages, buf->start, ++ buf->start + buf->len - 1, GFP_NOFS); ++ else ++ set_extent_new(&root->dirty_log_pages, buf->start, ++ buf->start + buf->len - 1, GFP_NOFS); + } else { + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -354,7 +354,7 @@ int btrfs_end_transaction_throttle(struc + * those extents are sent to disk but does not wait on them + */ + int btrfs_write_marked_extents(struct btrfs_root *root, +- struct extent_io_tree *dirty_pages) ++ struct extent_io_tree *dirty_pages, int mark) + { + int ret; + int err = 0; +@@ -367,7 +367,7 @@ int btrfs_write_marked_extents(struct bt + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, +- EXTENT_DIRTY); ++ mark); + if (ret) + break; + while (start <= end) { +@@ -413,7 +413,7 @@ int btrfs_write_marked_extents(struct bt + * on all the pages and clear them from the dirty pages state tree + */ + int btrfs_wait_marked_extents(struct btrfs_root *root, +- struct extent_io_tree *dirty_pages) ++ struct extent_io_tree *dirty_pages, int mark) + { + int ret; + int err = 0; +@@ -425,12 +425,12 @@ int btrfs_wait_marked_extents(struct btr + unsigned long index; + + while (1) { +- ret = find_first_extent_bit(dirty_pages, 0, &start, &end, +- EXTENT_DIRTY); ++ ret = find_first_extent_bit(dirty_pages, start, &start, &end, ++ mark); + if (ret) + break; + +- clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); ++ clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; +@@ -460,13 +460,13 @@ int btrfs_wait_marked_extents(struct btr + * those extents are on disk for transaction or log commit + */ + int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, +- struct extent_io_tree *dirty_pages) ++ struct extent_io_tree *dirty_pages, int mark) + { + int ret; + int ret2; + +- ret = btrfs_write_marked_extents(root, dirty_pages); +- ret2 = btrfs_wait_marked_extents(root, dirty_pages); ++ ret = btrfs_write_marked_extents(root, dirty_pages, mark); ++ ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); + return ret || ret2; + } + +@@ -479,7 +479,8 @@ int btrfs_write_and_wait_transaction(str + return filemap_write_and_wait(btree_inode->i_mapping); + } + return btrfs_write_and_wait_marked_extents(root, +- &trans->transaction->dirty_pages); ++ &trans->transaction->dirty_pages, ++ EXTENT_DIRTY); + } + + /* +--- a/fs/btrfs/transaction.h ++++ b/fs/btrfs/transaction.h +@@ -107,10 +107,10 @@ void btrfs_throttle(struct btrfs_root *r + int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, +- struct extent_io_tree *dirty_pages); ++ struct extent_io_tree *dirty_pages, int mark); + int btrfs_write_marked_extents(struct btrfs_root *root, +- struct extent_io_tree *dirty_pages); ++ struct extent_io_tree *dirty_pages, int mark); + int btrfs_wait_marked_extents(struct btrfs_root *root, +- struct extent_io_tree *dirty_pages); ++ struct extent_io_tree *dirty_pages, int mark); + int btrfs_transaction_in_commit(struct btrfs_fs_info *info); + #endif +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -1977,10 +1977,11 @@ int btrfs_sync_log(struct btrfs_trans_ha + { + int index1; + int index2; ++ int mark; + int ret; + struct btrfs_root *log = root->log_root; + struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; +- u64 log_transid = 0; ++ unsigned long log_transid = 0; + + mutex_lock(&root->log_mutex); + index1 = root->log_transid % 2; +@@ -2014,24 +2015,29 @@ int btrfs_sync_log(struct btrfs_trans_ha + goto out; + } + ++ log_transid = root->log_transid; ++ if (log_transid % 2 == 0) ++ mark = EXTENT_DIRTY; ++ else ++ mark = EXTENT_NEW; ++ + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ +- ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); ++ ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); + BUG_ON(ret); + + btrfs_set_root_node(&log->root_item, log->node); + + root->log_batch = 0; +- log_transid = root->log_transid; + root->log_transid++; + log->log_transid = root->log_transid; + root->log_start_pid = 0; + smp_mb(); + /* +- * log tree has been flushed to disk, new modifications of +- * the log will be written to new positions. so it's safe to +- * allow log writers to go in. ++ * IO has been started, blocks of the log tree have WRITTEN flag set ++ * in their headers. new modifications of the log will be written to ++ * new positions. so it's safe to allow log writers to go in. + */ + mutex_unlock(&root->log_mutex); + +@@ -2052,7 +2058,7 @@ int btrfs_sync_log(struct btrfs_trans_ha + + index2 = log_root_tree->log_transid % 2; + if (atomic_read(&log_root_tree->log_commit[index2])) { +- btrfs_wait_marked_extents(log, &log->dirty_log_pages); ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + wait_log_commit(trans, log_root_tree, + log_root_tree->log_transid); + mutex_unlock(&log_root_tree->log_mutex); +@@ -2072,16 +2078,17 @@ int btrfs_sync_log(struct btrfs_trans_ha + * check the full commit flag again + */ + if (root->fs_info->last_trans_log_full_commit == trans->transid) { +- btrfs_wait_marked_extents(log, &log->dirty_log_pages); ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out_wake_log_root; + } + + ret = btrfs_write_and_wait_marked_extents(log_root_tree, +- &log_root_tree->dirty_log_pages); ++ &log_root_tree->dirty_log_pages, ++ EXTENT_DIRTY | EXTENT_NEW); + BUG_ON(ret); +- btrfs_wait_marked_extents(log, &log->dirty_log_pages); ++ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log_root_tree->node->start); +@@ -2147,12 +2154,12 @@ int btrfs_free_log(struct btrfs_trans_ha + + while (1) { + ret = find_first_extent_bit(&log->dirty_log_pages, +- 0, &start, &end, EXTENT_DIRTY); ++ 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); + if (ret) + break; + +- clear_extent_dirty(&log->dirty_log_pages, +- start, end, GFP_NOFS); ++ clear_extent_bits(&log->dirty_log_pages, start, end, ++ EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + } + + if (log->log_transid > 0) { diff --git a/queue-2.6.32/btrfs-btrfs_mark_extent_written-uses-the-wrong-slot.patch b/queue-2.6.32/btrfs-btrfs_mark_extent_written-uses-the-wrong-slot.patch new file mode 100644 index 00000000000..8f6b44ed6f9 --- /dev/null +++ b/queue-2.6.32/btrfs-btrfs_mark_extent_written-uses-the-wrong-slot.patch @@ -0,0 +1,48 @@ +From 3f6fae9559225741c91f1320090b285da1413290 Mon Sep 17 00:00:00 2001 +From: Shaohua Li +Date: Thu, 11 Feb 2010 07:43:00 +0000 +Subject: Btrfs: btrfs_mark_extent_written uses the wrong slot + +From: Shaohua Li + +commit 3f6fae9559225741c91f1320090b285da1413290 upstream. + +My test do: fallocate a big file and do write. The file is 512M, but +after file write is done btrfs-debug-tree shows: +item 6 key (257 EXTENT_DATA 0) itemoff 3516 itemsize 53 + extent data disk byte 1103101952 nr 536870912 + extent data offset 0 nr 399634432 ram 536870912 + extent compression 0 +Looks like a regression introducted by +6c7d54ac87f338c479d9729e8392eca3f76e11e1, where we set wrong slot. + +Signed-off-by: Shaohua Li +Acked-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -720,13 +720,15 @@ again: + inode->i_ino, orig_offset); + BUG_ON(ret); + } +- fi = btrfs_item_ptr(leaf, path->slots[0], +- struct btrfs_file_extent_item); + if (del_nr == 0) { ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_mark_buffer_dirty(leaf); + } else { ++ fi = btrfs_item_ptr(leaf, del_slot - 1, ++ struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, diff --git a/queue-2.6.32/btrfs-check-return-value-of-open_bdev_exclusive-properly.patch b/queue-2.6.32/btrfs-check-return-value-of-open_bdev_exclusive-properly.patch new file mode 100644 index 00000000000..b45a988c236 --- /dev/null +++ b/queue-2.6.32/btrfs-check-return-value-of-open_bdev_exclusive-properly.patch @@ -0,0 +1,37 @@ +From 7f59203abeaf18bf3497b308891f95a4489810ad Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 27 Jan 2010 02:09:00 +0000 +Subject: Btrfs: check return value of open_bdev_exclusive properly + +From: Josef Bacik + +commit 7f59203abeaf18bf3497b308891f95a4489810ad upstream. + +Hit this problem while testing RAID1 failure stuff. open_bdev_exclusive +returns ERR_PTR(), not NULL. So change the return value properly. This +is important if you accidently specify a device that doesn't exist when +trying to add a new device to an array, you will panic the box +dereferencing bdev. + +Signed-off-by: Josef Bacik +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1434,8 +1434,8 @@ int btrfs_init_new_device(struct btrfs_r + return -EINVAL; + + bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); +- if (!bdev) +- return -EIO; ++ if (IS_ERR(bdev)) ++ return PTR_ERR(bdev); + + if (root->fs_info->fs_devices->seeding) { + seeding_dev = 1; diff --git a/queue-2.6.32/btrfs-check-total-number-of-devices-when-removing-missing.patch b/queue-2.6.32/btrfs-check-total-number-of-devices-when-removing-missing.patch new file mode 100644 index 00000000000..cde0ebd02ef --- /dev/null +++ b/queue-2.6.32/btrfs-check-total-number-of-devices-when-removing-missing.patch @@ -0,0 +1,51 @@ +From 035fe03a7ad56982b30ab3a522b7b08d58feccd0 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 27 Jan 2010 02:09:38 +0000 +Subject: Btrfs: check total number of devices when removing missing + +From: Josef Bacik + +commit 035fe03a7ad56982b30ab3a522b7b08d58feccd0 upstream. + +If you have a disk failure in RAID1 and then add a new disk to the +array, and then try to remove the missing volume, it will fail. The +reason is the sanity check only looks at the total number of rw devices, +which is just 2 because we have 2 good disks and 1 bad one. Instead +check the total number of devices in the array to make sure we can +actually remove the device. Tested this with a failed disk setup and +with this test we can now run + +btrfs-vol -r missing /mount/point + +and it works fine. + +Signed-off-by: Josef Bacik +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/volumes.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1135,7 +1135,7 @@ int btrfs_rm_device(struct btrfs_root *r + root->fs_info->avail_metadata_alloc_bits; + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && +- root->fs_info->fs_devices->rw_devices <= 4) { ++ root->fs_info->fs_devices->num_devices <= 4) { + printk(KERN_ERR "btrfs: unable to go below four devices " + "on raid10\n"); + ret = -EINVAL; +@@ -1143,7 +1143,7 @@ int btrfs_rm_device(struct btrfs_root *r + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && +- root->fs_info->fs_devices->rw_devices <= 2) { ++ root->fs_info->fs_devices->num_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid1\n"); + ret = -EINVAL; diff --git a/queue-2.6.32/btrfs-deal-with-null-acl-sent-to-btrfs_set_acl.patch b/queue-2.6.32/btrfs-deal-with-null-acl-sent-to-btrfs_set_acl.patch new file mode 100644 index 00000000000..83b7d75ebf9 --- /dev/null +++ b/queue-2.6.32/btrfs-deal-with-null-acl-sent-to-btrfs_set_acl.patch @@ -0,0 +1,43 @@ +From a9cc71a60c29a09174bee2fcef8f924c529fd4b7 Mon Sep 17 00:00:00 2001 +From: Chris Mason +Date: Sun, 17 Jan 2010 20:36:18 -0500 +Subject: Btrfs: deal with NULL acl sent to btrfs_set_acl + +From: Chris Mason + +commit a9cc71a60c29a09174bee2fcef8f924c529fd4b7 upstream. + +It is legal for btrfs_set_acl to be sent a NULL acl. This +makes sure we don't dereference it. A similar patch was sent by +Johannes Hirte + +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/acl.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/acl.c ++++ b/fs/btrfs/acl.c +@@ -112,12 +112,14 @@ static int btrfs_set_acl(struct btrfs_tr + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; +- ret = posix_acl_equiv_mode(acl, &mode); +- if (ret < 0) +- return ret; +- ret = 0; +- inode->i_mode = mode; + name = POSIX_ACL_XATTR_ACCESS; ++ if (acl) { ++ ret = posix_acl_equiv_mode(acl, &mode); ++ if (ret < 0) ++ return ret; ++ inode->i_mode = mode; ++ } ++ ret = 0; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) diff --git a/queue-2.6.32/btrfs-deny-sys_link-across-subvolumes.patch b/queue-2.6.32/btrfs-deny-sys_link-across-subvolumes.patch new file mode 100644 index 00000000000..2dda52aabc3 --- /dev/null +++ b/queue-2.6.32/btrfs-deny-sys_link-across-subvolumes.patch @@ -0,0 +1,42 @@ +From 4a8be425a8fb8fbb5d881eb55fa6634c3463b9c9 Mon Sep 17 00:00:00 2001 +From: TARUISI Hiroaki +Date: Thu, 12 Nov 2009 07:14:26 +0000 +Subject: Btrfs: deny sys_link across subvolumes. + +From: TARUISI Hiroaki + +commit 4a8be425a8fb8fbb5d881eb55fa6634c3463b9c9 upstream. + +I rebased Christian Parpart's patch to deny hard link across +subvolumes. Original patch modifies also btrfs_rename, but +I excluded it because we can move across subvolumes now and +it make no problem. +----------------- + +Hard link across subvolumes should not allowed in Btrfs. +btrfs_link checks root of 'to' directory is same as root +of 'from' file. If not same, btrfs_link returns -EPERM. + +Signed-off-by: TARUISI Hiroaki +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/inode.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -4462,6 +4462,10 @@ static int btrfs_link(struct dentry *old + if (inode->i_nlink == 0) + return -ENOENT; + ++ /* do not allow sys_link's with other subvols of the same device */ ++ if (root->objectid != BTRFS_I(inode)->root->objectid) ++ return -EPERM; ++ + /* + * 1 item for inode ref + * 2 items for dir items diff --git a/queue-2.6.32/btrfs-do-not-mark-the-chunk-as-readonly-if-in-degraded-mode.patch b/queue-2.6.32/btrfs-do-not-mark-the-chunk-as-readonly-if-in-degraded-mode.patch new file mode 100644 index 00000000000..c0c829f8922 --- /dev/null +++ b/queue-2.6.32/btrfs-do-not-mark-the-chunk-as-readonly-if-in-degraded-mode.patch @@ -0,0 +1,41 @@ +From f48b90756bd834dda852ff514f2690d3175b1f44 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 27 Jan 2010 02:07:59 +0000 +Subject: Btrfs: do not mark the chunk as readonly if in degraded mode + +From: Josef Bacik + +commit f48b90756bd834dda852ff514f2690d3175b1f44 upstream. + +If a RAID setup has chunks that span multiple disks, and one of those +disks has failed, btrfs_chunk_readonly will return 1 since one of the +disks in that chunk's stripes is dead and therefore not writeable. So +instead if we are in degraded mode, return 0 so we can go ahead and +allocate stuff. Without this patch all of the block groups in a RAID1 +setup will end up read-only, which will mean we can't add new disks to +the array since we won't be able to make allocations. + +Signed-off-by: Josef Bacik +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/volumes.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2538,6 +2538,11 @@ int btrfs_chunk_readonly(struct btrfs_ro + if (!em) + return 1; + ++ if (btrfs_test_opt(root, DEGRADED)) { ++ free_extent_map(em); ++ return 0; ++ } ++ + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (!map->stripes[i].dev->writeable) { diff --git a/queue-2.6.32/btrfs-do-not-try-and-lookup-the-file-extent-when-finishing-ordered-io.patch b/queue-2.6.32/btrfs-do-not-try-and-lookup-the-file-extent-when-finishing-ordered-io.patch new file mode 100644 index 00000000000..dca70e70b98 --- /dev/null +++ b/queue-2.6.32/btrfs-do-not-try-and-lookup-the-file-extent-when-finishing-ordered-io.patch @@ -0,0 +1,105 @@ +From efd049fb26a162c3830fd3cb1001fdc09b147f3b Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 2 Feb 2010 20:50:10 +0000 +Subject: Btrfs: do not try and lookup the file extent when finishing ordered io + +From: Josef Bacik + +commit efd049fb26a162c3830fd3cb1001fdc09b147f3b upstream. + +When running the following fio job + +[torrent] +filename=torrent-test +rw=randwrite +size=4g +filesize=4g +bs=4k +ioengine=sync + +you would see long stalls where no work was being done. That is because we were +doing all this extra work to read in the file extent outside of the transaction, +however in the random io case this ends up hurting us because the file extents +are not there to begin with. So axe this logic, since we end up reading in the +file extent when we go to update it anyway. This took the fio job from 11 mb/s +with several ~10 second stalls to 24 mb/s to a couple of 1-2 second stalls. + +Signed-off-by: Josef Bacik +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 46 ++-------------------------------------------- + 1 file changed, 2 insertions(+), 44 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1680,24 +1680,6 @@ static int insert_reserved_file_extent(s + * before we start the transaction. It limits the amount of btree + * reads required while inside the transaction. + */ +-static noinline void reada_csum(struct btrfs_root *root, +- struct btrfs_path *path, +- struct btrfs_ordered_extent *ordered_extent) +-{ +- struct btrfs_ordered_sum *sum; +- u64 bytenr; +- +- sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, +- list); +- bytenr = sum->sums[0].bytenr; +- +- /* +- * we don't care about the results, the point of this search is +- * just to get the btree leaves into ram +- */ +- btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); +-} +- + /* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. +@@ -1708,7 +1690,6 @@ static int btrfs_finish_ordered_io(struc + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered_extent = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +- struct btrfs_path *path; + int compressed = 0; + int ret; + +@@ -1716,32 +1697,9 @@ static int btrfs_finish_ordered_io(struc + if (!ret) + return 0; + +- /* +- * before we join the transaction, try to do some of our IO. +- * This will limit the amount of IO that we have to do with +- * the transaction running. We're unlikely to need to do any +- * IO if the file extents are new, the disk_i_size checks +- * covers the most common case. +- */ +- if (start < BTRFS_I(inode)->disk_i_size) { +- path = btrfs_alloc_path(); +- if (path) { +- ret = btrfs_lookup_file_extent(NULL, root, path, +- inode->i_ino, +- start, 0); +- ordered_extent = btrfs_lookup_ordered_extent(inode, +- start); +- if (!list_empty(&ordered_extent->list)) { +- btrfs_release_path(root, path); +- reada_csum(root, path, ordered_extent); +- } +- btrfs_free_path(path); +- } +- } +- +- if (!ordered_extent) +- ordered_extent = btrfs_lookup_ordered_extent(inode, start); ++ ordered_extent = btrfs_lookup_ordered_extent(inode, start); + BUG_ON(!ordered_extent); ++ + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + BUG_ON(!list_empty(&ordered_extent->list)); + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); diff --git a/queue-2.6.32/btrfs-don-t-add-extent-0-to-the-free-space-cache-v2.patch b/queue-2.6.32/btrfs-don-t-add-extent-0-to-the-free-space-cache-v2.patch new file mode 100644 index 00000000000..a2dda34e1f4 --- /dev/null +++ b/queue-2.6.32/btrfs-don-t-add-extent-0-to-the-free-space-cache-v2.patch @@ -0,0 +1,48 @@ +From 06b2331f8333ec6edf41662757ce8882cc1747d5 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 26 Nov 2009 09:31:11 +0000 +Subject: Btrfs: don't add extent 0 to the free space cache v2 + +From: Yan, Zheng + +commit 06b2331f8333ec6edf41662757ce8882cc1747d5 upstream. + +If block group 0 is completely free, btrfs_read_block_groups will +add extent [0, BTRFS_SUPER_INFO_OFFSET) to the free space cache. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/extent-tree.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -195,6 +195,14 @@ static int exclude_super_stripes(struct + int stripe_len; + int i, nr, ret; + ++ if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { ++ stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; ++ cache->bytes_super += stripe_len; ++ ret = add_excluded_extent(root, cache->key.objectid, ++ stripe_len); ++ BUG_ON(ret); ++ } ++ + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, +@@ -255,7 +263,7 @@ static u64 add_new_free_space(struct btr + if (ret) + break; + +- if (extent_start == start) { ++ if (extent_start <= start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; diff --git a/queue-2.6.32/btrfs-fail-mount-on-bad-mount-options.patch b/queue-2.6.32/btrfs-fail-mount-on-bad-mount-options.patch new file mode 100644 index 00000000000..9d66a71f976 --- /dev/null +++ b/queue-2.6.32/btrfs-fail-mount-on-bad-mount-options.patch @@ -0,0 +1,50 @@ +From a7a3f7cadd9bdee569243f7ead9550aa16b60e07 Mon Sep 17 00:00:00 2001 +From: Sage Weil +Date: Sat, 7 Nov 2009 06:19:16 +0000 +Subject: Btrfs: fail mount on bad mount options + +From: Sage Weil + +commit a7a3f7cadd9bdee569243f7ead9550aa16b60e07 upstream. + +We shouldn't silently ignore unrecognized options. + +Signed-off-by: Sage Weil +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/super.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -128,6 +128,7 @@ int btrfs_parse_options(struct btrfs_roo + substring_t args[MAX_OPT_ARGS]; + char *p, *num; + int intarg; ++ int ret = 0; + + if (!options) + return 0; +@@ -262,12 +263,18 @@ int btrfs_parse_options(struct btrfs_roo + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; ++ case Opt_err: ++ printk(KERN_INFO "btrfs: unrecognized mount option " ++ "'%s'\n", p); ++ ret = -EINVAL; ++ goto out; + default: + break; + } + } ++out: + kfree(options); +- return 0; ++ return ret; + } + + /* diff --git a/queue-2.6.32/btrfs-fix-a-memory-leak-in-btrfs_init_acl.patch b/queue-2.6.32/btrfs-fix-a-memory-leak-in-btrfs_init_acl.patch new file mode 100644 index 00000000000..f6924391c50 --- /dev/null +++ b/queue-2.6.32/btrfs-fix-a-memory-leak-in-btrfs_init_acl.patch @@ -0,0 +1,31 @@ +From f858153c367a397235d3e81136741e40e44faf1d Mon Sep 17 00:00:00 2001 +From: Yang Hongyang +Date: Tue, 26 Jan 2010 00:48:23 +0000 +Subject: Btrfs: fix a memory leak in btrfs_init_acl + +From: Yang Hongyang + +commit f858153c367a397235d3e81136741e40e44faf1d upstream. + +In btrfs_init_acl() cloned acl is not released + +Signed-off-by: Yang Hongyang +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/acl.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/acl.c ++++ b/fs/btrfs/acl.c +@@ -272,6 +272,7 @@ int btrfs_init_acl(struct btrfs_trans_ha + ACL_TYPE_ACCESS); + } + } ++ posix_acl_release(clone); + } + failed: + posix_acl_release(acl); diff --git a/queue-2.6.32/btrfs-fix-btrfs_drop_extent_cache-for-skip-pinned-case.patch b/queue-2.6.32/btrfs-fix-btrfs_drop_extent_cache-for-skip-pinned-case.patch new file mode 100644 index 00000000000..9ecc378e6c3 --- /dev/null +++ b/queue-2.6.32/btrfs-fix-btrfs_drop_extent_cache-for-skip-pinned-case.patch @@ -0,0 +1,45 @@ +From 55ef68990029fcd8d04d42fc184aa7fb18cf309e Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:36:44 +0000 +Subject: Btrfs: Fix btrfs_drop_extent_cache for skip pinned case + +From: Yan, Zheng + +commit 55ef68990029fcd8d04d42fc184aa7fb18cf309e upstream. + +The check for skip pinned case is wrong, it may breaks the +while loop too soon. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 10 +++------- + 1 file changed, 3 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode + } + flags = em->flags; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { +- if (em->start <= start && +- (!testend || em->start + em->len >= start + len)) { ++ if (testend && em->start + em->len >= start + len) { + free_extent_map(em); + write_unlock(&em_tree->lock); + break; + } +- if (start < em->start) { +- len = em->start - start; +- } else { ++ start = em->start + em->len; ++ if (testend) + len = start + len - (em->start + em->len); +- start = em->start + em->len; +- } + free_extent_map(em); + write_unlock(&em_tree->lock); + continue; diff --git a/queue-2.6.32/btrfs-fix-disk_i_size-update-corner-case.patch b/queue-2.6.32/btrfs-fix-disk_i_size-update-corner-case.patch new file mode 100644 index 00000000000..c742fd4b53a --- /dev/null +++ b/queue-2.6.32/btrfs-fix-disk_i_size-update-corner-case.patch @@ -0,0 +1,448 @@ +From c216775458a2ee345d9412a2770c2916acfb5d30 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:34:21 +0000 +Subject: Btrfs: Fix disk_i_size update corner case + +From: Yan, Zheng + +commit c216775458a2ee345d9412a2770c2916acfb5d30 upstream. + +There are some cases file extents are inserted without involving +ordered struct. In these cases, we update disk_i_size directly, +without checking pending ordered extent and DELALLOC bit. This +patch extends btrfs_ordered_update_i_size() to handle these cases. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/btrfs_inode.h | 5 -- + fs/btrfs/inode.c | 71 ++++++++++++++++++++------------ + fs/btrfs/ordered-data.c | 105 +++++++++++++++++++++++++++++++++++++----------- + fs/btrfs/ordered-data.h | 2 + 4 files changed, 127 insertions(+), 56 deletions(-) + +--- a/fs/btrfs/btrfs_inode.h ++++ b/fs/btrfs/btrfs_inode.h +@@ -44,9 +44,6 @@ struct btrfs_inode { + */ + struct extent_io_tree io_failure_tree; + +- /* held while inesrting or deleting extents from files */ +- struct mutex extent_mutex; +- + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + +@@ -166,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_ + + static inline void btrfs_i_size_write(struct inode *inode, u64 size) + { +- inode->i_size = size; ++ i_size_write(inode, size); + BTRFS_I(inode)->disk_i_size = size; + } + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -188,8 +188,18 @@ static noinline int insert_inline_extent + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + ++ /* ++ * we're an inline extent, so nobody can ++ * extend the file past i_size without locking ++ * a page we already have locked. ++ * ++ * We must do any isize and inode updates ++ * before we unlock the pages. Otherwise we ++ * could end up racing with unlink. ++ */ + BTRFS_I(inode)->disk_i_size = inode->i_size; + btrfs_update_inode(trans, root, inode); ++ + return 0; + fail: + btrfs_free_path(path); +@@ -415,7 +425,6 @@ again: + start, end, + total_compressed, pages); + } +- btrfs_end_transaction(trans, root); + if (ret == 0) { + /* + * inline extent creation worked, we don't need +@@ -429,9 +438,11 @@ again: + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); +- ret = 0; ++ ++ btrfs_end_transaction(trans, root); + goto free_pages_out; + } ++ btrfs_end_transaction(trans, root); + } + + if (will_compress) { +@@ -542,7 +553,6 @@ static noinline int submit_compressed_ex + if (list_empty(&async_cow->extents)) + return 0; + +- trans = btrfs_join_transaction(root, 1); + + while (!list_empty(&async_cow->extents)) { + async_extent = list_entry(async_cow->extents.next, +@@ -589,19 +599,15 @@ retry: + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1, + GFP_NOFS); +- /* +- * here we're doing allocation and writeback of the +- * compressed pages +- */ +- btrfs_drop_extent_cache(inode, async_extent->start, +- async_extent->start + +- async_extent->ram_size - 1, 0); + ++ trans = btrfs_join_transaction(root, 1); + ret = btrfs_reserve_extent(trans, root, + async_extent->compressed_size, + async_extent->compressed_size, + 0, alloc_hint, + (u64)-1, &ins, 1); ++ btrfs_end_transaction(trans, root); ++ + if (ret) { + int i; + for (i = 0; i < async_extent->nr_pages; i++) { +@@ -617,6 +623,14 @@ retry: + goto retry; + } + ++ /* ++ * here we're doing allocation and writeback of the ++ * compressed pages ++ */ ++ btrfs_drop_extent_cache(inode, async_extent->start, ++ async_extent->start + ++ async_extent->ram_size - 1, 0); ++ + em = alloc_extent_map(GFP_NOFS); + em->start = async_extent->start; + em->len = async_extent->ram_size; +@@ -648,8 +662,6 @@ retry: + BTRFS_ORDERED_COMPRESSED); + BUG_ON(ret); + +- btrfs_end_transaction(trans, root); +- + /* + * clear dirty, set writeback and unlock the pages. + */ +@@ -671,13 +683,11 @@ retry: + async_extent->nr_pages); + + BUG_ON(ret); +- trans = btrfs_join_transaction(root, 1); + alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + cond_resched(); + } + +- btrfs_end_transaction(trans, root); + return 0; + } + +@@ -741,6 +751,7 @@ static noinline int cow_file_range(struc + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); ++ + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; +@@ -1727,18 +1738,27 @@ static int btrfs_finish_ordered_io(struc + } + } + +- trans = btrfs_join_transaction(root, 1); +- + if (!ordered_extent) + ordered_extent = btrfs_lookup_ordered_extent(inode, start); + BUG_ON(!ordered_extent); +- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) +- goto nocow; ++ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { ++ BUG_ON(!list_empty(&ordered_extent->list)); ++ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); ++ if (!ret) { ++ trans = btrfs_join_transaction(root, 1); ++ ret = btrfs_update_inode(trans, root, inode); ++ BUG_ON(ret); ++ btrfs_end_transaction(trans, root); ++ } ++ goto out; ++ } + + lock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); + ++ trans = btrfs_join_transaction(root, 1); ++ + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + compressed = 1; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { +@@ -1765,22 +1785,20 @@ static int btrfs_finish_ordered_io(struc + unlock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); +-nocow: + add_pending_csums(trans, inode, ordered_extent->file_offset, + &ordered_extent->list); + +- mutex_lock(&BTRFS_I(inode)->extent_mutex); +- btrfs_ordered_update_i_size(inode, ordered_extent); +- btrfs_update_inode(trans, root, inode); +- btrfs_remove_ordered_extent(inode, ordered_extent); +- mutex_unlock(&BTRFS_I(inode)->extent_mutex); +- ++ /* this also removes the ordered extent from the tree */ ++ btrfs_ordered_update_i_size(inode, 0, ordered_extent); ++ ret = btrfs_update_inode(trans, root, inode); ++ BUG_ON(ret); ++ btrfs_end_transaction(trans, root); ++out: + /* once for us */ + btrfs_put_ordered_extent(ordered_extent); + /* once for the tree */ + btrfs_put_ordered_extent(ordered_extent); + +- btrfs_end_transaction(trans, root); + return 0; + } + +@@ -3562,7 +3580,6 @@ static noinline void init_btrfs_i(struct + INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); +- mutex_init(&BTRFS_I(inode)->extent_mutex); + mutex_init(&BTRFS_I(inode)->log_mutex); + } + +--- a/fs/btrfs/ordered-data.c ++++ b/fs/btrfs/ordered-data.c +@@ -291,16 +291,16 @@ int btrfs_put_ordered_extent(struct btrf + + /* + * remove an ordered extent from the tree. No references are dropped +- * but, anyone waiting on this extent is woken up. ++ * and you must wake_up entry->wait. You must hold the tree mutex ++ * while you call this function. + */ +-int btrfs_remove_ordered_extent(struct inode *inode, ++static int __btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) + { + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + + tree = &BTRFS_I(inode)->ordered_tree; +- mutex_lock(&tree->mutex); + node = &entry->rb_node; + rb_erase(node, &tree->tree); + tree->last = NULL; +@@ -326,9 +326,26 @@ int btrfs_remove_ordered_extent(struct i + } + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + ++ return 0; ++} ++ ++/* ++ * remove an ordered extent from the tree. No references are dropped ++ * but any waiters are woken. ++ */ ++int btrfs_remove_ordered_extent(struct inode *inode, ++ struct btrfs_ordered_extent *entry) ++{ ++ struct btrfs_ordered_inode_tree *tree; ++ int ret; ++ ++ tree = &BTRFS_I(inode)->ordered_tree; ++ mutex_lock(&tree->mutex); ++ ret = __btrfs_remove_ordered_extent(inode, entry); + mutex_unlock(&tree->mutex); + wake_up(&entry->wait); +- return 0; ++ ++ return ret; + } + + /* +@@ -589,7 +606,7 @@ out: + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ +-int btrfs_ordered_update_i_size(struct inode *inode, ++int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered) + { + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; +@@ -597,18 +614,30 @@ int btrfs_ordered_update_i_size(struct i + u64 disk_i_size; + u64 new_i_size; + u64 i_size_test; ++ u64 i_size = i_size_read(inode); + struct rb_node *node; ++ struct rb_node *prev = NULL; + struct btrfs_ordered_extent *test; ++ int ret = 1; ++ ++ if (ordered) ++ offset = entry_end(ordered); + + mutex_lock(&tree->mutex); + disk_i_size = BTRFS_I(inode)->disk_i_size; + ++ /* truncate file */ ++ if (disk_i_size > i_size) { ++ BTRFS_I(inode)->disk_i_size = i_size; ++ ret = 0; ++ goto out; ++ } ++ + /* + * if the disk i_size is already at the inode->i_size, or + * this ordered extent is inside the disk i_size, we're done + */ +- if (disk_i_size >= inode->i_size || +- ordered->file_offset + ordered->len <= disk_i_size) { ++ if (disk_i_size == i_size || offset <= disk_i_size) { + goto out; + } + +@@ -616,8 +645,7 @@ int btrfs_ordered_update_i_size(struct i + * we can't update the disk_isize if there are delalloc bytes + * between disk_i_size and this ordered extent + */ +- if (test_range_bit(io_tree, disk_i_size, +- ordered->file_offset + ordered->len - 1, ++ if (test_range_bit(io_tree, disk_i_size, offset - 1, + EXTENT_DELALLOC, 0, NULL)) { + goto out; + } +@@ -626,20 +654,32 @@ int btrfs_ordered_update_i_size(struct i + * if we find an ordered extent then we can't update disk i_size + * yet + */ +- node = &ordered->rb_node; +- while (1) { +- node = rb_prev(node); +- if (!node) +- break; ++ if (ordered) { ++ node = rb_prev(&ordered->rb_node); ++ } else { ++ prev = tree_search(tree, offset); ++ /* ++ * we insert file extents without involving ordered struct, ++ * so there should be no ordered struct cover this offset ++ */ ++ if (prev) { ++ test = rb_entry(prev, struct btrfs_ordered_extent, ++ rb_node); ++ BUG_ON(offset_in_entry(test, offset)); ++ } ++ node = prev; ++ } ++ while (node) { + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset + test->len <= disk_i_size) + break; +- if (test->file_offset >= inode->i_size) ++ if (test->file_offset >= i_size) + break; + if (test->file_offset >= disk_i_size) + goto out; ++ node = rb_prev(node); + } +- new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); ++ new_i_size = min_t(u64, offset, i_size); + + /* + * at this point, we know we can safely update i_size to at least +@@ -647,7 +687,14 @@ int btrfs_ordered_update_i_size(struct i + * walk forward and see if ios from higher up in the file have + * finished. + */ +- node = rb_next(&ordered->rb_node); ++ if (ordered) { ++ node = rb_next(&ordered->rb_node); ++ } else { ++ if (prev) ++ node = rb_next(prev); ++ else ++ node = rb_first(&tree->tree); ++ } + i_size_test = 0; + if (node) { + /* +@@ -655,10 +702,10 @@ int btrfs_ordered_update_i_size(struct i + * between our ordered extent and the next one. + */ + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); +- if (test->file_offset > entry_end(ordered)) ++ if (test->file_offset > offset) + i_size_test = test->file_offset; + } else { +- i_size_test = i_size_read(inode); ++ i_size_test = i_size; + } + + /* +@@ -667,15 +714,25 @@ int btrfs_ordered_update_i_size(struct i + * are no delalloc bytes in this area, it is safe to update + * disk_i_size to the end of the region. + */ +- if (i_size_test > entry_end(ordered) && +- !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, +- EXTENT_DELALLOC, 0, NULL)) { +- new_i_size = min_t(u64, i_size_test, i_size_read(inode)); ++ if (i_size_test > offset && ++ !test_range_bit(io_tree, offset, i_size_test - 1, ++ EXTENT_DELALLOC, 0, NULL)) { ++ new_i_size = min_t(u64, i_size_test, i_size); + } + BTRFS_I(inode)->disk_i_size = new_i_size; ++ ret = 0; + out: ++ /* ++ * we need to remove the ordered extent with the tree lock held ++ * so that other people calling this function don't find our fully ++ * processed ordered entry and skip updating the i_size ++ */ ++ if (ordered) ++ __btrfs_remove_ordered_extent(inode, ordered); + mutex_unlock(&tree->mutex); +- return 0; ++ if (ordered) ++ wake_up(&ordered->wait); ++ return ret; + } + + /* +--- a/fs/btrfs/ordered-data.h ++++ b/fs/btrfs/ordered-data.h +@@ -150,7 +150,7 @@ void btrfs_start_ordered_extent(struct i + int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); + struct btrfs_ordered_extent * + btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +-int btrfs_ordered_update_i_size(struct inode *inode, ++int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, + struct btrfs_ordered_extent *ordered); + int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); + int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); diff --git a/queue-2.6.32/btrfs-fix-memory-leaks-in-error-paths.patch b/queue-2.6.32/btrfs-fix-memory-leaks-in-error-paths.patch new file mode 100644 index 00000000000..0ba1012d87d --- /dev/null +++ b/queue-2.6.32/btrfs-fix-memory-leaks-in-error-paths.patch @@ -0,0 +1,53 @@ +From 2423fdfb96e3f9ff3baeb6c4c78d74145547891d Mon Sep 17 00:00:00 2001 +From: Jiri Slaby +Date: Wed, 6 Jan 2010 16:57:22 +0000 +Subject: Btrfs, fix memory leaks in error paths + +From: Jiri Slaby + +commit 2423fdfb96e3f9ff3baeb6c4c78d74145547891d upstream. + +Stanse found 2 memory leaks in relocate_block_group and +__btrfs_map_block. cluster and multi are not freed/assigned on all +paths. Fix that. + +Signed-off-by: Jiri Slaby +Cc: linux-btrfs@vger.kernel.org +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/relocation.c | 4 +++- + fs/btrfs/volumes.c | 4 +++- + 2 files changed, 6 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -3281,8 +3281,10 @@ static noinline_for_stack int relocate_b + return -ENOMEM; + + path = btrfs_alloc_path(); +- if (!path) ++ if (!path) { ++ kfree(cluster); + return -ENOMEM; ++ } + + rc->extents_found = 0; + rc->extents_skipped = 0; +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2649,8 +2649,10 @@ again: + em = lookup_extent_mapping(em_tree, logical, *length); + read_unlock(&em_tree->lock); + +- if (!em && unplug_page) ++ if (!em && unplug_page) { ++ kfree(multi); + return 0; ++ } + + if (!em) { + printk(KERN_CRIT "unable to find logical %llu len %llu\n", diff --git a/queue-2.6.32/btrfs-fix-missing-last-entry-in-readdir-3.patch b/queue-2.6.32/btrfs-fix-missing-last-entry-in-readdir-3.patch new file mode 100644 index 00000000000..49a506bd61a --- /dev/null +++ b/queue-2.6.32/btrfs-fix-missing-last-entry-in-readdir-3.patch @@ -0,0 +1,46 @@ +From 406266ab9ac8ed8b085c58aacd9e3161480dc5d5 Mon Sep 17 00:00:00 2001 +From: Jan Engelhardt +Date: Wed, 9 Dec 2009 22:00:38 +0000 +Subject: btrfs: fix missing last-entry in readdir(3) + +From: Jan Engelhardt + +commit 406266ab9ac8ed8b085c58aacd9e3161480dc5d5 upstream. + +parent 49313cdac7b34c9f7ecbb1780cfc648b1c082cd7 (v2.6.32-1-g49313cd) +commit ff48c08e1c05c67e8348ab6f8a24de8034e0e34d +Author: Jan Engelhardt +Date: Wed Dec 9 22:57:36 2009 +0100 + +Btrfs: fix missing last-entry in readdir(3) + +When one does a 32-bit readdir(3), the last entry of a directory is +missing. This is however not due to passing a large value to filldir, +but it seems to have to do with glibc doing telldir or something +quirky. In any case, this patch fixes it in practice. + +Signed-off-by: Jan Engelhardt +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/inode.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3995,7 +3995,11 @@ skip: + + /* Reached end of directory/root. Bump pos past the last item. */ + if (key_type == BTRFS_DIR_INDEX_KEY) +- filp->f_pos = INT_LIMIT(off_t); ++ /* ++ * 32-bit glibc will use getdents64, but then strtol - ++ * so the last number we can serve is this. ++ */ ++ filp->f_pos = 0x7fffffff; + else + filp->f_pos++; + nopos: diff --git a/queue-2.6.32/btrfs-fix-oopsen-when-dropping-empty-tree.patch b/queue-2.6.32/btrfs-fix-oopsen-when-dropping-empty-tree.patch new file mode 100644 index 00000000000..dc184fcc8f0 --- /dev/null +++ b/queue-2.6.32/btrfs-fix-oopsen-when-dropping-empty-tree.patch @@ -0,0 +1,46 @@ +From 7a7965f83e89f0be506a96769938a721e4e5ae50 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Mon, 1 Feb 2010 02:41:17 +0000 +Subject: Btrfs: Fix oopsen when dropping empty tree. + +From: Yan, Zheng + +commit 7a7965f83e89f0be506a96769938a721e4e5ae50 upstream. + +When dropping a empty tree, walk_down_tree() skips checking +extent information for the tree root. This will triggers a +BUG_ON in walk_up_proc(). + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -5402,10 +5402,6 @@ static noinline int walk_down_tree(struc + int ret; + + while (level >= 0) { +- if (path->slots[level] >= +- btrfs_header_nritems(path->nodes[level])) +- break; +- + ret = walk_down_proc(trans, root, path, wc, lookup_info); + if (ret > 0) + break; +@@ -5413,6 +5409,10 @@ static noinline int walk_down_tree(struc + if (level == 0) + break; + ++ if (path->slots[level] >= ++ btrfs_header_nritems(path->nodes[level])) ++ break; ++ + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; diff --git a/queue-2.6.32/btrfs-fix-per-root-used-space-accounting.patch b/queue-2.6.32/btrfs-fix-per-root-used-space-accounting.patch new file mode 100644 index 00000000000..da62b76a3e0 --- /dev/null +++ b/queue-2.6.32/btrfs-fix-per-root-used-space-accounting.patch @@ -0,0 +1,208 @@ +From 86b9f2eca5e0984145e3c7698a7cd6dd65c2a93f Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:36:50 +0000 +Subject: Btrfs: Fix per root used space accounting + +From: Yan, Zheng + +commit 86b9f2eca5e0984145e3c7698a7cd6dd65c2a93f upstream. + +The bytes_used field in root item was originally planned to +trace the amount of used data and tree blocks. But it never +worked right since we can't trace freeing of data accurately. +This patch changes it to only trace the amount of tree blocks. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.c | 31 ++++++++++++++----------------- + fs/btrfs/ctree.h | 4 ++++ + fs/btrfs/extent-tree.c | 31 +++++++++++++++++++++++-------- + fs/btrfs/ioctl.c | 2 +- + fs/btrfs/transaction.c | 6 +++++- + 5 files changed, 47 insertions(+), 27 deletions(-) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -456,9 +456,8 @@ static noinline int __btrfs_cow_block(st + extent_buffer_get(cow); + spin_unlock(&root->node_lock); + +- btrfs_free_extent(trans, root, buf->start, buf->len, +- parent_start, root->root_key.objectid, +- level, 0); ++ btrfs_free_tree_block(trans, root, buf->start, buf->len, ++ parent_start, root->root_key.objectid, level); + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { +@@ -473,9 +472,8 @@ static noinline int __btrfs_cow_block(st + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(parent); +- btrfs_free_extent(trans, root, buf->start, buf->len, +- parent_start, root->root_key.objectid, +- level, 0); ++ btrfs_free_tree_block(trans, root, buf->start, buf->len, ++ parent_start, root->root_key.objectid, level); + } + if (unlock_orig) + btrfs_tree_unlock(buf); +@@ -1035,8 +1033,8 @@ static noinline int balance_level(struct + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); +- ret = btrfs_free_extent(trans, root, mid->start, mid->len, +- 0, root->root_key.objectid, level, 1); ++ ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, ++ 0, root->root_key.objectid, level); + /* once for the root ptr */ + free_extent_buffer(mid); + return ret; +@@ -1100,10 +1098,10 @@ static noinline int balance_level(struct + 1); + if (wret) + ret = wret; +- wret = btrfs_free_extent(trans, root, bytenr, +- blocksize, 0, +- root->root_key.objectid, +- level, 0); ++ wret = btrfs_free_tree_block(trans, root, ++ bytenr, blocksize, 0, ++ root->root_key.objectid, ++ level); + if (wret) + ret = wret; + } else { +@@ -1148,9 +1146,8 @@ static noinline int balance_level(struct + wret = del_ptr(trans, root, path, level + 1, pslot); + if (wret) + ret = wret; +- wret = btrfs_free_extent(trans, root, bytenr, blocksize, +- 0, root->root_key.objectid, +- level, 0); ++ wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, ++ 0, root->root_key.objectid, level); + if (wret) + ret = wret; + } else { +@@ -3794,8 +3791,8 @@ static noinline int btrfs_del_leaf(struc + */ + btrfs_unlock_up_safe(path, 0); + +- ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, +- 0, root->root_key.objectid, 0, 0); ++ ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, ++ 0, root->root_key.objectid, 0); + return ret; + } + /* +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -1982,6 +1982,10 @@ struct extent_buffer *btrfs_alloc_free_b + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size); ++int btrfs_free_tree_block(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ u64 bytenr, u32 blocksize, ++ u64 parent, u64 root_objectid, int level); + struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3454,14 +3454,6 @@ static int update_block_group(struct btr + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(&info->super_copy, old_val); +- +- /* block accounting for root item */ +- old_val = btrfs_root_used(&root->root_item); +- if (alloc) +- old_val += num_bytes; +- else +- old_val -= num_bytes; +- btrfs_set_root_used(&root->root_item, old_val); + spin_unlock(&info->delalloc_lock); + + while (total) { +@@ -4049,6 +4041,21 @@ int btrfs_free_extent(struct btrfs_trans + return ret; + } + ++int btrfs_free_tree_block(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ u64 bytenr, u32 blocksize, ++ u64 parent, u64 root_objectid, int level) ++{ ++ u64 used; ++ spin_lock(&root->node_lock); ++ used = btrfs_root_used(&root->root_item) - blocksize; ++ btrfs_set_root_used(&root->root_item, used); ++ spin_unlock(&root->node_lock); ++ ++ return btrfs_free_extent(trans, root, bytenr, blocksize, ++ parent, root_objectid, level, 0); ++} ++ + static u64 stripe_align(struct btrfs_root *root, u64 val) + { + u64 mask = ((u64)root->stripesize - 1); +@@ -4897,6 +4904,14 @@ static int alloc_tree_block(struct btrfs + extent_op); + BUG_ON(ret); + } ++ ++ if (root_objectid == root->root_key.objectid) { ++ u64 used; ++ spin_lock(&root->node_lock); ++ used = btrfs_root_used(&root->root_item) + num_bytes; ++ btrfs_set_root_used(&root->root_item, used); ++ spin_unlock(&root->node_lock); ++ } + return ret; + } + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -289,7 +289,7 @@ static noinline int create_subvol(struct + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); +- btrfs_set_root_used(&root_item, 0); ++ btrfs_set_root_used(&root_item, leaf->len); + btrfs_set_root_last_snapshot(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -501,13 +501,16 @@ static int update_cowonly_root(struct bt + { + int ret; + u64 old_root_bytenr; ++ u64 old_root_used; + struct btrfs_root *tree_root = root->fs_info->tree_root; + ++ old_root_used = btrfs_root_used(&root->root_item); + btrfs_write_dirty_block_groups(trans, root); + + while (1) { + old_root_bytenr = btrfs_root_bytenr(&root->root_item); +- if (old_root_bytenr == root->node->start) ++ if (old_root_bytenr == root->node->start && ++ old_root_used == btrfs_root_used(&root->root_item)) + break; + + btrfs_set_root_node(&root->root_item, root->node); +@@ -516,6 +519,7 @@ static int update_cowonly_root(struct bt + &root->root_item); + BUG_ON(ret); + ++ old_root_used = btrfs_root_used(&root->root_item); + ret = btrfs_write_dirty_block_groups(trans, root); + BUG_ON(ret); + } diff --git a/queue-2.6.32/btrfs-fix-possible-panic-on-unmount.patch b/queue-2.6.32/btrfs-fix-possible-panic-on-unmount.patch new file mode 100644 index 00000000000..0fbc3d9f134 --- /dev/null +++ b/queue-2.6.32/btrfs-fix-possible-panic-on-unmount.patch @@ -0,0 +1,123 @@ +From 11dfe35a0108097f2df1f042c485fa7f758c2cdf Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 13 Nov 2009 20:12:59 +0000 +Subject: Btrfs: fix possible panic on unmount + +From: Josef Bacik + +commit 11dfe35a0108097f2df1f042c485fa7f758c2cdf upstream. + +We can race with the unmount of an fs and the stopping of a kthread where we +will free the block group before we're done using it. The reason for this is +because we do not hold a reference on the block group while its caching, since +the allocator drops its reference once it exits or moves on to the next block +group. This patch fixes the problem by taking a reference to the block group +before we start caching and dropping it when we're done to make sure all +accesses to the block group are safe. Thanks, + +Signed-off-by: Josef Bacik +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 32 +++++++++++++++++++------------- + 1 file changed, 19 insertions(+), 13 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs + return (cache->flags & bits) == bits; + } + ++void btrfs_get_block_group(struct btrfs_block_group_cache *cache) ++{ ++ atomic_inc(&cache->count); ++} ++ ++void btrfs_put_block_group(struct btrfs_block_group_cache *cache) ++{ ++ if (atomic_dec_and_test(&cache->count)) ++ kfree(cache); ++} ++ + /* + * this adds the block group to the fs_info rb tree for the block group + * cache +@@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btr + } + } + if (ret) +- atomic_inc(&ret->count); ++ btrfs_get_block_group(ret); + spin_unlock(&info->block_group_cache_lock); + + return ret; +@@ -407,6 +418,8 @@ err: + + put_caching_control(caching_ctl); + atomic_dec(&block_group->space_info->caching_threads); ++ btrfs_put_block_group(block_group); ++ + return 0; + } + +@@ -447,6 +460,7 @@ static int cache_block_group(struct btrf + up_write(&fs_info->extent_commit_sem); + + atomic_inc(&cache->space_info->caching_threads); ++ btrfs_get_block_group(cache); + + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); +@@ -486,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lo + return cache; + } + +-void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +-{ +- if (atomic_dec_and_test(&cache->count)) +- kfree(cache); +-} +- + static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) + { +@@ -2582,7 +2590,7 @@ next_block_group(struct btrfs_root *root + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); +- atomic_inc(&cache->count); ++ btrfs_get_block_group(cache); + } else + cache = NULL; + spin_unlock(&root->fs_info->block_group_cache_lock); +@@ -4227,7 +4235,7 @@ search: + u64 offset; + int cached; + +- atomic_inc(&block_group->count); ++ btrfs_get_block_group(block_group); + search_start = block_group->key.objectid; + + have_block_group: +@@ -4315,7 +4323,7 @@ have_block_group: + + btrfs_put_block_group(block_group); + block_group = last_ptr->block_group; +- atomic_inc(&block_group->count); ++ btrfs_get_block_group(block_group); + spin_unlock(&last_ptr->lock); + spin_unlock(&last_ptr->refill_lock); + +@@ -7395,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs + wait_block_group_cache_done(block_group); + + btrfs_remove_free_space_cache(block_group); +- +- WARN_ON(atomic_read(&block_group->count) != 1); +- kfree(block_group); ++ btrfs_put_block_group(block_group); + + spin_lock(&info->block_group_cache_lock); + } diff --git a/queue-2.6.32/btrfs-fix-race-between-allocate-and-release-extent-buffer.patch b/queue-2.6.32/btrfs-fix-race-between-allocate-and-release-extent-buffer.patch new file mode 100644 index 00000000000..da43bdf04a0 --- /dev/null +++ b/queue-2.6.32/btrfs-fix-race-between-allocate-and-release-extent-buffer.patch @@ -0,0 +1,36 @@ +From f044ba7835b84e69c68b620ca8fa27e5ef67759d Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 4 Feb 2010 08:46:56 +0000 +Subject: Btrfs: fix race between allocate and release extent buffer. + +From: Yan, Zheng + +commit f044ba7835b84e69c68b620ca8fa27e5ef67759d upstream. + +Increase extent buffer's reference count while holding the lock. +Otherwise it can race with try_release_extent_buffer. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/extent_io.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3165,10 +3165,9 @@ struct extent_buffer *alloc_extent_buffe + spin_unlock(&tree->buffer_lock); + goto free_eb; + } +- spin_unlock(&tree->buffer_lock); +- + /* add one reference for the tree */ + atomic_inc(&eb->refs); ++ spin_unlock(&tree->buffer_lock); + return eb; + + free_eb: diff --git a/queue-2.6.32/btrfs-fix-race-in-btrfs_mark_extent_written.patch b/queue-2.6.32/btrfs-fix-race-in-btrfs_mark_extent_written.patch new file mode 100644 index 00000000000..9705b0bcca8 --- /dev/null +++ b/queue-2.6.32/btrfs-fix-race-in-btrfs_mark_extent_written.patch @@ -0,0 +1,195 @@ +From 6c7d54ac87f338c479d9729e8392eca3f76e11e1 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Fri, 15 Jan 2010 08:43:09 +0000 +Subject: Btrfs: Fix race in btrfs_mark_extent_written + +From: Yan, Zheng + +commit 6c7d54ac87f338c479d9729e8392eca3f76e11e1 upstream. + +Fix bug reported by Johannes Hirte. The reason of that bug +is btrfs_del_items is called after btrfs_duplicate_item and +btrfs_del_items triggers tree balance. The fix is check that +case and call btrfs_search_slot when needed. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 100 ++++++++++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 80 insertions(+), 20 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -506,7 +506,8 @@ next_slot: + } + + static int extent_mergeable(struct extent_buffer *leaf, int slot, +- u64 objectid, u64 bytenr, u64 *start, u64 *end) ++ u64 objectid, u64 bytenr, u64 orig_offset, ++ u64 *start, u64 *end) + { + struct btrfs_file_extent_item *fi; + struct btrfs_key key; +@@ -522,6 +523,7 @@ static int extent_mergeable(struct exten + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || ++ btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) +@@ -561,6 +563,7 @@ int btrfs_mark_extent_written(struct btr + u64 split; + int del_nr = 0; + int del_slot = 0; ++ int recow; + int ret; + + btrfs_drop_extent_cache(inode, start, end - 1, 0); +@@ -568,6 +571,7 @@ int btrfs_mark_extent_written(struct btr + path = btrfs_alloc_path(); + BUG_ON(!path); + again: ++ recow = 0; + split = start; + key.objectid = inode->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; +@@ -591,12 +595,60 @@ again: + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); ++ memcpy(&new_key, &key, sizeof(new_key)); ++ ++ if (start == key.offset && end < extent_end) { ++ other_start = 0; ++ other_end = start; ++ if (extent_mergeable(leaf, path->slots[0] - 1, ++ inode->i_ino, bytenr, orig_offset, ++ &other_start, &other_end)) { ++ new_key.offset = end; ++ btrfs_set_item_key_safe(trans, root, path, &new_key); ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ extent_end - end); ++ btrfs_set_file_extent_offset(leaf, fi, ++ end - orig_offset); ++ fi = btrfs_item_ptr(leaf, path->slots[0] - 1, ++ struct btrfs_file_extent_item); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ end - other_start); ++ btrfs_mark_buffer_dirty(leaf); ++ goto out; ++ } ++ } ++ ++ if (start > key.offset && end == extent_end) { ++ other_start = end; ++ other_end = 0; ++ if (extent_mergeable(leaf, path->slots[0] + 1, ++ inode->i_ino, bytenr, orig_offset, ++ &other_start, &other_end)) { ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ start - key.offset); ++ path->slots[0]++; ++ new_key.offset = start; ++ btrfs_set_item_key_safe(trans, root, path, &new_key); ++ ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ other_end - start); ++ btrfs_set_file_extent_offset(leaf, fi, ++ start - orig_offset); ++ btrfs_mark_buffer_dirty(leaf); ++ goto out; ++ } ++ } + + while (start > key.offset || end < extent_end) { + if (key.offset == start) + split = end; + +- memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = split; + ret = btrfs_duplicate_item(trans, root, path, &new_key); + if (ret == -EAGAIN) { +@@ -631,15 +683,18 @@ again: + path->slots[0]--; + extent_end = end; + } ++ recow = 1; + } + +- fi = btrfs_item_ptr(leaf, path->slots[0], +- struct btrfs_file_extent_item); +- + other_start = end; + other_end = 0; +- if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, +- bytenr, &other_start, &other_end)) { ++ if (extent_mergeable(leaf, path->slots[0] + 1, ++ inode->i_ino, bytenr, orig_offset, ++ &other_start, &other_end)) { ++ if (recow) { ++ btrfs_release_path(root, path); ++ goto again; ++ } + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; +@@ -650,8 +705,13 @@ again: + } + other_start = 0; + other_end = start; +- if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, +- bytenr, &other_start, &other_end)) { ++ if (extent_mergeable(leaf, path->slots[0] - 1, ++ inode->i_ino, bytenr, orig_offset, ++ &other_start, &other_end)) { ++ if (recow) { ++ btrfs_release_path(root, path); ++ goto again; ++ } + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; +@@ -660,22 +720,22 @@ again: + inode->i_ino, orig_offset); + BUG_ON(ret); + } ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); + if (del_nr == 0) { + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_mark_buffer_dirty(leaf); +- goto out; +- } +- +- fi = btrfs_item_ptr(leaf, del_slot - 1, +- struct btrfs_file_extent_item); +- btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); +- btrfs_set_file_extent_num_bytes(leaf, fi, +- extent_end - key.offset); +- btrfs_mark_buffer_dirty(leaf); ++ } else { ++ btrfs_set_file_extent_type(leaf, fi, ++ BTRFS_FILE_EXTENT_REG); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ extent_end - key.offset); ++ btrfs_mark_buffer_dirty(leaf); + +- ret = btrfs_del_items(trans, root, path, del_slot, del_nr); +- BUG_ON(ret); ++ ret = btrfs_del_items(trans, root, path, del_slot, del_nr); ++ BUG_ON(ret); ++ } + out: + btrfs_free_path(path); + return 0; diff --git a/queue-2.6.32/btrfs-fix-regression-in-orphan-cleanup.patch b/queue-2.6.32/btrfs-fix-regression-in-orphan-cleanup.patch new file mode 100644 index 00000000000..a7fef3722ee --- /dev/null +++ b/queue-2.6.32/btrfs-fix-regression-in-orphan-cleanup.patch @@ -0,0 +1,95 @@ +From 6c090a11e1c403b727a6a8eff0b97d5fb9e95cb5 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 15 Jan 2010 20:08:22 +0000 +Subject: Btrfs: fix regression in orphan cleanup + +From: Josef Bacik + +commit 6c090a11e1c403b727a6a8eff0b97d5fb9e95cb5 upstream. + +Currently orphan cleanup only ever gets triggered if we cross subvolumes during +a lookup, which means that if we just mount a plain jane fs that has orphans in +it, they will never get cleaned up. This results in panic's like these + +http://www.kerneloops.org/oops.php?number=1109085 + +where adding an orphan entry results in -EEXIST being returned and we panic. In +order to fix this, we check to see on lookup if our root has had the orphan +cleanup done, and if not go ahead and do it. This is easily reproduceable by +running this testcase + +#include +#include +#include +#include +#include +#include + +int main(int argc, char **argv) +{ + char data[4096]; + char newdata[4096]; + int fd1, fd2; + + memset(data, 'a', 4096); + memset(newdata, 'b', 4096); + + while (1) { + int i; + + fd1 = creat("file1", 0666); + if (fd1 < 0) + break; + + for (i = 0; i < 512; i++) + write(fd1, data, 4096); + + fsync(fd1); + close(fd1); + + fd2 = creat("file2", 0666); + if (fd2 < 0) + break; + + ftruncate(fd2, 4096 * 512); + + for (i = 0; i < 512; i++) + write(fd2, newdata, 4096); + close(fd2); + + i = rename("file2", "file1"); + unlink("file1"); + } + + return 0; +} + +and then pulling the power on the box, and then trying to run that test again +when the box comes back up. I've tested this locally and it fixes the problem. +Thanks to Tomas Carnecky for helping me track this down initially. + +Signed-off-by: Josef Bacik +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/inode.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3796,6 +3796,12 @@ struct inode *btrfs_lookup_dentry(struct + + if (location.type == BTRFS_INODE_ITEM_KEY) { + inode = btrfs_iget(dir->i_sb, &location, root); ++ if (unlikely(root->clean_orphans) && ++ !(inode->i_sb->s_flags & MS_RDONLY)) { ++ down_read(&root->fs_info->cleanup_work_sem); ++ btrfs_orphan_cleanup(root); ++ up_read(&root->fs_info->cleanup_work_sem); ++ } + return inode; + } + diff --git a/queue-2.6.32/btrfs-kfree-correct-pointer-during-mount-option-parsing.patch b/queue-2.6.32/btrfs-kfree-correct-pointer-during-mount-option-parsing.patch new file mode 100644 index 00000000000..f02572bc4f3 --- /dev/null +++ b/queue-2.6.32/btrfs-kfree-correct-pointer-during-mount-option-parsing.patch @@ -0,0 +1,50 @@ +From da495ecc0fb096b383754952a1c152147bc95b52 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Thu, 25 Feb 2010 20:38:35 +0000 +Subject: Btrfs: kfree correct pointer during mount option parsing + +From: Josef Bacik + +commit da495ecc0fb096b383754952a1c152147bc95b52 upstream. + +We kstrdup the options string, but then strsep screws with the pointer, +so when we kfree() it, we're not giving it the right pointer. + +Tested-by: Andy Lutomirski + +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/super.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -126,7 +126,7 @@ int btrfs_parse_options(struct btrfs_roo + { + struct btrfs_fs_info *info = root->fs_info; + substring_t args[MAX_OPT_ARGS]; +- char *p, *num; ++ char *p, *num, *orig; + int intarg; + int ret = 0; + +@@ -141,6 +141,7 @@ int btrfs_parse_options(struct btrfs_roo + if (!options) + return -ENOMEM; + ++ orig = options; + + while ((p = strsep(&options, ",")) != NULL) { + int token; +@@ -273,7 +274,7 @@ int btrfs_parse_options(struct btrfs_roo + } + } + out: +- kfree(options); ++ kfree(orig); + return ret; + } + diff --git a/queue-2.6.32/btrfs-make-error-return-negative-in-btrfs_sync_file.patch b/queue-2.6.32/btrfs-make-error-return-negative-in-btrfs_sync_file.patch new file mode 100644 index 00000000000..5b7f0c0afa4 --- /dev/null +++ b/queue-2.6.32/btrfs-make-error-return-negative-in-btrfs_sync_file.patch @@ -0,0 +1,31 @@ +From 014e4ac4f7d9c981750491fa40ea35efadc9ed49 Mon Sep 17 00:00:00 2001 +From: Roel Kluin +Date: Fri, 29 Jan 2010 10:42:11 +0000 +Subject: Btrfs: make error return negative in btrfs_sync_file() + +From: Roel Kluin + +commit 014e4ac4f7d9c981750491fa40ea35efadc9ed49 upstream. + +It appears the error return should be negative + +Signed-off-by: Roel Kluin +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1133,7 +1133,7 @@ int btrfs_sync_file(struct file *file, s + } + mutex_lock(&dentry->d_inode->i_mutex); + out: +- return ret > 0 ? EIO : ret; ++ return ret > 0 ? -EIO : ret; + } + + static const struct vm_operations_struct btrfs_file_vm_ops = { diff --git a/queue-2.6.32/btrfs-make-fallocate-2-more-enospc-friendly.patch b/queue-2.6.32/btrfs-make-fallocate-2-more-enospc-friendly.patch new file mode 100644 index 00000000000..ad8c47c6bbb --- /dev/null +++ b/queue-2.6.32/btrfs-make-fallocate-2-more-enospc-friendly.patch @@ -0,0 +1,170 @@ +From 5a303d5d4b8055d2e5a03e92d04745bfc5881a22 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:34:52 +0000 +Subject: Btrfs: Make fallocate(2) more ENOSPC friendly + +From: Yan, Zheng + +commit 5a303d5d4b8055d2e5a03e92d04745bfc5881a22 upstream. + +fallocate(2) may allocate large number of file extents, so it's not +good to do it in a single transaction. This patch make fallocate(2) +start a new transaction for each file extents it allocates. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/inode.c | 65 +++++++++++++++++++++++++++---------------------------- + 1 file changed, 32 insertions(+), 33 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -5664,10 +5664,10 @@ out_fail: + return err; + } + +-static int prealloc_file_range(struct btrfs_trans_handle *trans, +- struct inode *inode, u64 start, u64 end, ++static int prealloc_file_range(struct inode *inode, u64 start, u64 end, + u64 alloc_hint, int mode) + { ++ struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; + u64 alloc_size; +@@ -5678,17 +5678,23 @@ static int prealloc_file_range(struct bt + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); + +- ret = btrfs_reserve_metadata_space(root, 1); +- if (ret) +- goto out; +- + ret = btrfs_reserve_extent(trans, root, alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + if (ret) { + WARN_ON(1); +- goto out; ++ break; + } ++ ++ ret = btrfs_reserve_metadata_space(root, 3); ++ if (ret) { ++ btrfs_free_reserved_extent(root, ins.objectid, ++ ins.offset); ++ break; ++ } ++ ++ trans = btrfs_start_transaction(root, 1); ++ + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, +@@ -5697,22 +5703,25 @@ static int prealloc_file_range(struct bt + BUG_ON(ret); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset -1, 0); ++ + num_bytes -= ins.offset; + cur_offset += ins.offset; + alloc_hint = ins.objectid + ins.offset; +- btrfs_unreserve_metadata_space(root, 1); +- } +-out: +- if (cur_offset > start) { ++ + inode->i_ctime = CURRENT_TIME; + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; + if (!(mode & FALLOC_FL_KEEP_SIZE) && +- cur_offset > i_size_read(inode)) +- btrfs_i_size_write(inode, cur_offset); ++ cur_offset > inode->i_size) { ++ i_size_write(inode, cur_offset); ++ btrfs_ordered_update_i_size(inode, cur_offset, NULL); ++ } ++ + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); +- } + ++ btrfs_end_transaction(trans, root); ++ btrfs_unreserve_metadata_space(root, 3); ++ } + return ret; + } + +@@ -5727,8 +5736,6 @@ static long btrfs_fallocate(struct inode + u64 locked_end; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; +- struct btrfs_trans_handle *trans; +- struct btrfs_root *root; + int ret; + + alloc_start = offset & ~mask; +@@ -5747,9 +5754,7 @@ static long btrfs_fallocate(struct inode + goto out; + } + +- root = BTRFS_I(inode)->root; +- +- ret = btrfs_check_data_free_space(root, inode, ++ ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, + alloc_end - alloc_start); + if (ret) + goto out; +@@ -5758,12 +5763,6 @@ static long btrfs_fallocate(struct inode + while (1) { + struct btrfs_ordered_extent *ordered; + +- trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); +- if (!trans) { +- ret = -EIO; +- goto out_free; +- } +- + /* the extent lock is ordered inside the running + * transaction + */ +@@ -5777,8 +5776,6 @@ static long btrfs_fallocate(struct inode + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + alloc_start, locked_end, GFP_NOFS); +- btrfs_end_transaction(trans, BTRFS_I(inode)->root); +- + /* + * we can't wait on the range with the transaction + * running or with the extent lock held +@@ -5799,9 +5796,12 @@ static long btrfs_fallocate(struct inode + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; +- if (em->block_start == EXTENT_MAP_HOLE) { +- ret = prealloc_file_range(trans, inode, cur_offset, +- last_byte, alloc_hint, mode); ++ if (em->block_start == EXTENT_MAP_HOLE || ++ (cur_offset >= inode->i_size && ++ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { ++ ret = prealloc_file_range(inode, ++ cur_offset, last_byte, ++ alloc_hint, mode); + if (ret < 0) { + free_extent_map(em); + break; +@@ -5820,9 +5820,8 @@ static long btrfs_fallocate(struct inode + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); + +- btrfs_end_transaction(trans, BTRFS_I(inode)->root); +-out_free: +- btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); ++ btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, ++ alloc_end - alloc_start); + out: + mutex_unlock(&inode->i_mutex); + return ret; diff --git a/queue-2.6.32/btrfs-make-metadata-chunks-smaller.patch b/queue-2.6.32/btrfs-make-metadata-chunks-smaller.patch new file mode 100644 index 00000000000..5bc3768f4f5 --- /dev/null +++ b/queue-2.6.32/btrfs-make-metadata-chunks-smaller.patch @@ -0,0 +1,67 @@ +From 83d3c9696fed237a3d96fce18299e2fcf112109f Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Mon, 7 Dec 2009 21:45:59 +0000 +Subject: Btrfs: make metadata chunks smaller + +From: Josef Bacik + +commit 83d3c9696fed237a3d96fce18299e2fcf112109f upstream. + +This patch makes us a bit less zealous about making sure we have enough free +metadata space by pearing down the size of new metadata chunks to 256mb instead +of 1gb. Also, we used to try an allocate metadata chunks when allocating data, +but that sort of thing is done elsewhere now so we can just remove it. With my +-ENOSPC test I used to have 3gb reserved for metadata out of 75gb, now I have +1.7gb. Thanks, + +Signed-off-by: Josef Bacik +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/extent-tree.c | 11 +---------- + fs/btrfs/volumes.c | 2 +- + 2 files changed, 2 insertions(+), 11 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4593,7 +4593,6 @@ int btrfs_reserve_extent(struct btrfs_tr + { + int ret; + u64 search_start = 0; +- struct btrfs_fs_info *info = root->fs_info; + + data = btrfs_get_alloc_profile(root, data); + again: +@@ -4601,17 +4600,9 @@ again: + * the only place that sets empty_size is btrfs_realloc_node, which + * is not called recursively on allocations + */ +- if (empty_size || root->ref_cows) { +- if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { +- ret = do_chunk_alloc(trans, root->fs_info->extent_root, +- 2 * 1024 * 1024, +- BTRFS_BLOCK_GROUP_METADATA | +- (info->metadata_alloc_profile & +- info->avail_metadata_alloc_bits), 0); +- } ++ if (empty_size || root->ref_cows) + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, data, 0); +- } + + WARN_ON(num_bytes < root->sectorsize); + ret = find_free_extent(trans, root, num_bytes, empty_size, +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct bt + max_chunk_size = 10 * calc_size; + min_stripe_size = 64 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { +- max_chunk_size = 4 * calc_size; ++ max_chunk_size = 256 * 1024 * 1024; + min_stripe_size = 32 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + calc_size = 8 * 1024 * 1024; diff --git a/queue-2.6.32/btrfs-make-sure-fallocate-properly-starts-a-transaction.patch b/queue-2.6.32/btrfs-make-sure-fallocate-properly-starts-a-transaction.patch new file mode 100644 index 00000000000..130476dbced --- /dev/null +++ b/queue-2.6.32/btrfs-make-sure-fallocate-properly-starts-a-transaction.patch @@ -0,0 +1,64 @@ +From 3a1abec9f6880cf406593c392636199ea1c6c917 Mon Sep 17 00:00:00 2001 +From: Chris Mason +Date: Thu, 17 Dec 2009 15:47:17 -0500 +Subject: Btrfs: make sure fallocate properly starts a transaction + +From: Chris Mason + +commit 3a1abec9f6880cf406593c392636199ea1c6c917 upstream. + +The recent patch to make fallocate enospc friendly would send +down a NULL trans handle to the allocator. This moves the +transaction start to properly fix things. + +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/inode.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -5802,23 +5802,23 @@ static int prealloc_file_range(struct in + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); + ++ trans = btrfs_start_transaction(root, 1); ++ + ret = btrfs_reserve_extent(trans, root, alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + if (ret) { + WARN_ON(1); +- break; ++ goto stop_trans; + } + + ret = btrfs_reserve_metadata_space(root, 3); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset); +- break; ++ goto stop_trans; + } + +- trans = btrfs_start_transaction(root, 1); +- + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, +@@ -5847,6 +5847,11 @@ static int prealloc_file_range(struct in + btrfs_unreserve_metadata_space(root, 3); + } + return ret; ++ ++stop_trans: ++ btrfs_end_transaction(trans, root); ++ return ret; ++ + } + + static long btrfs_fallocate(struct inode *inode, int mode, diff --git a/queue-2.6.32/btrfs-make-truncate-2-more-enospc-friendly.patch b/queue-2.6.32/btrfs-make-truncate-2-more-enospc-friendly.patch new file mode 100644 index 00000000000..4fb7e9e8773 --- /dev/null +++ b/queue-2.6.32/btrfs-make-truncate-2-more-enospc-friendly.patch @@ -0,0 +1,568 @@ +From 8082510e7124cc50d728f1b875639cb4e22312cc Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:35:36 +0000 +Subject: Btrfs: Make truncate(2) more ENOSPC friendly + +From: Yan, Zheng + +commit 8082510e7124cc50d728f1b875639cb4e22312cc upstream. + +truncating and deleting regular files are unbound operations, +so it's not good to do them in a single transaction. This +patch makes btrfs_truncate and btrfs_delete_inode start a +new transaction after all items in a tree leaf are deleted. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/inode.c | 316 ++++++++++++++++++++++++++++++-------------------- + fs/btrfs/relocation.c | 33 +++-- + 2 files changed, 212 insertions(+), 137 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -2848,37 +2848,40 @@ out: + * min_type is the minimum key type to truncate down to. If set to 0, this + * will kill all the items on this inode, including the INODE_ITEM_KEY. + */ +-noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, +- struct inode *inode, +- u64 new_size, u32 min_type) ++int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, ++ struct btrfs_root *root, ++ struct inode *inode, ++ u64 new_size, u32 min_type) + { +- int ret; + struct btrfs_path *path; +- struct btrfs_key key; +- struct btrfs_key found_key; +- u32 found_type = (u8)-1; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; ++ struct btrfs_key key; ++ struct btrfs_key found_key; + u64 extent_start = 0; + u64 extent_num_bytes = 0; + u64 extent_offset = 0; + u64 item_end = 0; ++ u64 mask = root->sectorsize - 1; ++ u32 found_type = (u8)-1; + int found_extent; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int encoding; +- u64 mask = root->sectorsize - 1; ++ int ret; ++ int err = 0; ++ ++ BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + + if (root->ref_cows) + btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); ++ + path = btrfs_alloc_path(); + BUG_ON(!path); + path->reada = -1; + +- /* FIXME, add redo link to tree so we don't leak on crash */ + key.objectid = inode->i_ino; + key.offset = (u64)-1; + key.type = (u8)-1; +@@ -2886,17 +2889,17 @@ noinline int btrfs_truncate_inode_items( + search_again: + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); +- if (ret < 0) +- goto error; ++ if (ret < 0) { ++ err = ret; ++ goto out; ++ } + + if (ret > 0) { + /* there are no items in the tree for us to truncate, we're + * done + */ +- if (path->slots[0] == 0) { +- ret = 0; +- goto error; +- } ++ if (path->slots[0] == 0) ++ goto out; + path->slots[0]--; + } + +@@ -2931,28 +2934,17 @@ search_again: + } + item_end--; + } +- if (item_end < new_size) { +- if (found_type == BTRFS_DIR_ITEM_KEY) +- found_type = BTRFS_INODE_ITEM_KEY; +- else if (found_type == BTRFS_EXTENT_ITEM_KEY) +- found_type = BTRFS_EXTENT_DATA_KEY; +- else if (found_type == BTRFS_EXTENT_DATA_KEY) +- found_type = BTRFS_XATTR_ITEM_KEY; +- else if (found_type == BTRFS_XATTR_ITEM_KEY) +- found_type = BTRFS_INODE_REF_KEY; +- else if (found_type) +- found_type--; +- else ++ if (found_type > min_type) { ++ del_item = 1; ++ } else { ++ if (item_end < new_size) + break; +- btrfs_set_key_type(&key, found_type); +- goto next; ++ if (found_key.offset >= new_size) ++ del_item = 1; ++ else ++ del_item = 0; + } +- if (found_key.offset >= new_size) +- del_item = 1; +- else +- del_item = 0; + found_extent = 0; +- + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; +@@ -3039,42 +3031,36 @@ delete: + inode->i_ino, extent_offset); + BUG_ON(ret); + } +-next: +- if (path->slots[0] == 0) { +- if (pending_del_nr) +- goto del_pending; +- btrfs_release_path(root, path); +- if (found_type == BTRFS_INODE_ITEM_KEY) +- break; +- goto search_again; +- } + +- path->slots[0]--; +- if (pending_del_nr && +- path->slots[0] + 1 != pending_del_slot) { +- struct btrfs_key debug; +-del_pending: +- btrfs_item_key_to_cpu(path->nodes[0], &debug, +- pending_del_slot); +- ret = btrfs_del_items(trans, root, path, +- pending_del_slot, +- pending_del_nr); +- BUG_ON(ret); +- pending_del_nr = 0; ++ if (found_type == BTRFS_INODE_ITEM_KEY) ++ break; ++ ++ if (path->slots[0] == 0 || ++ path->slots[0] != pending_del_slot) { ++ if (root->ref_cows) { ++ err = -EAGAIN; ++ goto out; ++ } ++ if (pending_del_nr) { ++ ret = btrfs_del_items(trans, root, path, ++ pending_del_slot, ++ pending_del_nr); ++ BUG_ON(ret); ++ pending_del_nr = 0; ++ } + btrfs_release_path(root, path); +- if (found_type == BTRFS_INODE_ITEM_KEY) +- break; + goto search_again; ++ } else { ++ path->slots[0]--; + } + } +- ret = 0; +-error: ++out: + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + } + btrfs_free_path(path); +- return ret; ++ return err; + } + + /* +@@ -3194,10 +3180,6 @@ int btrfs_cont_expand(struct inode *inod + if (size <= hole_start) + return 0; + +- err = btrfs_truncate_page(inode->i_mapping, inode->i_size); +- if (err) +- return err; +- + while (1) { + struct btrfs_ordered_extent *ordered; + btrfs_wait_ordered_range(inode, hole_start, +@@ -3210,9 +3192,6 @@ int btrfs_cont_expand(struct inode *inod + btrfs_put_ordered_extent(ordered); + } + +- trans = btrfs_start_transaction(root, 1); +- btrfs_set_trans_block_group(trans, inode); +- + cur_offset = hole_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, +@@ -3220,38 +3199,120 @@ int btrfs_cont_expand(struct inode *inod + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), block_end); + last_byte = (last_byte + mask) & ~mask; +- if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { ++ if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + u64 hint_byte = 0; + hole_size = last_byte - cur_offset; +- err = btrfs_drop_extents(trans, inode, cur_offset, +- cur_offset + hole_size, +- &hint_byte, 1); +- if (err) +- break; + +- err = btrfs_reserve_metadata_space(root, 1); ++ err = btrfs_reserve_metadata_space(root, 2); + if (err) + break; + ++ trans = btrfs_start_transaction(root, 1); ++ btrfs_set_trans_block_group(trans, inode); ++ ++ err = btrfs_drop_extents(trans, inode, cur_offset, ++ cur_offset + hole_size, ++ &hint_byte, 1); ++ BUG_ON(err); ++ + err = btrfs_insert_file_extent(trans, root, + inode->i_ino, cur_offset, 0, + 0, hole_size, 0, hole_size, + 0, 0, 0); ++ BUG_ON(err); ++ + btrfs_drop_extent_cache(inode, hole_start, + last_byte - 1, 0); +- btrfs_unreserve_metadata_space(root, 1); ++ ++ btrfs_end_transaction(trans, root); ++ btrfs_unreserve_metadata_space(root, 2); + } + free_extent_map(em); + cur_offset = last_byte; +- if (err || cur_offset >= block_end) ++ if (cur_offset >= block_end) + break; + } + +- btrfs_end_transaction(trans, root); + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + return err; + } + ++static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ struct btrfs_trans_handle *trans; ++ unsigned long nr; ++ int ret; ++ ++ if (attr->ia_size == inode->i_size) ++ return 0; ++ ++ if (attr->ia_size > inode->i_size) { ++ unsigned long limit; ++ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; ++ if (attr->ia_size > inode->i_sb->s_maxbytes) ++ return -EFBIG; ++ if (limit != RLIM_INFINITY && attr->ia_size > limit) { ++ send_sig(SIGXFSZ, current, 0); ++ return -EFBIG; ++ } ++ } ++ ++ ret = btrfs_reserve_metadata_space(root, 1); ++ if (ret) ++ return ret; ++ ++ trans = btrfs_start_transaction(root, 1); ++ btrfs_set_trans_block_group(trans, inode); ++ ++ ret = btrfs_orphan_add(trans, inode); ++ BUG_ON(ret); ++ ++ nr = trans->blocks_used; ++ btrfs_end_transaction(trans, root); ++ btrfs_unreserve_metadata_space(root, 1); ++ btrfs_btree_balance_dirty(root, nr); ++ ++ if (attr->ia_size > inode->i_size) { ++ ret = btrfs_cont_expand(inode, attr->ia_size); ++ if (ret) { ++ btrfs_truncate(inode); ++ return ret; ++ } ++ ++ i_size_write(inode, attr->ia_size); ++ btrfs_ordered_update_i_size(inode, inode->i_size, NULL); ++ ++ trans = btrfs_start_transaction(root, 1); ++ btrfs_set_trans_block_group(trans, inode); ++ ++ ret = btrfs_update_inode(trans, root, inode); ++ BUG_ON(ret); ++ if (inode->i_nlink > 0) { ++ ret = btrfs_orphan_del(trans, inode); ++ BUG_ON(ret); ++ } ++ nr = trans->blocks_used; ++ btrfs_end_transaction(trans, root); ++ btrfs_btree_balance_dirty(root, nr); ++ return 0; ++ } ++ ++ /* ++ * We're truncating a file that used to have good data down to ++ * zero. Make sure it gets into the ordered flush list so that ++ * any new writes get down to disk quickly. ++ */ ++ if (attr->ia_size == 0) ++ BTRFS_I(inode)->ordered_data_close = 1; ++ ++ /* we don't support swapfiles, so vmtruncate shouldn't fail */ ++ ret = vmtruncate(inode, attr->ia_size); ++ BUG_ON(ret); ++ ++ return 0; ++} ++ + static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) + { + struct inode *inode = dentry->d_inode; +@@ -3262,23 +3323,14 @@ static int btrfs_setattr(struct dentry * + return err; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { +- if (attr->ia_size > inode->i_size) { +- err = btrfs_cont_expand(inode, attr->ia_size); +- if (err) +- return err; +- } else if (inode->i_size > 0 && +- attr->ia_size == 0) { +- +- /* we're truncating a file that used to have good +- * data down to zero. Make sure it gets into +- * the ordered flush list so that any new writes +- * get down to disk quickly. +- */ +- BTRFS_I(inode)->ordered_data_close = 1; +- } ++ err = btrfs_setattr_size(inode, attr); ++ if (err) ++ return err; + } ++ attr->ia_valid &= ~ATTR_SIZE; + +- err = inode_setattr(inode, attr); ++ if (attr->ia_valid) ++ err = inode_setattr(inode, attr); + + if (!err && ((attr->ia_valid & ATTR_MODE))) + err = btrfs_acl_chmod(inode); +@@ -3310,30 +3362,32 @@ void btrfs_delete_inode(struct inode *in + } + + btrfs_i_size_write(inode, 0); +- trans = btrfs_join_transaction(root, 1); + +- btrfs_set_trans_block_group(trans, inode); +- ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); +- if (ret) { +- btrfs_orphan_del(NULL, inode); +- goto no_delete_lock; +- } ++ while (1) { ++ trans = btrfs_start_transaction(root, 1); ++ btrfs_set_trans_block_group(trans, inode); ++ ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + +- btrfs_orphan_del(trans, inode); ++ if (ret != -EAGAIN) ++ break; + +- nr = trans->blocks_used; +- clear_inode(inode); ++ nr = trans->blocks_used; ++ btrfs_end_transaction(trans, root); ++ trans = NULL; ++ btrfs_btree_balance_dirty(root, nr); ++ } + +- btrfs_end_transaction(trans, root); +- btrfs_btree_balance_dirty(root, nr); +- return; ++ if (ret == 0) { ++ ret = btrfs_orphan_del(trans, inode); ++ BUG_ON(ret); ++ } + +-no_delete_lock: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + no_delete: + clear_inode(inode); ++ return; + } + + /* +@@ -5097,17 +5151,20 @@ static void btrfs_truncate(struct inode + unsigned long nr; + u64 mask = root->sectorsize - 1; + +- if (!S_ISREG(inode->i_mode)) +- return; +- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) ++ if (!S_ISREG(inode->i_mode)) { ++ WARN_ON(1); + return; ++ } + + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return; ++ + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); ++ btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + + trans = btrfs_start_transaction(root, 1); ++ btrfs_set_trans_block_group(trans, inode); + + /* + * setattr is responsible for setting the ordered_data_close flag, +@@ -5129,21 +5186,32 @@ static void btrfs_truncate(struct inode + if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) + btrfs_add_ordered_operation(trans, root, inode); + +- btrfs_set_trans_block_group(trans, inode); +- btrfs_i_size_write(inode, inode->i_size); ++ while (1) { ++ ret = btrfs_truncate_inode_items(trans, root, inode, ++ inode->i_size, ++ BTRFS_EXTENT_DATA_KEY); ++ if (ret != -EAGAIN) ++ break; + +- ret = btrfs_orphan_add(trans, inode); +- if (ret) +- goto out; +- /* FIXME, add redo link to tree so we don't leak on crash */ +- ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, +- BTRFS_EXTENT_DATA_KEY); +- btrfs_update_inode(trans, root, inode); ++ ret = btrfs_update_inode(trans, root, inode); ++ BUG_ON(ret); ++ ++ nr = trans->blocks_used; ++ btrfs_end_transaction(trans, root); ++ btrfs_btree_balance_dirty(root, nr); ++ ++ trans = btrfs_start_transaction(root, 1); ++ btrfs_set_trans_block_group(trans, inode); ++ } + +- ret = btrfs_orphan_del(trans, inode); ++ if (ret == 0 && inode->i_nlink > 0) { ++ ret = btrfs_orphan_del(trans, inode); ++ BUG_ON(ret); ++ } ++ ++ ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + +-out: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + BUG_ON(ret); +@@ -5240,9 +5308,9 @@ void btrfs_destroy_inode(struct inode *i + + spin_lock(&root->list_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { +- printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" +- " list\n", inode->i_ino); +- dump_stack(); ++ printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", ++ inode->i_ino); ++ list_del_init(&BTRFS_I(inode)->i_orphan); + } + spin_unlock(&root->list_lock); + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struc + return 0; + } + ++static void put_inodes(struct list_head *list) ++{ ++ struct inodevec *ivec; ++ while (!list_empty(list)) { ++ ivec = list_entry(list->next, struct inodevec, list); ++ list_del(&ivec->list); ++ while (ivec->nr > 0) { ++ ivec->nr--; ++ iput(ivec->inode[ivec->nr]); ++ } ++ kfree(ivec); ++ } ++} ++ + static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +@@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_relo + + btrfs_btree_balance_dirty(root, nr); + ++ /* ++ * put inodes outside transaction, otherwise we may deadlock. ++ */ ++ put_inodes(&inode_list); ++ + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + } +@@ -1752,19 +1771,7 @@ out: + + btrfs_btree_balance_dirty(root, nr); + +- /* +- * put inodes while we aren't holding the tree locks +- */ +- while (!list_empty(&inode_list)) { +- struct inodevec *ivec; +- ivec = list_entry(inode_list.next, struct inodevec, list); +- list_del(&ivec->list); +- while (ivec->nr > 0) { +- ivec->nr--; +- iput(ivec->inode[ivec->nr]); +- } +- kfree(ivec); +- } ++ put_inodes(&inode_list); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); diff --git a/queue-2.6.32/btrfs-pass-transaction-handle-to-security-and-acl-initialization-functions.patch b/queue-2.6.32/btrfs-pass-transaction-handle-to-security-and-acl-initialization-functions.patch new file mode 100644 index 00000000000..6b7389ef599 --- /dev/null +++ b/queue-2.6.32/btrfs-pass-transaction-handle-to-security-and-acl-initialization-functions.patch @@ -0,0 +1,419 @@ +From f34f57a3ab4e73304d78c125682f1a53cd3975f2 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:35:27 +0000 +Subject: Btrfs: Pass transaction handle to security and ACL initialization functions + +From: Yan, Zheng + +commit f34f57a3ab4e73304d78c125682f1a53cd3975f2 upstream. + +Pass transaction handle down to security and ACL initialization +functions, so we can avoid starting nested transactions + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/acl.c | 23 ++++++++------ + fs/btrfs/ctree.h | 13 +++++--- + fs/btrfs/dir-item.c | 19 ++++-------- + fs/btrfs/inode.c | 15 +++++---- + fs/btrfs/xattr.c | 80 +++++++++++++++++++++++++++++++++++----------------- + fs/btrfs/xattr.h | 9 +++-- + 6 files changed, 96 insertions(+), 63 deletions(-) + +--- a/fs/btrfs/acl.c ++++ b/fs/btrfs/acl.c +@@ -94,7 +94,8 @@ static int btrfs_xattr_get_acl(struct in + /* + * Needs to be called with fs_mutex held + */ +-static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) ++static int btrfs_set_acl(struct btrfs_trans_handle *trans, ++ struct inode *inode, struct posix_acl *acl, int type) + { + int ret, size = 0; + const char *name; +@@ -140,8 +141,7 @@ static int btrfs_set_acl(struct inode *i + goto out; + } + +- ret = __btrfs_setxattr(inode, name, value, size, 0); +- ++ ret = __btrfs_setxattr(trans, inode, name, value, size, 0); + out: + kfree(value); + +@@ -154,7 +154,7 @@ out: + static int btrfs_xattr_set_acl(struct inode *inode, int type, + const void *value, size_t size) + { +- int ret = 0; ++ int ret; + struct posix_acl *acl = NULL; + + if (!is_owner_or_cap(inode)) +@@ -170,7 +170,7 @@ static int btrfs_xattr_set_acl(struct in + } + } + +- ret = btrfs_set_acl(inode, acl, type); ++ ret = btrfs_set_acl(NULL, inode, acl, type); + + posix_acl_release(acl); + +@@ -224,7 +224,8 @@ int btrfs_check_acl(struct inode *inode, + * stuff has been fixed to work with that. If the locking stuff changes, we + * need to re-evaluate the acl locking stuff. + */ +-int btrfs_init_acl(struct inode *inode, struct inode *dir) ++int btrfs_init_acl(struct btrfs_trans_handle *trans, ++ struct inode *inode, struct inode *dir) + { + struct posix_acl *acl = NULL; + int ret = 0; +@@ -249,7 +250,8 @@ int btrfs_init_acl(struct inode *inode, + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { +- ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); ++ ret = btrfs_set_acl(trans, inode, acl, ++ ACL_TYPE_DEFAULT); + if (ret) + goto failed; + } +@@ -264,7 +266,7 @@ int btrfs_init_acl(struct inode *inode, + inode->i_mode = mode; + if (ret > 0) { + /* we need an acl */ +- ret = btrfs_set_acl(inode, clone, ++ ret = btrfs_set_acl(trans, inode, clone, + ACL_TYPE_ACCESS); + } + } +@@ -297,7 +299,7 @@ int btrfs_acl_chmod(struct inode *inode) + + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) +- ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); ++ ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); + + posix_acl_release(clone); + +@@ -323,7 +325,8 @@ int btrfs_acl_chmod(struct inode *inode) + return 0; + } + +-int btrfs_init_acl(struct inode *inode, struct inode *dir) ++int btrfs_init_acl(struct btrfs_trans_handle *trans, ++ struct inode *inode, struct inode *dir) + { + return 0; + } +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -310,6 +310,9 @@ struct btrfs_header { + #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) - \ + sizeof(struct btrfs_file_extent_item)) ++#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ ++ sizeof(struct btrfs_item) -\ ++ sizeof(struct btrfs_dir_item)) + + + /* +@@ -2201,9 +2204,10 @@ int btrfs_delete_one_dir_name(struct btr + struct btrfs_path *path, + struct btrfs_dir_item *di); + int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, const char *name, +- u16 name_len, const void *data, u16 data_len, +- u64 dir); ++ struct btrfs_root *root, ++ struct btrfs_path *path, u64 objectid, ++ const char *name, u16 name_len, ++ const void *data, u16 data_len); + struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, +@@ -2382,7 +2386,8 @@ int btrfs_check_acl(struct inode *inode, + #else + #define btrfs_check_acl NULL + #endif +-int btrfs_init_acl(struct inode *inode, struct inode *dir); ++int btrfs_init_acl(struct btrfs_trans_handle *trans, ++ struct inode *inode, struct inode *dir); + int btrfs_acl_chmod(struct inode *inode); + + /* relocation.c */ +--- a/fs/btrfs/dir-item.c ++++ b/fs/btrfs/dir-item.c +@@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_wit + * into the tree + */ + int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, const char *name, +- u16 name_len, const void *data, u16 data_len, +- u64 dir) ++ struct btrfs_root *root, ++ struct btrfs_path *path, u64 objectid, ++ const char *name, u16 name_len, ++ const void *data, u16 data_len) + { + int ret = 0; +- struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; +@@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs + struct extent_buffer *leaf; + u32 data_size; + +- key.objectid = dir; ++ BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); ++ ++ key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); +- path = btrfs_alloc_path(); +- if (!path) +- return -ENOMEM; +- if (name_len + data_len + sizeof(struct btrfs_dir_item) > +- BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item)) +- return -ENOSPC; + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, +@@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + +- btrfs_free_path(path); + return ret; + } + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -88,13 +88,14 @@ static noinline int cow_file_range(struc + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); + +-static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) ++static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, ++ struct inode *inode, struct inode *dir) + { + int err; + +- err = btrfs_init_acl(inode, dir); ++ err = btrfs_init_acl(trans, inode, dir); + if (!err) +- err = btrfs_xattr_security_init(inode, dir); ++ err = btrfs_xattr_security_init(trans, inode, dir); + return err; + } + +@@ -4296,7 +4297,7 @@ static int btrfs_mknod(struct inode *dir + if (IS_ERR(inode)) + goto out_unlock; + +- err = btrfs_init_inode_security(inode, dir); ++ err = btrfs_init_inode_security(trans, inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; +@@ -4367,7 +4368,7 @@ static int btrfs_create(struct inode *di + if (IS_ERR(inode)) + goto out_unlock; + +- err = btrfs_init_inode_security(inode, dir); ++ err = btrfs_init_inode_security(trans, inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; +@@ -4500,7 +4501,7 @@ static int btrfs_mkdir(struct inode *dir + + drop_on_err = 1; + +- err = btrfs_init_inode_security(inode, dir); ++ err = btrfs_init_inode_security(trans, inode, dir); + if (err) + goto out_fail; + +@@ -5660,7 +5661,7 @@ static int btrfs_symlink(struct inode *d + if (IS_ERR(inode)) + goto out_unlock; + +- err = btrfs_init_inode_security(inode, dir); ++ err = btrfs_init_inode_security(trans, inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; +--- a/fs/btrfs/xattr.c ++++ b/fs/btrfs/xattr.c +@@ -85,22 +85,23 @@ out: + return ret; + } + +-int __btrfs_setxattr(struct inode *inode, const char *name, +- const void *value, size_t size, int flags) ++static int do_setxattr(struct btrfs_trans_handle *trans, ++ struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) + { + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; +- struct btrfs_trans_handle *trans; + struct btrfs_path *path; +- int ret = 0, mod = 0; ++ size_t name_len = strlen(name); ++ int ret = 0; ++ ++ if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) ++ return -ENOSPC; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +- trans = btrfs_join_transaction(root, 1); +- btrfs_set_trans_block_group(trans, inode); +- + /* first lets see if we already have this xattr */ + di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, + strlen(name), -1); +@@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode + } + + ret = btrfs_delete_one_dir_name(trans, root, path, di); +- if (ret) +- goto out; ++ BUG_ON(ret); + btrfs_release_path(root, path); + + /* if we don't have a value then we are removing the xattr */ +- if (!value) { +- mod = 1; ++ if (!value) + goto out; +- } + } else { + btrfs_release_path(root, path); + +@@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode + } + + /* ok we have to create a completely new xattr */ +- ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), +- value, size, inode->i_ino); ++ ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino, ++ name, name_len, value, size); ++ BUG_ON(ret); ++out: ++ btrfs_free_path(path); ++ return ret; ++} ++ ++int __btrfs_setxattr(struct btrfs_trans_handle *trans, ++ struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ int ret; ++ ++ if (trans) ++ return do_setxattr(trans, inode, name, value, size, flags); ++ ++ ret = btrfs_reserve_metadata_space(root, 2); + if (ret) +- goto out; +- mod = 1; ++ return ret; + +-out: +- if (mod) { +- inode->i_ctime = CURRENT_TIME; +- ret = btrfs_update_inode(trans, root, inode); ++ trans = btrfs_start_transaction(root, 1); ++ if (!trans) { ++ ret = -ENOMEM; ++ goto out; + } ++ btrfs_set_trans_block_group(trans, inode); + +- btrfs_end_transaction(trans, root); +- btrfs_free_path(path); ++ ret = do_setxattr(trans, inode, name, value, size, flags); ++ if (ret) ++ goto out; ++ ++ inode->i_ctime = CURRENT_TIME; ++ ret = btrfs_update_inode(trans, root, inode); ++ BUG_ON(ret); ++out: ++ btrfs_end_transaction_throttle(trans, root); ++ btrfs_unreserve_metadata_space(root, 2); + return ret; + } + +@@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry + + if (size == 0) + value = ""; /* empty EA, do not remove */ +- return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); ++ ++ return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, ++ flags); + } + + int btrfs_removexattr(struct dentry *dentry, const char *name) +@@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *den + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; +- return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); ++ ++ return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, ++ XATTR_REPLACE); + } + +-int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) ++int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, ++ struct inode *inode, struct inode *dir) + { + int err; + size_t len; +@@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct ino + } else { + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); +- err = __btrfs_setxattr(inode, name, value, len, 0); ++ err = __btrfs_setxattr(trans, inode, name, value, len, 0); + kfree(name); + } + +--- a/fs/btrfs/xattr.h ++++ b/fs/btrfs/xattr.h +@@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr + + extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size); +-extern int __btrfs_setxattr(struct inode *inode, const char *name, +- const void *value, size_t size, int flags); +- ++extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, ++ struct inode *inode, const char *name, ++ const void *value, size_t size, int flags); + extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); + extern int btrfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); + extern int btrfs_removexattr(struct dentry *dentry, const char *name); + +-extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir); ++extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, ++ struct inode *inode, struct inode *dir); + + #endif /* __XATTR__ */ diff --git a/queue-2.6.32/btrfs-remove-bug_on-due-to-mounting-bad-filesystem.patch b/queue-2.6.32/btrfs-remove-bug_on-due-to-mounting-bad-filesystem.patch new file mode 100644 index 00000000000..d26e5598f9f --- /dev/null +++ b/queue-2.6.32/btrfs-remove-bug_on-due-to-mounting-bad-filesystem.patch @@ -0,0 +1,63 @@ +From d7ce5843bb28ada6845ab2ae8510ba3f12d33154 Mon Sep 17 00:00:00 2001 +From: Miao Xie +Date: Tue, 2 Feb 2010 08:46:44 +0000 +Subject: Btrfs: remove BUG_ON() due to mounting bad filesystem + +From: Miao Xie + +commit d7ce5843bb28ada6845ab2ae8510ba3f12d33154 upstream. + +Mounting a bad filesystem caused a BUG_ON(). The following is steps to +reproduce it. + # mkfs.btrfs /dev/sda2 + # mount /dev/sda2 /mnt + # mkfs.btrfs /dev/sda1 /dev/sda2 + (the program says that /dev/sda2 was mounted, and then exits. ) + # umount /mnt + # mount /dev/sda1 /mnt + +At the third step, mkfs.btrfs exited in the way of make filesystem. So the +initialization of the filesystem didn't finish. So the filesystem was bad, and +it caused BUG_ON() when mounting it. But BUG_ON() should be called by the wrong +code, not user's operation, so I think it is a bug of btrfs. + +This patch fixes it. + +Signed-off-by: Miao Xie +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 7 ++++++- + fs/btrfs/relocation.c | 3 ++- + 2 files changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1982,7 +1982,12 @@ struct btrfs_root *open_ctree(struct sup + + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_recover_relocation(tree_root); +- BUG_ON(ret); ++ if (ret < 0) { ++ printk(KERN_WARNING ++ "btrfs: failed to recover relocation\n"); ++ err = -EINVAL; ++ goto fail_trans_kthread; ++ } + } + + location.objectid = BTRFS_FS_TREE_OBJECTID; +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -3764,7 +3764,8 @@ out: + BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(fs_root)) + err = PTR_ERR(fs_root); +- btrfs_orphan_cleanup(fs_root); ++ else ++ btrfs_orphan_cleanup(fs_root); + } + return err; + } diff --git a/queue-2.6.32/btrfs-rewrite-btrfs_drop_extents.patch b/queue-2.6.32/btrfs-rewrite-btrfs_drop_extents.patch new file mode 100644 index 00000000000..a0f4de3bd3f --- /dev/null +++ b/queue-2.6.32/btrfs-rewrite-btrfs_drop_extents.patch @@ -0,0 +1,943 @@ +From 920bbbfb05c9fce22e088d20eb9dcb8f96342de9 Mon Sep 17 00:00:00 2001 +From: Yan, Zheng +Date: Thu, 12 Nov 2009 09:34:08 +0000 +Subject: Btrfs: Rewrite btrfs_drop_extents + +From: Yan, Zheng + +commit 920bbbfb05c9fce22e088d20eb9dcb8f96342de9 upstream. + +Rewrite btrfs_drop_extents by using btrfs_duplicate_item, so we can +avoid calling lock_extent within transaction. + +Signed-off-by: Yan Zheng +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 7 + fs/btrfs/file.c | 661 ++++++++++++++++++++-------------------------------- + fs/btrfs/inode.c | 27 -- + fs/btrfs/ioctl.c | 3 + fs/btrfs/tree-log.c | 4 + 5 files changed, 278 insertions(+), 424 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -2349,12 +2349,9 @@ int btrfs_drop_extent_cache(struct inode + int skip_pinned); + int btrfs_check_file(struct btrfs_root *root, struct inode *inode); + extern const struct file_operations btrfs_file_operations; +-int btrfs_drop_extents(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, struct inode *inode, +- u64 start, u64 end, u64 locked_end, +- u64 inline_limit, u64 *hint_block, int drop_cache); ++int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, ++ u64 start, u64 end, u64 *hint_byte, int drop_cache); + int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, + struct inode *inode, u64 start, u64 end); + int btrfs_release_file(struct inode *inode, struct file *file); + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -265,319 +265,247 @@ int btrfs_drop_extent_cache(struct inode + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. +- * +- * inline_limit is used to tell this code which offsets in the file to keep +- * if they contain inline extents. + */ +-noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, struct inode *inode, +- u64 start, u64 end, u64 locked_end, +- u64 inline_limit, u64 *hint_byte, int drop_cache) ++int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, ++ u64 start, u64 end, u64 *hint_byte, int drop_cache) + { +- u64 extent_end = 0; +- u64 search_start = start; +- u64 ram_bytes = 0; +- u64 disk_bytenr = 0; +- u64 orig_locked_end = locked_end; +- u8 compression; +- u8 encryption; +- u16 other_encoding = 0; ++ struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; +- struct btrfs_file_extent_item *extent; ++ struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct btrfs_key key; +- struct btrfs_file_extent_item old; +- int keep; +- int slot; +- int bookend; +- int found_type = 0; +- int found_extent; +- int found_inline; ++ struct btrfs_key new_key; ++ u64 search_start = start; ++ u64 disk_bytenr = 0; ++ u64 num_bytes = 0; ++ u64 extent_offset = 0; ++ u64 extent_end = 0; ++ int del_nr = 0; ++ int del_slot = 0; ++ int extent_type; + int recow; + int ret; + +- inline_limit = 0; + if (drop_cache) + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; ++ + while (1) { + recow = 0; +- btrfs_release_path(root, path); + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + search_start, -1); + if (ret < 0) +- goto out; +- if (ret > 0) { +- if (path->slots[0] == 0) { +- ret = 0; +- goto out; +- } +- path->slots[0]--; ++ break; ++ if (ret > 0 && path->slots[0] > 0 && search_start == start) { ++ leaf = path->nodes[0]; ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); ++ if (key.objectid == inode->i_ino && ++ key.type == BTRFS_EXTENT_DATA_KEY) ++ path->slots[0]--; + } ++ ret = 0; + next_slot: +- keep = 0; +- bookend = 0; +- found_extent = 0; +- found_inline = 0; +- compression = 0; +- encryption = 0; +- extent = NULL; + leaf = path->nodes[0]; +- slot = path->slots[0]; +- ret = 0; +- btrfs_item_key_to_cpu(leaf, &key, slot); +- if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && +- key.offset >= end) { +- goto out; +- } +- if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || +- key.objectid != inode->i_ino) { +- goto out; +- } +- if (recow) { +- search_start = max(key.offset, start); +- continue; +- } +- if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { +- extent = btrfs_item_ptr(leaf, slot, +- struct btrfs_file_extent_item); +- found_type = btrfs_file_extent_type(leaf, extent); +- compression = btrfs_file_extent_compression(leaf, +- extent); +- encryption = btrfs_file_extent_encryption(leaf, +- extent); +- other_encoding = btrfs_file_extent_other_encoding(leaf, +- extent); +- if (found_type == BTRFS_FILE_EXTENT_REG || +- found_type == BTRFS_FILE_EXTENT_PREALLOC) { +- extent_end = +- btrfs_file_extent_disk_bytenr(leaf, +- extent); +- if (extent_end) +- *hint_byte = extent_end; +- +- extent_end = key.offset + +- btrfs_file_extent_num_bytes(leaf, extent); +- ram_bytes = btrfs_file_extent_ram_bytes(leaf, +- extent); +- found_extent = 1; +- } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { +- found_inline = 1; +- extent_end = key.offset + +- btrfs_file_extent_inline_len(leaf, extent); ++ if (path->slots[0] >= btrfs_header_nritems(leaf)) { ++ BUG_ON(del_nr > 0); ++ ret = btrfs_next_leaf(root, path); ++ if (ret < 0) ++ break; ++ if (ret > 0) { ++ ret = 0; ++ break; + } ++ leaf = path->nodes[0]; ++ recow = 1; ++ } ++ ++ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ++ if (key.objectid > inode->i_ino || ++ key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) ++ break; ++ ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); ++ extent_type = btrfs_file_extent_type(leaf, fi); ++ ++ if (extent_type == BTRFS_FILE_EXTENT_REG || ++ extent_type == BTRFS_FILE_EXTENT_PREALLOC) { ++ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); ++ num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); ++ extent_offset = btrfs_file_extent_offset(leaf, fi); ++ extent_end = key.offset + ++ btrfs_file_extent_num_bytes(leaf, fi); ++ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ++ extent_end = key.offset + ++ btrfs_file_extent_inline_len(leaf, fi); + } else { ++ WARN_ON(1); + extent_end = search_start; + } + +- /* we found nothing we can drop */ +- if ((!found_extent && !found_inline) || +- search_start >= extent_end) { +- int nextret; +- u32 nritems; +- nritems = btrfs_header_nritems(leaf); +- if (slot >= nritems - 1) { +- nextret = btrfs_next_leaf(root, path); +- if (nextret) +- goto out; +- recow = 1; +- } else { +- path->slots[0]++; +- } ++ if (extent_end <= search_start) { ++ path->slots[0]++; + goto next_slot; + } + +- if (end <= extent_end && start >= key.offset && found_inline) +- *hint_byte = EXTENT_MAP_INLINE; ++ search_start = max(key.offset, start); ++ if (recow) { ++ btrfs_release_path(root, path); ++ continue; ++ } + +- if (found_extent) { +- read_extent_buffer(leaf, &old, (unsigned long)extent, +- sizeof(old)); +- } +- +- if (end < extent_end && end >= key.offset) { +- bookend = 1; +- if (found_inline && start <= key.offset) +- keep = 1; +- } +- +- if (bookend && found_extent) { +- if (locked_end < extent_end) { +- ret = try_lock_extent(&BTRFS_I(inode)->io_tree, +- locked_end, extent_end - 1, +- GFP_NOFS); +- if (!ret) { +- btrfs_release_path(root, path); +- lock_extent(&BTRFS_I(inode)->io_tree, +- locked_end, extent_end - 1, +- GFP_NOFS); +- locked_end = extent_end; +- continue; +- } +- locked_end = extent_end; ++ /* ++ * | - range to drop - | ++ * | -------- extent -------- | ++ */ ++ if (start > key.offset && end < extent_end) { ++ BUG_ON(del_nr > 0); ++ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); ++ ++ memcpy(&new_key, &key, sizeof(new_key)); ++ new_key.offset = start; ++ ret = btrfs_duplicate_item(trans, root, path, ++ &new_key); ++ if (ret == -EAGAIN) { ++ btrfs_release_path(root, path); ++ continue; + } +- disk_bytenr = le64_to_cpu(old.disk_bytenr); +- if (disk_bytenr != 0) { ++ if (ret < 0) ++ break; ++ ++ leaf = path->nodes[0]; ++ fi = btrfs_item_ptr(leaf, path->slots[0] - 1, ++ struct btrfs_file_extent_item); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ start - key.offset); ++ ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); ++ ++ extent_offset += start - key.offset; ++ btrfs_set_file_extent_offset(leaf, fi, extent_offset); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ extent_end - start); ++ btrfs_mark_buffer_dirty(leaf); ++ ++ if (disk_bytenr > 0) { + ret = btrfs_inc_extent_ref(trans, root, +- disk_bytenr, +- le64_to_cpu(old.disk_num_bytes), 0, +- root->root_key.objectid, +- key.objectid, key.offset - +- le64_to_cpu(old.offset)); ++ disk_bytenr, num_bytes, 0, ++ root->root_key.objectid, ++ new_key.objectid, ++ start - extent_offset); + BUG_ON(ret); ++ *hint_byte = disk_bytenr; + } ++ key.offset = start; + } ++ /* ++ * | ---- range to drop ----- | ++ * | -------- extent -------- | ++ */ ++ if (start <= key.offset && end < extent_end) { ++ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); ++ ++ memcpy(&new_key, &key, sizeof(new_key)); ++ new_key.offset = end; ++ btrfs_set_item_key_safe(trans, root, path, &new_key); + +- if (found_inline) { +- u64 mask = root->sectorsize - 1; +- search_start = (extent_end + mask) & ~mask; +- } else +- search_start = extent_end; +- +- /* truncate existing extent */ +- if (start > key.offset) { +- u64 new_num; +- u64 old_num; +- keep = 1; +- WARN_ON(start & (root->sectorsize - 1)); +- if (found_extent) { +- new_num = start - key.offset; +- old_num = btrfs_file_extent_num_bytes(leaf, +- extent); +- *hint_byte = +- btrfs_file_extent_disk_bytenr(leaf, +- extent); +- if (btrfs_file_extent_disk_bytenr(leaf, +- extent)) { +- inode_sub_bytes(inode, old_num - +- new_num); +- } +- btrfs_set_file_extent_num_bytes(leaf, +- extent, new_num); +- btrfs_mark_buffer_dirty(leaf); +- } else if (key.offset < inline_limit && +- (end > extent_end) && +- (inline_limit < extent_end)) { +- u32 new_size; +- new_size = btrfs_file_extent_calc_inline_size( +- inline_limit - key.offset); +- inode_sub_bytes(inode, extent_end - +- inline_limit); +- btrfs_set_file_extent_ram_bytes(leaf, extent, +- new_size); +- if (!compression && !encryption) { +- btrfs_truncate_item(trans, root, path, +- new_size, 1); +- } ++ extent_offset += end - key.offset; ++ btrfs_set_file_extent_offset(leaf, fi, extent_offset); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ extent_end - end); ++ btrfs_mark_buffer_dirty(leaf); ++ if (disk_bytenr > 0) { ++ inode_sub_bytes(inode, end - key.offset); ++ *hint_byte = disk_bytenr; + } ++ break; + } +- /* delete the entire extent */ +- if (!keep) { +- if (found_inline) +- inode_sub_bytes(inode, extent_end - +- key.offset); +- ret = btrfs_del_item(trans, root, path); +- /* TODO update progress marker and return */ +- BUG_ON(ret); +- extent = NULL; +- btrfs_release_path(root, path); +- /* the extent will be freed later */ +- } +- if (bookend && found_inline && start <= key.offset) { +- u32 new_size; +- new_size = btrfs_file_extent_calc_inline_size( +- extent_end - end); +- inode_sub_bytes(inode, end - key.offset); +- btrfs_set_file_extent_ram_bytes(leaf, extent, +- new_size); +- if (!compression && !encryption) +- ret = btrfs_truncate_item(trans, root, path, +- new_size, 0); +- BUG_ON(ret); +- } +- /* create bookend, splitting the extent in two */ +- if (bookend && found_extent) { +- struct btrfs_key ins; +- ins.objectid = inode->i_ino; +- ins.offset = end; +- btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); +- +- btrfs_release_path(root, path); +- path->leave_spinning = 1; +- ret = btrfs_insert_empty_item(trans, root, path, &ins, +- sizeof(*extent)); +- BUG_ON(ret); + +- leaf = path->nodes[0]; +- extent = btrfs_item_ptr(leaf, path->slots[0], +- struct btrfs_file_extent_item); +- write_extent_buffer(leaf, &old, +- (unsigned long)extent, sizeof(old)); +- +- btrfs_set_file_extent_compression(leaf, extent, +- compression); +- btrfs_set_file_extent_encryption(leaf, extent, +- encryption); +- btrfs_set_file_extent_other_encoding(leaf, extent, +- other_encoding); +- btrfs_set_file_extent_offset(leaf, extent, +- le64_to_cpu(old.offset) + end - key.offset); +- WARN_ON(le64_to_cpu(old.num_bytes) < +- (extent_end - end)); +- btrfs_set_file_extent_num_bytes(leaf, extent, +- extent_end - end); ++ search_start = extent_end; ++ /* ++ * | ---- range to drop ----- | ++ * | -------- extent -------- | ++ */ ++ if (start > key.offset && end >= extent_end) { ++ BUG_ON(del_nr > 0); ++ BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + +- /* +- * set the ram bytes to the size of the full extent +- * before splitting. This is a worst case flag, +- * but its the best we can do because we don't know +- * how splitting affects compression +- */ +- btrfs_set_file_extent_ram_bytes(leaf, extent, +- ram_bytes); +- btrfs_set_file_extent_type(leaf, extent, found_type); +- +- btrfs_unlock_up_safe(path, 1); +- btrfs_mark_buffer_dirty(path->nodes[0]); +- btrfs_set_lock_blocking(path->nodes[0]); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ start - key.offset); ++ btrfs_mark_buffer_dirty(leaf); ++ if (disk_bytenr > 0) { ++ inode_sub_bytes(inode, extent_end - start); ++ *hint_byte = disk_bytenr; ++ } ++ if (end == extent_end) ++ break; + +- path->leave_spinning = 0; +- btrfs_release_path(root, path); +- if (disk_bytenr != 0) +- inode_add_bytes(inode, extent_end - end); ++ path->slots[0]++; ++ goto next_slot; + } + +- if (found_extent && !keep) { +- u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); ++ /* ++ * | ---- range to drop ----- | ++ * | ------ extent ------ | ++ */ ++ if (start <= key.offset && end >= extent_end) { ++ if (del_nr == 0) { ++ del_slot = path->slots[0]; ++ del_nr = 1; ++ } else { ++ BUG_ON(del_slot + del_nr != path->slots[0]); ++ del_nr++; ++ } + +- if (old_disk_bytenr != 0) { ++ if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + inode_sub_bytes(inode, +- le64_to_cpu(old.num_bytes)); ++ extent_end - key.offset); ++ extent_end = ALIGN(extent_end, ++ root->sectorsize); ++ } else if (disk_bytenr > 0) { + ret = btrfs_free_extent(trans, root, +- old_disk_bytenr, +- le64_to_cpu(old.disk_num_bytes), +- 0, root->root_key.objectid, ++ disk_bytenr, num_bytes, 0, ++ root->root_key.objectid, + key.objectid, key.offset - +- le64_to_cpu(old.offset)); ++ extent_offset); + BUG_ON(ret); +- *hint_byte = old_disk_bytenr; ++ inode_sub_bytes(inode, ++ extent_end - key.offset); ++ *hint_byte = disk_bytenr; + } +- } + +- if (search_start >= end) { +- ret = 0; +- goto out; ++ if (end == extent_end) ++ break; ++ ++ if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { ++ path->slots[0]++; ++ goto next_slot; ++ } ++ ++ ret = btrfs_del_items(trans, root, path, del_slot, ++ del_nr); ++ BUG_ON(ret); ++ ++ del_nr = 0; ++ del_slot = 0; ++ ++ btrfs_release_path(root, path); ++ continue; + } ++ ++ BUG_ON(1); + } +-out: +- btrfs_free_path(path); +- if (locked_end > orig_locked_end) { +- unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, +- locked_end - 1, GFP_NOFS); ++ ++ if (del_nr > 0) { ++ ret = btrfs_del_items(trans, root, path, del_slot, del_nr); ++ BUG_ON(ret); + } ++ ++ btrfs_free_path(path); + return ret; + } + +@@ -620,23 +548,23 @@ static int extent_mergeable(struct exten + * two or three. + */ + int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, +- struct btrfs_root *root, + struct inode *inode, u64 start, u64 end) + { ++ struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; ++ struct btrfs_key new_key; + u64 bytenr; + u64 num_bytes; + u64 extent_end; + u64 orig_offset; + u64 other_start; + u64 other_end; +- u64 split = start; +- u64 locked_end = end; +- int extent_type; +- int split_end = 1; ++ u64 split; ++ int del_nr = 0; ++ int del_slot = 0; + int ret; + + btrfs_drop_extent_cache(inode, start, end - 1, 0); +@@ -644,12 +572,10 @@ int btrfs_mark_extent_written(struct btr + path = btrfs_alloc_path(); + BUG_ON(!path); + again: ++ split = start; + key.objectid = inode->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; +- if (split == start) +- key.offset = split; +- else +- key.offset = split - 1; ++ key.offset = split; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0 && path->slots[0] > 0) +@@ -661,8 +587,8 @@ again: + key.type != BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); +- extent_type = btrfs_file_extent_type(leaf, fi); +- BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); ++ BUG_ON(btrfs_file_extent_type(leaf, fi) != ++ BTRFS_FILE_EXTENT_PREALLOC); + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + BUG_ON(key.offset > start || extent_end < end); + +@@ -670,150 +596,91 @@ again: + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + +- if (key.offset == start) +- split = end; +- +- if (key.offset == start && extent_end == end) { +- int del_nr = 0; +- int del_slot = 0; +- other_start = end; +- other_end = 0; +- if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, +- bytenr, &other_start, &other_end)) { +- extent_end = other_end; +- del_slot = path->slots[0] + 1; +- del_nr++; +- ret = btrfs_free_extent(trans, root, bytenr, num_bytes, +- 0, root->root_key.objectid, +- inode->i_ino, orig_offset); +- BUG_ON(ret); +- } +- other_start = 0; +- other_end = start; +- if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, +- bytenr, &other_start, &other_end)) { +- key.offset = other_start; +- del_slot = path->slots[0]; +- del_nr++; +- ret = btrfs_free_extent(trans, root, bytenr, num_bytes, +- 0, root->root_key.objectid, +- inode->i_ino, orig_offset); +- BUG_ON(ret); +- } +- split_end = 0; +- if (del_nr == 0) { +- btrfs_set_file_extent_type(leaf, fi, +- BTRFS_FILE_EXTENT_REG); +- goto done; ++ while (start > key.offset || end < extent_end) { ++ if (key.offset == start) ++ split = end; ++ ++ memcpy(&new_key, &key, sizeof(new_key)); ++ new_key.offset = split; ++ ret = btrfs_duplicate_item(trans, root, path, &new_key); ++ if (ret == -EAGAIN) { ++ btrfs_release_path(root, path); ++ goto again; + } ++ BUG_ON(ret < 0); + +- fi = btrfs_item_ptr(leaf, del_slot - 1, ++ leaf = path->nodes[0]; ++ fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); +- btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, +- extent_end - key.offset); ++ split - key.offset); ++ ++ fi = btrfs_item_ptr(leaf, path->slots[0], ++ struct btrfs_file_extent_item); ++ ++ btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ extent_end - split); + btrfs_mark_buffer_dirty(leaf); + +- ret = btrfs_del_items(trans, root, path, del_slot, del_nr); ++ ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, ++ root->root_key.objectid, ++ inode->i_ino, orig_offset); + BUG_ON(ret); +- goto release; +- } else if (split == start) { +- if (locked_end < extent_end) { +- ret = try_lock_extent(&BTRFS_I(inode)->io_tree, +- locked_end, extent_end - 1, GFP_NOFS); +- if (!ret) { +- btrfs_release_path(root, path); +- lock_extent(&BTRFS_I(inode)->io_tree, +- locked_end, extent_end - 1, GFP_NOFS); +- locked_end = extent_end; +- goto again; +- } +- locked_end = extent_end; +- } +- btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); +- } else { +- BUG_ON(key.offset != start); +- key.offset = split; +- btrfs_set_file_extent_offset(leaf, fi, key.offset - +- orig_offset); +- btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); +- btrfs_set_item_key_safe(trans, root, path, &key); +- extent_end = split; +- } + +- if (extent_end == end) { +- split_end = 0; +- extent_type = BTRFS_FILE_EXTENT_REG; +- } +- if (extent_end == end && split == start) { +- other_start = end; +- other_end = 0; +- if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, +- bytenr, &other_start, &other_end)) { +- path->slots[0]++; +- fi = btrfs_item_ptr(leaf, path->slots[0], +- struct btrfs_file_extent_item); +- key.offset = split; +- btrfs_set_item_key_safe(trans, root, path, &key); +- btrfs_set_file_extent_offset(leaf, fi, key.offset - +- orig_offset); +- btrfs_set_file_extent_num_bytes(leaf, fi, +- other_end - split); +- goto done; +- } +- } +- if (extent_end == end && split == end) { +- other_start = 0; +- other_end = start; +- if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino, +- bytenr, &other_start, &other_end)) { ++ if (split == start) { ++ key.offset = start; ++ } else { ++ BUG_ON(start != key.offset); + path->slots[0]--; +- fi = btrfs_item_ptr(leaf, path->slots[0], +- struct btrfs_file_extent_item); +- btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - +- other_start); +- goto done; ++ extent_end = end; + } + } + +- btrfs_mark_buffer_dirty(leaf); +- +- ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, +- root->root_key.objectid, +- inode->i_ino, orig_offset); +- BUG_ON(ret); +- btrfs_release_path(root, path); +- +- key.offset = start; +- ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi)); +- BUG_ON(ret); +- +- leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); +- btrfs_set_file_extent_generation(leaf, fi, trans->transid); +- btrfs_set_file_extent_type(leaf, fi, extent_type); +- btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); +- btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); +- btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset); +- btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); +- btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); +- btrfs_set_file_extent_compression(leaf, fi, 0); +- btrfs_set_file_extent_encryption(leaf, fi, 0); +- btrfs_set_file_extent_other_encoding(leaf, fi, 0); +-done: +- btrfs_mark_buffer_dirty(leaf); + +-release: +- btrfs_release_path(root, path); +- if (split_end && split == start) { +- split = end; +- goto again; ++ other_start = end; ++ other_end = 0; ++ if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, ++ bytenr, &other_start, &other_end)) { ++ extent_end = other_end; ++ del_slot = path->slots[0] + 1; ++ del_nr++; ++ ret = btrfs_free_extent(trans, root, bytenr, num_bytes, ++ 0, root->root_key.objectid, ++ inode->i_ino, orig_offset); ++ BUG_ON(ret); ++ } ++ other_start = 0; ++ other_end = start; ++ if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, ++ bytenr, &other_start, &other_end)) { ++ key.offset = other_start; ++ del_slot = path->slots[0]; ++ del_nr++; ++ ret = btrfs_free_extent(trans, root, bytenr, num_bytes, ++ 0, root->root_key.objectid, ++ inode->i_ino, orig_offset); ++ BUG_ON(ret); + } +- if (locked_end > end) { +- unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, +- GFP_NOFS); ++ if (del_nr == 0) { ++ btrfs_set_file_extent_type(leaf, fi, ++ BTRFS_FILE_EXTENT_REG); ++ btrfs_mark_buffer_dirty(leaf); ++ goto out; + } ++ ++ fi = btrfs_item_ptr(leaf, del_slot - 1, ++ struct btrfs_file_extent_item); ++ btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); ++ btrfs_set_file_extent_num_bytes(leaf, fi, ++ extent_end - key.offset); ++ btrfs_mark_buffer_dirty(leaf); ++ ++ ret = btrfs_del_items(trans, root, path, del_slot, del_nr); ++ BUG_ON(ret); ++out: + btrfs_free_path(path); + return 0; + } +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -230,8 +230,7 @@ static noinline int cow_file_range_inlin + return 1; + } + +- ret = btrfs_drop_extents(trans, root, inode, start, +- aligned_end, aligned_end, start, ++ ret = btrfs_drop_extents(trans, inode, start, aligned_end, + &hint_byte, 1); + BUG_ON(ret); + +@@ -1596,7 +1595,6 @@ static int insert_reserved_file_extent(s + struct inode *inode, u64 file_pos, + u64 disk_bytenr, u64 disk_num_bytes, + u64 num_bytes, u64 ram_bytes, +- u64 locked_end, + u8 compression, u8 encryption, + u16 other_encoding, int extent_type) + { +@@ -1622,9 +1620,8 @@ static int insert_reserved_file_extent(s + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ +- ret = btrfs_drop_extents(trans, root, inode, file_pos, +- file_pos + num_bytes, locked_end, +- file_pos, &hint, 0); ++ ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, ++ &hint, 0); + BUG_ON(ret); + + ins.objectid = inode->i_ino; +@@ -1746,7 +1743,7 @@ static int btrfs_finish_ordered_io(struc + compressed = 1; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + BUG_ON(compressed); +- ret = btrfs_mark_extent_written(trans, root, inode, ++ ret = btrfs_mark_extent_written(trans, inode, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len); +@@ -1758,8 +1755,6 @@ static int btrfs_finish_ordered_io(struc + ordered_extent->disk_len, + ordered_extent->len, + ordered_extent->len, +- ordered_extent->file_offset + +- ordered_extent->len, + compressed, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, +@@ -3209,11 +3204,9 @@ int btrfs_cont_expand(struct inode *inod + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + u64 hint_byte = 0; + hole_size = last_byte - cur_offset; +- err = btrfs_drop_extents(trans, root, inode, +- cur_offset, ++ err = btrfs_drop_extents(trans, inode, cur_offset, + cur_offset + hole_size, +- block_end, +- cur_offset, &hint_byte, 1); ++ &hint_byte, 1); + if (err) + break; + +@@ -5643,7 +5636,7 @@ out_fail: + + static int prealloc_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end, +- u64 locked_end, u64 alloc_hint, int mode) ++ u64 alloc_hint, int mode) + { + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; +@@ -5669,8 +5662,7 @@ static int prealloc_file_range(struct bt + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, +- ins.offset, locked_end, +- 0, 0, 0, ++ ins.offset, 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, cur_offset, +@@ -5779,8 +5771,7 @@ static long btrfs_fallocate(struct inode + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE) { + ret = prealloc_file_range(trans, inode, cur_offset, +- last_byte, locked_end + 1, +- alloc_hint, mode); ++ last_byte, alloc_hint, mode); + if (ret < 0) { + free_extent_map(em); + break; +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -1032,8 +1032,7 @@ static noinline long btrfs_ioctl_clone(s + BUG_ON(!trans); + + /* punch hole in destination first */ +- btrfs_drop_extents(trans, root, inode, off, off + len, +- off + len, 0, &hint_byte, 1); ++ btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); + + /* clone data */ + key.objectid = src->i_ino; +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -542,8 +542,8 @@ static noinline int replay_one_extent(st + + saved_nbytes = inode_get_bytes(inode); + /* drop any overlapping extents */ +- ret = btrfs_drop_extents(trans, root, inode, +- start, extent_end, extent_end, start, &alloc_hint, 1); ++ ret = btrfs_drop_extents(trans, inode, start, extent_end, ++ &alloc_hint, 1); + BUG_ON(ret); + + if (found_type == BTRFS_FILE_EXTENT_REG || diff --git a/queue-2.6.32/btrfs-run-orphan-cleanup-on-default-fs-root.patch b/queue-2.6.32/btrfs-run-orphan-cleanup-on-default-fs-root.patch new file mode 100644 index 00000000000..f9d7ca2120b --- /dev/null +++ b/queue-2.6.32/btrfs-run-orphan-cleanup-on-default-fs-root.patch @@ -0,0 +1,61 @@ +From e3acc2a6850efff647f1c5458524eb3a8bcba20a Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 26 Jan 2010 14:30:53 +0000 +Subject: Btrfs: run orphan cleanup on default fs root + +From: Josef Bacik + +commit e3acc2a6850efff647f1c5458524eb3a8bcba20a upstream. + +This patch revert's commit + +6c090a11e1c403b727a6a8eff0b97d5fb9e95cb5 + +Since it introduces this problem where we can run orphan cleanup on a +volume that can have orphan entries re-added. Instead of my original +fix, Yan Zheng pointed out that we can just revert my original fix and +then run the orphan cleanup in open_ctree after we look up the fs_root. +I have tested this with all the tests that gave me problems and this +patch fixes both problems. Thanks, + +Signed-off-by: Josef Bacik +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/disk-io.c | 6 ++++++ + fs/btrfs/inode.c | 6 ------ + 2 files changed, 6 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1993,6 +1993,12 @@ struct btrfs_root *open_ctree(struct sup + if (!fs_info->fs_root) + goto fail_trans_kthread; + ++ if (!(sb->s_flags & MS_RDONLY)) { ++ down_read(&fs_info->cleanup_work_sem); ++ btrfs_orphan_cleanup(fs_info->fs_root); ++ up_read(&fs_info->cleanup_work_sem); ++ } ++ + return tree_root; + + fail_trans_kthread: +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3796,12 +3796,6 @@ struct inode *btrfs_lookup_dentry(struct + + if (location.type == BTRFS_INODE_ITEM_KEY) { + inode = btrfs_iget(dir->i_sb, &location, root); +- if (unlikely(root->clean_orphans) && +- !(inode->i_sb->s_flags & MS_RDONLY)) { +- down_read(&root->fs_info->cleanup_work_sem); +- btrfs_orphan_cleanup(root); +- up_read(&root->fs_info->cleanup_work_sem); +- } + return inode; + } + diff --git a/queue-2.6.32/btrfs-show-discard-option-in-proc-mounts.patch b/queue-2.6.32/btrfs-show-discard-option-in-proc-mounts.patch new file mode 100644 index 00000000000..9299cc2c072 --- /dev/null +++ b/queue-2.6.32/btrfs-show-discard-option-in-proc-mounts.patch @@ -0,0 +1,34 @@ +From 20a5239a5d0f340e29827a6a2d28a138001c44b8 Mon Sep 17 00:00:00 2001 +From: Matthew Wilcox +Date: Mon, 14 Dec 2009 22:01:12 +0000 +Subject: Btrfs: Show discard option in /proc/mounts + +From: Matthew Wilcox + +commit 20a5239a5d0f340e29827a6a2d28a138001c44b8 upstream. + +Christoph's patch e244a0aeb6a599c19a7c802cda6e2d67c847b154 doesn't display +the discard option in /proc/mounts, leading to some confusion for me. +Here's the missing bit. + +Signed-off-by: Matthew Wilcox +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/super.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -457,6 +457,8 @@ static int btrfs_show_options(struct seq + seq_puts(seq, ",notreelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flushoncommit"); ++ if (btrfs_test_opt(root, DISCARD)) ++ seq_puts(seq, ",discard"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + return 0; diff --git a/queue-2.6.32/btrfs-use-correct-values-when-updating-inode-i_size-on-fallocate.patch b/queue-2.6.32/btrfs-use-correct-values-when-updating-inode-i_size-on-fallocate.patch new file mode 100644 index 00000000000..81cfe445740 --- /dev/null +++ b/queue-2.6.32/btrfs-use-correct-values-when-updating-inode-i_size-on-fallocate.patch @@ -0,0 +1,71 @@ +From d1ea6a61454e7d7ff0873d0ad1ae27d5807da0d3 Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Wed, 20 Jan 2010 07:28:54 +0000 +Subject: Btrfs: Use correct values when updating inode i_size on fallocate + +From: Aneesh Kumar K.V + +commit d1ea6a61454e7d7ff0873d0ad1ae27d5807da0d3 upstream. + +commit f2bc9dd07e3424c4ec5f3949961fe053d47bc825 +Author: Aneesh Kumar K.V +Date: Wed Jan 20 12:57:53 2010 +0530 + + Btrfs: Use correct values when updating inode i_size on fallocate + + Even though we allocate more, we should be updating inode i_size + as per the arguments passed + + Signed-off-by: Aneesh Kumar K.V + +Signed-off-by: Chris Mason +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -5799,7 +5799,7 @@ out_fail: + } + + static int prealloc_file_range(struct inode *inode, u64 start, u64 end, +- u64 alloc_hint, int mode) ++ u64 alloc_hint, int mode, loff_t actual_len) + { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; +@@ -5808,6 +5808,7 @@ static int prealloc_file_range(struct in + u64 cur_offset = start; + u64 num_bytes = end - start; + int ret = 0; ++ u64 i_size; + + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); +@@ -5846,8 +5847,12 @@ static int prealloc_file_range(struct in + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; + if (!(mode & FALLOC_FL_KEEP_SIZE) && + cur_offset > inode->i_size) { +- i_size_write(inode, cur_offset); +- btrfs_ordered_update_i_size(inode, cur_offset, NULL); ++ if (cur_offset > actual_len) ++ i_size = actual_len; ++ else ++ i_size = cur_offset; ++ i_size_write(inode, i_size); ++ btrfs_ordered_update_i_size(inode, i_size, NULL); + } + + ret = btrfs_update_inode(trans, root, inode); +@@ -5940,7 +5945,7 @@ static long btrfs_fallocate(struct inode + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = prealloc_file_range(inode, + cur_offset, last_byte, +- alloc_hint, mode); ++ alloc_hint, mode, offset+len); + if (ret < 0) { + free_extent_map(em); + break; diff --git a/queue-2.6.32/crypto-testmgr-fix-complain-about-lack-test-for-internal-used-algorithm.patch b/queue-2.6.32/crypto-testmgr-fix-complain-about-lack-test-for-internal-used-algorithm.patch new file mode 100644 index 00000000000..69378697ff0 --- /dev/null +++ b/queue-2.6.32/crypto-testmgr-fix-complain-about-lack-test-for-internal-used-algorithm.patch @@ -0,0 +1,142 @@ +From 863b557a88f8c033f7419fabafef4712a5055f85 Mon Sep 17 00:00:00 2001 +From: Youquan, Song +Date: Wed, 23 Dec 2009 19:45:20 +0800 +Subject: crypto: testmgr - Fix complain about lack test for internal used algorithm + +From: Youquan, Song + +commit 863b557a88f8c033f7419fabafef4712a5055f85 upstream. + +When load aesni-intel and ghash_clmulni-intel driver,kernel will complain no + test for some internal used algorithm. +The strange information as following: + +alg: No test for __aes-aesni (__driver-aes-aesni) +alg: No test for __ecb-aes-aesni (__driver-ecb-aes-aesni) +alg: No test for __cbc-aes-aesni (__driver-cbc-aes-aesni) +alg: No test for __ecb-aes-aesni (cryptd(__driver-ecb-aes-aesni) +alg: No test for __ghash (__ghash-pclmulqdqni) +alg: No test for __ghash (cryptd(__ghash-pclmulqdqni)) + +This patch add NULL test entries for these algorithm and driver. + +Signed-off-by: Youquan, Song +Signed-off-by: Ying, Huang +Signed-off-by: Herbert Xu +Acked-by: Jiri Kosina +Signed-off-by: Greg Kroah-Hartman + +--- + crypto/testmgr.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 84 insertions(+) + +--- a/crypto/testmgr.c ++++ b/crypto/testmgr.c +@@ -1477,9 +1477,54 @@ static int alg_test_cprng(const struct a + return err; + } + ++static int alg_test_null(const struct alg_test_desc *desc, ++ const char *driver, u32 type, u32 mask) ++{ ++ return 0; ++} ++ + /* Please keep this list sorted by algorithm name. */ + static const struct alg_test_desc alg_test_descs[] = { + { ++ .alg = "__driver-cbc-aes-aesni", ++ .test = alg_test_null, ++ .suite = { ++ .cipher = { ++ .enc = { ++ .vecs = NULL, ++ .count = 0 ++ }, ++ .dec = { ++ .vecs = NULL, ++ .count = 0 ++ } ++ } ++ } ++ }, { ++ .alg = "__driver-ecb-aes-aesni", ++ .test = alg_test_null, ++ .suite = { ++ .cipher = { ++ .enc = { ++ .vecs = NULL, ++ .count = 0 ++ }, ++ .dec = { ++ .vecs = NULL, ++ .count = 0 ++ } ++ } ++ } ++ }, { ++ .alg = "__ghash-pclmulqdqni", ++ .test = alg_test_null, ++ .suite = { ++ .hash = { ++ .vecs = NULL, ++ .count = 0 ++ } ++ } ++ }, { + .alg = "ansi_cprng", + .test = alg_test_cprng, + .fips_allowed = 1, +@@ -1623,6 +1668,30 @@ static const struct alg_test_desc alg_te + } + } + }, { ++ .alg = "cryptd(__driver-ecb-aes-aesni)", ++ .test = alg_test_null, ++ .suite = { ++ .cipher = { ++ .enc = { ++ .vecs = NULL, ++ .count = 0 ++ }, ++ .dec = { ++ .vecs = NULL, ++ .count = 0 ++ } ++ } ++ } ++ }, { ++ .alg = "cryptd(__ghash-pclmulqdqni)", ++ .test = alg_test_null, ++ .suite = { ++ .hash = { ++ .vecs = NULL, ++ .count = 0 ++ } ++ } ++ }, { + .alg = "ctr(aes)", + .test = alg_test_skcipher, + .fips_allowed = 1, +@@ -1668,6 +1737,21 @@ static const struct alg_test_desc alg_te + } + } + } ++ }, { ++ .alg = "ecb(__aes-aesni)", ++ .test = alg_test_null, ++ .suite = { ++ .cipher = { ++ .enc = { ++ .vecs = NULL, ++ .count = 0 ++ }, ++ .dec = { ++ .vecs = NULL, ++ .count = 0 ++ } ++ } ++ } + }, { + .alg = "ecb(aes)", + .test = alg_test_skcipher, diff --git a/queue-2.6.32/dlm-always-use-gfp_nofs.patch b/queue-2.6.32/dlm-always-use-gfp_nofs.patch new file mode 100644 index 00000000000..e4169bfb1c3 --- /dev/null +++ b/queue-2.6.32/dlm-always-use-gfp_nofs.patch @@ -0,0 +1,467 @@ +From 573c24c4af6664ffcd9aa7ba617a35fde2b95534 Mon Sep 17 00:00:00 2001 +From: David Teigland +Date: Mon, 30 Nov 2009 16:34:43 -0600 +Subject: dlm: always use GFP_NOFS + +From: David Teigland + +commit 573c24c4af6664ffcd9aa7ba617a35fde2b95534 upstream. + +Replace all GFP_KERNEL and ls_allocation with GFP_NOFS. +ls_allocation would be GFP_KERNEL for userland lockspaces +and GFP_NOFS for file system lockspaces. + +It was discovered that any lockspaces on the system can +affect all others by triggering memory reclaim in the +file system which could in turn call back into the dlm +to acquire locks, deadlocking dlm threads that were +shared by all lockspaces, like dlm_recv. + +Signed-off-by: David Teigland +Signed-off-by: Greg Kroah-Hartman + +--- + fs/dlm/config.c | 24 ++++++++++++------------ + fs/dlm/debug_fs.c | 2 +- + fs/dlm/dir.c | 7 +++---- + fs/dlm/dlm_internal.h | 1 - + fs/dlm/lock.c | 6 +++--- + fs/dlm/lockspace.c | 15 +++++---------- + fs/dlm/lowcomms.c | 6 +++--- + fs/dlm/member.c | 8 ++++---- + fs/dlm/memory.c | 6 +++--- + fs/dlm/netlink.c | 2 +- + fs/dlm/plock.c | 6 +++--- + fs/dlm/rcom.c | 2 +- + fs/dlm/requestqueue.c | 2 +- + fs/dlm/user.c | 12 ++++++------ + 14 files changed, 46 insertions(+), 53 deletions(-) + +--- a/fs/dlm/config.c ++++ b/fs/dlm/config.c +@@ -410,10 +410,10 @@ static struct config_group *make_cluster + struct dlm_comms *cms = NULL; + void *gps = NULL; + +- cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL); +- gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); +- sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL); +- cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL); ++ cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS); ++ gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS); ++ sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS); ++ cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS); + + if (!cl || !gps || !sps || !cms) + goto fail; +@@ -482,9 +482,9 @@ static struct config_group *make_space(s + struct dlm_nodes *nds = NULL; + void *gps = NULL; + +- sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL); +- gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); +- nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL); ++ sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS); ++ gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS); ++ nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS); + + if (!sp || !gps || !nds) + goto fail; +@@ -536,7 +536,7 @@ static struct config_item *make_comm(str + { + struct dlm_comm *cm; + +- cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL); ++ cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS); + if (!cm) + return ERR_PTR(-ENOMEM); + +@@ -569,7 +569,7 @@ static struct config_item *make_node(str + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd; + +- nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); ++ nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS); + if (!nd) + return ERR_PTR(-ENOMEM); + +@@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dl + if (cm->addr_count >= DLM_MAX_ADDR_COUNT) + return -ENOSPC; + +- addr = kzalloc(sizeof(*addr), GFP_KERNEL); ++ addr = kzalloc(sizeof(*addr), GFP_NOFS); + if (!addr) + return -ENOMEM; + +@@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int ** + + ids_count = sp->members_count; + +- ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL); ++ ids = kcalloc(ids_count, sizeof(int), GFP_NOFS); + if (!ids) { + rv = -ENOMEM; + goto out; +@@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int ** + if (!new_count) + goto out_ids; + +- new = kcalloc(new_count, sizeof(int), GFP_KERNEL); ++ new = kcalloc(new_count, sizeof(int), GFP_NOFS); + if (!new) { + kfree(ids); + rv = -ENOMEM; +--- a/fs/dlm/debug_fs.c ++++ b/fs/dlm/debug_fs.c +@@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_ + if (bucket >= ls->ls_rsbtbl_size) + return NULL; + +- ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL); ++ ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS); + if (!ri) + return NULL; + if (n == 0) +--- a/fs/dlm/dir.c ++++ b/fs/dlm/dir.c +@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de( + spin_unlock(&ls->ls_recover_list_lock); + + if (!found) +- de = kzalloc(sizeof(struct dlm_direntry) + len, +- ls->ls_allocation); ++ de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS); + return de; + } + +@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls + + dlm_dir_clear(ls); + +- last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation); ++ last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS); + if (!last_name) + goto out; + +@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, + if (namelen > DLM_RESNAME_MAXLEN) + return -EINVAL; + +- de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation); ++ de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS); + if (!de) + return -ENOMEM; + +--- a/fs/dlm/dlm_internal.h ++++ b/fs/dlm/dlm_internal.h +@@ -473,7 +473,6 @@ struct dlm_ls { + int ls_low_nodeid; + int ls_total_weight; + int *ls_node_array; +- gfp_t ls_allocation; + + struct dlm_rsb ls_stub_rsb; /* for returning errors */ + struct dlm_lkb ls_stub_lkb; /* for returning errors */ +--- a/fs/dlm/lock.c ++++ b/fs/dlm/lock.c +@@ -2689,7 +2689,7 @@ static int _create_message(struct dlm_ls + pass into lowcomms_commit and a message buffer (mb) that we + write our data into */ + +- mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); ++ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) + return -ENOBUFS; + +@@ -4512,7 +4512,7 @@ int dlm_user_request(struct dlm_ls *ls, + } + + if (flags & DLM_LKF_VALBLK) { +- ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); ++ ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); + if (!ua->lksb.sb_lvbptr) { + kfree(ua); + __put_lkb(ls, lkb); +@@ -4582,7 +4582,7 @@ int dlm_user_convert(struct dlm_ls *ls, + ua = lkb->lkb_ua; + + if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { +- ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); ++ ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS); + if (!ua->lksb.sb_lvbptr) { + error = -ENOMEM; + goto out_put; +--- a/fs/dlm/lockspace.c ++++ b/fs/dlm/lockspace.c +@@ -430,7 +430,7 @@ static int new_lockspace(const char *nam + + error = -ENOMEM; + +- ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); ++ ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS); + if (!ls) + goto out; + memcpy(ls->ls_name, name, namelen); +@@ -443,11 +443,6 @@ static int new_lockspace(const char *nam + if (flags & DLM_LSFL_TIMEWARN) + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + +- if (flags & DLM_LSFL_FS) +- ls->ls_allocation = GFP_NOFS; +- else +- ls->ls_allocation = GFP_KERNEL; +- + /* ls_exflags are forced to match among nodes, and we don't + need to require all nodes to have some flags set */ + ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | +@@ -456,7 +451,7 @@ static int new_lockspace(const char *nam + size = dlm_config.ci_rsbtbl_size; + ls->ls_rsbtbl_size = size; + +- ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); ++ ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS); + if (!ls->ls_rsbtbl) + goto out_lsfree; + for (i = 0; i < size; i++) { +@@ -468,7 +463,7 @@ static int new_lockspace(const char *nam + size = dlm_config.ci_lkbtbl_size; + ls->ls_lkbtbl_size = size; + +- ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); ++ ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS); + if (!ls->ls_lkbtbl) + goto out_rsbfree; + for (i = 0; i < size; i++) { +@@ -480,7 +475,7 @@ static int new_lockspace(const char *nam + size = dlm_config.ci_dirtbl_size; + ls->ls_dirtbl_size = size; + +- ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); ++ ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS); + if (!ls->ls_dirtbl) + goto out_lkbfree; + for (i = 0; i < size; i++) { +@@ -527,7 +522,7 @@ static int new_lockspace(const char *nam + mutex_init(&ls->ls_requestqueue_mutex); + mutex_init(&ls->ls_clear_proc_locks); + +- ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); ++ ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); + if (!ls->ls_recover_buf) + goto out_dirfree; + +--- a/fs/dlm/lowcomms.c ++++ b/fs/dlm/lowcomms.c +@@ -1060,7 +1060,7 @@ static void init_local(void) + if (dlm_our_addr(&sas, i)) + break; + +- addr = kmalloc(sizeof(*addr), GFP_KERNEL); ++ addr = kmalloc(sizeof(*addr), GFP_NOFS); + if (!addr) + break; + memcpy(addr, &sas, sizeof(*addr)); +@@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void) + struct sockaddr_storage localaddr; + struct sctp_event_subscribe subscribe; + int result = -EINVAL, num = 1, i, addr_len; +- struct connection *con = nodeid2con(0, GFP_KERNEL); ++ struct connection *con = nodeid2con(0, GFP_NOFS); + int bufsize = NEEDED_RMEM; + + if (!con) +@@ -1171,7 +1171,7 @@ out: + static int tcp_listen_for_all(void) + { + struct socket *sock = NULL; +- struct connection *con = nodeid2con(0, GFP_KERNEL); ++ struct connection *con = nodeid2con(0, GFP_NOFS); + int result = -EINVAL; + + if (!con) +--- a/fs/dlm/member.c ++++ b/fs/dlm/member.c +@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls + struct dlm_member *memb; + int w, error; + +- memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); ++ memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); + if (!memb) + return -ENOMEM; + +@@ -143,7 +143,7 @@ static void make_member_array(struct dlm + + ls->ls_total_weight = total; + +- array = kmalloc(sizeof(int) * total, ls->ls_allocation); ++ array = kmalloc(sizeof(int) * total, GFP_NOFS); + if (!array) + return; + +@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *l + continue; + log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); + +- memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); ++ memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); + if (!memb) + return -ENOMEM; + memb->nodeid = rv->new[i]; +@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls) + int *ids = NULL, *new = NULL; + int error, ids_count = 0, new_count = 0; + +- rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation); ++ rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); + if (!rv) + return -ENOMEM; + +--- a/fs/dlm/memory.c ++++ b/fs/dlm/memory.c +@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls + { + char *p; + +- p = kzalloc(ls->ls_lvblen, ls->ls_allocation); ++ p = kzalloc(ls->ls_lvblen, GFP_NOFS); + return p; + } + +@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct + + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); + +- r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation); ++ r = kzalloc(sizeof(*r) + namelen, GFP_NOFS); + return r; + } + +@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct + { + struct dlm_lkb *lkb; + +- lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation); ++ lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS); + return lkb; + } + +--- a/fs/dlm/netlink.c ++++ b/fs/dlm/netlink.c +@@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct s + struct sk_buff *skb; + void *data; + +- skb = genlmsg_new(size, GFP_KERNEL); ++ skb = genlmsg_new(size, GFP_NOFS); + if (!skb) + return -ENOMEM; + +--- a/fs/dlm/plock.c ++++ b/fs/dlm/plock.c +@@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lock + if (!ls) + return -EINVAL; + +- xop = kzalloc(sizeof(*xop), GFP_KERNEL); ++ xop = kzalloc(sizeof(*xop), GFP_NOFS); + if (!xop) { + rv = -ENOMEM; + goto out; +@@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lo + if (!ls) + return -EINVAL; + +- op = kzalloc(sizeof(*op), GFP_KERNEL); ++ op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; +@@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *locks + if (!ls) + return -EINVAL; + +- op = kzalloc(sizeof(*op), GFP_KERNEL); ++ op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) { + rv = -ENOMEM; + goto out; +--- a/fs/dlm/rcom.c ++++ b/fs/dlm/rcom.c +@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls + char *mb; + int mb_len = sizeof(struct dlm_rcom) + len; + +- mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); ++ mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb); + if (!mh) { + log_print("create_rcom to %d type %d len %d ENOBUFS", + to_nodeid, type, len); +--- a/fs/dlm/requestqueue.c ++++ b/fs/dlm/requestqueue.c +@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls + struct rq_entry *e; + int length = ms->m_header.h_length - sizeof(struct dlm_message); + +- e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation); ++ e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS); + if (!e) { + log_print("dlm_add_requestqueue: out of memory len %d", length); + return; +--- a/fs/dlm/user.c ++++ b/fs/dlm/user.c +@@ -267,7 +267,7 @@ static int device_user_lock(struct dlm_u + goto out; + } + +- ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); ++ ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); + if (!ua) + goto out; + ua->proc = proc; +@@ -307,7 +307,7 @@ static int device_user_unlock(struct dlm + if (!ls) + return -ENOENT; + +- ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); ++ ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); + if (!ua) + goto out; + ua->proc = proc; +@@ -352,7 +352,7 @@ static int dlm_device_register(struct dl + + error = -ENOMEM; + len = strlen(name) + strlen(name_prefix) + 2; +- ls->ls_device.name = kzalloc(len, GFP_KERNEL); ++ ls->ls_device.name = kzalloc(len, GFP_NOFS); + if (!ls->ls_device.name) + goto fail; + +@@ -520,7 +520,7 @@ static ssize_t device_write(struct file + #endif + return -EINVAL; + +- kbuf = kzalloc(count + 1, GFP_KERNEL); ++ kbuf = kzalloc(count + 1, GFP_NOFS); + if (!kbuf) + return -ENOMEM; + +@@ -546,7 +546,7 @@ static ssize_t device_write(struct file + + /* add 1 after namelen so that the name string is terminated */ + kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, +- GFP_KERNEL); ++ GFP_NOFS); + if (!kbuf) { + kfree(k32buf); + return -ENOMEM; +@@ -648,7 +648,7 @@ static int device_open(struct inode *ino + if (!ls) + return -ENOENT; + +- proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); ++ proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS); + if (!proc) { + dlm_put_lockspace(ls); + return -ENOMEM; diff --git a/queue-2.6.32/dlm-fix-ordering-of-bast-and-cast.patch b/queue-2.6.32/dlm-fix-ordering-of-bast-and-cast.patch new file mode 100644 index 00000000000..4988b2b3d72 --- /dev/null +++ b/queue-2.6.32/dlm-fix-ordering-of-bast-and-cast.patch @@ -0,0 +1,272 @@ +From 7fe2b3190b8b299409f13cf3a6f85c2bd371f8bb Mon Sep 17 00:00:00 2001 +From: David Teigland +Date: Wed, 24 Feb 2010 11:08:18 -0600 +Subject: dlm: fix ordering of bast and cast + +From: David Teigland + +commit 7fe2b3190b8b299409f13cf3a6f85c2bd371f8bb upstream. + +When both blocking and completion callbacks are queued for lock, +the dlm would always deliver the completion callback (cast) first. +In some cases the blocking callback (bast) is queued before the +cast, though, and should be delivered first. This patch keeps +track of the order in which they were queued and delivers them +in that order. + +This patch also keeps track of the granted mode in the last cast +and eliminates the following bast if the bast mode is compatible +with the preceding cast mode. This happens when a remotely mastered +lock is demoted, e.g. EX->NL, in which case the local node queues +a cast immediately after sending the demote message. In this way +a cast can be queued for a mode, e.g. NL, that makes an in-transit +bast extraneous. + +Signed-off-by: David Teigland +Signed-off-by: Greg Kroah-Hartman + +--- + fs/dlm/ast.c | 72 +++++++++++++++++++++++++++++++++++++++----------- + fs/dlm/ast.h | 4 +- + fs/dlm/dlm_internal.h | 10 +++++- + fs/dlm/lock.c | 4 +- + fs/dlm/user.c | 10 ++++-- + fs/dlm/user.h | 4 +- + 6 files changed, 77 insertions(+), 27 deletions(-) + +--- a/fs/dlm/ast.c ++++ b/fs/dlm/ast.c +@@ -2,7 +2,7 @@ + ******************************************************************************* + ** + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ++** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. + ** + ** This copyrighted material is made available to anyone wishing to use, + ** modify, copy, or redistribute it subject to the terms and conditions +@@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb) + spin_unlock(&ast_queue_lock); + } + +-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) ++void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode) + { + if (lkb->lkb_flags & DLM_IFL_USER) { +- dlm_user_add_ast(lkb, type, bastmode); ++ dlm_user_add_ast(lkb, type, mode); + return; + } + +@@ -44,10 +44,21 @@ void dlm_add_ast(struct dlm_lkb *lkb, in + if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { + kref_get(&lkb->lkb_ref); + list_add_tail(&lkb->lkb_astqueue, &ast_queue); ++ lkb->lkb_ast_first = type; + } ++ ++ /* sanity check, this should not happen */ ++ ++ if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP)) ++ log_print("repeat cast %d castmode %d lock %x %s", ++ mode, lkb->lkb_castmode, ++ lkb->lkb_id, lkb->lkb_resource->res_name); ++ + lkb->lkb_ast_type |= type; +- if (bastmode) +- lkb->lkb_bastmode = bastmode; ++ if (type == AST_BAST) ++ lkb->lkb_bastmode = mode; ++ else ++ lkb->lkb_castmode = mode; + spin_unlock(&ast_queue_lock); + + set_bit(WAKE_ASTS, &astd_wakeflags); +@@ -59,9 +70,9 @@ static void process_asts(void) + struct dlm_ls *ls = NULL; + struct dlm_rsb *r = NULL; + struct dlm_lkb *lkb; +- void (*cast) (void *astparam); +- void (*bast) (void *astparam, int mode); +- int type = 0, bastmode; ++ void (*castfn) (void *astparam); ++ void (*bastfn) (void *astparam, int mode); ++ int type, first, bastmode, castmode, do_bast, do_cast, last_castmode; + + repeat: + spin_lock(&ast_queue_lock); +@@ -75,17 +86,48 @@ repeat: + list_del(&lkb->lkb_astqueue); + type = lkb->lkb_ast_type; + lkb->lkb_ast_type = 0; ++ first = lkb->lkb_ast_first; ++ lkb->lkb_ast_first = 0; + bastmode = lkb->lkb_bastmode; +- ++ castmode = lkb->lkb_castmode; ++ castfn = lkb->lkb_astfn; ++ bastfn = lkb->lkb_bastfn; + spin_unlock(&ast_queue_lock); +- cast = lkb->lkb_astfn; +- bast = lkb->lkb_bastfn; + +- if ((type & AST_COMP) && cast) +- cast(lkb->lkb_astparam); ++ do_cast = (type & AST_COMP) && castfn; ++ do_bast = (type & AST_BAST) && bastfn; ++ ++ /* Skip a bast if its blocking mode is compatible with the ++ granted mode of the preceding cast. */ + +- if ((type & AST_BAST) && bast) +- bast(lkb->lkb_astparam, bastmode); ++ if (do_bast) { ++ if (first == AST_COMP) ++ last_castmode = castmode; ++ else ++ last_castmode = lkb->lkb_castmode_done; ++ if (dlm_modes_compat(bastmode, last_castmode)) ++ do_bast = 0; ++ } ++ ++ if (first == AST_COMP) { ++ if (do_cast) ++ castfn(lkb->lkb_astparam); ++ if (do_bast) ++ bastfn(lkb->lkb_astparam, bastmode); ++ } else if (first == AST_BAST) { ++ if (do_bast) ++ bastfn(lkb->lkb_astparam, bastmode); ++ if (do_cast) ++ castfn(lkb->lkb_astparam); ++ } else { ++ log_error(ls, "bad ast_first %d ast_type %d", ++ first, type); ++ } ++ ++ if (do_cast) ++ lkb->lkb_castmode_done = castmode; ++ if (do_bast) ++ lkb->lkb_bastmode_done = bastmode; + + /* this removes the reference added by dlm_add_ast + and may result in the lkb being freed */ +--- a/fs/dlm/ast.h ++++ b/fs/dlm/ast.h +@@ -1,7 +1,7 @@ + /****************************************************************************** + ******************************************************************************* + ** +-** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ++** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. + ** + ** This copyrighted material is made available to anyone wishing to use, + ** modify, copy, or redistribute it subject to the terms and conditions +@@ -13,7 +13,7 @@ + #ifndef __ASTD_DOT_H__ + #define __ASTD_DOT_H__ + +-void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode); ++void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode); + void dlm_del_ast(struct dlm_lkb *lkb); + + void dlm_astd_wake(void); +--- a/fs/dlm/dlm_internal.h ++++ b/fs/dlm/dlm_internal.h +@@ -2,7 +2,7 @@ + ******************************************************************************* + ** + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. +-** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ++** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. + ** + ** This copyrighted material is made available to anyone wishing to use, + ** modify, copy, or redistribute it subject to the terms and conditions +@@ -232,11 +232,17 @@ struct dlm_lkb { + int8_t lkb_status; /* granted, waiting, convert */ + int8_t lkb_rqmode; /* requested lock mode */ + int8_t lkb_grmode; /* granted lock mode */ +- int8_t lkb_bastmode; /* requested mode */ + int8_t lkb_highbast; /* highest mode bast sent for */ ++ + int8_t lkb_wait_type; /* type of reply waiting for */ + int8_t lkb_wait_count; + int8_t lkb_ast_type; /* type of ast queued for */ ++ int8_t lkb_ast_first; /* type of first ast queued */ ++ ++ int8_t lkb_bastmode; /* req mode of queued bast */ ++ int8_t lkb_castmode; /* gr mode of queued cast */ ++ int8_t lkb_bastmode_done; /* last delivered bastmode */ ++ int8_t lkb_castmode_done; /* last delivered castmode */ + + struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ + struct list_head lkb_statequeue; /* rsb g/c/w list */ +--- a/fs/dlm/lock.c ++++ b/fs/dlm/lock.c +@@ -1,7 +1,7 @@ + /****************************************************************************** + ******************************************************************************* + ** +-** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ++** Copyright (C) 2005-2010 Red Hat, Inc. All rights reserved. + ** + ** This copyrighted material is made available to anyone wishing to use, + ** modify, copy, or redistribute it subject to the terms and conditions +@@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r + lkb->lkb_lksb->sb_status = rv; + lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; + +- dlm_add_ast(lkb, AST_COMP, 0); ++ dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode); + } + + static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) +--- a/fs/dlm/user.c ++++ b/fs/dlm/user.c +@@ -1,5 +1,5 @@ + /* +- * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. ++ * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions +@@ -173,7 +173,7 @@ static int lkb_is_endoflife(struct dlm_l + /* we could possibly check if the cancel of an orphan has resulted in the lkb + being removed and then remove that lkb from the orphans list and free it */ + +-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) ++void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) + { + struct dlm_ls *ls; + struct dlm_user_args *ua; +@@ -206,8 +206,10 @@ void dlm_user_add_ast(struct dlm_lkb *lk + + ast_type = lkb->lkb_ast_type; + lkb->lkb_ast_type |= type; +- if (bastmode) +- lkb->lkb_bastmode = bastmode; ++ if (type == AST_BAST) ++ lkb->lkb_bastmode = mode; ++ else ++ lkb->lkb_castmode = mode; + + if (!ast_type) { + kref_get(&lkb->lkb_ref); +--- a/fs/dlm/user.h ++++ b/fs/dlm/user.h +@@ -1,5 +1,5 @@ + /* +- * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. ++ * Copyright (C) 2006-2010 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions +@@ -9,7 +9,7 @@ + #ifndef __USER_DOT_H__ + #define __USER_DOT_H__ + +-void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode); ++void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode); + int dlm_user_init(void); + void dlm_user_exit(void); + int dlm_device_deregister(struct dlm_ls *ls); diff --git a/queue-2.6.32/dlm-send-reply-before-bast.patch b/queue-2.6.32/dlm-send-reply-before-bast.patch new file mode 100644 index 00000000000..8cefd14e949 --- /dev/null +++ b/queue-2.6.32/dlm-send-reply-before-bast.patch @@ -0,0 +1,285 @@ +From cf6620acc0f6fac57968aafef79ab372bdcf6157 Mon Sep 17 00:00:00 2001 +From: David Teigland +Date: Wed, 24 Feb 2010 11:59:23 -0600 +Subject: dlm: send reply before bast + +From: David Teigland + +commit cf6620acc0f6fac57968aafef79ab372bdcf6157 upstream. + +When the lock master processes a successful operation (request, +convert, cancel, or unlock), it will process the effects of the +change before sending the reply for the operation. The "effects" +of the operation are: + +- blocking callbacks (basts) for any newly granted locks +- waiting or converting locks that can now be granted + +The cast is queued on the local node when the reply from the lock +master is received. This means that a lock holder can receive a +bast for a lock mode that is doesn't yet know has been granted. + +Signed-off-by: David Teigland +Signed-off-by: Greg Kroah-Hartman + +--- + fs/dlm/lock.c | 110 ++++++++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 84 insertions(+), 26 deletions(-) + +--- a/fs/dlm/lock.c ++++ b/fs/dlm/lock.c +@@ -2280,20 +2280,30 @@ static int do_request(struct dlm_rsb *r, + if (can_be_queued(lkb)) { + error = -EINPROGRESS; + add_lkb(r, lkb, DLM_LKSTS_WAITING); +- send_blocking_asts(r, lkb); + add_timeout(lkb); + goto out; + } + + error = -EAGAIN; +- if (force_blocking_asts(lkb)) +- send_blocking_asts_all(r, lkb); + queue_cast(r, lkb, -EAGAIN); +- + out: + return error; + } + ++static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, ++ int error) ++{ ++ switch (error) { ++ case -EAGAIN: ++ if (force_blocking_asts(lkb)) ++ send_blocking_asts_all(r, lkb); ++ break; ++ case -EINPROGRESS: ++ send_blocking_asts(r, lkb); ++ break; ++ } ++} ++ + static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) + { + int error = 0; +@@ -2304,7 +2314,6 @@ static int do_convert(struct dlm_rsb *r, + if (can_be_granted(r, lkb, 1, &deadlk)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); +- grant_pending_locks(r); + goto out; + } + +@@ -2334,7 +2343,6 @@ static int do_convert(struct dlm_rsb *r, + if (_can_be_granted(r, lkb, 1)) { + grant_lock(r, lkb); + queue_cast(r, lkb, 0); +- grant_pending_locks(r); + goto out; + } + /* else fall through and move to convert queue */ +@@ -2344,28 +2352,47 @@ static int do_convert(struct dlm_rsb *r, + error = -EINPROGRESS; + del_lkb(r, lkb); + add_lkb(r, lkb, DLM_LKSTS_CONVERT); +- send_blocking_asts(r, lkb); + add_timeout(lkb); + goto out; + } + + error = -EAGAIN; +- if (force_blocking_asts(lkb)) +- send_blocking_asts_all(r, lkb); + queue_cast(r, lkb, -EAGAIN); +- + out: + return error; + } + ++static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, ++ int error) ++{ ++ switch (error) { ++ case 0: ++ grant_pending_locks(r); ++ /* grant_pending_locks also sends basts */ ++ break; ++ case -EAGAIN: ++ if (force_blocking_asts(lkb)) ++ send_blocking_asts_all(r, lkb); ++ break; ++ case -EINPROGRESS: ++ send_blocking_asts(r, lkb); ++ break; ++ } ++} ++ + static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) + { + remove_lock(r, lkb); + queue_cast(r, lkb, -DLM_EUNLOCK); +- grant_pending_locks(r); + return -DLM_EUNLOCK; + } + ++static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, ++ int error) ++{ ++ grant_pending_locks(r); ++} ++ + /* returns: 0 did nothing, -DLM_ECANCEL canceled lock */ + + static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) +@@ -2375,12 +2402,18 @@ static int do_cancel(struct dlm_rsb *r, + error = revert_lock(r, lkb); + if (error) { + queue_cast(r, lkb, -DLM_ECANCEL); +- grant_pending_locks(r); + return -DLM_ECANCEL; + } + return 0; + } + ++static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb, ++ int error) ++{ ++ if (error) ++ grant_pending_locks(r); ++} ++ + /* + * Four stage 3 varieties: + * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() +@@ -2402,11 +2435,15 @@ static int _request_lock(struct dlm_rsb + goto out; + } + +- if (is_remote(r)) ++ if (is_remote(r)) { + /* receive_request() calls do_request() on remote node */ + error = send_request(r, lkb); +- else ++ } else { + error = do_request(r, lkb); ++ /* for remote locks the request_reply is sent ++ between do_request and do_request_effects */ ++ do_request_effects(r, lkb, error); ++ } + out: + return error; + } +@@ -2417,11 +2454,15 @@ static int _convert_lock(struct dlm_rsb + { + int error; + +- if (is_remote(r)) ++ if (is_remote(r)) { + /* receive_convert() calls do_convert() on remote node */ + error = send_convert(r, lkb); +- else ++ } else { + error = do_convert(r, lkb); ++ /* for remote locks the convert_reply is sent ++ between do_convert and do_convert_effects */ ++ do_convert_effects(r, lkb, error); ++ } + + return error; + } +@@ -2432,11 +2473,15 @@ static int _unlock_lock(struct dlm_rsb * + { + int error; + +- if (is_remote(r)) ++ if (is_remote(r)) { + /* receive_unlock() calls do_unlock() on remote node */ + error = send_unlock(r, lkb); +- else ++ } else { + error = do_unlock(r, lkb); ++ /* for remote locks the unlock_reply is sent ++ between do_unlock and do_unlock_effects */ ++ do_unlock_effects(r, lkb, error); ++ } + + return error; + } +@@ -2447,11 +2492,15 @@ static int _cancel_lock(struct dlm_rsb * + { + int error; + +- if (is_remote(r)) ++ if (is_remote(r)) { + /* receive_cancel() calls do_cancel() on remote node */ + error = send_cancel(r, lkb); +- else ++ } else { + error = do_cancel(r, lkb); ++ /* for remote locks the cancel_reply is sent ++ between do_cancel and do_cancel_effects */ ++ do_cancel_effects(r, lkb, error); ++ } + + return error; + } +@@ -3191,6 +3240,7 @@ static void receive_request(struct dlm_l + attach_lkb(r, lkb); + error = do_request(r, lkb); + send_request_reply(r, lkb, error); ++ do_request_effects(r, lkb, error); + + unlock_rsb(r); + put_rsb(r); +@@ -3226,15 +3276,19 @@ static void receive_convert(struct dlm_l + goto out; + + receive_flags(lkb, ms); ++ + error = receive_convert_args(ls, lkb, ms); +- if (error) +- goto out_reply; ++ if (error) { ++ send_convert_reply(r, lkb, error); ++ goto out; ++ } ++ + reply = !down_conversion(lkb); + + error = do_convert(r, lkb); +- out_reply: + if (reply) + send_convert_reply(r, lkb, error); ++ do_convert_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); +@@ -3266,13 +3320,16 @@ static void receive_unlock(struct dlm_ls + goto out; + + receive_flags(lkb, ms); ++ + error = receive_unlock_args(ls, lkb, ms); +- if (error) +- goto out_reply; ++ if (error) { ++ send_unlock_reply(r, lkb, error); ++ goto out; ++ } + + error = do_unlock(r, lkb); +- out_reply: + send_unlock_reply(r, lkb, error); ++ do_unlock_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); +@@ -3307,6 +3364,7 @@ static void receive_cancel(struct dlm_ls + + error = do_cancel(r, lkb); + send_cancel_reply(r, lkb, error); ++ do_cancel_effects(r, lkb, error); + out: + unlock_rsb(r); + put_rsb(r); diff --git a/queue-2.6.32/ext4-fix-optional-arg-mount-options.patch b/queue-2.6.32/ext4-fix-optional-arg-mount-options.patch new file mode 100644 index 00000000000..31e76ff7567 --- /dev/null +++ b/queue-2.6.32/ext4-fix-optional-arg-mount-options.patch @@ -0,0 +1,77 @@ +From 15121c18a22ae483279f76dc9e554334b800d0f7 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Mon, 15 Feb 2010 20:17:55 -0500 +Subject: ext4: Fix optional-arg mount options + +From: Eric Sandeen + +commit 15121c18a22ae483279f76dc9e554334b800d0f7 upstream. + +We have 2 mount options, "barrier" and "auto_da_alloc" which may or +may not take a 1/0 argument. This causes the ext4 superblock mount +code to subtract uninitialized pointers and pass the result to +kmalloc, which results in very noisy failures. + +Per Ted's suggestion, initialize the args struct so that +we know whether match_token() found an argument for the +option, and skip match_int() if not. + +Also, return error (0) from parse_options if we thought +we found an argument, but match_int() Fails. + +Reported-by: Michael S. Tsirkin +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/super.c | 23 +++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1218,6 +1218,11 @@ static int parse_options(char *options, + if (!*p) + continue; + ++ /* ++ * Initialize args struct so we know whether arg was ++ * found; some options take optional arguments. ++ */ ++ args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + switch (token) { + case Opt_bsd_df: +@@ -1503,10 +1508,11 @@ set_qf_format: + clear_opt(sbi->s_mount_opt, BARRIER); + break; + case Opt_barrier: +- if (match_int(&args[0], &option)) { +- set_opt(sbi->s_mount_opt, BARRIER); +- break; +- } ++ if (args[0].from) { ++ if (match_int(&args[0], &option)) ++ return 0; ++ } else ++ option = 1; /* No argument, default to 1 */ + if (option) + set_opt(sbi->s_mount_opt, BARRIER); + else +@@ -1579,10 +1585,11 @@ set_qf_format: + set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + break; + case Opt_auto_da_alloc: +- if (match_int(&args[0], &option)) { +- clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); +- break; +- } ++ if (args[0].from) { ++ if (match_int(&args[0], &option)) ++ return 0; ++ } else ++ option = 1; /* No argument, default to 1 */ + if (option) + clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC); + else diff --git a/queue-2.6.32/ext4-make-sure-the-move_ext-ioctl-can-t-overwrite-append-only-files.patch b/queue-2.6.32/ext4-make-sure-the-move_ext-ioctl-can-t-overwrite-append-only-files.patch new file mode 100644 index 00000000000..394ea668d98 --- /dev/null +++ b/queue-2.6.32/ext4-make-sure-the-move_ext-ioctl-can-t-overwrite-append-only-files.patch @@ -0,0 +1,34 @@ +From 1f5a81e41f8b1a782c68d3843e9ec1bfaadf7d72 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Wed, 2 Jun 2010 22:04:39 -0400 +Subject: ext4: Make sure the MOVE_EXT ioctl can't overwrite append-only files + +From: Theodore Ts'o + +commit 1f5a81e41f8b1a782c68d3843e9ec1bfaadf7d72 upstream. + +Dan Roseberg has reported a problem with the MOVE_EXT ioctl. If the +donor file is an append-only file, we should not allow the operation +to proceed, lest we end up overwriting the contents of an append-only +file. + +Signed-off-by: "Theodore Ts'o" +Cc: Dan Rosenberg +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/move_extent.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -958,6 +958,9 @@ mext_check_arguments(struct inode *orig_ + return -EINVAL; + } + ++ if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) ++ return -EPERM; ++ + /* Ext4 move extent does not support swapfile */ + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " diff --git a/queue-2.6.32/fix-sba-iommu-to-handle-allocation-failure-properly.patch b/queue-2.6.32/fix-sba-iommu-to-handle-allocation-failure-properly.patch new file mode 100644 index 00000000000..c235c0af721 --- /dev/null +++ b/queue-2.6.32/fix-sba-iommu-to-handle-allocation-failure-properly.patch @@ -0,0 +1,110 @@ +From e2a465675dc089e9a56ba2fa2a5fbd9bd8844d18 Mon Sep 17 00:00:00 2001 +From: FUJITA Tomonori +Date: Tue, 17 Nov 2009 14:44:35 -0800 +Subject: [IA64] fix SBA IOMMU to handle allocation failure properly + +From: FUJITA Tomonori + +commit e2a465675dc089e9a56ba2fa2a5fbd9bd8844d18 upstream. + +It's possible that SBA IOMMU might fail to find I/O space under heavy +I/Os. SBA IOMMU panics on allocation failure but it shouldn't; drivers +can handle the failure. The majority of other IOMMU drivers don't panic +on allocation failure. + +This patch fixes SBA IOMMU path to handle allocation failure properly. + +Signed-off-by: FUJITA Tomonori +Cc: Fenghua Yu +Signed-off-by: Andrew Morton +Signed-off-by: Tony Luck +Acked-by: Leonardo Chiquitto +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + arch/ia64/hp/common/sba_iommu.c | 38 +++++++++++++++++++++++++++++--------- + 1 file changed, 29 insertions(+), 9 deletions(-) + +--- a/arch/ia64/hp/common/sba_iommu.c ++++ b/arch/ia64/hp/common/sba_iommu.c +@@ -677,12 +677,19 @@ sba_alloc_range(struct ioc *ioc, struct + spin_unlock_irqrestore(&ioc->saved_lock, flags); + + pide = sba_search_bitmap(ioc, dev, pages_needed, 0); +- if (unlikely(pide >= (ioc->res_size << 3))) +- panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n", +- ioc->ioc_hpa); ++ if (unlikely(pide >= (ioc->res_size << 3))) { ++ printk(KERN_WARNING "%s: I/O MMU @ %p is" ++ "out of mapping resources, %u %u %lx\n", ++ __func__, ioc->ioc_hpa, ioc->res_size, ++ pages_needed, dma_get_seg_boundary(dev)); ++ return -1; ++ } + #else +- panic(__FILE__ ": I/O MMU @ %p is out of mapping resources\n", +- ioc->ioc_hpa); ++ printk(KERN_WARNING "%s: I/O MMU @ %p is" ++ "out of mapping resources, %u %u %lx\n", ++ __func__, ioc->ioc_hpa, ioc->res_size, ++ pages_needed, dma_get_seg_boundary(dev)); ++ return -1; + #endif + } + } +@@ -965,6 +972,8 @@ static dma_addr_t sba_map_page(struct de + #endif + + pide = sba_alloc_range(ioc, dev, size); ++ if (pide < 0) ++ return 0; + + iovp = (dma_addr_t) pide << iovp_shift; + +@@ -1320,6 +1329,7 @@ sba_coalesce_chunks(struct ioc *ioc, str + unsigned long dma_offset, dma_len; /* start/len of DMA stream */ + int n_mappings = 0; + unsigned int max_seg_size = dma_get_max_seg_size(dev); ++ int idx; + + while (nents > 0) { + unsigned long vaddr = (unsigned long) sba_sg_address(startsg); +@@ -1418,16 +1428,22 @@ sba_coalesce_chunks(struct ioc *ioc, str + vcontig_sg->dma_length = vcontig_len; + dma_len = (dma_len + dma_offset + ~iovp_mask) & iovp_mask; + ASSERT(dma_len <= DMA_CHUNK_SIZE); +- dma_sg->dma_address = (dma_addr_t) (PIDE_FLAG +- | (sba_alloc_range(ioc, dev, dma_len) << iovp_shift) +- | dma_offset); ++ idx = sba_alloc_range(ioc, dev, dma_len); ++ if (idx < 0) { ++ dma_sg->dma_length = 0; ++ return -1; ++ } ++ dma_sg->dma_address = (dma_addr_t)(PIDE_FLAG | (idx << iovp_shift) ++ | dma_offset); + n_mappings++; + } + + return n_mappings; + } + +- ++static void sba_unmap_sg_attrs(struct device *dev, struct scatterlist *sglist, ++ int nents, enum dma_data_direction dir, ++ struct dma_attrs *attrs); + /** + * sba_map_sg - map Scatter/Gather list + * @dev: instance of PCI owned by the driver that's asking. +@@ -1493,6 +1509,10 @@ static int sba_map_sg_attrs(struct devic + ** Access to the virtual address is what forces a two pass algorithm. + */ + coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents); ++ if (coalesced < 0) { ++ sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs); ++ return 0; ++ } + + /* + ** Program the I/O Pdir diff --git a/queue-2.6.32/hwpoison-abort-on-failed-unmap.patch b/queue-2.6.32/hwpoison-abort-on-failed-unmap.patch new file mode 100644 index 00000000000..d66b84f6450 --- /dev/null +++ b/queue-2.6.32/hwpoison-abort-on-failed-unmap.patch @@ -0,0 +1,78 @@ +From 1668bfd5be9d8a52536c4865000fbbe065a3613b Mon Sep 17 00:00:00 2001 +From: Wu Fengguang +Date: Wed, 16 Dec 2009 12:19:58 +0100 +Subject: HWPOISON: abort on failed unmap + +From: Wu Fengguang + +commit 1668bfd5be9d8a52536c4865000fbbe065a3613b upstream. + +Don't try to isolate a still mapped page. Otherwise we will hit the +BUG_ON(page_mapped(page)) in __remove_from_page_cache(). + +Signed-off-by: Wu Fengguang +Signed-off-by: Andi Kleen +Signed-off-by: Thomas Renninger +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory-failure.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -637,7 +637,7 @@ static int page_action(struct page_state + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +-static void hwpoison_user_mappings(struct page *p, unsigned long pfn, ++static int hwpoison_user_mappings(struct page *p, unsigned long pfn, + int trapno) + { + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; +@@ -647,15 +647,18 @@ static void hwpoison_user_mappings(struc + int i; + int kill = 1; + +- if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) +- return; ++ if (PageReserved(p) || PageSlab(p)) ++ return SWAP_SUCCESS; + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!page_mapped(p)) +- return; ++ return SWAP_SUCCESS; ++ ++ if (PageCompound(p) || PageKsm(p)) ++ return SWAP_FAIL; + + if (PageSwapCache(p)) { + printk(KERN_ERR +@@ -717,6 +720,8 @@ static void hwpoison_user_mappings(struc + */ + kill_procs_ao(&tokill, !!PageDirty(p), trapno, + ret != SWAP_SUCCESS, pfn); ++ ++ return ret; + } + + int __memory_failure(unsigned long pfn, int trapno, int ref) +@@ -786,8 +791,13 @@ int __memory_failure(unsigned long pfn, + + /* + * Now take care of user space mappings. ++ * Abort on fail: __remove_from_page_cache() assumes unmapped page. + */ +- hwpoison_user_mappings(p, pfn, trapno); ++ if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { ++ printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); ++ res = -EBUSY; ++ goto out; ++ } + + /* + * Torn down by someone else? diff --git a/queue-2.6.32/hwpoison-remove-the-anonymous-entry.patch b/queue-2.6.32/hwpoison-remove-the-anonymous-entry.patch new file mode 100644 index 00000000000..6ae9ce72182 --- /dev/null +++ b/queue-2.6.32/hwpoison-remove-the-anonymous-entry.patch @@ -0,0 +1,31 @@ +From 9b9a29ecd75e310f75a9243e1c3538ad34598fcb Mon Sep 17 00:00:00 2001 +From: Wu Fengguang +Date: Wed, 16 Dec 2009 12:19:57 +0100 +Subject: HWPOISON: remove the anonymous entry + +From: Wu Fengguang + +commit 9b9a29ecd75e310f75a9243e1c3538ad34598fcb upstream. + +(PG_swapbacked && !PG_lru) pages should not happen. +Better to treat them as unknown pages. + +Signed-off-by: Wu Fengguang +Signed-off-by: Andi Kleen +Signed-off-by: Thomas Renninger +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory-failure.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -589,7 +589,6 @@ static struct page_state { + + { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru, "clean LRU", me_pagecache_clean }, +- { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, + + /* + * Catchall entry: must be at end. diff --git a/queue-2.6.32/ibmvfc-fix-command-completion-handling.patch b/queue-2.6.32/ibmvfc-fix-command-completion-handling.patch new file mode 100644 index 00000000000..f8958f4ae63 --- /dev/null +++ b/queue-2.6.32/ibmvfc-fix-command-completion-handling.patch @@ -0,0 +1,73 @@ +From f5832fa2f8dc39adcf3ae348d2d6383163235e79 Mon Sep 17 00:00:00 2001 +From: Brian King +Date: Tue, 20 Apr 2010 14:21:33 -0500 +Subject: [SCSI] ibmvfc: Fix command completion handling + +From: Brian King + +commit f5832fa2f8dc39adcf3ae348d2d6383163235e79 upstream. + +Commands which are completed by the VIOS are placed on a CRQ +in kernel memory for the ibmvfc driver to process. Each CRQ +entry is 16 bytes. The ibmvfc driver reads the first 8 bytes +to check if the entry is valid, then reads the next 8 bytes to get +the handle, which is a pointer the completed command. This fixes +an issue seen on Power 7 where the processor reordered the +loads from memory, resulting in processing command completion +with a stale handle. This could result in command timeouts, +and also early completion of commands. + +Signed-off-by: Brian King +Signed-off-by: James Bottomley +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/scsi/ibmvscsi/ibmvfc.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/scsi/ibmvscsi/ibmvfc.c ++++ b/drivers/scsi/ibmvscsi/ibmvfc.c +@@ -2720,6 +2720,7 @@ static struct ibmvfc_async_crq *ibmvfc_n + if (crq->valid & 0x80) { + if (++async_crq->cur == async_crq->size) + async_crq->cur = 0; ++ rmb(); + } else + crq = NULL; + +@@ -2742,6 +2743,7 @@ static struct ibmvfc_crq *ibmvfc_next_cr + if (crq->valid & 0x80) { + if (++queue->cur == queue->size) + queue->cur = 0; ++ rmb(); + } else + crq = NULL; + +@@ -2790,12 +2792,14 @@ static void ibmvfc_tasklet(void *data) + while ((async = ibmvfc_next_async_crq(vhost)) != NULL) { + ibmvfc_handle_async(async, vhost); + async->valid = 0; ++ wmb(); + } + + /* Pull all the valid messages off the CRQ */ + while ((crq = ibmvfc_next_crq(vhost)) != NULL) { + ibmvfc_handle_crq(crq, vhost); + crq->valid = 0; ++ wmb(); + } + + vio_enable_interrupts(vdev); +@@ -2803,10 +2807,12 @@ static void ibmvfc_tasklet(void *data) + vio_disable_interrupts(vdev); + ibmvfc_handle_async(async, vhost); + async->valid = 0; ++ wmb(); + } else if ((crq = ibmvfc_next_crq(vhost)) != NULL) { + vio_disable_interrupts(vdev); + ibmvfc_handle_crq(crq, vhost); + crq->valid = 0; ++ wmb(); + } else + done = 1; + } diff --git a/queue-2.6.32/ibmvfc-reduce-error-recovery-timeout.patch b/queue-2.6.32/ibmvfc-reduce-error-recovery-timeout.patch new file mode 100644 index 00000000000..69de386571a --- /dev/null +++ b/queue-2.6.32/ibmvfc-reduce-error-recovery-timeout.patch @@ -0,0 +1,44 @@ +From daa142d1773dd3a986f02a8a4da929608d24daaa Mon Sep 17 00:00:00 2001 +From: Brian King +Date: Tue, 20 Apr 2010 14:21:35 -0500 +Subject: [SCSI] ibmvfc: Reduce error recovery timeout + +From: Brian King + +commit daa142d1773dd3a986f02a8a4da929608d24daaa upstream. + +If a command times out resulting in EH getting invoked, we wait for the +aborted commands to come back after sending the abort. Shorten +the amount of time we wait for these responses, to ensure we don't +get stuck in EH for several minutes. + +Signed-off-by: Brian King +Signed-off-by: James Bottomley +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/scsi/ibmvscsi/ibmvfc.c | 2 +- + drivers/scsi/ibmvscsi/ibmvfc.h | 1 + + 2 files changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/scsi/ibmvscsi/ibmvfc.c ++++ b/drivers/scsi/ibmvscsi/ibmvfc.c +@@ -1969,7 +1969,7 @@ static int ibmvfc_wait_for_ops(struct ib + DECLARE_COMPLETION_ONSTACK(comp); + int wait; + unsigned long flags; +- signed long timeout = init_timeout * HZ; ++ signed long timeout = IBMVFC_ABORT_WAIT_TIMEOUT * HZ; + + ENTER; + do { +--- a/drivers/scsi/ibmvscsi/ibmvfc.h ++++ b/drivers/scsi/ibmvscsi/ibmvfc.h +@@ -38,6 +38,7 @@ + #define IBMVFC_ADISC_PLUS_CANCEL_TIMEOUT \ + (IBMVFC_ADISC_TIMEOUT + IBMVFC_ADISC_CANCEL_TIMEOUT) + #define IBMVFC_INIT_TIMEOUT 120 ++#define IBMVFC_ABORT_WAIT_TIMEOUT 40 + #define IBMVFC_MAX_REQUESTS_DEFAULT 100 + + #define IBMVFC_DEBUG 0 diff --git a/queue-2.6.32/loop-update-mtime-when-writing-using-aops.patch b/queue-2.6.32/loop-update-mtime-when-writing-using-aops.patch new file mode 100644 index 00000000000..6021cd1a22e --- /dev/null +++ b/queue-2.6.32/loop-update-mtime-when-writing-using-aops.patch @@ -0,0 +1,31 @@ +From 02246c41171097ceab3246f6dc251ac89de6004b Mon Sep 17 00:00:00 2001 +From: Nikanth Karthikesan +Date: Thu, 8 Apr 2010 21:39:31 +0200 +Subject: loop: Update mtime when writing using aops + +From: Nikanth Karthikesan + +commit 02246c41171097ceab3246f6dc251ac89de6004b upstream. + +Update mtime when writing to backing filesystem using the address space +operations write_begin and write_end. + +Signed-off-by: Nikanth Karthikesan +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/loop.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/block/loop.c ++++ b/drivers/block/loop.c +@@ -238,6 +238,8 @@ static int do_lo_send_aops(struct loop_d + if (ret) + goto fail; + ++ file_update_time(file); ++ + transfer_result = lo_do_transfer(lo, WRITE, page, offset, + bvec->bv_page, bv_offs, size, IV); + copied = size; diff --git a/queue-2.6.32/md-raid1-delay-reads-that-could-overtake-behind-writes.patch b/queue-2.6.32/md-raid1-delay-reads-that-could-overtake-behind-writes.patch new file mode 100644 index 00000000000..c200039a84b --- /dev/null +++ b/queue-2.6.32/md-raid1-delay-reads-that-could-overtake-behind-writes.patch @@ -0,0 +1,113 @@ +From e555190d82c0f58e825e3cbd9e6ebe2e7ac713bd Mon Sep 17 00:00:00 2001 +From: NeilBrown +Date: Wed, 31 Mar 2010 11:21:44 +1100 +Subject: md/raid1: delay reads that could overtake behind-writes. + +From: NeilBrown + +commit e555190d82c0f58e825e3cbd9e6ebe2e7ac713bd upstream. + +When a raid1 array is configured to support write-behind +on some devices, it normally only reads from other devices. +If all devices are write-behind (because the rest have failed) +it is possible for a read request to be serviced before a +behind-write request, which would appear as data corruption. + +So when forced to read from a WriteMostly device, wait for any +write-behind to complete, and don't start any more behind-writes. + +Signed-off-by: NeilBrown +Signed-off-by: Greg Kroah-Hartman + + +--- + drivers/md/bitmap.c | 4 +++- + drivers/md/bitmap.h | 3 +++ + drivers/md/raid1.c | 25 ++++++++++++++++++------- + 3 files changed, 24 insertions(+), 8 deletions(-) + +--- a/drivers/md/bitmap.c ++++ b/drivers/md/bitmap.c +@@ -1317,7 +1317,8 @@ void bitmap_endwrite(struct bitmap *bitm + { + if (!bitmap) return; + if (behind) { +- atomic_dec(&bitmap->behind_writes); ++ if (atomic_dec_and_test(&bitmap->behind_writes)) ++ wake_up(&bitmap->behind_wait); + PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", + atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); + } +@@ -1629,6 +1630,7 @@ int bitmap_create(mddev_t *mddev) + atomic_set(&bitmap->pending_writes, 0); + init_waitqueue_head(&bitmap->write_wait); + init_waitqueue_head(&bitmap->overflow_wait); ++ init_waitqueue_head(&bitmap->behind_wait); + + bitmap->mddev = mddev; + +--- a/drivers/md/bitmap.h ++++ b/drivers/md/bitmap.h +@@ -254,6 +254,9 @@ struct bitmap { + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + ++#ifndef __GENKSYMS__ ++ wait_queue_head_t behind_wait; ++#endif + }; + + /* the bitmap API */ +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -845,6 +845,15 @@ static int make_request(struct request_q + } + mirror = conf->mirrors + rdisk; + ++ if (test_bit(WriteMostly, &mirror->rdev->flags) && ++ bitmap) { ++ /* Reading from a write-mostly device must ++ * take care not to over-take any writes ++ * that are 'behind' ++ */ ++ wait_event(bitmap->behind_wait, ++ atomic_read(&bitmap->behind_writes) == 0); ++ } + r1_bio->read_disk = rdisk; + + read_bio = bio_clone(bio, GFP_NOIO); +@@ -922,9 +931,13 @@ static int make_request(struct request_q + set_bit(R1BIO_Degraded, &r1_bio->state); + } + +- /* do behind I/O ? */ ++ /* do behind I/O ? ++ * Not if there are too many, or cannot allocate memory, ++ * or a reader on WriteMostly is waiting for behind writes ++ * to flush */ + if (bitmap && + atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && ++ !waitqueue_active(&bitmap->behind_wait) && + (behind_pages = alloc_behind_pages(bio)) != NULL) + set_bit(R1BIO_BehindIO, &r1_bio->state); + +@@ -2105,15 +2118,13 @@ static int stop(mddev_t *mddev) + { + conf_t *conf = mddev->private; + struct bitmap *bitmap = mddev->bitmap; +- int behind_wait = 0; + + /* wait for behind writes to complete */ +- while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { +- behind_wait++; +- printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); +- set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(HZ); /* wait a second */ ++ if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { ++ printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop.\n", mdname(mddev)); + /* need to kick something here to make sure I/O goes? */ ++ wait_event(bitmap->behind_wait, ++ atomic_read(&bitmap->behind_writes) == 0); + } + + raise_barrier(conf); diff --git a/queue-2.6.32/memory-hotplug-fix-a-bug-on-dev-mem-for-64-bit-kernels.patch b/queue-2.6.32/memory-hotplug-fix-a-bug-on-dev-mem-for-64-bit-kernels.patch new file mode 100644 index 00000000000..852c227dede --- /dev/null +++ b/queue-2.6.32/memory-hotplug-fix-a-bug-on-dev-mem-for-64-bit-kernels.patch @@ -0,0 +1,74 @@ +From ea0854170c95245a258b386c7a9314399c949fe0 Mon Sep 17 00:00:00 2001 +From: Shaohui Zheng +Date: Tue, 2 Feb 2010 13:44:16 -0800 +Subject: memory hotplug: fix a bug on /dev/mem for 64-bit kernels + +From: Shaohui Zheng + +commit ea0854170c95245a258b386c7a9314399c949fe0 upstream. + +Newly added memory can not be accessed via /dev/mem, because we do not +update the variables high_memory, max_pfn and max_low_pfn. + +Add a function update_end_of_memory_vars() to update these variables for +64-bit kernels. + +[akpm@linux-foundation.org: simplify comment] +Signed-off-by: Shaohui Zheng +Cc: Andi Kleen +Cc: Li Haicheng +Reviewed-by: Wu Fengguang +Reviewed-by: KAMEZAWA Hiroyuki +Cc: Ingo Molnar +Cc: Thomas Gleixner +Cc: "H. Peter Anvin" +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/mm/init_64.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -49,6 +49,7 @@ + #include + #include + #include ++#include + + static unsigned long dma_reserve __initdata; + +@@ -615,6 +616,21 @@ void __init paging_init(void) + */ + #ifdef CONFIG_MEMORY_HOTPLUG + /* ++ * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need ++ * updating. ++ */ ++static void update_end_of_memory_vars(u64 start, u64 size) ++{ ++ unsigned long end_pfn = PFN_UP(start + size); ++ ++ if (end_pfn > max_pfn) { ++ max_pfn = end_pfn; ++ max_low_pfn = end_pfn; ++ high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; ++ } ++} ++ ++/* + * Memory is added always to NORMAL zone. This means you will never get + * additional DMA/DMA32 memory. + */ +@@ -633,6 +649,9 @@ int arch_add_memory(int nid, u64 start, + ret = __add_pages(nid, zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + ++ /* update max_pfn, max_low_pfn and high_memory */ ++ update_end_of_memory_vars(start, size); ++ + return ret; + } + EXPORT_SYMBOL_GPL(arch_add_memory); diff --git a/queue-2.6.32/mutex-don-t-spin-when-the-owner-cpu-is-offline-or-other-weird-cases.patch b/queue-2.6.32/mutex-don-t-spin-when-the-owner-cpu-is-offline-or-other-weird-cases.patch new file mode 100644 index 00000000000..98812308afd --- /dev/null +++ b/queue-2.6.32/mutex-don-t-spin-when-the-owner-cpu-is-offline-or-other-weird-cases.patch @@ -0,0 +1,67 @@ +From 4b402210486c6414fe5fbfd85934a0a22da56b04 Mon Sep 17 00:00:00 2001 +From: Benjamin Herrenschmidt +Date: Fri, 16 Apr 2010 23:20:00 +0200 +Subject: mutex: Don't spin when the owner CPU is offline or other weird cases + +From: Benjamin Herrenschmidt + +commit 4b402210486c6414fe5fbfd85934a0a22da56b04 upstream. + +Due to recent load-balancer changes that delay the task migration to +the next wakeup, the adaptive mutex spinning ends up in a live lock +when the owner's CPU gets offlined because the cpu_online() check +lives before the owner running check. + +This patch changes mutex_spin_on_owner() to return 0 (don't spin) in +any case where we aren't sure about the owner struct validity or CPU +number, and if the said CPU is offline. There is no point going back & +re-evaluate spinning in corner cases like that, let's just go to +sleep. + +Signed-off-by: Benjamin Herrenschmidt +Signed-off-by: Peter Zijlstra +LKML-Reference: <1271212509.13059.135.camel@pasglop> +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -5590,7 +5590,7 @@ int mutex_spin_on_owner(struct mutex *lo + * the mutex owner just released it and exited. + */ + if (probe_kernel_address(&owner->cpu, cpu)) +- goto out; ++ return 0; + #else + cpu = owner->cpu; + #endif +@@ -5600,14 +5600,14 @@ int mutex_spin_on_owner(struct mutex *lo + * the cpu field may no longer be valid. + */ + if (cpu >= nr_cpumask_bits) +- goto out; ++ return 0; + + /* + * We need to validate that we can do a + * get_cpu() and that we have the percpu area. + */ + if (!cpu_online(cpu)) +- goto out; ++ return 0; + + rq = cpu_rq(cpu); + +@@ -5626,7 +5626,7 @@ int mutex_spin_on_owner(struct mutex *lo + + cpu_relax(); + } +-out: ++ + return 1; + } + #endif diff --git a/queue-2.6.32/nohz-introduce-arch_needs_cpu.patch b/queue-2.6.32/nohz-introduce-arch_needs_cpu.patch new file mode 100644 index 00000000000..97234a257fd --- /dev/null +++ b/queue-2.6.32/nohz-introduce-arch_needs_cpu.patch @@ -0,0 +1,121 @@ +From 3c5d92a0cfb5103c0d5ab74d4ae6373d3af38148 Mon Sep 17 00:00:00 2001 +From: Martin Schwidefsky +Date: Tue, 29 Sep 2009 14:25:16 +0200 +Subject: nohz: Introduce arch_needs_cpu + +From: Martin Schwidefsky + +commit 3c5d92a0cfb5103c0d5ab74d4ae6373d3af38148 upstream. + +Allow the architecture to request a normal jiffy tick when the system +goes idle and tick_nohz_stop_sched_tick is called . On s390 the hook is +used to prevent the system going fully idle if there has been an +interrupt other than a clock comparator interrupt since the last wakeup. + +On s390 the HiperSockets response time for 1 connection ping-pong goes +down from 42 to 34 microseconds. The CPU cost decreases by 27%. + +Signed-off-by: Martin Schwidefsky +LKML-Reference: <20090929122533.402715150@de.ibm.com> +Signed-off-by: Thomas Gleixner +Acked-by: John Jolly +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/include/asm/cputime.h | 8 ++++++++ + arch/s390/kernel/s390_ext.c | 2 ++ + arch/s390/kernel/vtime.c | 2 ++ + drivers/s390/cio/cio.c | 1 + + include/linux/tick.h | 3 +++ + kernel/time/tick-sched.c | 13 ++++++++----- + 6 files changed, 24 insertions(+), 5 deletions(-) + +--- a/arch/s390/include/asm/cputime.h ++++ b/arch/s390/include/asm/cputime.h +@@ -183,6 +183,7 @@ struct s390_idle_data { + unsigned long long idle_count; + unsigned long long idle_enter; + unsigned long long idle_time; ++ int nohz_delay; + }; + + DECLARE_PER_CPU(struct s390_idle_data, s390_idle); +@@ -198,4 +199,11 @@ static inline void s390_idle_check(void) + vtime_start_cpu(); + } + ++static inline int s390_nohz_delay(int cpu) ++{ ++ return per_cpu(s390_idle, cpu).nohz_delay != 0; ++} ++ ++#define arch_needs_cpu(cpu) s390_nohz_delay(cpu) ++ + #endif /* _S390_CPUTIME_H */ +--- a/arch/s390/kernel/s390_ext.c ++++ b/arch/s390/kernel/s390_ext.c +@@ -126,6 +126,8 @@ void __irq_entry do_extint(struct pt_reg + /* Serve timer interrupts first. */ + clock_comparator_work(); + kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++; ++ if (code != 0x1004) ++ __get_cpu_var(s390_idle).nohz_delay = 1; + index = ext_hash(code); + for (p = ext_int_hash[index]; p; p = p->next) { + if (likely(p->code == code)) +--- a/arch/s390/kernel/vtime.c ++++ b/arch/s390/kernel/vtime.c +@@ -167,6 +167,8 @@ void vtime_stop_cpu(void) + /* Wait for external, I/O or machine check interrupt. */ + psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT; + ++ idle->nohz_delay = 0; ++ + /* Check if the CPU timer needs to be reprogrammed. */ + if (vq->do_spt) { + __u64 vmax = VTIMER_MAX_SLICE; +--- a/drivers/s390/cio/cio.c ++++ b/drivers/s390/cio/cio.c +@@ -618,6 +618,7 @@ void __irq_entry do_IRQ(struct pt_regs * + old_regs = set_irq_regs(regs); + s390_idle_check(); + irq_enter(); ++ __get_cpu_var(s390_idle).nohz_delay = 1; + if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) + /* Serve timer interrupts first. */ + clock_comparator_work(); +--- a/include/linux/tick.h ++++ b/include/linux/tick.h +@@ -98,6 +98,9 @@ extern int tick_check_oneshot_change(int + extern struct tick_sched *tick_get_tick_sched(int cpu); + extern void tick_check_idle(int cpu); + extern int tick_oneshot_mode_active(void); ++# ifndef arch_needs_cpu ++# define arch_needs_cpu(cpu) (0) ++# endif + # else + static inline void tick_clock_notify(void) { } + static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -289,12 +289,15 @@ void tick_nohz_stop_sched_tick(int inidl + time_delta = KTIME_MAX; + } while (read_seqretry(&xtime_lock, seq)); + +- /* Get the next timer wheel timer */ +- next_jiffies = get_next_timer_interrupt(last_jiffies); +- delta_jiffies = next_jiffies - last_jiffies; +- +- if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu)) ++ if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || ++ arch_needs_cpu(cpu)) { ++ next_jiffies = last_jiffies + 1; + delta_jiffies = 1; ++ } else { ++ /* Get the next timer wheel timer */ ++ next_jiffies = get_next_timer_interrupt(last_jiffies); ++ delta_jiffies = next_jiffies - last_jiffies; ++ } + /* + * Do not stop the tick, if we are only one off + * or if the cpu is required for rcu diff --git a/queue-2.6.32/nohz-reuse-ktime-in-sub-functions-of-tick_check_idle.patch b/queue-2.6.32/nohz-reuse-ktime-in-sub-functions-of-tick_check_idle.patch new file mode 100644 index 00000000000..6107b7cc34a --- /dev/null +++ b/queue-2.6.32/nohz-reuse-ktime-in-sub-functions-of-tick_check_idle.patch @@ -0,0 +1,194 @@ +From eed3b9cf3fe3fcc7a50238dfcab63a63914e8f42 Mon Sep 17 00:00:00 2001 +From: Martin Schwidefsky +Date: Tue, 29 Sep 2009 14:25:15 +0200 +Subject: nohz: Reuse ktime in sub-functions of tick_check_idle. + +From: Martin Schwidefsky + +commit eed3b9cf3fe3fcc7a50238dfcab63a63914e8f42 upstream. + +On a system with NOHZ=y tick_check_idle calls tick_nohz_stop_idle and +tick_nohz_update_jiffies. Given the right conditions (ts->idle_active +and/or ts->tick_stopped) both function get a time stamp with ktime_get. +The same time stamp can be reused if both function require one. + +On s390 this change has the additional benefit that gcc inlines the +tick_nohz_stop_idle function into tick_check_idle. The number of +instructions to execute tick_check_idle drops from 225 to 144 +(without the ktime_get optimization it is 367 vs 215 instructions). + +before: + + 0) | tick_check_idle() { + 0) | tick_nohz_stop_idle() { + 0) | ktime_get() { + 0) | read_tod_clock() { + 0) 0.601 us | } + 0) 1.765 us | } + 0) 3.047 us | } + 0) | ktime_get() { + 0) | read_tod_clock() { + 0) 0.570 us | } + 0) 1.727 us | } + 0) | tick_do_update_jiffies64() { + 0) 0.609 us | } + 0) 8.055 us | } + +after: + + 0) | tick_check_idle() { + 0) | ktime_get() { + 0) | read_tod_clock() { + 0) 0.617 us | } + 0) 1.773 us | } + 0) | tick_do_update_jiffies64() { + 0) 0.593 us | } + 0) 4.477 us | } + +Signed-off-by: Martin Schwidefsky +Cc: john stultz +LKML-Reference: <20090929122533.206589318@de.ibm.com> +Signed-off-by: Thomas Gleixner +Acked-by: John Jolly +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/time/tick-sched.c | 62 +++++++++++++++++++++++++---------------------- + 1 file changed, 33 insertions(+), 29 deletions(-) + +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz); + * value. We do this unconditionally on any cpu, as we don't know whether the + * cpu, which has the update task assigned is in a long sleep. + */ +-static void tick_nohz_update_jiffies(void) ++static void tick_nohz_update_jiffies(ktime_t now) + { + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long flags; +- ktime_t now; +- +- if (!ts->tick_stopped) +- return; + + cpumask_clear_cpu(cpu, nohz_cpu_mask); +- now = ktime_get(); + ts->idle_waketime = now; + + local_irq_save(flags); +@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(voi + touch_softlockup_watchdog(); + } + +-static void tick_nohz_stop_idle(int cpu) ++static void tick_nohz_stop_idle(int cpu, ktime_t now) + { + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ++ ktime_t delta; + +- if (ts->idle_active) { +- ktime_t now, delta; +- now = ktime_get(); +- delta = ktime_sub(now, ts->idle_entrytime); +- ts->idle_lastupdate = now; +- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); +- ts->idle_active = 0; ++ delta = ktime_sub(now, ts->idle_entrytime); ++ ts->idle_lastupdate = now; ++ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ++ ts->idle_active = 0; + +- sched_clock_idle_wakeup_event(0); +- } ++ sched_clock_idle_wakeup_event(0); + } + + static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +@@ -463,7 +455,11 @@ void tick_nohz_restart_sched_tick(void) + ktime_t now; + + local_irq_disable(); +- tick_nohz_stop_idle(cpu); ++ if (ts->idle_active || (ts->inidle && ts->tick_stopped)) ++ now = ktime_get(); ++ ++ if (ts->idle_active) ++ tick_nohz_stop_idle(cpu, now); + + if (!ts->inidle || !ts->tick_stopped) { + ts->inidle = 0; +@@ -477,7 +473,6 @@ void tick_nohz_restart_sched_tick(void) + + /* Update jiffies first */ + select_nohz_load_balancer(0); +- now = ktime_get(); + tick_do_update_jiffies64(now); + cpumask_clear_cpu(cpu, nohz_cpu_mask); + +@@ -611,22 +606,18 @@ static void tick_nohz_switch_to_nohz(voi + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +-static void tick_nohz_kick_tick(int cpu) ++static void tick_nohz_kick_tick(int cpu, ktime_t now) + { + #if 0 + /* Switch back to 2.6.27 behaviour */ + + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +- ktime_t delta, now; +- +- if (!ts->tick_stopped) +- return; ++ ktime_t delta; + + /* + * Do not touch the tick device, when the next expiry is either + * already reached or less/equal than the tick period. + */ +- now = ktime_get(); + delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); + if (delta.tv64 <= tick_period.tv64) + return; +@@ -635,9 +626,26 @@ static void tick_nohz_kick_tick(int cpu) + #endif + } + ++static inline void tick_check_nohz(int cpu) ++{ ++ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); ++ ktime_t now; ++ ++ if (!ts->idle_active && !ts->tick_stopped) ++ return; ++ now = ktime_get(); ++ if (ts->idle_active) ++ tick_nohz_stop_idle(cpu, now); ++ if (ts->tick_stopped) { ++ tick_nohz_update_jiffies(now); ++ tick_nohz_kick_tick(cpu, now); ++ } ++} ++ + #else + + static inline void tick_nohz_switch_to_nohz(void) { } ++static inline void tick_check_nohz(int cpu) { } + + #endif /* NO_HZ */ + +@@ -647,11 +655,7 @@ static inline void tick_nohz_switch_to_n + void tick_check_idle(int cpu) + { + tick_check_oneshot_broadcast(cpu); +-#ifdef CONFIG_NO_HZ +- tick_nohz_stop_idle(cpu); +- tick_nohz_update_jiffies(); +- tick_nohz_kick_tick(cpu); +-#endif ++ tick_check_nohz(cpu); + } + + /* diff --git a/queue-2.6.32/ocfs2-find-proper-end-cpos-for-a-leaf-refcount-block.patch b/queue-2.6.32/ocfs2-find-proper-end-cpos-for-a-leaf-refcount-block.patch new file mode 100644 index 00000000000..6a150f7b75b --- /dev/null +++ b/queue-2.6.32/ocfs2-find-proper-end-cpos-for-a-leaf-refcount-block.patch @@ -0,0 +1,254 @@ +From 38a04e432768ec0b016f3c687b4de31ac111ae59 Mon Sep 17 00:00:00 2001 +From: Tao Ma +Date: Mon, 30 Nov 2009 14:32:19 +0800 +Subject: ocfs2: Find proper end cpos for a leaf refcount block. + +From: Tao Ma + +commit 38a04e432768ec0b016f3c687b4de31ac111ae59 upstream. + +ocfs2 refcount tree is stored as an extent tree while +the leaf ocfs2_refcount_rec points to a refcount block. + +The following step can trip a kernel panic. +mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE +mount -t ocfs2 $DEVICE $MNT_DIR +FILE_NAME=$RANDOM +FILE_NAME_1=$RANDOM +FILE_REF="${FILE_NAME}_ref" +FILE_REF_1="${FILE_NAME}_ref_1" +for((i=0;i<305;i++)) +do +# /mnt/1048576 is a file with 1048576 sizes. +cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME +cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 +done +for((i=0;i<3;i++)) +do +cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME +done + +for((i=0;i<2;i++)) +do +cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME +cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 +done + +cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME + +for((i=0;i<11;i++)) +do +cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME +cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 +done +reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF +# write_f is a program which will write some bytes to a file at offset. +# write_f -f file_name -l offset -w write_bytes. +./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 +./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 +./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 +./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 +./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 +reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 +./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 +#kernel panic here. + +The reason is that if the ocfs2_extent_rec is the last record +in a leaf extent block, the old solution fails to find the +suitable end cpos. So this patch try to walk through the b-tree, +find the next sub root and get the c_pos the next sub-tree starts +from. + +btw, I have runned tristan's test case against the patched kernel +for several days and this type of kernel panic never happens again. + +Signed-off-by: Tao Ma +Signed-off-by: Joel Becker +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/alloc.c | 10 ++-- + fs/ocfs2/alloc.h | 5 ++ + fs/ocfs2/refcounttree.c | 117 ++++++++++++++++++++++++++++++++++++++++++++---- + 3 files changed, 119 insertions(+), 13 deletions(-) + +--- a/fs/ocfs2/alloc.c ++++ b/fs/ocfs2/alloc.c +@@ -1765,9 +1765,9 @@ set_and_inc: + * + * The array index of the subtree root is passed back. + */ +-static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, +- struct ocfs2_path *left, +- struct ocfs2_path *right) ++int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, ++ struct ocfs2_path *left, ++ struct ocfs2_path *right) + { + int i = 0; + +@@ -2872,8 +2872,8 @@ out: + * This looks similar, but is subtly different to + * ocfs2_find_cpos_for_left_leaf(). + */ +-static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, +- struct ocfs2_path *path, u32 *cpos) ++int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, ++ struct ocfs2_path *path, u32 *cpos) + { + int i, j, ret = 0; + u64 blkno; +--- a/fs/ocfs2/alloc.h ++++ b/fs/ocfs2/alloc.h +@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_ + int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, + handle_t *handle, + struct ocfs2_path *path); ++int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, ++ struct ocfs2_path *path, u32 *cpos); ++int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, ++ struct ocfs2_path *left, ++ struct ocfs2_path *right); + #endif /* OCFS2_ALLOC_H */ +--- a/fs/ocfs2/refcounttree.c ++++ b/fs/ocfs2/refcounttree.c +@@ -969,6 +969,103 @@ out: + } + + /* ++ * Find the end range for a leaf refcount block indicated by ++ * el->l_recs[index].e_blkno. ++ */ ++static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, ++ struct buffer_head *ref_root_bh, ++ struct ocfs2_extent_block *eb, ++ struct ocfs2_extent_list *el, ++ int index, u32 *cpos_end) ++{ ++ int ret, i, subtree_root; ++ u32 cpos; ++ u64 blkno; ++ struct super_block *sb = ocfs2_metadata_cache_get_super(ci); ++ struct ocfs2_path *left_path = NULL, *right_path = NULL; ++ struct ocfs2_extent_tree et; ++ struct ocfs2_extent_list *tmp_el; ++ ++ if (index < le16_to_cpu(el->l_next_free_rec) - 1) { ++ /* ++ * We have a extent rec after index, so just use the e_cpos ++ * of the next extent rec. ++ */ ++ *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); ++ return 0; ++ } ++ ++ if (!eb || (eb && !eb->h_next_leaf_blk)) { ++ /* ++ * We are the last extent rec, so any high cpos should ++ * be stored in this leaf refcount block. ++ */ ++ *cpos_end = UINT_MAX; ++ return 0; ++ } ++ ++ /* ++ * If the extent block isn't the last one, we have to find ++ * the subtree root between this extent block and the next ++ * leaf extent block and get the corresponding e_cpos from ++ * the subroot. Otherwise we may corrupt the b-tree. ++ */ ++ ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); ++ ++ left_path = ocfs2_new_path_from_et(&et); ++ if (!left_path) { ++ ret = -ENOMEM; ++ mlog_errno(ret); ++ goto out; ++ } ++ ++ cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); ++ ret = ocfs2_find_path(ci, left_path, cpos); ++ if (ret) { ++ mlog_errno(ret); ++ goto out; ++ } ++ ++ right_path = ocfs2_new_path_from_path(left_path); ++ if (!right_path) { ++ ret = -ENOMEM; ++ mlog_errno(ret); ++ goto out; ++ } ++ ++ ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); ++ if (ret) { ++ mlog_errno(ret); ++ goto out; ++ } ++ ++ ret = ocfs2_find_path(ci, right_path, cpos); ++ if (ret) { ++ mlog_errno(ret); ++ goto out; ++ } ++ ++ subtree_root = ocfs2_find_subtree_root(&et, left_path, ++ right_path); ++ ++ tmp_el = left_path->p_node[subtree_root].el; ++ blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; ++ for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { ++ if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { ++ *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); ++ break; ++ } ++ } ++ ++ BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); ++ ++out: ++ ocfs2_free_path(left_path); ++ ocfs2_free_path(right_path); ++ return ret; ++} ++ ++/* + * Given a cpos and len, try to find the refcount record which contains cpos. + * 1. If cpos can be found in one refcount record, return the record. + * 2. If cpos can't be found, return a fake record which start from cpos +@@ -983,10 +1080,10 @@ static int ocfs2_get_refcount_rec(struct + struct buffer_head **ret_bh) + { + int ret = 0, i, found; +- u32 low_cpos; ++ u32 low_cpos, uninitialized_var(cpos_end); + struct ocfs2_extent_list *el; +- struct ocfs2_extent_rec *tmp, *rec = NULL; +- struct ocfs2_extent_block *eb; ++ struct ocfs2_extent_rec *rec = NULL; ++ struct ocfs2_extent_block *eb = NULL; + struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_refcount_block *rb = +@@ -1034,12 +1131,16 @@ static int ocfs2_get_refcount_rec(struct + } + } + +- /* adjust len when we have ocfs2_extent_rec after it. */ +- if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { +- tmp = &el->l_recs[i+1]; ++ if (found) { ++ ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, ++ eb, el, i, &cpos_end); ++ if (ret) { ++ mlog_errno(ret); ++ goto out; ++ } + +- if (le32_to_cpu(tmp->e_cpos) < cpos + len) +- len = le32_to_cpu(tmp->e_cpos) - cpos; ++ if (cpos_end < low_cpos + len) ++ len = cpos_end - low_cpos; + } + + ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), diff --git a/queue-2.6.32/ocfs2-set-ms_posixacl-on-remount.patch b/queue-2.6.32/ocfs2-set-ms_posixacl-on-remount.patch new file mode 100644 index 00000000000..0ddbc3a5d17 --- /dev/null +++ b/queue-2.6.32/ocfs2-set-ms_posixacl-on-remount.patch @@ -0,0 +1,35 @@ +From 57b09bb5e492c37c1e4273fe4e435ffd1d2ddbe0 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 15 Oct 2009 14:54:05 +0200 +Subject: ocfs2: Set MS_POSIXACL on remount + +From: Jan Kara + +commit 57b09bb5e492c37c1e4273fe4e435ffd1d2ddbe0 upstream. + +We have to set MS_POSIXACL on remount as well. Otherwise VFS +would not know we started supporting ACLs after remount and +thus ACLs would not work. + +Signed-off-by: Jan Kara +Signed-off-by: Joel Becker +Signed-off-by: Mark Fasheh +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/super.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/ocfs2/super.c ++++ b/fs/ocfs2/super.c +@@ -701,6 +701,10 @@ unlock_osb: + + if (!ocfs2_is_hard_readonly(osb)) + ocfs2_set_journal_params(osb); ++ ++ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ++ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? ++ MS_POSIXACL : 0); + } + out: + unlock_kernel(); diff --git a/queue-2.6.32/powerpc-eeh-fix-a-bug-when-pci-structure-is-null.patch b/queue-2.6.32/powerpc-eeh-fix-a-bug-when-pci-structure-is-null.patch new file mode 100644 index 00000000000..d250cce9465 --- /dev/null +++ b/queue-2.6.32/powerpc-eeh-fix-a-bug-when-pci-structure-is-null.patch @@ -0,0 +1,105 @@ +From 8d3d50bf1913561ef3b1f5b53115c5a481ba9b1e Mon Sep 17 00:00:00 2001 +From: Breno Leitao +Date: Wed, 3 Feb 2010 05:56:41 +0000 +Subject: powerpc/eeh: Fix a bug when pci structure is null + +From: Breno Leitao + +commit 8d3d50bf1913561ef3b1f5b53115c5a481ba9b1e upstream. + +During a EEH recover, the pci_dev structure can be null, mainly if an +eeh event is detected during cpi config operation. In this case, the +pci_dev will not be known (and will be null) the kernel will crash +with the following message: + +Unable to handle kernel paging request for data at address 0x000000a0 +Faulting instruction address: 0xc00000000006b8b4 +Oops: Kernel access of bad area, sig: 11 [#1] + +NIP [c00000000006b8b4] .eeh_event_handler+0x10c/0x1a0 +LR [c00000000006b8a8] .eeh_event_handler+0x100/0x1a0 +Call Trace: +[c0000003a80dff00] [c00000000006b8a8] .eeh_event_handler+0x100/0x1a0 +[c0000003a80dff90] [c000000000031f1c] .kernel_thread+0x54/0x70 + +The bug occurs because pci_name() tries to access a null pointer. +This patch just guarantee that pci_name() is not called on Null pointers. + +Signed-off-by: Breno Leitao +Signed-off-by: Linas Vepstas +Signed-off-by: Benjamin Herrenschmidt +Acked-by: Jeff Mahoney +Signed-off-by: Greg Kroah-Hartman + +--- + arch/powerpc/include/asm/ppc-pci.h | 5 +++++ + arch/powerpc/platforms/pseries/eeh.c | 4 ++-- + arch/powerpc/platforms/pseries/eeh_driver.c | 4 ++-- + arch/powerpc/platforms/pseries/eeh_event.c | 2 +- + 4 files changed, 10 insertions(+), 5 deletions(-) + +--- a/arch/powerpc/include/asm/ppc-pci.h ++++ b/arch/powerpc/include/asm/ppc-pci.h +@@ -137,6 +137,11 @@ struct device_node * find_device_pe(stru + void eeh_sysfs_add_device(struct pci_dev *pdev); + void eeh_sysfs_remove_device(struct pci_dev *pdev); + ++static inline const char *eeh_pci_name(struct pci_dev *pdev) ++{ ++ return pdev ? pci_name(pdev) : ""; ++} ++ + #endif /* CONFIG_EEH */ + + #else /* CONFIG_PCI */ +--- a/arch/powerpc/platforms/pseries/eeh.c ++++ b/arch/powerpc/platforms/pseries/eeh.c +@@ -491,7 +491,7 @@ int eeh_dn_check_failure(struct device_n + pdn->eeh_mode & EEH_MODE_NOCHECK) { + ignored_check++; + pr_debug("EEH: Ignored check (%x) for %s %s\n", +- pdn->eeh_mode, pci_name (dev), dn->full_name); ++ pdn->eeh_mode, eeh_pci_name(dev), dn->full_name); + return 0; + } + +@@ -515,7 +515,7 @@ int eeh_dn_check_failure(struct device_n + printk (KERN_ERR "EEH: %d reads ignored for recovering device at " + "location=%s driver=%s pci addr=%s\n", + pdn->eeh_check_count, location, +- dev->driver->name, pci_name(dev)); ++ dev->driver->name, eeh_pci_name(dev)); + printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n", + dev->driver->name); + dump_stack(); +--- a/arch/powerpc/platforms/pseries/eeh_driver.c ++++ b/arch/powerpc/platforms/pseries/eeh_driver.c +@@ -353,7 +353,7 @@ struct pci_dn * handle_eeh_events (struc + location = location ? location : "unknown"; + printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " + "for location=%s pci addr=%s\n", +- location, pci_name(event->dev)); ++ location, eeh_pci_name(event->dev)); + return NULL; + } + +@@ -384,7 +384,7 @@ struct pci_dn * handle_eeh_events (struc + pci_str = pci_name (frozen_pdn->pcidev); + drv_str = pcid_name (frozen_pdn->pcidev); + } else { +- pci_str = pci_name (event->dev); ++ pci_str = eeh_pci_name(event->dev); + drv_str = pcid_name (event->dev); + } + +--- a/arch/powerpc/platforms/pseries/eeh_event.c ++++ b/arch/powerpc/platforms/pseries/eeh_event.c +@@ -80,7 +80,7 @@ static int eeh_event_handler(void * dumm + eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); + + printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", +- pci_name(event->dev)); ++ eeh_pci_name(event->dev)); + + pdn = handle_eeh_events(event); + diff --git a/queue-2.6.32/reiserfs-fix-oops-while-creating-privroot-with-selinux-enabled.patch b/queue-2.6.32/reiserfs-fix-oops-while-creating-privroot-with-selinux-enabled.patch new file mode 100644 index 00000000000..7814d20220b --- /dev/null +++ b/queue-2.6.32/reiserfs-fix-oops-while-creating-privroot-with-selinux-enabled.patch @@ -0,0 +1,56 @@ +From 6cb4aff0a77cc0e6bae9475d62205319e3ebbf3f Mon Sep 17 00:00:00 2001 +From: Jeff Mahoney +Date: Tue, 23 Mar 2010 13:35:38 -0700 +Subject: reiserfs: fix oops while creating privroot with selinux enabled + +From: Jeff Mahoney + +commit 6cb4aff0a77cc0e6bae9475d62205319e3ebbf3f upstream. + +Commit 57fe60df ("reiserfs: add atomic addition of selinux attributes +during inode creation") contains a bug that will cause it to oops when +mounting a file system that didn't previously contain extended attributes +on a system using security.* xattrs. + +The issue is that while creating the privroot during mount +reiserfs_security_init calls reiserfs_xattr_jcreate_nblocks which +dereferences the xattr root. The xattr root doesn't exist, so we get an +oops. + +Addresses http://bugzilla.kernel.org/show_bug.cgi?id=15309 + +Signed-off-by: Jeff Mahoney +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/reiserfs/xattr_security.c | 2 +- + include/linux/reiserfs_xattr.h | 5 +++++ + 2 files changed, 6 insertions(+), 1 deletion(-) + +--- a/fs/reiserfs/xattr_security.c ++++ b/fs/reiserfs/xattr_security.c +@@ -75,7 +75,7 @@ int reiserfs_security_init(struct inode + return error; + } + +- if (sec->length) { ++ if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) { + blocks = reiserfs_xattr_jcreate_nblocks(inode) + + reiserfs_xattr_nblocks(inode, sec->length); + /* We don't want to count the directories twice if we have +--- a/include/linux/reiserfs_xattr.h ++++ b/include/linux/reiserfs_xattr.h +@@ -70,6 +70,11 @@ int reiserfs_security_write(struct reise + void reiserfs_security_free(struct reiserfs_security_handle *sec); + #endif + ++static inline int reiserfs_xattrs_initialized(struct super_block *sb) ++{ ++ return REISERFS_SB(sb)->priv_root != NULL; ++} ++ + #define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) + static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) + { diff --git a/queue-2.6.32/reiserfs-properly-honor-read-only-devices.patch b/queue-2.6.32/reiserfs-properly-honor-read-only-devices.patch new file mode 100644 index 00000000000..265606b1275 --- /dev/null +++ b/queue-2.6.32/reiserfs-properly-honor-read-only-devices.patch @@ -0,0 +1,71 @@ +From 3f8b5ee33293d43ca360771b535dfae8c57259dc Mon Sep 17 00:00:00 2001 +From: Jeff Mahoney +Date: Tue, 23 Mar 2010 13:35:39 -0700 +Subject: reiserfs: properly honor read-only devices + +From: Jeff Mahoney + +commit 3f8b5ee33293d43ca360771b535dfae8c57259dc upstream. + +The reiserfs journal behaves inconsistently when determining whether to +allow a mount of a read-only device. + +This is due to the use of the continue_replay variable to short circuit +the journal scanning. If it's set, it's assumed that there are +transactions to replay, but there may not be. If it's unset, it's assumed +that there aren't any, and that may not be the case either. + +I've observed two failure cases: +1) Where a clean file system on a read-only device refuses to mount +2) Where a clean file system on a read-only device passes the + optimization and then tries writing the journal header to update + the latest mount id. + +The former is easily observable by using a freshly created file system on +a read-only loopback device. + +This patch moves the check into journal_read_transaction, where it can +bail out before it's about to replay a transaction. That way it can go +through and skip transactions where appropriate, yet still refuse to mount +a file system with outstanding transactions. + +Signed-off-by: Jeff Mahoney +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/reiserfs/journal.c | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +--- a/fs/reiserfs/journal.c ++++ b/fs/reiserfs/journal.c +@@ -2184,6 +2184,15 @@ static int journal_read_transaction(stru + brelse(d_bh); + return 1; + } ++ ++ if (bdev_read_only(sb->s_bdev)) { ++ reiserfs_warning(sb, "clm-2076", ++ "device is readonly, unable to replay log"); ++ brelse(c_bh); ++ brelse(d_bh); ++ return -EROFS; ++ } ++ + trans_id = get_desc_trans_id(desc); + /* now we know we've got a good transaction, and it was inside the valid time ranges */ + log_blocks = kmalloc(get_desc_trans_len(desc) * +@@ -2422,12 +2431,6 @@ static int journal_read(struct super_blo + goto start_log_replay; + } + +- if (continue_replay && bdev_read_only(sb->s_bdev)) { +- reiserfs_warning(sb, "clm-2076", +- "device is readonly, unable to replay log"); +- return -1; +- } +- + /* ok, there are transactions that need to be replayed. start with the first log block, find + ** all the valid transactions, and pick out the oldest. + */ diff --git a/queue-2.6.32/sched-cputime-introduce-thread_group_times.patch b/queue-2.6.32/sched-cputime-introduce-thread_group_times.patch new file mode 100644 index 00000000000..9550a702fdd --- /dev/null +++ b/queue-2.6.32/sched-cputime-introduce-thread_group_times.patch @@ -0,0 +1,320 @@ +From 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 Mon Sep 17 00:00:00 2001 +From: Hidetoshi Seto +Date: Wed, 2 Dec 2009 17:28:07 +0900 +Subject: sched, cputime: Introduce thread_group_times() + +From: Hidetoshi Seto + +commit 0cf55e1ec08bb5a22e068309e2d8ba1180ab4239 upstream. + +This is a real fix for problem of utime/stime values decreasing +described in the thread: + + http://lkml.org/lkml/2009/11/3/522 + +Now cputime is accounted in the following way: + + - {u,s}time in task_struct are increased every time when the thread + is interrupted by a tick (timer interrupt). + + - When a thread exits, its {u,s}time are added to signal->{u,s}time, + after adjusted by task_times(). + + - When all threads in a thread_group exits, accumulated {u,s}time + (and also c{u,s}time) in signal struct are added to c{u,s}time + in signal struct of the group's parent. + +So {u,s}time in task struct are "raw" tick count, while +{u,s}time and c{u,s}time in signal struct are "adjusted" values. + +And accounted values are used by: + + - task_times(), to get cputime of a thread: + This function returns adjusted values that originates from raw + {u,s}time and scaled by sum_exec_runtime that accounted by CFS. + + - thread_group_cputime(), to get cputime of a thread group: + This function returns sum of all {u,s}time of living threads in + the group, plus {u,s}time in the signal struct that is sum of + adjusted cputimes of all exited threads belonged to the group. + +The problem is the return value of thread_group_cputime(), +because it is mixed sum of "raw" value and "adjusted" value: + + group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) + +This misbehavior can break {u,s}time monotonicity. +Assume that if there is a thread that have raw values greater +than adjusted values (e.g. interrupted by 1000Hz ticks 50 times +but only runs 45ms) and if it exits, cputime will decrease (e.g. +-5ms). + +To fix this, we could do: + + group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) + +But task_times() contains hard divisions, so applying it for +every thread should be avoided. + +This patch fixes the above problem in the following way: + + - Modify thread's exit (= __exit_signal()) not to use task_times(). + It means {u,s}time in signal struct accumulates raw values instead + of adjusted values. As the result it makes thread_group_cputime() + to return pure sum of "raw" values. + + - Introduce a new function thread_group_times(*task, *utime, *stime) + that converts "raw" values of thread_group_cputime() to "adjusted" + values, in same calculation procedure as task_times(). + + - Modify group's exit (= wait_task_zombie()) to use this introduced + thread_group_times(). It make c{u,s}time in signal struct to + have adjusted values like before this patch. + + - Replace some thread_group_cputime() by thread_group_times(). + This replacements are only applied where conveys the "adjusted" + cputime to users, and where already uses task_times() near by it. + (i.e. sys_times(), getrusage(), and /proc//stat.) + +This patch have a positive side effect: + + - Before this patch, if a group contains many short-life threads + (e.g. runs 0.9ms and not interrupted by ticks), the group's + cputime could be invisible since thread's cputime was accumulated + after adjusted: imagine adjustment function as adj(ticks, runtime), + {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. + After this patch it will not happen because the adjustment is + applied after accumulated. + +v2: + - remove if()s, put new variables into signal_struct. + +Signed-off-by: Hidetoshi Seto +Acked-by: Peter Zijlstra +Cc: Spencer Candland +Cc: Americo Wang +Cc: Oleg Nesterov +Cc: Balbir Singh +Cc: Stanislaw Gruszka +LKML-Reference: <4B162517.8040909@jp.fujitsu.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Jiri Slaby +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/proc/array.c | 5 +---- + include/linux/sched.h | 4 ++++ + kernel/exit.c | 20 ++++++++++++-------- + kernel/fork.c | 3 +++ + kernel/sched.c | 41 +++++++++++++++++++++++++++++++++++++++++ + kernel/sys.c | 18 ++++++++---------- + 6 files changed, 69 insertions(+), 22 deletions(-) + +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -405,7 +405,6 @@ static int do_task_stat(struct seq_file + + /* add up live thread stats at the group level */ + if (whole) { +- struct task_cputime cputime; + struct task_struct *t = task; + do { + min_flt += t->min_flt; +@@ -416,9 +415,7 @@ static int do_task_stat(struct seq_file + + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; +- thread_group_cputime(task, &cputime); +- utime = cputime.utime; +- stime = cputime.stime; ++ thread_group_times(task, &utime, &stime); + gtime = cputime_add(gtime, sig->gtime); + } + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -628,6 +628,9 @@ struct signal_struct { + cputime_t utime, stime, cutime, cstime; + cputime_t gtime; + cputime_t cgtime; ++#ifndef CONFIG_VIRT_CPU_ACCOUNTING ++ cputime_t prev_utime, prev_stime; ++#endif + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; + unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; + unsigned long inblock, oublock, cinblock, coublock; +@@ -1725,6 +1728,7 @@ static inline void put_task_struct(struc + extern cputime_t task_utime(struct task_struct *p); + extern cputime_t task_stime(struct task_struct *p); + extern cputime_t task_gtime(struct task_struct *p); ++extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); + + /* + * Per process flags +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -110,8 +110,8 @@ static void __exit_signal(struct task_st + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ +- sig->utime = cputime_add(sig->utime, task_utime(tsk)); +- sig->stime = cputime_add(sig->stime, task_stime(tsk)); ++ sig->utime = cputime_add(sig->utime, tsk->utime); ++ sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; +@@ -1205,6 +1205,7 @@ static int wait_task_zombie(struct wait_ + struct signal_struct *psig; + struct signal_struct *sig; + unsigned long maxrss; ++ cputime_t tgutime, tgstime; + + /* + * The resource counters for the group leader are in its +@@ -1220,20 +1221,23 @@ static int wait_task_zombie(struct wait_ + * need to protect the access to parent->signal fields, + * as other threads in the parent group can be right + * here reaping other children at the same time. ++ * ++ * We use thread_group_times() to get times for the thread ++ * group, which consolidates times for all threads in the ++ * group including the group leader. + */ ++ thread_group_times(p, &tgutime, &tgstime); + spin_lock_irq(&p->real_parent->sighand->siglock); + psig = p->real_parent->signal; + sig = p->signal; + psig->cutime = + cputime_add(psig->cutime, +- cputime_add(p->utime, +- cputime_add(sig->utime, +- sig->cutime))); ++ cputime_add(tgutime, ++ sig->cutime)); + psig->cstime = + cputime_add(psig->cstime, +- cputime_add(p->stime, +- cputime_add(sig->stime, +- sig->cstime))); ++ cputime_add(tgstime, ++ sig->cstime)); + psig->cgtime = + cputime_add(psig->cgtime, + cputime_add(p->gtime, +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -884,6 +884,9 @@ static int copy_signal(unsigned long clo + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->gtime = cputime_zero; + sig->cgtime = cputime_zero; ++#ifndef CONFIG_VIRT_CPU_ACCOUNTING ++ sig->prev_utime = sig->prev_stime = cputime_zero; ++#endif + sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; + sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; + sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -5215,6 +5215,16 @@ cputime_t task_stime(struct task_struct + { + return p->stime; + } ++ ++void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ struct task_cputime cputime; ++ ++ thread_group_cputime(p, &cputime); ++ ++ *ut = cputime.utime; ++ *st = cputime.stime; ++} + #else + + #ifndef nsecs_to_cputime +@@ -5258,6 +5268,37 @@ cputime_t task_stime(struct task_struct + + return p->prev_stime; + } ++ ++/* ++ * Must be called with siglock held. ++ */ ++void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) ++{ ++ struct signal_struct *sig = p->signal; ++ struct task_cputime cputime; ++ cputime_t rtime, utime, total; ++ ++ thread_group_cputime(p, &cputime); ++ ++ total = cputime_add(cputime.utime, cputime.stime); ++ rtime = nsecs_to_cputime(cputime.sum_exec_runtime); ++ ++ if (total) { ++ u64 temp; ++ ++ temp = (u64)(rtime * cputime.utime); ++ do_div(temp, total); ++ utime = (cputime_t)temp; ++ } else ++ utime = rtime; ++ ++ sig->prev_utime = max(sig->prev_utime, utime); ++ sig->prev_stime = max(sig->prev_stime, ++ cputime_sub(rtime, sig->prev_utime)); ++ ++ *ut = sig->prev_utime; ++ *st = sig->prev_stime; ++} + #endif + + inline cputime_t task_gtime(struct task_struct *p) +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -911,16 +911,15 @@ change_okay: + + void do_sys_times(struct tms *tms) + { +- struct task_cputime cputime; +- cputime_t cutime, cstime; ++ cputime_t tgutime, tgstime, cutime, cstime; + +- thread_group_cputime(current, &cputime); + spin_lock_irq(¤t->sighand->siglock); ++ thread_group_times(current, &tgutime, &tgstime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); +- tms->tms_utime = cputime_to_clock_t(cputime.utime); +- tms->tms_stime = cputime_to_clock_t(cputime.stime); ++ tms->tms_utime = cputime_to_clock_t(tgutime); ++ tms->tms_stime = cputime_to_clock_t(tgstime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); + } +@@ -1338,8 +1337,7 @@ static void k_getrusage(struct task_stru + { + struct task_struct *t; + unsigned long flags; +- cputime_t utime, stime; +- struct task_cputime cputime; ++ cputime_t tgutime, tgstime, utime, stime; + unsigned long maxrss = 0; + + memset((char *) r, 0, sizeof *r); +@@ -1373,9 +1371,9 @@ static void k_getrusage(struct task_stru + break; + + case RUSAGE_SELF: +- thread_group_cputime(p, &cputime); +- utime = cputime_add(utime, cputime.utime); +- stime = cputime_add(stime, cputime.stime); ++ thread_group_times(p, &tgutime, &tgstime); ++ utime = cputime_add(utime, tgutime); ++ stime = cputime_add(stime, tgstime); + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; diff --git a/queue-2.6.32/sched-fix-granularity-of-task_u-stime.patch b/queue-2.6.32/sched-fix-granularity-of-task_u-stime.patch new file mode 100644 index 00000000000..c57e80f825e --- /dev/null +++ b/queue-2.6.32/sched-fix-granularity-of-task_u-stime.patch @@ -0,0 +1,103 @@ +From 761b1d26df542fd5eb348837351e4d2f3bc7bffe Mon Sep 17 00:00:00 2001 +From: Hidetoshi Seto +Date: Thu, 12 Nov 2009 13:33:45 +0900 +Subject: sched: Fix granularity of task_u/stime() + +From: Hidetoshi Seto + +commit 761b1d26df542fd5eb348837351e4d2f3bc7bffe upstream. + +Originally task_s/utime() were designed to return clock_t but +later changed to return cputime_t by following commit: + + commit efe567fc8281661524ffa75477a7c4ca9b466c63 + Author: Christian Borntraeger + Date: Thu Aug 23 15:18:02 2007 +0200 + +It only changed the type of return value, but not the +implementation. As the result the granularity of task_s/utime() +is still that of clock_t, not that of cputime_t. + +So using task_s/utime() in __exit_signal() makes values +accumulated to the signal struct to be rounded and coarse +grained. + +This patch removes casts to clock_t in task_u/stime(), to keep +granularity of cputime_t over the calculation. + +v2: + Use div_u64() to avoid error "undefined reference to `__udivdi3`" + on some 32bit systems. + +Signed-off-by: Hidetoshi Seto +Acked-by: Peter Zijlstra +Cc: xiyou.wangcong@gmail.com +Cc: Spencer Candland +Cc: Oleg Nesterov +Cc: Stanislaw Gruszka +LKML-Reference: <4AFB9029.9000208@jp.fujitsu.com> +Signed-off-by: Ingo Molnar +Signed-off-by: Jiri Slaby +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched.c | 22 +++++++++++++--------- + 1 file changed, 13 insertions(+), 9 deletions(-) + +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -5216,41 +5216,45 @@ cputime_t task_stime(struct task_struct + return p->stime; + } + #else ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) \ ++ msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) ++#endif ++ + cputime_t task_utime(struct task_struct *p) + { +- clock_t utime = cputime_to_clock_t(p->utime), +- total = utime + cputime_to_clock_t(p->stime); ++ cputime_t utime = p->utime, total = utime + p->stime; + u64 temp; + + /* + * Use CFS's precise accounting: + */ +- temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); ++ temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime); + + if (total) { + temp *= utime; + do_div(temp, total); + } +- utime = (clock_t)temp; ++ utime = (cputime_t)temp; + +- p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); ++ p->prev_utime = max(p->prev_utime, utime); + return p->prev_utime; + } + + cputime_t task_stime(struct task_struct *p) + { +- clock_t stime; ++ cputime_t stime; + + /* + * Use CFS's precise accounting. (we subtract utime from + * the total, to make sure the total observed by userspace + * grows monotonically - apps rely on that): + */ +- stime = nsec_to_clock_t(p->se.sum_exec_runtime) - +- cputime_to_clock_t(task_utime(p)); ++ stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p); + + if (stime >= 0) +- p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); ++ p->prev_stime = max(p->prev_stime, stime); + + return p->prev_stime; + } diff --git a/queue-2.6.32/series b/queue-2.6.32/series index 9f0301de405..3eb261dfe89 100644 --- a/queue-2.6.32/series +++ b/queue-2.6.32/series @@ -40,3 +40,71 @@ irq-add-new-irq-flag-irqf_no_suspend.patch xen-do-not-suspend-ipi-irqs.patch ext4-fix-freeze-deadlock-under-io.patch drm-i915-use-rsen-instead-of-htplg-for-tfp410-monitor-detection.patch +btrfs-avoid-superfluous-tree-log-writeout.patch +btrfs-add-btrfs_duplicate_item.patch +btrfs-rewrite-btrfs_drop_extents.patch +btrfs-fix-disk_i_size-update-corner-case.patch +btrfs-avoid-orphan-inodes-cleanup-while-replaying-log.patch +btrfs-avoid-orphan-inodes-cleanup-during-committing-transaction.patch +btrfs-make-fallocate-2-more-enospc-friendly.patch +btrfs-make-truncate-2-more-enospc-friendly.patch +btrfs-pass-transaction-handle-to-security-and-acl-initialization-functions.patch +btrfs-add-delayed-iput.patch +btrfs-fix-btrfs_drop_extent_cache-for-skip-pinned-case.patch +btrfs-fix-per-root-used-space-accounting.patch +btrfs-don-t-add-extent-0-to-the-free-space-cache-v2.patch +btrfs-fail-mount-on-bad-mount-options.patch +btrfs-deny-sys_link-across-subvolumes.patch +btrfs-show-discard-option-in-proc-mounts.patch +btrfs-make-metadata-chunks-smaller.patch +btrfs-make-sure-fallocate-properly-starts-a-transaction.patch +btrfs-fix-missing-last-entry-in-readdir-3.patch +btrfs-align-offsets-for-btrfs_ordered_update_i_size.patch +btrfs-fix-memory-leaks-in-error-paths.patch +btrfs-fix-race-in-btrfs_mark_extent_written.patch +btrfs-fix-regression-in-orphan-cleanup.patch +btrfs-deal-with-null-acl-sent-to-btrfs_set_acl.patch +btrfs-fix-possible-panic-on-unmount.patch +btrfs-use-correct-values-when-updating-inode-i_size-on-fallocate.patch +btrfs-fix-a-memory-leak-in-btrfs_init_acl.patch +btrfs-run-orphan-cleanup-on-default-fs-root.patch +btrfs-do-not-mark-the-chunk-as-readonly-if-in-degraded-mode.patch +btrfs-check-return-value-of-open_bdev_exclusive-properly.patch +btrfs-check-total-number-of-devices-when-removing-missing.patch +btrfs-fix-race-between-allocate-and-release-extent-buffer.patch +btrfs-make-error-return-negative-in-btrfs_sync_file.patch +btrfs-remove-bug_on-due-to-mounting-bad-filesystem.patch +btrfs-fix-oopsen-when-dropping-empty-tree.patch +btrfs-do-not-try-and-lookup-the-file-extent-when-finishing-ordered-io.patch +btrfs-apply-updated-fallocate-i_size-fix.patch +btrfs-btrfs_mark_extent_written-uses-the-wrong-slot.patch +btrfs-kfree-correct-pointer-during-mount-option-parsing.patch +nohz-introduce-arch_needs_cpu.patch +nohz-reuse-ktime-in-sub-functions-of-tick_check_idle.patch +timekeeping-fix-clock_gettime-vsyscall-time-warp.patch +sched-fix-granularity-of-task_u-stime.patch +sched-cputime-introduce-thread_group_times.patch +mutex-don-t-spin-when-the-owner-cpu-is-offline-or-other-weird-cases.patch +fix-sba-iommu-to-handle-allocation-failure-properly.patch +crypto-testmgr-fix-complain-about-lack-test-for-internal-used-algorithm.patch +memory-hotplug-fix-a-bug-on-dev-mem-for-64-bit-kernels.patch +x86-fix-out-of-order-of-gsi.patch +hwpoison-remove-the-anonymous-entry.patch +hwpoison-abort-on-failed-unmap.patch +powerpc-eeh-fix-a-bug-when-pci-structure-is-null.patch +acpi-fix-regression-where-_ppc-is-not-read-at-boot-even-when-ignore_ppc-0.patch +ext4-make-sure-the-move_ext-ioctl-can-t-overwrite-append-only-files.patch +ext4-fix-optional-arg-mount-options.patch +reiserfs-properly-honor-read-only-devices.patch +reiserfs-fix-oops-while-creating-privroot-with-selinux-enabled.patch +dlm-always-use-gfp_nofs.patch +dlm-fix-ordering-of-bast-and-cast.patch +dlm-send-reply-before-bast.patch +ocfs2-find-proper-end-cpos-for-a-leaf-refcount-block.patch +ocfs2-set-ms_posixacl-on-remount.patch +skip-check-for-mandatory-locks-when-unlocking.patch +loop-update-mtime-when-writing-using-aops.patch +aic79xx-check-for-non-null-scb-in-ahd_handle_nonpkt_busfree.patch +ibmvfc-fix-command-completion-handling.patch +ibmvfc-reduce-error-recovery-timeout.patch +md-raid1-delay-reads-that-could-overtake-behind-writes.patch diff --git a/queue-2.6.32/skip-check-for-mandatory-locks-when-unlocking.patch b/queue-2.6.32/skip-check-for-mandatory-locks-when-unlocking.patch new file mode 100644 index 00000000000..bb8e7c563d4 --- /dev/null +++ b/queue-2.6.32/skip-check-for-mandatory-locks-when-unlocking.patch @@ -0,0 +1,36 @@ +From ee860b6a650360c91f5d5f9a94262aad9be90015 Mon Sep 17 00:00:00 2001 +From: Sachin Prabhu +Date: Wed, 10 Mar 2010 10:28:40 -0500 +Subject: [PATCH] Skip check for mandatory locks when unlocking + +From: Sachin Prabhu + +commit ee860b6a650360c91f5d5f9a94262aad9be90015 upstream. + +ocfs2_lock() will skip locks on file which has mode set to 02666. This +is a problem in cases where the mode of the file is changed after a +process has obtained a lock on the file. + +ocfs2_lock() should skip the check for mandatory locks when unlocking a +file. + +Signed-off-by: Sachin Prabhu +Signed-off-by: Joel Becker +Signed-off-by: Neil Brown +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/locks.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ocfs2/locks.c ++++ b/fs/ocfs2/locks.c +@@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cm + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; +- if (__mandatory_lock(inode)) ++ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) + return -ENOLCK; + + return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); diff --git a/queue-2.6.32/timekeeping-fix-clock_gettime-vsyscall-time-warp.patch b/queue-2.6.32/timekeeping-fix-clock_gettime-vsyscall-time-warp.patch new file mode 100644 index 00000000000..3ec98c2e567 --- /dev/null +++ b/queue-2.6.32/timekeeping-fix-clock_gettime-vsyscall-time-warp.patch @@ -0,0 +1,159 @@ +From 0696b711e4be45fa104c12329f617beb29c03f78 Mon Sep 17 00:00:00 2001 +From: Lin Ming +Date: Tue, 17 Nov 2009 13:49:50 +0800 +Subject: timekeeping: Fix clock_gettime vsyscall time warp + +From: Lin Ming + +commit 0696b711e4be45fa104c12329f617beb29c03f78 upstream. + +Since commit 0a544198 "timekeeping: Move NTP adjusted clock multiplier +to struct timekeeper" the clock multiplier of vsyscall is updated with +the unmodified clock multiplier of the clock source and not with the +NTP adjusted multiplier of the timekeeper. + +This causes user space observerable time warps: +new CLOCK-warp maximum: 120 nsecs, 00000025c337c537 -> 00000025c337c4bf + +Add a new argument "mult" to update_vsyscall() and hand in the +timekeeping internal NTP adjusted multiplier. + +Signed-off-by: Lin Ming +Cc: "Zhang Yanmin" +Cc: Martin Schwidefsky +Cc: Benjamin Herrenschmidt +Cc: Tony Luck +LKML-Reference: <1258436990.17765.83.camel@minggr.sh.intel.com> +Signed-off-by: Thomas Gleixner +Signed-off-by: Kurt Garloff +Signed-off-by: Greg Kroah-Hartman + +--- + arch/ia64/kernel/time.c | 4 ++-- + arch/powerpc/kernel/time.c | 5 +++-- + arch/s390/kernel/time.c | 3 ++- + arch/x86/kernel/vsyscall_64.c | 5 +++-- + include/linux/clocksource.h | 6 ++++-- + kernel/time/timekeeping.c | 6 +++--- + 6 files changed, 17 insertions(+), 12 deletions(-) + +--- a/arch/ia64/kernel/time.c ++++ b/arch/ia64/kernel/time.c +@@ -473,7 +473,7 @@ void update_vsyscall_tz(void) + { + } + +-void update_vsyscall(struct timespec *wall, struct clocksource *c) ++void update_vsyscall(struct timespec *wall, struct clocksource *c, u32 mult) + { + unsigned long flags; + +@@ -481,7 +481,7 @@ void update_vsyscall(struct timespec *wa + + /* copy fsyscall clock data */ + fsyscall_gtod_data.clk_mask = c->mask; +- fsyscall_gtod_data.clk_mult = c->mult; ++ fsyscall_gtod_data.clk_mult = mult; + fsyscall_gtod_data.clk_shift = c->shift; + fsyscall_gtod_data.clk_fsys_mmio = c->fsys_mmio; + fsyscall_gtod_data.clk_cycle_last = c->cycle_last; +--- a/arch/powerpc/kernel/time.c ++++ b/arch/powerpc/kernel/time.c +@@ -864,7 +864,8 @@ static cycle_t timebase_read(struct cloc + return (cycle_t)get_tb(); + } + +-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) ++void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, ++ u32 mult) + { + u64 t2x, stamp_xsec; + +@@ -877,7 +878,7 @@ void update_vsyscall(struct timespec *wa + + /* XXX this assumes clock->shift == 22 */ + /* 4611686018 ~= 2^(20+64-22) / 1e9 */ +- t2x = (u64) clock->mult * 4611686018ULL; ++ t2x = (u64) mult * 4611686018ULL; + stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; + do_div(stamp_xsec, 1000000000); + stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; +--- a/arch/s390/kernel/time.c ++++ b/arch/s390/kernel/time.c +@@ -214,7 +214,8 @@ struct clocksource * __init clocksource_ + return &clocksource_tod; + } + +-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) ++void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, ++ u32 mult) + { + if (clock != &clocksource_tod) + return; +--- a/arch/x86/kernel/vsyscall_64.c ++++ b/arch/x86/kernel/vsyscall_64.c +@@ -73,7 +73,8 @@ void update_vsyscall_tz(void) + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + } + +-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) ++void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, ++ u32 mult) + { + unsigned long flags; + +@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wa + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; +- vsyscall_gtod_data.clock.mult = clock->mult; ++ vsyscall_gtod_data.clock.mult = mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; +--- a/include/linux/clocksource.h ++++ b/include/linux/clocksource.h +@@ -282,10 +282,12 @@ extern struct clocksource * __init __wea + extern void clocksource_mark_unstable(struct clocksource *cs); + + #ifdef CONFIG_GENERIC_TIME_VSYSCALL +-extern void update_vsyscall(struct timespec *ts, struct clocksource *c); ++extern void ++update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult); + extern void update_vsyscall_tz(void); + #else +-static inline void update_vsyscall(struct timespec *ts, struct clocksource *c) ++static inline void ++update_vsyscall(struct timespec *ts, struct clocksource *c, u32 mult) + { + } + +--- a/kernel/time/timekeeping.c ++++ b/kernel/time/timekeeping.c +@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsec + { + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; +- update_vsyscall(&xtime, timekeeper.clock); ++ update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + } + + #ifdef CONFIG_GENERIC_TIME +@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv) + timekeeper.ntp_error = 0; + ntp_clear(); + +- update_vsyscall(&xtime, timekeeper.clock); ++ update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + +@@ -822,7 +822,7 @@ void update_wall_time(void) + update_xtime_cache(nsecs); + + /* check to see if there is a new clocksource to use */ +- update_vsyscall(&xtime, timekeeper.clock); ++ update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + } + + /** diff --git a/queue-2.6.32/x86-fix-out-of-order-of-gsi.patch b/queue-2.6.32/x86-fix-out-of-order-of-gsi.patch new file mode 100644 index 00000000000..467a7cfbc28 --- /dev/null +++ b/queue-2.6.32/x86-fix-out-of-order-of-gsi.patch @@ -0,0 +1,176 @@ +From fad539956c9e69749a03f7817d22d1bab87657bf Mon Sep 17 00:00:00 2001 +From: Eric W. Biederman +Date: Sun, 28 Feb 2010 01:06:34 -0800 +Subject: x86: Fix out of order of gsi + +From: Eric W. Biederman + +commit fad539956c9e69749a03f7817d22d1bab87657bf upstream. + +Iranna D Ankad reported that IBM x3950 systems have boot +problems after this commit: + + | + | commit b9c61b70075c87a8612624736faf4a2de5b1ed30 + | + | x86/pci: update pirq_enable_irq() to setup io apic routing + | + +The problem is that with the patch, the machine freezes when +console=ttyS0,... kernel serial parameter is passed. + +It seem to freeze at DVD initialization and the whole problem +seem to be DVD/pata related, but somehow exposed through the +serial parameter. + +Such apic problems can expose really weird behavior: + + ACPI: IOAPIC (id[0x10] address[0xfecff000] gsi_base[0]) + IOAPIC[0]: apic_id 16, version 0, address 0xfecff000, GSI 0-2 + ACPI: IOAPIC (id[0x0f] address[0xfec00000] gsi_base[3]) + IOAPIC[1]: apic_id 15, version 0, address 0xfec00000, GSI 3-38 + ACPI: IOAPIC (id[0x0e] address[0xfec01000] gsi_base[39]) + IOAPIC[2]: apic_id 14, version 0, address 0xfec01000, GSI 39-74 + ACPI: INT_SRC_OVR (bus 0 bus_irq 1 global_irq 4 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 5 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 3 global_irq 6 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 4 global_irq 7 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 6 global_irq 9 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 7 global_irq 10 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 8 global_irq 11 low edge) + ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 12 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 12 global_irq 15 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 13 global_irq 16 dfl dfl) + ACPI: INT_SRC_OVR (bus 0 bus_irq 14 global_irq 17 low edge) + ACPI: INT_SRC_OVR (bus 0 bus_irq 15 global_irq 18 dfl dfl) + +It turns out that the system has three io apic controllers, but +boot ioapic routing is in the second one, and that gsi_base is +not 0 - it is using a bunch of INT_SRC_OVR... + +So these recent changes: + + 1. one set routing for first io apic controller + 2. assume irq = gsi + +... will break that system. + +So try to remap those gsis, need to seperate boot_ioapic_idx +detection out of enable_IO_APIC() and call them early. + +So introduce boot_ioapic_idx, and remap_ioapic_gsi()... + + -v2: shift gsi with delta instead of gsi_base of boot_ioapic_idx + + -v3: double check with find_isa_irq_apic(0, mp_INT) to get right + boot_ioapic_idx + + -v4: nr_legacy_irqs + + -v5: add print out for boot_ioapic_idx, and also make it could be + applied for current kernel and previous kernel + + -v6: add bus_irq, in acpi_sci_ioapic_setup, so can get overwride + for sci right mapping... + + -v7: looks like pnpacpi get irq instead of gsi, so need to revert + them back... + + -v8: split into two patches + + -v9: according to Eric, use fixed 16 for shifting instead of remap + + -v10: still need to touch rsparser.c + + -v11: just revert back to way Eric suggest... + anyway the ioapic in first ioapic is blocked by second... + + -v12: two patches, this one will add more loop but check apic_id and irq > 16 + +Reported-by: Iranna D Ankad +Bisected-by: Iranna D Ankad +Tested-by: Gary Hade +Signed-off-by: Yinghai Lu +Cc: Eric W. Biederman +Cc: Thomas Renninger +Cc: Eric W. Biederman +Cc: Suresh Siddha +Cc: len.brown@intel.com +LKML-Reference: <4B8A321A.1000008@kernel.org> +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/apic/io_apic.c | 28 ++++++++++------------------ + 1 file changed, 10 insertions(+), 18 deletions(-) + +--- a/arch/x86/kernel/apic/io_apic.c ++++ b/arch/x86/kernel/apic/io_apic.c +@@ -1484,7 +1484,7 @@ static struct { + + static void __init setup_IO_APIC_irqs(void) + { +- int apic_id = 0, pin, idx, irq; ++ int apic_id, pin, idx, irq; + int notcon = 0; + struct irq_desc *desc; + struct irq_cfg *cfg; +@@ -1492,14 +1492,7 @@ static void __init setup_IO_APIC_irqs(vo + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + +-#ifdef CONFIG_ACPI +- if (!acpi_disabled && acpi_ioapic) { +- apic_id = mp_find_ioapic(0); +- if (apic_id < 0) +- apic_id = 0; +- } +-#endif +- ++ for (apic_id = 0; apic_id < nr_ioapics; apic_id++) + for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) { +@@ -1521,6 +1514,9 @@ static void __init setup_IO_APIC_irqs(vo + + irq = pin_2_irq(idx, apic_id, pin); + ++ if ((apic_id > 0) && (irq > 16)) ++ continue; ++ + /* + * Skip the timer IRQ if there's a quirk handler + * installed and if it returns 1: +@@ -4083,27 +4079,23 @@ int acpi_get_override_irq(int bus_irq, i + #ifdef CONFIG_SMP + void __init setup_ioapic_dest(void) + { +- int pin, ioapic = 0, irq, irq_entry; ++ int pin, ioapic, irq, irq_entry; + struct irq_desc *desc; + const struct cpumask *mask; + + if (skip_ioapic_setup == 1) + return; + +-#ifdef CONFIG_ACPI +- if (!acpi_disabled && acpi_ioapic) { +- ioapic = mp_find_ioapic(0); +- if (ioapic < 0) +- ioapic = 0; +- } +-#endif +- ++ for (ioapic = 0; ioapic < nr_ioapics; ioapic++) + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + ++ if ((ioapic > 0) && (irq > 16)) ++ continue; ++ + desc = irq_to_desc(irq); + + /*