]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.kernel.org/revert-ext4-changes-in-2.6.27.19-and-2.6.27.20-and-2.6.27.25.patch
Updated xen patches taken from suse.
[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.kernel.org / revert-ext4-changes-in-2.6.27.19-and-2.6.27.20-and-2.6.27.25.patch
1 From: Greg Kroah-Hartman <gregkh@suse.de>
2 Subject: revert ext4 changes in 2.6.27.19 and 2.6.27.20 and 2.6.27.25
3 Patch-mainline: no
4
5 As we are already taking a different version of ext4, revert the
6 changes that were made to ext4 in 2.6.27.19 and 2.6.27.20 and 2.6.27.25
7
8 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
9
10 --- b/Documentation/filesystems/ext4.txt
11 +++ a/Documentation/filesystems/ext4.txt
12 @@ -73,7 +73,7 @@
13 * extent format more robust in face of on-disk corruption due to magics,
14 * internal redunancy in tree
15 * improved file allocation (multi-block alloc)
16 +* fix 32000 subdirectory limit
17 -* lift 32000 subdirectory limit imposed by i_links_count[1]
18 * nsec timestamps for mtime, atime, ctime, create time
19 * inode version field on disk (NFSv4, Lustre)
20 * reduced e2fsck time via uninit_bg feature
21 @@ -88,9 +88,6 @@
22 * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
23 the ordering)
24
25 -[1] Filesystems with a block size of 1k may see a limit imposed by the
26 -directory hash tree having a maximum depth of two.
27 -
28 2.2 Candidate features for future inclusion
29
30 * Online defrag (patches available but not well tested)
31 reverted:
32 --- b/fs/ext4/balloc.c
33 +++ a/fs/ext4/balloc.c
34 @@ -20,7 +20,6 @@
35 #include "ext4.h"
36 #include "ext4_jbd2.h"
37 #include "group.h"
38 -#include "mballoc.h"
39
40 /*
41 * balloc.c contains the blocks allocation and deallocation routines
42 @@ -319,41 +318,18 @@
43 block_group, bitmap_blk);
44 return NULL;
45 }
46 + if (bh_uptodate_or_lock(bh))
47 -
48 - if (bitmap_uptodate(bh))
49 return bh;
50
51 - lock_buffer(bh);
52 - if (bitmap_uptodate(bh)) {
53 - unlock_buffer(bh);
54 - return bh;
55 - }
56 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
57 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
58 ext4_init_block_bitmap(sb, bh, block_group, desc);
59 - set_bitmap_uptodate(bh);
60 set_buffer_uptodate(bh);
61 unlock_buffer(bh);
62 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
63 return bh;
64 }
65 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
66 - if (buffer_uptodate(bh)) {
67 - /*
68 - * if not uninit if bh is uptodate,
69 - * bitmap is also uptodate
70 - */
71 - set_bitmap_uptodate(bh);
72 - unlock_buffer(bh);
73 - return bh;
74 - }
75 - /*
76 - * submit the buffer_head for read. We can
77 - * safely mark the bitmap as uptodate now.
78 - * We do it here so the bitmap uptodate bit
79 - * get set with buffer lock held.
80 - */
81 - set_bitmap_uptodate(bh);
82 if (bh_submit_read(bh) < 0) {
83 put_bh(bh);
84 ext4_error(sb, __func__,
85 @@ -861,136 +837,6 @@
86 }
87
88 /**
89 - * ext4_add_groupblocks() -- Add given blocks to an existing group
90 - * @handle: handle to this transaction
91 - * @sb: super block
92 - * @block: start physcial block to add to the block group
93 - * @count: number of blocks to free
94 - *
95 - * This marks the blocks as free in the bitmap. We ask the
96 - * mballoc to reload the buddy after this by setting group
97 - * EXT4_GROUP_INFO_NEED_INIT_BIT flag
98 - */
99 -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
100 - ext4_fsblk_t block, unsigned long count)
101 -{
102 - struct buffer_head *bitmap_bh = NULL;
103 - struct buffer_head *gd_bh;
104 - ext4_group_t block_group;
105 - ext4_grpblk_t bit;
106 - unsigned long i;
107 - struct ext4_group_desc *desc;
108 - struct ext4_super_block *es;
109 - struct ext4_sb_info *sbi;
110 - int err = 0, ret;
111 - ext4_grpblk_t blocks_freed;
112 - struct ext4_group_info *grp;
113 -
114 - sbi = EXT4_SB(sb);
115 - es = sbi->s_es;
116 - ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
117 -
118 - ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
119 - grp = ext4_get_group_info(sb, block_group);
120 - /*
121 - * Check to see if we are freeing blocks across a group
122 - * boundary.
123 - */
124 - if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
125 - goto error_return;
126 -
127 - bitmap_bh = ext4_read_block_bitmap(sb, block_group);
128 - if (!bitmap_bh)
129 - goto error_return;
130 - desc = ext4_get_group_desc(sb, block_group, &gd_bh);
131 - if (!desc)
132 - goto error_return;
133 -
134 - if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
135 - in_range(ext4_inode_bitmap(sb, desc), block, count) ||
136 - in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
137 - in_range(block + count - 1, ext4_inode_table(sb, desc),
138 - sbi->s_itb_per_group)) {
139 - ext4_error(sb, __func__,
140 - "Adding blocks in system zones - "
141 - "Block = %llu, count = %lu",
142 - block, count);
143 - goto error_return;
144 - }
145 -
146 - /*
147 - * We are about to add blocks to the bitmap,
148 - * so we need undo access.
149 - */
150 - BUFFER_TRACE(bitmap_bh, "getting undo access");
151 - err = ext4_journal_get_undo_access(handle, bitmap_bh);
152 - if (err)
153 - goto error_return;
154 -
155 - /*
156 - * We are about to modify some metadata. Call the journal APIs
157 - * to unshare ->b_data if a currently-committing transaction is
158 - * using it
159 - */
160 - BUFFER_TRACE(gd_bh, "get_write_access");
161 - err = ext4_journal_get_write_access(handle, gd_bh);
162 - if (err)
163 - goto error_return;
164 - /*
165 - * make sure we don't allow a parallel init on other groups in the
166 - * same buddy cache
167 - */
168 - down_write(&grp->alloc_sem);
169 - for (i = 0, blocks_freed = 0; i < count; i++) {
170 - BUFFER_TRACE(bitmap_bh, "clear bit");
171 - if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
172 - bit + i, bitmap_bh->b_data)) {
173 - ext4_error(sb, __func__,
174 - "bit already cleared for block %llu",
175 - (ext4_fsblk_t)(block + i));
176 - BUFFER_TRACE(bitmap_bh, "bit already cleared");
177 - } else {
178 - blocks_freed++;
179 - }
180 - }
181 - spin_lock(sb_bgl_lock(sbi, block_group));
182 - le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
183 - desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
184 - spin_unlock(sb_bgl_lock(sbi, block_group));
185 - percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
186 -
187 - if (sbi->s_log_groups_per_flex) {
188 - ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
189 - spin_lock(sb_bgl_lock(sbi, flex_group));
190 - sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
191 - spin_unlock(sb_bgl_lock(sbi, flex_group));
192 - }
193 - /*
194 - * request to reload the buddy with the
195 - * new bitmap information
196 - */
197 - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
198 - ext4_mb_update_group_info(grp, blocks_freed);
199 - up_write(&grp->alloc_sem);
200 -
201 - /* We dirtied the bitmap block */
202 - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
203 - err = ext4_journal_dirty_metadata(handle, bitmap_bh);
204 -
205 - /* And the group descriptor block */
206 - BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
207 - ret = ext4_journal_dirty_metadata(handle, gd_bh);
208 - if (!err)
209 - err = ret;
210 - sb->s_dirt = 1;
211 -
212 -error_return:
213 - brelse(bitmap_bh);
214 - ext4_std_error(sb, err);
215 - return;
216 -}
217 -
218 -/**
219 * ext4_free_blocks() -- Free given blocks and update quota
220 * @handle: handle for this transaction
221 * @inode: inode
222 reverted:
223 --- b/fs/ext4/ext4.h
224 +++ a/fs/ext4/ext4.h
225 @@ -19,7 +19,6 @@
226 #include <linux/types.h>
227 #include <linux/blkdev.h>
228 #include <linux/magic.h>
229 -#include <linux/jbd2.h>
230 #include "ext4_i.h"
231
232 /*
233 @@ -248,30 +247,6 @@
234 #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
235 #define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
236
237 -/* Flags that should be inherited by new inodes from their parent. */
238 -#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
239 - EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
240 - EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
241 - EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
242 - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
243 -
244 -/* Flags that are appropriate for regular files (all but dir-specific ones). */
245 -#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
246 -
247 -/* Flags that are appropriate for non-directories/regular files. */
248 -#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
249 -
250 -/* Mask out flags that are inappropriate for the given type of inode. */
251 -static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
252 -{
253 - if (S_ISDIR(mode))
254 - return flags;
255 - else if (S_ISREG(mode))
256 - return flags & EXT4_REG_FLMASK;
257 - else
258 - return flags & EXT4_OTHER_FLMASK;
259 -}
260 -
261 /*
262 * Inode dynamic state flags
263 */
264 @@ -279,7 +254,6 @@
265 #define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
266 #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
267 #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
268 -#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
269
270 /* Used to pass group descriptor data when online resize is done */
271 struct ext4_new_group_input {
272 @@ -327,9 +301,7 @@
273 #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
274 #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
275 #define EXT4_IOC_MIGRATE _IO('f', 9)
276 - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
277 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
278 -#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
279
280 /*
281 * ioctl commands in 32 bit emulation
282 @@ -887,7 +859,7 @@
283 {
284 unsigned len = le16_to_cpu(dlen);
285
286 + if (len == EXT4_MAX_REC_LEN)
287 - if (len == EXT4_MAX_REC_LEN || len == 0)
288 return 1 << 16;
289 return len;
290 }
291 @@ -917,9 +889,6 @@
292 #define DX_HASH_LEGACY 0
293 #define DX_HASH_HALF_MD4 1
294 #define DX_HASH_TEA 2
295 -#define DX_HASH_LEGACY_UNSIGNED 3
296 -#define DX_HASH_HALF_MD4_UNSIGNED 4
297 -#define DX_HASH_TEA_UNSIGNED 5
298
299 #ifdef __KERNEL__
300
301 @@ -1019,11 +988,9 @@
302 ext4_fsblk_t nblocks);
303 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
304 ext4_fsblk_t block, unsigned long count, int metadata);
305 +extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
306 + ext4_fsblk_t block, unsigned long count,
307 -extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
308 - ext4_fsblk_t block, unsigned long count,
309 unsigned long *pdquot_freed_blocks);
310 -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
311 - ext4_fsblk_t block, unsigned long count);
312 extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
313 extern void ext4_check_blocks_bitmap (struct super_block *);
314 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
315 @@ -1071,13 +1038,12 @@
316 extern void exit_ext4_mballoc(void);
317 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
318 unsigned long, unsigned long, int, unsigned long *);
319 +extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
320 -extern int ext4_mb_add_groupinfo(struct super_block *sb,
321 ext4_group_t i, struct ext4_group_desc *desc);
322 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
323 ext4_grpblk_t add);
324 +
325 +
326 -extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
327 -extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
328 - ext4_group_t, int);
329 /* inode.c */
330 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
331 struct buffer_head *bh, ext4_fsblk_t blocknr);
332 @@ -1105,14 +1071,13 @@
333 extern void ext4_truncate (struct inode *);
334 extern void ext4_set_inode_flags(struct inode *);
335 extern void ext4_get_inode_flags(struct ext4_inode_info *);
336 -extern int ext4_alloc_da_blocks(struct inode *inode);
337 extern void ext4_set_aops(struct inode *inode);
338 extern int ext4_writepage_trans_blocks(struct inode *);
339 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
340 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
341 extern int ext4_block_truncate_page(handle_t *handle,
342 struct address_space *mapping, loff_t from);
343 +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
344 -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
345
346 /* ioctl.c */
347 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
348 @@ -1202,11 +1167,8 @@
349
350 static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
351 {
352 + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
353 + le32_to_cpu(raw_inode->i_size_lo);
354 - if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
355 - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
356 - le32_to_cpu(raw_inode->i_size_lo);
357 - else
358 - return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
359 }
360
361 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
362 @@ -1282,23 +1244,6 @@
363 sector_t block, unsigned long max_blocks,
364 struct buffer_head *bh, int create,
365 int extend_disksize, int flag);
366 -/*
367 - * Add new method to test wether block and inode bitmaps are properly
368 - * initialized. With uninit_bg reading the block from disk is not enough
369 - * to mark the bitmap uptodate. We need to also zero-out the bitmap
370 - */
371 -#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
372 -
373 -static inline int bitmap_uptodate(struct buffer_head *bh)
374 -{
375 - return (buffer_uptodate(bh) &&
376 - test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
377 -}
378 -static inline void set_bitmap_uptodate(struct buffer_head *bh)
379 -{
380 - set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
381 -}
382 -
383 #endif /* __KERNEL__ */
384
385 #endif /* _EXT4_H */
386 reverted:
387 --- b/fs/ext4/ext4_sb.h
388 +++ a/fs/ext4/ext4_sb.h
389 @@ -56,7 +56,6 @@
390 u32 s_next_generation;
391 u32 s_hash_seed[4];
392 int s_def_hash_version;
393 - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
394 struct percpu_counter s_freeblocks_counter;
395 struct percpu_counter s_freeinodes_counter;
396 struct percpu_counter s_dirs_counter;
397 @@ -103,8 +102,7 @@
398 struct list_head s_committed_transaction;
399 spinlock_t s_md_lock;
400 tid_t s_last_transaction;
401 + unsigned short *s_mb_offsets, *s_mb_maxs;
402 - unsigned short *s_mb_offsets;
403 - unsigned int *s_mb_maxs;
404
405 /* tunables */
406 unsigned long s_stripe;
407 reverted:
408 --- b/fs/ext4/extents.c
409 +++ a/fs/ext4/extents.c
410 @@ -1118,8 +1118,7 @@
411 struct ext4_extent_idx *ix;
412 struct ext4_extent *ex;
413 ext4_fsblk_t block;
414 + int depth, ee_len;
415 - int depth; /* Note, NOT eh_depth; depth from top of tree */
416 - int ee_len;
417
418 BUG_ON(path == NULL);
419 depth = path->p_depth;
420 @@ -1178,8 +1177,7 @@
421 if (bh == NULL)
422 return -EIO;
423 eh = ext_block_hdr(bh);
424 + if (ext4_ext_check_header(inode, eh, depth)) {
425 - /* subtract from p_depth to get proper eh_depth */
426 - if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
427 put_bh(bh);
428 return -EIO;
429 }
430 @@ -1633,13 +1631,11 @@
431 {
432 struct ext4_ext_cache *cex;
433 BUG_ON(len == 0);
434 - spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
435 cex = &EXT4_I(inode)->i_cached_extent;
436 cex->ec_type = type;
437 cex->ec_block = block;
438 cex->ec_len = len;
439 cex->ec_start = start;
440 - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
441 }
442
443 /*
444 @@ -1696,17 +1692,12 @@
445 struct ext4_extent *ex)
446 {
447 struct ext4_ext_cache *cex;
448 - int ret = EXT4_EXT_CACHE_NO;
449
450 - /*
451 - * We borrow i_block_reservation_lock to protect i_cached_extent
452 - */
453 - spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
454 cex = &EXT4_I(inode)->i_cached_extent;
455
456 /* has cache valid data? */
457 if (cex->ec_type == EXT4_EXT_CACHE_NO)
458 + return EXT4_EXT_CACHE_NO;
459 - goto errout;
460
461 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
462 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
463 @@ -1717,11 +1708,11 @@
464 ext_debug("%u cached by %u:%u:%llu\n",
465 block,
466 cex->ec_block, cex->ec_len, cex->ec_start);
467 + return cex->ec_type;
468 - ret = cex->ec_type;
469 }
470 +
471 + /* not in cache */
472 + return EXT4_EXT_CACHE_NO;
473 -errout:
474 - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
475 - return ret;
476 }
477
478 /*
479 @@ -2677,8 +2668,6 @@
480 if (allocated > max_blocks)
481 allocated = max_blocks;
482 set_buffer_unwritten(bh_result);
483 - bh_result->b_bdev = inode->i_sb->s_bdev;
484 - bh_result->b_blocknr = newblock;
485 goto out2;
486 }
487
488 reverted:
489 --- b/fs/ext4/file.c
490 +++ a/fs/ext4/file.c
491 @@ -33,14 +33,9 @@
492 */
493 static int ext4_release_file (struct inode * inode, struct file * filp)
494 {
495 - if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
496 - ext4_alloc_da_blocks(inode);
497 - EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
498 - }
499 /* if we are the last writer on the inode, drop the block reservation */
500 if ((filp->f_mode & FMODE_WRITE) &&
501 + (atomic_read(&inode->i_writecount) == 1))
502 - (atomic_read(&inode->i_writecount) == 1) &&
503 - !EXT4_I(inode)->i_reserved_data_blocks)
504 {
505 down_write(&EXT4_I(inode)->i_data_sem);
506 ext4_discard_reservation(inode);
507 reverted:
508 --- b/fs/ext4/hash.c
509 +++ a/fs/ext4/hash.c
510 @@ -35,71 +35,23 @@
511
512
513 /* The old legacy hash */
514 +static __u32 dx_hack_hash (const char *name, int len)
515 -static __u32 dx_hack_hash_unsigned(const char *name, int len)
516 {
517 + __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
518 - __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
519 - const unsigned char *ucp = (const unsigned char *) name;
520 -
521 - while (len--) {
522 - hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
523 -
524 - if (hash & 0x80000000)
525 - hash -= 0x7fffffff;
526 - hash1 = hash0;
527 - hash0 = hash;
528 - }
529 - return hash0 << 1;
530 -}
531 -
532 -static __u32 dx_hack_hash_signed(const char *name, int len)
533 -{
534 - __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
535 - const signed char *scp = (const signed char *) name;
536 -
537 while (len--) {
538 + __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
539 - hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
540
541 + if (hash & 0x80000000) hash -= 0x7fffffff;
542 - if (hash & 0x80000000)
543 - hash -= 0x7fffffff;
544 hash1 = hash0;
545 hash0 = hash;
546 }
547 + return (hash0 << 1);
548 - return hash0 << 1;
549 }
550
551 +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
552 -static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
553 {
554 __u32 pad, val;
555 int i;
556 - const signed char *scp = (const signed char *) msg;
557 -
558 - pad = (__u32)len | ((__u32)len << 8);
559 - pad |= pad << 16;
560 -
561 - val = pad;
562 - if (len > num*4)
563 - len = num * 4;
564 - for (i = 0; i < len; i++) {
565 - if ((i % 4) == 0)
566 - val = pad;
567 - val = ((int) scp[i]) + (val << 8);
568 - if ((i % 4) == 3) {
569 - *buf++ = val;
570 - val = pad;
571 - num--;
572 - }
573 - }
574 - if (--num >= 0)
575 - *buf++ = val;
576 - while (--num >= 0)
577 - *buf++ = pad;
578 -}
579 -
580 -static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
581 -{
582 - __u32 pad, val;
583 - int i;
584 - const unsigned char *ucp = (const unsigned char *) msg;
585
586 pad = (__u32)len | ((__u32)len << 8);
587 pad |= pad << 16;
588 @@ -110,7 +62,7 @@
589 for (i=0; i < len; i++) {
590 if ((i % 4) == 0)
591 val = pad;
592 + val = msg[i] + (val << 8);
593 - val = ((int) ucp[i]) + (val << 8);
594 if ((i % 4) == 3) {
595 *buf++ = val;
596 val = pad;
597 @@ -143,8 +95,6 @@
598 const char *p;
599 int i;
600 __u32 in[8], buf[4];
601 - void (*str2hashbuf)(const char *, int, __u32 *, int) =
602 - str2hashbuf_signed;
603
604 /* Initialize the default seed for the hash checksum functions */
605 buf[0] = 0x67452301;
606 @@ -163,18 +113,13 @@
607 }
608
609 switch (hinfo->hash_version) {
610 - case DX_HASH_LEGACY_UNSIGNED:
611 - hash = dx_hack_hash_unsigned(name, len);
612 - break;
613 case DX_HASH_LEGACY:
614 + hash = dx_hack_hash(name, len);
615 - hash = dx_hack_hash_signed(name, len);
616 break;
617 - case DX_HASH_HALF_MD4_UNSIGNED:
618 - str2hashbuf = str2hashbuf_unsigned;
619 case DX_HASH_HALF_MD4:
620 p = name;
621 while (len > 0) {
622 + str2hashbuf(p, len, in, 8);
623 - (*str2hashbuf)(p, len, in, 8);
624 half_md4_transform(buf, in);
625 len -= 32;
626 p += 32;
627 @@ -182,12 +127,10 @@
628 minor_hash = buf[2];
629 hash = buf[1];
630 break;
631 - case DX_HASH_TEA_UNSIGNED:
632 - str2hashbuf = str2hashbuf_unsigned;
633 case DX_HASH_TEA:
634 p = name;
635 while (len > 0) {
636 + str2hashbuf(p, len, in, 4);
637 - (*str2hashbuf)(p, len, in, 4);
638 TEA_transform(buf, in);
639 len -= 16;
640 p += 16;
641 reverted:
642 --- b/fs/ext4/ialloc.c
643 +++ a/fs/ext4/ialloc.c
644 @@ -84,7 +84,7 @@
645 }
646
647 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
648 + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
649 - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
650 bh->b_data);
651
652 return EXT4_INODES_PER_GROUP(sb);
653 @@ -115,40 +115,18 @@
654 block_group, bitmap_blk);
655 return NULL;
656 }
657 + if (bh_uptodate_or_lock(bh))
658 - if (bitmap_uptodate(bh))
659 return bh;
660
661 - lock_buffer(bh);
662 - if (bitmap_uptodate(bh)) {
663 - unlock_buffer(bh);
664 - return bh;
665 - }
666 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
667 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
668 ext4_init_inode_bitmap(sb, bh, block_group, desc);
669 - set_bitmap_uptodate(bh);
670 set_buffer_uptodate(bh);
671 unlock_buffer(bh);
672 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
673 return bh;
674 }
675 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
676 - if (buffer_uptodate(bh)) {
677 - /*
678 - * if not uninit if bh is uptodate,
679 - * bitmap is also uptodate
680 - */
681 - set_bitmap_uptodate(bh);
682 - unlock_buffer(bh);
683 - return bh;
684 - }
685 - /*
686 - * submit the buffer_head for read. We can
687 - * safely mark the bitmap as uptodate now.
688 - * We do it here so the bitmap uptodate bit
689 - * get set with buffer lock held.
690 - */
691 - set_bitmap_uptodate(bh);
692 if (bh_submit_read(bh) < 0) {
693 put_bh(bh);
694 ext4_error(sb, __func__,
695 @@ -188,7 +166,7 @@
696 struct ext4_group_desc * gdp;
697 struct ext4_super_block * es;
698 struct ext4_sb_info *sbi;
699 + int fatal = 0, err;
700 - int fatal = 0, err, cleared;
701 ext4_group_t flex_group;
702
703 if (atomic_read(&inode->i_count) > 1) {
704 @@ -242,12 +220,10 @@
705 goto error_return;
706
707 /* Ok, now we can actually update the inode bitmaps.. */
708 + if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
709 + bit, bitmap_bh->b_data))
710 + ext4_error (sb, "ext4_free_inode",
711 + "bit already cleared for inode %lu", ino);
712 - spin_lock(sb_bgl_lock(sbi, block_group));
713 - cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
714 - spin_unlock(sb_bgl_lock(sbi, block_group));
715 - if (!cleared)
716 - ext4_error(sb, "ext4_free_inode",
717 - "bit already cleared for inode %lu", ino);
718 else {
719 gdp = ext4_get_group_desc (sb, block_group, &bh2);
720
721 @@ -591,77 +567,6 @@
722 }
723
724 /*
725 - * claim the inode from the inode bitmap. If the group
726 - * is uninit we need to take the groups's sb_bgl_lock
727 - * and clear the uninit flag. The inode bitmap update
728 - * and group desc uninit flag clear should be done
729 - * after holding sb_bgl_lock so that ext4_read_inode_bitmap
730 - * doesn't race with the ext4_claim_inode
731 - */
732 -static int ext4_claim_inode(struct super_block *sb,
733 - struct buffer_head *inode_bitmap_bh,
734 - unsigned long ino, ext4_group_t group, int mode)
735 -{
736 - int free = 0, retval = 0;
737 - struct ext4_sb_info *sbi = EXT4_SB(sb);
738 - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
739 -
740 - spin_lock(sb_bgl_lock(sbi, group));
741 - if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
742 - /* not a free inode */
743 - retval = 1;
744 - goto err_ret;
745 - }
746 - ino++;
747 - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
748 - ino > EXT4_INODES_PER_GROUP(sb)) {
749 - spin_unlock(sb_bgl_lock(sbi, group));
750 - ext4_error(sb, __func__,
751 - "reserved inode or inode > inodes count - "
752 - "block_group = %lu, inode=%lu", group,
753 - ino + group * EXT4_INODES_PER_GROUP(sb));
754 - return 1;
755 - }
756 - /* If we didn't allocate from within the initialized part of the inode
757 - * table then we need to initialize up to this inode. */
758 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
759 -
760 - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
761 - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
762 - /* When marking the block group with
763 - * ~EXT4_BG_INODE_UNINIT we don't want to depend
764 - * on the value of bg_itable_unused even though
765 - * mke2fs could have initialized the same for us.
766 - * Instead we calculated the value below
767 - */
768 -
769 - free = 0;
770 - } else {
771 - free = EXT4_INODES_PER_GROUP(sb) -
772 - le16_to_cpu(gdp->bg_itable_unused);
773 - }
774 -
775 - /*
776 - * Check the relative inode number against the last used
777 - * relative inode number in this group. if it is greater
778 - * we need to update the bg_itable_unused count
779 - *
780 - */
781 - if (ino > free)
782 - gdp->bg_itable_unused =
783 - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
784 - }
785 - le16_add_cpu(&gdp->bg_free_inodes_count, -1);
786 - if (S_ISDIR(mode)) {
787 - le16_add_cpu(&gdp->bg_used_dirs_count, 1);
788 - }
789 - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
790 -err_ret:
791 - spin_unlock(sb_bgl_lock(sbi, group));
792 - return retval;
793 -}
794 -
795 -/*
796 * There are two policies for allocating an inode. If the new inode is
797 * a directory, then a forward search is made for a block group with both
798 * free space and a low directory-to-inode ratio; if that fails, then of
799 @@ -687,7 +592,6 @@
800 struct inode *ret;
801 ext4_group_t i;
802 int free = 0;
803 - static int once = 1;
804 ext4_group_t flex_group;
805
806 /* Cannot create files in a deleted directory */
807 @@ -705,15 +609,6 @@
808
809 if (sbi->s_log_groups_per_flex) {
810 ret2 = find_group_flex(sb, dir, &group);
811 - if (ret2 == -1) {
812 - ret2 = find_group_other(sb, dir, &group);
813 - if (ret2 == 0 && once) {
814 - once = 0;
815 - printk(KERN_NOTICE "ext4: find_group_flex "
816 - "failed, fallback succeeded dir %lu\n",
817 - dir->i_ino);
818 - }
819 - }
820 goto got_group;
821 }
822
823 @@ -754,12 +649,8 @@
824 if (err)
825 goto fail;
826
827 + if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
828 + ino, bitmap_bh->b_data)) {
829 - BUFFER_TRACE(bh2, "get_write_access");
830 - err = ext4_journal_get_write_access(handle, bh2);
831 - if (err)
832 - goto fail;
833 - if (!ext4_claim_inode(sb, bitmap_bh,
834 - ino, group, mode)) {
835 /* we won it */
836 BUFFER_TRACE(bitmap_bh,
837 "call ext4_journal_dirty_metadata");
838 @@ -767,13 +658,10 @@
839 bitmap_bh);
840 if (err)
841 goto fail;
842 - /* zero bit is inode number 1*/
843 - ino++;
844 goto got;
845 }
846 /* we lost it */
847 jbd2_journal_release_buffer(handle, bitmap_bh);
848 - jbd2_journal_release_buffer(handle, bh2);
849
850 if (++ino < EXT4_INODES_PER_GROUP(sb))
851 goto repeat_in_this_group;
852 @@ -793,6 +681,21 @@
853 goto out;
854
855 got:
856 + ino++;
857 + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
858 + ino > EXT4_INODES_PER_GROUP(sb)) {
859 + ext4_error(sb, __func__,
860 + "reserved inode or inode > inodes count - "
861 + "block_group = %lu, inode=%lu", group,
862 + ino + group * EXT4_INODES_PER_GROUP(sb));
863 + err = -EIO;
864 + goto fail;
865 + }
866 +
867 + BUFFER_TRACE(bh2, "get_write_access");
868 + err = ext4_journal_get_write_access(handle, bh2);
869 + if (err) goto fail;
870 +
871 /* We may have to initialize the block bitmap if it isn't already */
872 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
873 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
874 @@ -827,10 +730,47 @@
875 if (err)
876 goto fail;
877 }
878 +
879 + spin_lock(sb_bgl_lock(sbi, group));
880 + /* If we didn't allocate from within the initialized part of the inode
881 + * table then we need to initialize up to this inode. */
882 + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
883 + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
884 + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
885 +
886 + /* When marking the block group with
887 + * ~EXT4_BG_INODE_UNINIT we don't want to depend
888 + * on the value of bg_itable_unused even though
889 + * mke2fs could have initialized the same for us.
890 + * Instead we calculated the value below
891 + */
892 +
893 + free = 0;
894 + } else {
895 + free = EXT4_INODES_PER_GROUP(sb) -
896 + le16_to_cpu(gdp->bg_itable_unused);
897 + }
898 +
899 + /*
900 + * Check the relative inode number against the last used
901 + * relative inode number in this group. if it is greater
902 + * we need to update the bg_itable_unused count
903 + *
904 + */
905 + if (ino > free)
906 + gdp->bg_itable_unused =
907 + cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
908 + }
909 +
910 + le16_add_cpu(&gdp->bg_free_inodes_count, -1);
911 + if (S_ISDIR(mode)) {
912 + le16_add_cpu(&gdp->bg_used_dirs_count, 1);
913 + }
914 + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
915 + spin_unlock(sb_bgl_lock(sbi, group));
916 + BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
917 - BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
918 err = ext4_journal_dirty_metadata(handle, bh2);
919 + if (err) goto fail;
920 - if (err)
921 - goto fail;
922
923 percpu_counter_dec(&sbi->s_freeinodes_counter);
924 if (S_ISDIR(mode))
925 @@ -866,12 +806,16 @@
926 ei->i_disksize = 0;
927
928 /*
929 + * Don't inherit extent flag from directory. We set extent flag on
930 + * newly created directory and file only if -o extent mount option is
931 + * specified
932 - * Don't inherit extent flag from directory, amongst others. We set
933 - * extent flag on newly created directory and file only if -o extent
934 - * mount option is specified
935 */
936 + ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
937 + if (S_ISLNK(mode))
938 + ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
939 + /* dirsync only applies to directories */
940 + if (!S_ISDIR(mode))
941 + ei->i_flags &= ~EXT4_DIRSYNC_FL;
942 - ei->i_flags =
943 - ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
944 ei->i_file_acl = 0;
945 ei->i_dtime = 0;
946 ei->i_block_alloc_info = NULL;
947 reverted:
948 --- b/fs/ext4/inode.c
949 +++ a/fs/ext4/inode.c
950 @@ -46,10 +46,8 @@
951 static inline int ext4_begin_ordered_truncate(struct inode *inode,
952 loff_t new_size)
953 {
954 + return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
955 + new_size);
956 - return jbd2_journal_begin_ordered_truncate(
957 - EXT4_SB(inode->i_sb)->s_journal,
958 - &EXT4_I(inode)->jinode,
959 - new_size);
960 }
961
962 static void ext4_invalidatepage(struct page *page, unsigned long offset);
963 @@ -353,9 +351,9 @@
964 final = ptrs;
965 } else {
966 ext4_warning(inode->i_sb, "ext4_block_to_path",
967 + "block %lu > max",
968 - "block %lu > max in inode %lu",
969 i_block + direct_blocks +
970 + indirect_blocks + double_blocks);
971 - indirect_blocks + double_blocks, inode->i_ino);
972 }
973 if (boundary)
974 *boundary = final - 1 - (i_block & (ptrs - 1));
975 @@ -1046,14 +1044,6 @@
976 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
977 EXT4_I(inode)->i_allocated_meta_blocks = 0;
978 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
979 -
980 - /*
981 - * If we have done all the pending block allocations and if
982 - * there aren't any writers on the inode, we can discard the
983 - * inode's preallocations.
984 - */
985 - if (!total && (atomic_read(&inode->i_writecount) == 0))
986 - ext4_discard_reservation(inode);
987 }
988
989 /*
990 @@ -1085,7 +1075,6 @@
991 int retval;
992
993 clear_buffer_mapped(bh);
994 - clear_buffer_unwritten(bh);
995
996 /*
997 * Try to see if we can get the block without requesting
998 @@ -1116,18 +1105,6 @@
999 return retval;
1000
1001 /*
1002 - * When we call get_blocks without the create flag, the
1003 - * BH_Unwritten flag could have gotten set if the blocks
1004 - * requested were part of a uninitialized extent. We need to
1005 - * clear this flag now that we are committed to convert all or
1006 - * part of the uninitialized extent to be an initialized
1007 - * extent. This is because we need to avoid the combination
1008 - * of BH_Unwritten and BH_Mapped flags being simultaneously
1009 - * set on the buffer_head.
1010 - */
1011 - clear_buffer_unwritten(bh);
1012 -
1013 - /*
1014 * New blocks allocate and/or writing to uninitialized extent
1015 * will possibly result in updating i_data, so we take
1016 * the write lock of i_data_sem, and call get_blocks()
1017 @@ -1393,10 +1370,6 @@
1018 goto out;
1019 }
1020
1021 - /* We cannot recurse into the filesystem as the transaction is already
1022 - * started */
1023 - flags |= AOP_FLAG_NOFS;
1024 -
1025 page = grab_cache_page_write_begin(mapping, index, flags);
1026 if (!page) {
1027 ext4_journal_stop(handle);
1028 @@ -1406,7 +1379,7 @@
1029 *pagep = page;
1030
1031 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1032 + ext4_get_block);
1033 - ext4_get_block);
1034
1035 if (!ret && ext4_should_journal_data(inode)) {
1036 ret = walk_page_buffers(handle, page_buffers(page),
1037 @@ -1675,25 +1648,18 @@
1038 */
1039 static int mpage_da_submit_io(struct mpage_da_data *mpd)
1040 {
1041 + struct address_space *mapping = mpd->inode->i_mapping;
1042 + int ret = 0, err, nr_pages, i;
1043 + unsigned long index, end;
1044 - long pages_skipped;
1045 struct pagevec pvec;
1046 - unsigned long index, end;
1047 - int ret = 0, err, nr_pages, i;
1048 - struct inode *inode = mpd->inode;
1049 - struct address_space *mapping = inode->i_mapping;
1050
1051 BUG_ON(mpd->next_page <= mpd->first_page);
1052 + pagevec_init(&pvec, 0);
1053 - /*
1054 - * We need to start from the first_page to the next_page - 1
1055 - * to make sure we also write the mapped dirty buffer_heads.
1056 - * If we look at mpd->lbh.b_blocknr we would only be looking
1057 - * at the currently mapped buffer_heads.
1058 - */
1059 index = mpd->first_page;
1060 end = mpd->next_page - 1;
1061
1062 - pagevec_init(&pvec, 0);
1063 while (index <= end) {
1064 + /* XXX: optimize tail */
1065 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1066 if (nr_pages == 0)
1067 break;
1068 @@ -1705,10 +1671,6 @@
1069 break;
1070 index++;
1071
1072 - BUG_ON(!PageLocked(page));
1073 - BUG_ON(PageWriteback(page));
1074 -
1075 - pages_skipped = mpd->wbc->pages_skipped;
1076 err = mapping->a_ops->writepage(page, mpd->wbc);
1077 if (!err)
1078 mpd->pages_written++;
1079 @@ -2029,29 +1991,11 @@
1080 bh = head;
1081 do {
1082 BUG_ON(buffer_locked(bh));
1083 - /*
1084 - * We need to try to allocate
1085 - * unmapped blocks in the same page.
1086 - * Otherwise we won't make progress
1087 - * with the page in ext4_da_writepage
1088 - */
1089 if (buffer_dirty(bh) &&
1090 (!buffer_mapped(bh) || buffer_delay(bh))) {
1091 mpage_add_bh_to_extent(mpd, logical, bh);
1092 if (mpd->io_done)
1093 return MPAGE_DA_EXTENT_TAIL;
1094 - } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
1095 - /*
1096 - * mapped dirty buffer. We need to update
1097 - * the b_state because we look at
1098 - * b_state in mpage_da_map_blocks. We don't
1099 - * update b_size because if we find an
1100 - * unmapped buffer_head later we need to
1101 - * use the b_state flag of that buffer_head.
1102 - */
1103 - if (mpd->lbh.b_size == 0)
1104 - mpd->lbh.b_state =
1105 - bh->b_state & BH_FLAGS;
1106 }
1107 logical++;
1108 } while ((bh = bh->b_this_page) != head);
1109 @@ -2118,10 +2062,6 @@
1110 struct buffer_head *bh_result, int create)
1111 {
1112 int ret = 0;
1113 - sector_t invalid_block = ~((sector_t) 0xffff);
1114 -
1115 - if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1116 - invalid_block = ~0;
1117
1118 BUG_ON(create == 0);
1119 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1120 @@ -2143,18 +2083,11 @@
1121 /* not enough space to reserve */
1122 return ret;
1123
1124 + map_bh(bh_result, inode->i_sb, 0);
1125 - map_bh(bh_result, inode->i_sb, invalid_block);
1126 set_buffer_new(bh_result);
1127 set_buffer_delay(bh_result);
1128 } else if (ret > 0) {
1129 bh_result->b_size = (ret << inode->i_blkbits);
1130 - /*
1131 - * With sub-block writes into unwritten extents
1132 - * we also need to mark the buffer as new so that
1133 - * the unwritten parts of the buffer gets correctly zeroed.
1134 - */
1135 - if (buffer_unwritten(bh_result))
1136 - set_buffer_new(bh_result);
1137 ret = 0;
1138 }
1139
1140 @@ -2365,20 +2298,6 @@
1141 */
1142 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1143 return 0;
1144 -
1145 - /*
1146 - * If the filesystem has aborted, it is read-only, so return
1147 - * right away instead of dumping stack traces later on that
1148 - * will obscure the real source of the problem. We test
1149 - * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
1150 - * the latter could be true if the filesystem is mounted
1151 - * read-only, and in that case, ext4_da_writepages should
1152 - * *never* be called, so if that ever happens, we would want
1153 - * the stack trace.
1154 - */
1155 - if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
1156 - return -EROFS;
1157 -
1158 /*
1159 * Make sure nr_to_write is >= sbi->s_mb_stream_request
1160 * This make sure small files blocks are allocated in
1161 @@ -2417,7 +2336,7 @@
1162 handle = ext4_journal_start(inode, needed_blocks);
1163 if (IS_ERR(handle)) {
1164 ret = PTR_ERR(handle);
1165 + printk(KERN_EMERG "%s: jbd2_start: "
1166 - printk(KERN_CRIT "%s: jbd2_start: "
1167 "%ld pages, ino %lu; err %d\n", __func__,
1168 wbc->nr_to_write, inode->i_ino, ret);
1169 dump_stack();
1170 @@ -2501,9 +2420,6 @@
1171 ret = PTR_ERR(handle);
1172 goto out;
1173 }
1174 - /* We cannot recurse into the filesystem as the transaction is already
1175 - * started */
1176 - flags |= AOP_FLAG_NOFS;
1177
1178 page = grab_cache_page_write_begin(mapping, index, flags);
1179 if (!page) {
1180 @@ -2617,48 +2533,6 @@
1181 return;
1182 }
1183
1184 -/*
1185 - * Force all delayed allocation blocks to be allocated for a given inode.
1186 - */
1187 -int ext4_alloc_da_blocks(struct inode *inode)
1188 -{
1189 - if (!EXT4_I(inode)->i_reserved_data_blocks &&
1190 - !EXT4_I(inode)->i_reserved_meta_blocks)
1191 - return 0;
1192 -
1193 - /*
1194 - * We do something simple for now. The filemap_flush() will
1195 - * also start triggering a write of the data blocks, which is
1196 - * not strictly speaking necessary (and for users of
1197 - * laptop_mode, not even desirable). However, to do otherwise
1198 - * would require replicating code paths in:
1199 - *
1200 - * ext4_da_writepages() ->
1201 - * write_cache_pages() ---> (via passed in callback function)
1202 - * __mpage_da_writepage() -->
1203 - * mpage_add_bh_to_extent()
1204 - * mpage_da_map_blocks()
1205 - *
1206 - * The problem is that write_cache_pages(), located in
1207 - * mm/page-writeback.c, marks pages clean in preparation for
1208 - * doing I/O, which is not desirable if we're not planning on
1209 - * doing I/O at all.
1210 - *
1211 - * We could call write_cache_pages(), and then redirty all of
1212 - * the pages by calling redirty_page_for_writeback() but that
1213 - * would be ugly in the extreme. So instead we would need to
1214 - * replicate parts of the code in the above functions,
1215 - * simplifying them becuase we wouldn't actually intend to
1216 - * write out the pages, but rather only collect contiguous
1217 - * logical block extents, call the multi-block allocator, and
1218 - * then update the buffer heads with the block allocations.
1219 - *
1220 - * For now, though, we'll cheat by calling filemap_flush(),
1221 - * which will map the blocks, and start the I/O, but not
1222 - * actually wait for the I/O to complete.
1223 - */
1224 - return filemap_flush(inode->i_mapping);
1225 -}
1226
1227 /*
1228 * bmap() is special. It gets used by applications such as lilo and by
1229 @@ -3668,9 +3542,6 @@
1230 if (!ext4_can_truncate(inode))
1231 return;
1232
1233 - if (inode->i_size == 0)
1234 - ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
1235 -
1236 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1237 ext4_ext_truncate(inode);
1238 return;
1239 @@ -4088,9 +3959,11 @@
1240 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
1241 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
1242 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
1243 + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
1244 + cpu_to_le32(EXT4_OS_HURD)) {
1245 - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
1246 ei->i_file_acl |=
1247 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
1248 + }
1249 inode->i_size = ext4_isize(raw_inode);
1250 ei->i_disksize = inode->i_size;
1251 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
1252 @@ -4137,18 +4010,6 @@
1253 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
1254 }
1255
1256 - if (ei->i_file_acl &&
1257 - ((ei->i_file_acl <
1258 - (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
1259 - EXT4_SB(sb)->s_gdb_count)) ||
1260 - (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
1261 - ext4_error(sb, __func__,
1262 - "bad extended attribute block %llu in inode #%lu",
1263 - ei->i_file_acl, inode->i_ino);
1264 - ret = -EIO;
1265 - goto bad_inode;
1266 - }
1267 -
1268 if (S_ISREG(inode->i_mode)) {
1269 inode->i_op = &ext4_file_inode_operations;
1270 inode->i_fop = &ext4_file_operations;
1271 @@ -4163,8 +4024,7 @@
1272 inode->i_op = &ext4_symlink_inode_operations;
1273 ext4_set_aops(inode);
1274 }
1275 + } else {
1276 - } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
1277 - S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1278 inode->i_op = &ext4_special_inode_operations;
1279 if (raw_inode->i_block[0])
1280 init_special_inode(inode, inode->i_mode,
1281 @@ -4172,13 +4032,6 @@
1282 else
1283 init_special_inode(inode, inode->i_mode,
1284 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
1285 - } else {
1286 - brelse(bh);
1287 - ret = -EIO;
1288 - ext4_error(inode->i_sb, __func__,
1289 - "bogus i_mode (%o) for inode=%lu",
1290 - inode->i_mode, inode->i_ino);
1291 - goto bad_inode;
1292 }
1293 brelse (iloc.bh);
1294 ext4_set_inode_flags(inode);
1295 @@ -4956,9 +4809,8 @@
1296 return !buffer_mapped(bh);
1297 }
1298
1299 +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1300 -int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1301 {
1302 - struct page *page = vmf->page;
1303 loff_t size;
1304 unsigned long len;
1305 int ret = -EINVAL;
1306 @@ -5009,8 +4861,6 @@
1307 goto out_unlock;
1308 ret = 0;
1309 out_unlock:
1310 - if (ret)
1311 - ret = VM_FAULT_SIGBUS;
1312 up_read(&inode->i_alloc_sem);
1313 return ret;
1314 }
1315 reverted:
1316 --- b/fs/ext4/ioctl.c
1317 +++ a/fs/ext4/ioctl.c
1318 @@ -49,7 +49,8 @@
1319 if (err)
1320 return err;
1321
1322 + if (!S_ISDIR(inode->i_mode))
1323 + flags &= ~EXT4_DIRSYNC_FL;
1324 - flags = ext4_mask_flags(inode->i_mode, flags);
1325
1326 err = -EPERM;
1327 mutex_lock(&inode->i_mutex);
1328 @@ -287,20 +288,6 @@
1329 return err;
1330 }
1331
1332 - case EXT4_IOC_ALLOC_DA_BLKS:
1333 - {
1334 - int err;
1335 - if (!is_owner_or_cap(inode))
1336 - return -EACCES;
1337 -
1338 - err = mnt_want_write(filp->f_path.mnt);
1339 - if (err)
1340 - return err;
1341 - err = ext4_alloc_da_blocks(inode);
1342 - mnt_drop_write(filp->f_path.mnt);
1343 - return err;
1344 - }
1345 -
1346 default:
1347 return -ENOTTY;
1348 }
1349 reverted:
1350 --- b/fs/ext4/mballoc.c
1351 +++ a/fs/ext4/mballoc.c
1352 @@ -100,7 +100,7 @@
1353 * inode as:
1354 *
1355 * { page }
1356 + * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1357 - * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1358 *
1359 *
1360 * one block each for bitmap and buddy information. So for each group we
1361 @@ -330,18 +330,6 @@
1362 * object
1363 *
1364 */
1365 -static struct kmem_cache *ext4_pspace_cachep;
1366 -static struct kmem_cache *ext4_ac_cachep;
1367 -static struct kmem_cache *ext4_free_ext_cachep;
1368 -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
1369 - ext4_group_t group);
1370 -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1371 - ext4_group_t group);
1372 -static int ext4_mb_init_per_dev_proc(struct super_block *sb);
1373 -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
1374 -static void ext4_mb_free_committed_blocks(struct super_block *);
1375 -static void ext4_mb_poll_new_transaction(struct super_block *sb,
1376 - handle_t *handle);
1377
1378 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
1379 {
1380 @@ -730,7 +718,7 @@
1381 * stored in the inode as
1382 *
1383 * { page }
1384 + * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1385 - * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1386 *
1387 *
1388 * one block each for bitmap and buddy information.
1389 @@ -796,42 +784,20 @@
1390 if (bh[i] == NULL)
1391 goto out;
1392
1393 + if (bh_uptodate_or_lock(bh[i]))
1394 - if (bitmap_uptodate(bh[i]))
1395 continue;
1396
1397 - lock_buffer(bh[i]);
1398 - if (bitmap_uptodate(bh[i])) {
1399 - unlock_buffer(bh[i]);
1400 - continue;
1401 - }
1402 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1403 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1404 ext4_init_block_bitmap(sb, bh[i],
1405 first_group + i, desc);
1406 - set_bitmap_uptodate(bh[i]);
1407 set_buffer_uptodate(bh[i]);
1408 unlock_buffer(bh[i]);
1409 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1410 continue;
1411 }
1412 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1413 - if (buffer_uptodate(bh[i])) {
1414 - /*
1415 - * if not uninit if bh is uptodate,
1416 - * bitmap is also uptodate
1417 - */
1418 - set_bitmap_uptodate(bh[i]);
1419 - unlock_buffer(bh[i]);
1420 - continue;
1421 - }
1422 get_bh(bh[i]);
1423 - /*
1424 - * submit the buffer_head for read. We can
1425 - * safely mark the bitmap as uptodate now.
1426 - * We do it here so the bitmap uptodate bit
1427 - * get set with buffer lock held.
1428 - */
1429 - set_bitmap_uptodate(bh[i]);
1430 bh[i]->b_end_io = end_buffer_read_sync;
1431 submit_bh(READ, bh[i]);
1432 mb_debug("read bitmap for group %lu\n", first_group + i);
1433 @@ -848,8 +814,6 @@
1434
1435 err = 0;
1436 first_block = page->index * blocks_per_page;
1437 - /* init the page */
1438 - memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
1439 for (i = 0; i < blocks_per_page; i++) {
1440 int group;
1441 struct ext4_group_info *grinfo;
1442 @@ -876,6 +840,7 @@
1443 BUG_ON(incore == NULL);
1444 mb_debug("put buddy for group %u in page %lu/%x\n",
1445 group, page->index, i * blocksize);
1446 + memset(data, 0xff, blocksize);
1447 grinfo = ext4_get_group_info(sb, group);
1448 grinfo->bb_fragments = 0;
1449 memset(grinfo->bb_counters, 0,
1450 @@ -883,9 +848,7 @@
1451 /*
1452 * incore got set to the group block bitmap below
1453 */
1454 - ext4_lock_group(sb, group);
1455 ext4_mb_generate_buddy(sb, data, incore, group);
1456 - ext4_unlock_group(sb, group);
1457 incore = NULL;
1458 } else {
1459 /* this is block of bitmap */
1460 @@ -899,7 +862,6 @@
1461
1462 /* mark all preallocated blks used in in-core bitmap */
1463 ext4_mb_generate_from_pa(sb, data, group);
1464 - ext4_mb_generate_from_freelist(sb, data, group);
1465 ext4_unlock_group(sb, group);
1466
1467 /* set incore so that the buddy information can be
1468 @@ -924,20 +886,18 @@
1469 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1470 struct ext4_buddy *e4b)
1471 {
1472 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1473 + struct inode *inode = sbi->s_buddy_cache;
1474 int blocks_per_page;
1475 int block;
1476 int pnum;
1477 int poff;
1478 struct page *page;
1479 int ret;
1480 - struct ext4_group_info *grp;
1481 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1482 - struct inode *inode = sbi->s_buddy_cache;
1483
1484 mb_debug("load group %lu\n", group);
1485
1486 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1487 - grp = ext4_get_group_info(sb, group);
1488
1489 e4b->bd_blkbits = sb->s_blocksize_bits;
1490 e4b->bd_info = ext4_get_group_info(sb, group);
1491 @@ -945,15 +905,6 @@
1492 e4b->bd_group = group;
1493 e4b->bd_buddy_page = NULL;
1494 e4b->bd_bitmap_page = NULL;
1495 - e4b->alloc_semp = &grp->alloc_sem;
1496 -
1497 - /* Take the read lock on the group alloc
1498 - * sem. This would make sure a parallel
1499 - * ext4_mb_init_group happening on other
1500 - * groups mapped by the page is blocked
1501 - * till we are done with allocation
1502 - */
1503 - down_read(e4b->alloc_semp);
1504
1505 /*
1506 * the buddy cache inode stores the block bitmap
1507 @@ -969,14 +920,6 @@
1508 page = find_get_page(inode->i_mapping, pnum);
1509 if (page == NULL || !PageUptodate(page)) {
1510 if (page)
1511 - /*
1512 - * drop the page reference and try
1513 - * to get the page with lock. If we
1514 - * are not uptodate that implies
1515 - * somebody just created the page but
1516 - * is yet to initialize the same. So
1517 - * wait for it to initialize.
1518 - */
1519 page_cache_release(page);
1520 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1521 if (page) {
1522 @@ -1042,9 +985,6 @@
1523 page_cache_release(e4b->bd_buddy_page);
1524 e4b->bd_buddy = NULL;
1525 e4b->bd_bitmap = NULL;
1526 -
1527 - /* Done with the buddy cache */
1528 - up_read(e4b->alloc_semp);
1529 return ret;
1530 }
1531
1532 @@ -1054,9 +994,6 @@
1533 page_cache_release(e4b->bd_bitmap_page);
1534 if (e4b->bd_buddy_page)
1535 page_cache_release(e4b->bd_buddy_page);
1536 - /* Done with the buddy cache */
1537 - if (e4b->alloc_semp)
1538 - up_read(e4b->alloc_semp);
1539 }
1540
1541
1542 @@ -1094,10 +1031,7 @@
1543 cur += 32;
1544 continue;
1545 }
1546 + mb_clear_bit_atomic(lock, cur, bm);
1547 - if (lock)
1548 - mb_clear_bit_atomic(lock, cur, bm);
1549 - else
1550 - mb_clear_bit(cur, bm);
1551 cur++;
1552 }
1553 }
1554 @@ -1115,10 +1049,7 @@
1555 cur += 32;
1556 continue;
1557 }
1558 + mb_set_bit_atomic(lock, cur, bm);
1559 - if (lock)
1560 - mb_set_bit_atomic(lock, cur, bm);
1561 - else
1562 - mb_set_bit(cur, bm);
1563 cur++;
1564 }
1565 }
1566 @@ -1365,20 +1296,13 @@
1567 ac->ac_tail = ret & 0xffff;
1568 ac->ac_buddy = ret >> 16;
1569
1570 + /* XXXXXXX: SUCH A HORRIBLE **CK */
1571 + /*FIXME!! Why ? */
1572 - /*
1573 - * take the page reference. We want the page to be pinned
1574 - * so that we don't get a ext4_mb_init_cache_call for this
1575 - * group until we update the bitmap. That would mean we
1576 - * double allocate blocks. The reference is dropped
1577 - * in ext4_mb_release_context
1578 - */
1579 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1580 get_page(ac->ac_bitmap_page);
1581 ac->ac_buddy_page = e4b->bd_buddy_page;
1582 get_page(ac->ac_buddy_page);
1583 +
1584 - /* on allocation we use ac to track the held semaphore */
1585 - ac->alloc_semp = e4b->alloc_semp;
1586 - e4b->alloc_semp = NULL;
1587 /* store last allocated for subsequent stream allocation */
1588 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1589 spin_lock(&sbi->s_md_lock);
1590 @@ -1402,8 +1326,6 @@
1591 struct ext4_free_extent ex;
1592 int max;
1593
1594 - if (ac->ac_status == AC_STATUS_FOUND)
1595 - return;
1596 /*
1597 * We don't want to scan for a whole year
1598 */
1599 @@ -1450,7 +1372,7 @@
1600 struct ext4_free_extent *gex = &ac->ac_g_ex;
1601
1602 BUG_ON(ex->fe_len <= 0);
1603 + BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1604 - BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1605 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1606 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1607
1608 @@ -1770,173 +1692,6 @@
1609 return 0;
1610 }
1611
1612 -/*
1613 - * lock the group_info alloc_sem of all the groups
1614 - * belonging to the same buddy cache page. This
1615 - * make sure other parallel operation on the buddy
1616 - * cache doesn't happen whild holding the buddy cache
1617 - * lock
1618 - */
1619 -int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1620 -{
1621 - int i;
1622 - int block, pnum;
1623 - int blocks_per_page;
1624 - int groups_per_page;
1625 - ext4_group_t first_group;
1626 - struct ext4_group_info *grp;
1627 -
1628 - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1629 - /*
1630 - * the buddy cache inode stores the block bitmap
1631 - * and buddy information in consecutive blocks.
1632 - * So for each group we need two blocks.
1633 - */
1634 - block = group * 2;
1635 - pnum = block / blocks_per_page;
1636 - first_group = pnum * blocks_per_page / 2;
1637 -
1638 - groups_per_page = blocks_per_page >> 1;
1639 - if (groups_per_page == 0)
1640 - groups_per_page = 1;
1641 - /* read all groups the page covers into the cache */
1642 - for (i = 0; i < groups_per_page; i++) {
1643 -
1644 - if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1645 - break;
1646 - grp = ext4_get_group_info(sb, first_group + i);
1647 - /* take all groups write allocation
1648 - * semaphore. This make sure there is
1649 - * no block allocation going on in any
1650 - * of that groups
1651 - */
1652 - down_write(&grp->alloc_sem);
1653 - }
1654 - return i;
1655 -}
1656 -
1657 -void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1658 - ext4_group_t group, int locked_group)
1659 -{
1660 - int i;
1661 - int block, pnum;
1662 - int blocks_per_page;
1663 - ext4_group_t first_group;
1664 - struct ext4_group_info *grp;
1665 -
1666 - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1667 - /*
1668 - * the buddy cache inode stores the block bitmap
1669 - * and buddy information in consecutive blocks.
1670 - * So for each group we need two blocks.
1671 - */
1672 - block = group * 2;
1673 - pnum = block / blocks_per_page;
1674 - first_group = pnum * blocks_per_page / 2;
1675 - /* release locks on all the groups */
1676 - for (i = 0; i < locked_group; i++) {
1677 -
1678 - grp = ext4_get_group_info(sb, first_group + i);
1679 - /* take all groups write allocation
1680 - * semaphore. This make sure there is
1681 - * no block allocation going on in any
1682 - * of that groups
1683 - */
1684 - up_write(&grp->alloc_sem);
1685 - }
1686 -
1687 -}
1688 -
1689 -static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1690 -{
1691 -
1692 - int ret;
1693 - void *bitmap;
1694 - int blocks_per_page;
1695 - int block, pnum, poff;
1696 - int num_grp_locked = 0;
1697 - struct ext4_group_info *this_grp;
1698 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1699 - struct inode *inode = sbi->s_buddy_cache;
1700 - struct page *page = NULL, *bitmap_page = NULL;
1701 -
1702 - mb_debug("init group %lu\n", group);
1703 - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1704 - this_grp = ext4_get_group_info(sb, group);
1705 - /*
1706 - * This ensures we don't add group
1707 - * to this buddy cache via resize
1708 - */
1709 - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1710 - if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1711 - /*
1712 - * somebody initialized the group
1713 - * return without doing anything
1714 - */
1715 - ret = 0;
1716 - goto err;
1717 - }
1718 - /*
1719 - * the buddy cache inode stores the block bitmap
1720 - * and buddy information in consecutive blocks.
1721 - * So for each group we need two blocks.
1722 - */
1723 - block = group * 2;
1724 - pnum = block / blocks_per_page;
1725 - poff = block % blocks_per_page;
1726 - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1727 - if (page) {
1728 - BUG_ON(page->mapping != inode->i_mapping);
1729 - ret = ext4_mb_init_cache(page, NULL);
1730 - if (ret) {
1731 - unlock_page(page);
1732 - goto err;
1733 - }
1734 - unlock_page(page);
1735 - }
1736 - if (page == NULL || !PageUptodate(page)) {
1737 - ret = -EIO;
1738 - goto err;
1739 - }
1740 - mark_page_accessed(page);
1741 - bitmap_page = page;
1742 - bitmap = page_address(page) + (poff * sb->s_blocksize);
1743 -
1744 - /* init buddy cache */
1745 - block++;
1746 - pnum = block / blocks_per_page;
1747 - poff = block % blocks_per_page;
1748 - page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1749 - if (page == bitmap_page) {
1750 - /*
1751 - * If both the bitmap and buddy are in
1752 - * the same page we don't need to force
1753 - * init the buddy
1754 - */
1755 - unlock_page(page);
1756 - } else if (page) {
1757 - BUG_ON(page->mapping != inode->i_mapping);
1758 - ret = ext4_mb_init_cache(page, bitmap);
1759 - if (ret) {
1760 - unlock_page(page);
1761 - goto err;
1762 - }
1763 - unlock_page(page);
1764 - }
1765 - if (page == NULL || !PageUptodate(page)) {
1766 - ret = -EIO;
1767 - goto err;
1768 - }
1769 - mark_page_accessed(page);
1770 -err:
1771 - ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1772 - if (bitmap_page)
1773 - page_cache_release(bitmap_page);
1774 - if (page)
1775 - page_cache_release(page);
1776 - return ret;
1777 -}
1778 -
1779 static noinline_for_stack int
1780 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1781 {
1782 @@ -2020,7 +1775,7 @@
1783 group = 0;
1784
1785 /* quick check to skip empty groups */
1786 + grp = ext4_get_group_info(ac->ac_sb, group);
1787 - grp = ext4_get_group_info(sb, group);
1788 if (grp->bb_free == 0)
1789 continue;
1790
1791 @@ -2033,9 +1788,10 @@
1792 * we need full data about the group
1793 * to make a good selection
1794 */
1795 + err = ext4_mb_load_buddy(sb, group, &e4b);
1796 - err = ext4_mb_init_group(sb, group);
1797 if (err)
1798 goto out;
1799 + ext4_mb_release_desc(&e4b);
1800 }
1801
1802 /*
1803 @@ -2543,8 +2299,6 @@
1804 }
1805
1806 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
1807 - init_rwsem(&meta_group_info[i]->alloc_sem);
1808 - meta_group_info[i]->bb_free_root.rb_node = NULL;;
1809
1810 #ifdef DOUBLE_CHECK
1811 {
1812 @@ -2571,6 +2325,54 @@
1813 } /* ext4_mb_add_groupinfo */
1814
1815 /*
1816 + * Add a group to the existing groups.
1817 + * This function is used for online resize
1818 + */
1819 +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
1820 + struct ext4_group_desc *desc)
1821 +{
1822 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1823 + struct inode *inode = sbi->s_buddy_cache;
1824 + int blocks_per_page;
1825 + int block;
1826 + int pnum;
1827 + struct page *page;
1828 + int err;
1829 +
1830 + /* Add group based on group descriptor*/
1831 + err = ext4_mb_add_groupinfo(sb, group, desc);
1832 + if (err)
1833 + return err;
1834 +
1835 + /*
1836 + * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
1837 + * datas) are set not up to date so that they will be re-initilaized
1838 + * during the next call to ext4_mb_load_buddy
1839 + */
1840 +
1841 + /* Set buddy page as not up to date */
1842 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1843 + block = group * 2;
1844 + pnum = block / blocks_per_page;
1845 + page = find_get_page(inode->i_mapping, pnum);
1846 + if (page != NULL) {
1847 + ClearPageUptodate(page);
1848 + page_cache_release(page);
1849 + }
1850 +
1851 + /* Set bitmap page as not up to date */
1852 + block++;
1853 + pnum = block / blocks_per_page;
1854 + page = find_get_page(inode->i_mapping, pnum);
1855 + if (page != NULL) {
1856 + ClearPageUptodate(page);
1857 + page_cache_release(page);
1858 + }
1859 +
1860 + return 0;
1861 +}
1862 +
1863 +/*
1864 * Update an existing group.
1865 * This function is used for online resize
1866 */
1867 @@ -2693,12 +2495,10 @@
1868 clear_opt(sbi->s_mount_opt, MBALLOC);
1869 return -ENOMEM;
1870 }
1871 -
1872 - i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
1873 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
1874 if (sbi->s_mb_maxs == NULL) {
1875 clear_opt(sbi->s_mount_opt, MBALLOC);
1876 + kfree(sbi->s_mb_maxs);
1877 - kfree(sbi->s_mb_offsets);
1878 return -ENOMEM;
1879 }
1880
1881 @@ -2858,11 +2658,13 @@
1882 static noinline_for_stack void
1883 ext4_mb_free_committed_blocks(struct super_block *sb)
1884 {
1885 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1886 + int err;
1887 + int i;
1888 + int count = 0;
1889 + int count2 = 0;
1890 + struct ext4_free_metadata *md;
1891 struct ext4_buddy e4b;
1892 - struct ext4_group_info *db;
1893 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1894 - int err, count = 0, count2 = 0;
1895 - struct ext4_free_data *entry;
1896
1897 if (list_empty(&sbi->s_committed_transaction))
1898 return;
1899 @@ -2870,46 +2672,44 @@
1900 /* there is committed blocks to be freed yet */
1901 do {
1902 /* get next array of blocks */
1903 + md = NULL;
1904 - entry = NULL;
1905 spin_lock(&sbi->s_md_lock);
1906 if (!list_empty(&sbi->s_committed_transaction)) {
1907 + md = list_entry(sbi->s_committed_transaction.next,
1908 + struct ext4_free_metadata, list);
1909 + list_del(&md->list);
1910 - entry = list_entry(sbi->s_committed_transaction.next,
1911 - struct ext4_free_data, list);
1912 - list_del(&entry->list);
1913 }
1914 spin_unlock(&sbi->s_md_lock);
1915
1916 + if (md == NULL)
1917 - if (entry == NULL)
1918 break;
1919
1920 mb_debug("gonna free %u blocks in group %lu (0x%p):",
1921 + md->num, md->group, md);
1922 - entry->count, entry->group, entry);
1923
1924 + err = ext4_mb_load_buddy(sb, md->group, &e4b);
1925 - err = ext4_mb_load_buddy(sb, entry->group, &e4b);
1926 /* we expect to find existing buddy because it's pinned */
1927 BUG_ON(err != 0);
1928
1929 - db = e4b.bd_info;
1930 /* there are blocks to put in buddy to make them really free */
1931 + count += md->num;
1932 - count += entry->count;
1933 count2++;
1934 + ext4_lock_group(sb, md->group);
1935 + for (i = 0; i < md->num; i++) {
1936 + mb_debug(" %u", md->blocks[i]);
1937 + mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
1938 - ext4_lock_group(sb, entry->group);
1939 - /* Take it out of per group rb tree */
1940 - rb_erase(&entry->node, &(db->bb_free_root));
1941 - mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
1942 -
1943 - if (!db->bb_free_root.rb_node) {
1944 - /* No more items in the per group rb tree
1945 - * balance refcounts from ext4_mb_free_metadata()
1946 - */
1947 - page_cache_release(e4b.bd_buddy_page);
1948 - page_cache_release(e4b.bd_bitmap_page);
1949 }
1950 + mb_debug("\n");
1951 + ext4_unlock_group(sb, md->group);
1952 - ext4_unlock_group(sb, entry->group);
1953
1954 + /* balance refcounts from ext4_mb_free_metadata() */
1955 + page_cache_release(e4b.bd_buddy_page);
1956 + page_cache_release(e4b.bd_bitmap_page);
1957 +
1958 + kfree(md);
1959 - kmem_cache_free(ext4_free_ext_cachep, entry);
1960 ext4_mb_release_desc(&e4b);
1961 +
1962 + } while (md);
1963 - } while (1);
1964
1965 mb_debug("freed %u blocks in %u structures\n", count, count2);
1966 }
1967 @@ -3064,16 +2864,6 @@
1968 kmem_cache_destroy(ext4_pspace_cachep);
1969 return -ENOMEM;
1970 }
1971 -
1972 - ext4_free_ext_cachep =
1973 - kmem_cache_create("ext4_free_block_extents",
1974 - sizeof(struct ext4_free_data),
1975 - 0, SLAB_RECLAIM_ACCOUNT, NULL);
1976 - if (ext4_free_ext_cachep == NULL) {
1977 - kmem_cache_destroy(ext4_pspace_cachep);
1978 - kmem_cache_destroy(ext4_ac_cachep);
1979 - return -ENOMEM;
1980 - }
1981 #ifdef CONFIG_PROC_FS
1982 proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
1983 if (proc_root_ext4 == NULL)
1984 @@ -3090,7 +2880,6 @@
1985 #ifdef CONFIG_PROC_FS
1986 remove_proc_entry("fs/ext4", NULL);
1987 #endif
1988 - kmem_cache_destroy(ext4_free_ext_cachep);
1989 }
1990
1991
1992 @@ -3152,8 +2941,8 @@
1993 in_range(block + len - 1, ext4_inode_table(sb, gdp),
1994 EXT4_SB(sb)->s_itb_per_group)) {
1995 ext4_error(sb, __func__,
1996 + "Allocating block in system zone - block = %llu",
1997 + block);
1998 - "Allocating block %llu in system zone of %lu group\n",
1999 - block, ac->ac_b_ex.fe_group);
2000 /* File system mounted not to panic on error
2001 * Fix the bitmap and repeat the block allocation
2002 * We leak some of the blocks here.
2003 @@ -3175,9 +2964,10 @@
2004 }
2005 }
2006 #endif
2007 + mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
2008 + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2009 +
2010 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2011 - mb_set_bits(NULL, bitmap_bh->b_data,
2012 - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2013 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2014 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2015 gdp->bg_free_blocks_count =
2016 @@ -3400,7 +3190,7 @@
2017 }
2018 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
2019 start > ac->ac_o_ex.fe_logical);
2020 + BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
2021 - BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
2022
2023 /* now prepare goal request */
2024
2025 @@ -3610,37 +3400,10 @@
2026 ac->ac_criteria = 20;
2027 return 1;
2028 }
2029 -
2030 return 0;
2031 }
2032
2033 /*
2034 - * the function goes through all block freed in the group
2035 - * but not yet committed and marks them used in in-core bitmap.
2036 - * buddy must be generated from this bitmap
2037 - * Need to be called with ext4 group lock (ext4_lock_group)
2038 - */
2039 -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
2040 - ext4_group_t group)
2041 -{
2042 - struct rb_node *n;
2043 - struct ext4_group_info *grp;
2044 - struct ext4_free_data *entry;
2045 -
2046 - grp = ext4_get_group_info(sb, group);
2047 - n = rb_first(&(grp->bb_free_root));
2048 -
2049 - while (n) {
2050 - entry = rb_entry(n, struct ext4_free_data, node);
2051 - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
2052 - bitmap, entry->start_blk,
2053 - entry->count);
2054 - n = rb_next(n);
2055 - }
2056 - return;
2057 -}
2058 -
2059 -/*
2060 * the function goes through all preallocation in this group and marks them
2061 * used in in-core bitmap. buddy must be generated from this bitmap
2062 * Need to be called with ext4 group lock (ext4_lock_group)
2063 @@ -3698,7 +3461,6 @@
2064 struct super_block *sb, struct ext4_prealloc_space *pa)
2065 {
2066 unsigned long grp;
2067 - ext4_fsblk_t grp_blk;
2068
2069 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
2070 return;
2071 @@ -3713,12 +3475,8 @@
2072 pa->pa_deleted = 1;
2073 spin_unlock(&pa->pa_lock);
2074
2075 + /* -1 is to protect from crossing allocation group */
2076 + ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
2077 - grp_blk = pa->pa_pstart;
2078 - /* If linear, pa_pstart may be in the next group when pa is used up */
2079 - if (pa->pa_linear)
2080 - grp_blk--;
2081 -
2082 - ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
2083
2084 /*
2085 * possible race:
2086 @@ -3807,8 +3565,6 @@
2087 pa->pa_free = pa->pa_len;
2088 atomic_set(&pa->pa_count, 1);
2089 spin_lock_init(&pa->pa_lock);
2090 - INIT_LIST_HEAD(&pa->pa_inode_list);
2091 - INIT_LIST_HEAD(&pa->pa_group_list);
2092 pa->pa_deleted = 0;
2093 pa->pa_linear = 0;
2094
2095 @@ -3867,7 +3623,6 @@
2096 atomic_set(&pa->pa_count, 1);
2097 spin_lock_init(&pa->pa_lock);
2098 INIT_LIST_HEAD(&pa->pa_inode_list);
2099 - INIT_LIST_HEAD(&pa->pa_group_list);
2100 pa->pa_deleted = 0;
2101 pa->pa_linear = 1;
2102
2103 @@ -4411,7 +4166,6 @@
2104 ac->ac_pa = NULL;
2105 ac->ac_bitmap_page = NULL;
2106 ac->ac_buddy_page = NULL;
2107 - ac->alloc_semp = NULL;
2108 ac->ac_lg = NULL;
2109
2110 /* we have to define context: we'll we work with a file or
2111 @@ -4532,7 +4286,7 @@
2112 pa_inode_list) {
2113 spin_lock(&tmp_pa->pa_lock);
2114 if (tmp_pa->pa_deleted) {
2115 + spin_unlock(&pa->pa_lock);
2116 - spin_unlock(&tmp_pa->pa_lock);
2117 continue;
2118 }
2119 if (!added && pa->pa_free < tmp_pa->pa_free) {
2120 @@ -4577,23 +4331,18 @@
2121 pa->pa_free -= ac->ac_b_ex.fe_len;
2122 pa->pa_len -= ac->ac_b_ex.fe_len;
2123 spin_unlock(&pa->pa_lock);
2124 + /*
2125 + * We want to add the pa to the right bucket.
2126 + * Remove it from the list and while adding
2127 + * make sure the list to which we are adding
2128 + * doesn't grow big.
2129 + */
2130 + if (likely(pa->pa_free)) {
2131 + spin_lock(pa->pa_obj_lock);
2132 + list_del_rcu(&pa->pa_inode_list);
2133 + spin_unlock(pa->pa_obj_lock);
2134 + ext4_mb_add_n_trim(ac);
2135 + }
2136 - }
2137 - }
2138 - if (ac->alloc_semp)
2139 - up_read(ac->alloc_semp);
2140 - if (pa) {
2141 - /*
2142 - * We want to add the pa to the right bucket.
2143 - * Remove it from the list and while adding
2144 - * make sure the list to which we are adding
2145 - * doesn't grow big. We need to release
2146 - * alloc_semp before calling ext4_mb_add_n_trim()
2147 - */
2148 - if (pa->pa_linear && likely(pa->pa_free)) {
2149 - spin_lock(pa->pa_obj_lock);
2150 - list_del_rcu(&pa->pa_inode_list);
2151 - spin_unlock(pa->pa_obj_lock);
2152 - ext4_mb_add_n_trim(ac);
2153 }
2154 ext4_mb_put_pa(ac, ac->ac_sb, pa);
2155 }
2156 @@ -4700,14 +4449,10 @@
2157 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
2158 ext4_mb_new_preallocation(ac);
2159 }
2160 +
2161 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
2162 *errp = ext4_mb_mark_diskspace_used(ac, handle);
2163 if (*errp == -EAGAIN) {
2164 - /*
2165 - * drop the reference that we took
2166 - * in ext4_mb_use_best_found
2167 - */
2168 - ext4_mb_release_context(ac);
2169 ac->ac_b_ex.fe_group = 0;
2170 ac->ac_b_ex.fe_start = 0;
2171 ac->ac_b_ex.fe_len = 0;
2172 @@ -4772,97 +4517,65 @@
2173 ext4_mb_free_committed_blocks(sb);
2174 }
2175
2176 -/*
2177 - * We can merge two free data extents only if the physical blocks
2178 - * are contiguous, AND the extents were freed by the same transaction,
2179 - * AND the blocks are associated with the same group.
2180 - */
2181 -static int can_merge(struct ext4_free_data *entry1,
2182 - struct ext4_free_data *entry2)
2183 -{
2184 - if ((entry1->t_tid == entry2->t_tid) &&
2185 - (entry1->group == entry2->group) &&
2186 - ((entry1->start_blk + entry1->count) == entry2->start_blk))
2187 - return 1;
2188 - return 0;
2189 -}
2190 -
2191 static noinline_for_stack int
2192 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
2193 + ext4_group_t group, ext4_grpblk_t block, int count)
2194 - struct ext4_free_data *new_entry)
2195 {
2196 - ext4_grpblk_t block;
2197 - struct ext4_free_data *entry;
2198 struct ext4_group_info *db = e4b->bd_info;
2199 struct super_block *sb = e4b->bd_sb;
2200 struct ext4_sb_info *sbi = EXT4_SB(sb);
2201 + struct ext4_free_metadata *md;
2202 + int i;
2203 - struct rb_node **n = &db->bb_free_root.rb_node, *node;
2204 - struct rb_node *parent = NULL, *new_node;
2205
2206 BUG_ON(e4b->bd_bitmap_page == NULL);
2207 BUG_ON(e4b->bd_buddy_page == NULL);
2208
2209 + ext4_lock_group(sb, group);
2210 + for (i = 0; i < count; i++) {
2211 + md = db->bb_md_cur;
2212 + if (md && db->bb_tid != handle->h_transaction->t_tid) {
2213 + db->bb_md_cur = NULL;
2214 + md = NULL;
2215 - new_node = &new_entry->node;
2216 - block = new_entry->start_blk;
2217 -
2218 - if (!*n) {
2219 - /* first free block exent. We need to
2220 - protect buddy cache from being freed,
2221 - * otherwise we'll refresh it from
2222 - * on-disk bitmap and lose not-yet-available
2223 - * blocks */
2224 - page_cache_get(e4b->bd_buddy_page);
2225 - page_cache_get(e4b->bd_bitmap_page);
2226 - }
2227 - while (*n) {
2228 - parent = *n;
2229 - entry = rb_entry(parent, struct ext4_free_data, node);
2230 - if (block < entry->start_blk)
2231 - n = &(*n)->rb_left;
2232 - else if (block >= (entry->start_blk + entry->count))
2233 - n = &(*n)->rb_right;
2234 - else {
2235 - ext4_error(sb, __func__,
2236 - "Double free of blocks %d (%d %d)\n",
2237 - block, entry->start_blk, entry->count);
2238 - return 0;
2239 }
2240 - }
2241
2242 + if (md == NULL) {
2243 + ext4_unlock_group(sb, group);
2244 + md = kmalloc(sizeof(*md), GFP_NOFS);
2245 + if (md == NULL)
2246 + return -ENOMEM;
2247 + md->num = 0;
2248 + md->group = group;
2249 +
2250 + ext4_lock_group(sb, group);
2251 + if (db->bb_md_cur == NULL) {
2252 + spin_lock(&sbi->s_md_lock);
2253 + list_add(&md->list, &sbi->s_active_transaction);
2254 + spin_unlock(&sbi->s_md_lock);
2255 + /* protect buddy cache from being freed,
2256 + * otherwise we'll refresh it from
2257 + * on-disk bitmap and lose not-yet-available
2258 + * blocks */
2259 + page_cache_get(e4b->bd_buddy_page);
2260 + page_cache_get(e4b->bd_bitmap_page);
2261 + db->bb_md_cur = md;
2262 + db->bb_tid = handle->h_transaction->t_tid;
2263 + mb_debug("new md 0x%p for group %lu\n",
2264 + md, md->group);
2265 + } else {
2266 + kfree(md);
2267 + md = db->bb_md_cur;
2268 + }
2269 - rb_link_node(new_node, parent, n);
2270 - rb_insert_color(new_node, &db->bb_free_root);
2271 -
2272 - /* Now try to see the extent can be merged to left and right */
2273 - node = rb_prev(new_node);
2274 - if (node) {
2275 - entry = rb_entry(node, struct ext4_free_data, node);
2276 - if (can_merge(entry, new_entry)) {
2277 - new_entry->start_blk = entry->start_blk;
2278 - new_entry->count += entry->count;
2279 - rb_erase(node, &(db->bb_free_root));
2280 - spin_lock(&sbi->s_md_lock);
2281 - list_del(&entry->list);
2282 - spin_unlock(&sbi->s_md_lock);
2283 - kmem_cache_free(ext4_free_ext_cachep, entry);
2284 }
2285 - }
2286
2287 + BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
2288 + md->blocks[md->num] = block + i;
2289 + md->num++;
2290 + if (md->num == EXT4_BB_MAX_BLOCKS) {
2291 + /* no more space, put full container on a sb's list */
2292 + db->bb_md_cur = NULL;
2293 - node = rb_next(new_node);
2294 - if (node) {
2295 - entry = rb_entry(node, struct ext4_free_data, node);
2296 - if (can_merge(new_entry, entry)) {
2297 - new_entry->count += entry->count;
2298 - rb_erase(node, &(db->bb_free_root));
2299 - spin_lock(&sbi->s_md_lock);
2300 - list_del(&entry->list);
2301 - spin_unlock(&sbi->s_md_lock);
2302 - kmem_cache_free(ext4_free_ext_cachep, entry);
2303 }
2304 }
2305 + ext4_unlock_group(sb, group);
2306 - /* Add the extent to active_transaction list */
2307 - spin_lock(&sbi->s_md_lock);
2308 - list_add(&new_entry->list, &sbi->s_active_transaction);
2309 - spin_unlock(&sbi->s_md_lock);
2310 return 0;
2311 }
2312
2313 @@ -4962,6 +4675,11 @@
2314 err = ext4_journal_get_write_access(handle, gd_bh);
2315 if (err)
2316 goto error_return;
2317 +
2318 + err = ext4_mb_load_buddy(sb, block_group, &e4b);
2319 + if (err)
2320 + goto error_return;
2321 +
2322 #ifdef AGGRESSIVE_CHECK
2323 {
2324 int i;
2325 @@ -4969,6 +4687,13 @@
2326 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
2327 }
2328 #endif
2329 + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2330 + bit, count);
2331 +
2332 + /* We dirtied the bitmap block */
2333 + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2334 + err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2335 +
2336 if (ac) {
2337 ac->ac_b_ex.fe_group = block_group;
2338 ac->ac_b_ex.fe_start = bit;
2339 @@ -4976,33 +4701,12 @@
2340 ext4_mb_store_history(ac);
2341 }
2342
2343 - err = ext4_mb_load_buddy(sb, block_group, &e4b);
2344 - if (err)
2345 - goto error_return;
2346 if (metadata) {
2347 + /* blocks being freed are metadata. these blocks shouldn't
2348 + * be used until this transaction is committed */
2349 + ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
2350 - struct ext4_free_data *new_entry;
2351 - /*
2352 - * blocks being freed are metadata. these blocks shouldn't
2353 - * be used until this transaction is committed
2354 - */
2355 - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
2356 - new_entry->start_blk = bit;
2357 - new_entry->group = block_group;
2358 - new_entry->count = count;
2359 - new_entry->t_tid = handle->h_transaction->t_tid;
2360 - ext4_lock_group(sb, block_group);
2361 - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2362 - bit, count);
2363 - ext4_mb_free_metadata(handle, &e4b, new_entry);
2364 - ext4_unlock_group(sb, block_group);
2365 } else {
2366 ext4_lock_group(sb, block_group);
2367 - /* need to update group_info->bb_free and bitmap
2368 - * with group lock held. generate_buddy look at
2369 - * them with group lock_held
2370 - */
2371 - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2372 - bit, count);
2373 mb_free_blocks(inode, &e4b, bit, count);
2374 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
2375 ext4_unlock_group(sb, block_group);
2376 @@ -5025,10 +4729,6 @@
2377
2378 *freed += count;
2379
2380 - /* We dirtied the bitmap block */
2381 - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2382 - err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2383 -
2384 /* And the group descriptor block */
2385 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
2386 ret = ext4_journal_dirty_metadata(handle, gd_bh);
2387 reverted:
2388 --- b/fs/ext4/mballoc.h
2389 +++ a/fs/ext4/mballoc.h
2390 @@ -18,7 +18,6 @@
2391 #include <linux/pagemap.h>
2392 #include <linux/seq_file.h>
2393 #include <linux/version.h>
2394 -#include <linux/mutex.h>
2395 #include "ext4_jbd2.h"
2396 #include "ext4.h"
2397 #include "group.h"
2398 @@ -97,27 +96,25 @@
2399 */
2400 #define MB_DEFAULT_GROUP_PREALLOC 512
2401
2402 +static struct kmem_cache *ext4_pspace_cachep;
2403 +static struct kmem_cache *ext4_ac_cachep;
2404 -struct ext4_free_data {
2405 - /* this links the free block information from group_info */
2406 - struct rb_node node;
2407
2408 +#ifdef EXT4_BB_MAX_BLOCKS
2409 +#undef EXT4_BB_MAX_BLOCKS
2410 +#endif
2411 +#define EXT4_BB_MAX_BLOCKS 30
2412 - /* this links the free block information from ext4_sb_info */
2413 - struct list_head list;
2414
2415 +struct ext4_free_metadata {
2416 - /* group which free block extent belongs */
2417 ext4_group_t group;
2418 + unsigned short num;
2419 + ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
2420 + struct list_head list;
2421 -
2422 - /* free block extent */
2423 - ext4_grpblk_t start_blk;
2424 - ext4_grpblk_t count;
2425 -
2426 - /* transaction which freed this extent */
2427 - tid_t t_tid;
2428 };
2429
2430 struct ext4_group_info {
2431 unsigned long bb_state;
2432 + unsigned long bb_tid;
2433 + struct ext4_free_metadata *bb_md_cur;
2434 - struct rb_root bb_free_root;
2435 unsigned short bb_first_free;
2436 unsigned short bb_free;
2437 unsigned short bb_fragments;
2438 @@ -125,7 +122,6 @@
2439 #ifdef DOUBLE_CHECK
2440 void *bb_bitmap;
2441 #endif
2442 - struct rw_semaphore alloc_sem;
2443 unsigned short bb_counters[];
2444 };
2445
2446 @@ -213,11 +209,6 @@
2447 __u8 ac_op; /* operation, for history only */
2448 struct page *ac_bitmap_page;
2449 struct page *ac_buddy_page;
2450 - /*
2451 - * pointer to the held semaphore upon successful
2452 - * block allocation
2453 - */
2454 - struct rw_semaphore *alloc_semp;
2455 struct ext4_prealloc_space *ac_pa;
2456 struct ext4_locality_group *ac_lg;
2457 };
2458 @@ -251,7 +242,6 @@
2459 struct super_block *bd_sb;
2460 __u16 bd_blkbits;
2461 ext4_group_t bd_group;
2462 - struct rw_semaphore *alloc_semp;
2463 };
2464 #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
2465 #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
2466 @@ -261,6 +251,8 @@
2467 {
2468 return;
2469 }
2470 +#else
2471 +static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2472 #endif
2473
2474 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2475 @@ -268,6 +260,19 @@
2476 static struct proc_dir_entry *proc_root_ext4;
2477 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
2478
2479 +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
2480 + ext4_group_t group);
2481 +static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
2482 +static void ext4_mb_free_committed_blocks(struct super_block *);
2483 +static void ext4_mb_return_to_preallocation(struct inode *inode,
2484 + struct ext4_buddy *e4b, sector_t block,
2485 + int count);
2486 +static void ext4_mb_put_pa(struct ext4_allocation_context *,
2487 + struct super_block *, struct ext4_prealloc_space *pa);
2488 +static int ext4_mb_init_per_dev_proc(struct super_block *sb);
2489 +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
2490 +
2491 +
2492 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
2493 {
2494 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
2495 @@ -292,7 +297,7 @@
2496 &(grinfo->bb_state));
2497 }
2498
2499 +static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2500 -static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2501 struct ext4_free_extent *fex)
2502 {
2503 ext4_fsblk_t block;
2504 reverted:
2505 --- b/fs/ext4/migrate.c
2506 +++ a/fs/ext4/migrate.c
2507 @@ -480,7 +480,7 @@
2508 + 1);
2509 if (IS_ERR(handle)) {
2510 retval = PTR_ERR(handle);
2511 + goto err_out;
2512 - return retval;
2513 }
2514 tmp_inode = ext4_new_inode(handle,
2515 inode->i_sb->s_root->d_inode,
2516 @@ -488,7 +488,8 @@
2517 if (IS_ERR(tmp_inode)) {
2518 retval = -ENOMEM;
2519 ext4_journal_stop(handle);
2520 + tmp_inode = NULL;
2521 + goto err_out;
2522 - return retval;
2523 }
2524 i_size_write(tmp_inode, i_size_read(inode));
2525 /*
2526 @@ -616,7 +617,8 @@
2527
2528 ext4_journal_stop(handle);
2529
2530 + if (tmp_inode)
2531 + iput(tmp_inode);
2532 - iput(tmp_inode);
2533
2534 return retval;
2535 }
2536 reverted:
2537 --- b/fs/ext4/namei.c
2538 +++ a/fs/ext4/namei.c
2539 @@ -371,8 +371,6 @@
2540 goto fail;
2541 }
2542 hinfo->hash_version = root->info.hash_version;
2543 - if (hinfo->hash_version <= DX_HASH_TEA)
2544 - hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2545 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2546 if (dentry)
2547 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
2548 @@ -642,9 +640,6 @@
2549 dir = dir_file->f_path.dentry->d_inode;
2550 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
2551 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
2552 - if (hinfo.hash_version <= DX_HASH_TEA)
2553 - hinfo.hash_version +=
2554 - EXT4_SB(dir->i_sb)->s_hash_unsigned;
2555 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2556 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
2557 start_hash, start_minor_hash);
2558 @@ -1055,16 +1050,8 @@
2559 return ERR_PTR(-EIO);
2560 }
2561 inode = ext4_iget(dir->i_sb, ino);
2562 + if (IS_ERR(inode))
2563 + return ERR_CAST(inode);
2564 - if (unlikely(IS_ERR(inode))) {
2565 - if (PTR_ERR(inode) == -ESTALE) {
2566 - ext4_error(dir->i_sb, __func__,
2567 - "deleted inode referenced: %u",
2568 - ino);
2569 - return ERR_PTR(-EIO);
2570 - } else {
2571 - return ERR_CAST(inode);
2572 - }
2573 - }
2574 }
2575 return d_splice_alias(inode, dentry);
2576 }
2577 @@ -1390,7 +1377,7 @@
2578 struct fake_dirent *fde;
2579
2580 blocksize = dir->i_sb->s_blocksize;
2581 + dxtrace(printk("Creating index\n"));
2582 - dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
2583 retval = ext4_journal_get_write_access(handle, bh);
2584 if (retval) {
2585 ext4_std_error(dir->i_sb, retval);
2586 @@ -1399,20 +1386,6 @@
2587 }
2588 root = (struct dx_root *) bh->b_data;
2589
2590 - /* The 0th block becomes the root, move the dirents out */
2591 - fde = &root->dotdot;
2592 - de = (struct ext4_dir_entry_2 *)((char *)fde +
2593 - ext4_rec_len_from_disk(fde->rec_len));
2594 - if ((char *) de >= (((char *) root) + blocksize)) {
2595 - ext4_error(dir->i_sb, __func__,
2596 - "invalid rec_len for '..' in inode %lu",
2597 - dir->i_ino);
2598 - brelse(bh);
2599 - return -EIO;
2600 - }
2601 - len = ((char *) root) + blocksize - (char *) de;
2602 -
2603 - /* Allocate new block for the 0th block's dirents */
2604 bh2 = ext4_append (handle, dir, &block, &retval);
2605 if (!(bh2)) {
2606 brelse(bh);
2607 @@ -1421,6 +1394,11 @@
2608 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
2609 data1 = bh2->b_data;
2610
2611 + /* The 0th block becomes the root, move the dirents out */
2612 + fde = &root->dotdot;
2613 + de = (struct ext4_dir_entry_2 *)((char *)fde +
2614 + ext4_rec_len_from_disk(fde->rec_len));
2615 + len = ((char *) root) + blocksize - (char *) de;
2616 memcpy (data1, de, len);
2617 de = (struct ext4_dir_entry_2 *) data1;
2618 top = data1 + len;
2619 @@ -1440,8 +1418,6 @@
2620
2621 /* Initialize as for dx_probe */
2622 hinfo.hash_version = root->info.hash_version;
2623 - if (hinfo.hash_version <= DX_HASH_TEA)
2624 - hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2625 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2626 ext4fs_dirhash(name, namelen, &hinfo);
2627 frame = frames;
2628 @@ -2314,7 +2290,7 @@
2629 struct inode * old_inode, * new_inode;
2630 struct buffer_head * old_bh, * new_bh, * dir_bh;
2631 struct ext4_dir_entry_2 * old_de, * new_de;
2632 + int retval;
2633 - int retval, force_da_alloc = 0;
2634
2635 old_bh = new_bh = dir_bh = NULL;
2636
2637 @@ -2452,7 +2428,6 @@
2638 ext4_mark_inode_dirty(handle, new_inode);
2639 if (!new_inode->i_nlink)
2640 ext4_orphan_add(handle, new_inode);
2641 - force_da_alloc = 1;
2642 }
2643 retval = 0;
2644
2645 @@ -2461,8 +2436,6 @@
2646 brelse (old_bh);
2647 brelse (new_bh);
2648 ext4_journal_stop(handle);
2649 - if (retval == 0 && force_da_alloc)
2650 - ext4_alloc_da_blocks(old_inode);
2651 return retval;
2652 }
2653
2654 reverted:
2655 --- b/fs/ext4/resize.c
2656 +++ a/fs/ext4/resize.c
2657 @@ -284,9 +284,11 @@
2658 if ((err = extend_or_restart_transaction(handle, 2, bh)))
2659 goto exit_bh;
2660
2661 + mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
2662 + bh->b_data);
2663 - mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
2664 ext4_journal_dirty_metadata(handle, bh);
2665 brelse(bh);
2666 +
2667 /* Mark unused entries in inode bitmap used */
2668 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
2669 input->inode_bitmap, input->inode_bitmap - start);
2670 @@ -295,7 +297,7 @@
2671 goto exit_journal;
2672 }
2673
2674 + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
2675 - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
2676 bh->b_data);
2677 ext4_journal_dirty_metadata(handle, bh);
2678 exit_bh:
2679 @@ -745,7 +747,6 @@
2680 struct inode *inode = NULL;
2681 handle_t *handle;
2682 int gdb_off, gdb_num;
2683 - int num_grp_locked = 0;
2684 int err, err2;
2685
2686 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
2687 @@ -786,7 +787,6 @@
2688 }
2689 }
2690
2691 -
2692 if ((err = verify_group_input(sb, input)))
2693 goto exit_put;
2694
2695 @@ -855,18 +855,15 @@
2696 * using the new disk blocks.
2697 */
2698
2699 - num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
2700 /* Update group descriptor block for new group */
2701 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
2702 gdb_off * EXT4_DESC_SIZE(sb));
2703
2704 - memset(gdp, 0, EXT4_DESC_SIZE(sb));
2705 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
2706 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
2707 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
2708 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
2709 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
2710 - gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
2711 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
2712
2713 /*
2714 @@ -874,11 +871,9 @@
2715 * descriptor
2716 */
2717 if (test_opt(sb, MBALLOC)) {
2718 + err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
2719 + if (err)
2720 - err = ext4_mb_add_groupinfo(sb, input->group, gdp);
2721 - if (err) {
2722 - ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2723 goto exit_journal;
2724 - }
2725 }
2726 /*
2727 * Make the new blocks and inodes valid next. We do this before
2728 @@ -920,7 +915,6 @@
2729
2730 /* Update the global fs size fields */
2731 sbi->s_groups_count++;
2732 - ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2733
2734 ext4_journal_dirty_metadata(handle, primary);
2735
2736 @@ -982,7 +976,9 @@
2737 struct buffer_head * bh;
2738 handle_t *handle;
2739 int err;
2740 + unsigned long freed_blocks;
2741 ext4_group_t group;
2742 + struct ext4_group_info *grp;
2743
2744 /* We don't need to worry about locking wrt other resizers just
2745 * yet: we're going to revalidate es->s_blocks_count after
2746 @@ -1081,13 +1077,50 @@
2747 unlock_super(sb);
2748 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
2749 o_blocks_count + add);
2750 + ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
2751 - /* We add the blocks to the bitmap and set the group need init bit */
2752 - ext4_add_groupblocks(handle, sb, o_blocks_count, add);
2753 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
2754 o_blocks_count + add);
2755 if ((err = ext4_journal_stop(handle)))
2756 goto exit_put;
2757
2758 + /*
2759 + * Mark mballoc pages as not up to date so that they will be updated
2760 + * next time they are loaded by ext4_mb_load_buddy.
2761 + */
2762 + if (test_opt(sb, MBALLOC)) {
2763 + struct ext4_sb_info *sbi = EXT4_SB(sb);
2764 + struct inode *inode = sbi->s_buddy_cache;
2765 + int blocks_per_page;
2766 + int block;
2767 + int pnum;
2768 + struct page *page;
2769 +
2770 + /* Set buddy page as not up to date */
2771 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2772 + block = group * 2;
2773 + pnum = block / blocks_per_page;
2774 + page = find_get_page(inode->i_mapping, pnum);
2775 + if (page != NULL) {
2776 + ClearPageUptodate(page);
2777 + page_cache_release(page);
2778 + }
2779 +
2780 + /* Set bitmap page as not up to date */
2781 + block++;
2782 + pnum = block / blocks_per_page;
2783 + page = find_get_page(inode->i_mapping, pnum);
2784 + if (page != NULL) {
2785 + ClearPageUptodate(page);
2786 + page_cache_release(page);
2787 + }
2788 +
2789 + /* Get the info on the last group */
2790 + grp = ext4_get_group_info(sb, group);
2791 +
2792 + /* Update free blocks in group info */
2793 + ext4_mb_update_group_info(grp, add);
2794 + }
2795 +
2796 if (test_opt(sb, DEBUG))
2797 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
2798 ext4_blocks_count(es));
2799 reverted:
2800 --- b/fs/ext4/super.c
2801 +++ a/fs/ext4/super.c
2802 @@ -1493,6 +1493,7 @@
2803 ext4_group_t flex_group_count;
2804 ext4_group_t flex_group;
2805 int groups_per_flex = 0;
2806 + __u64 block_bitmap = 0;
2807 int i;
2808
2809 if (!sbi->s_es->s_log_groups_per_flex) {
2810 @@ -1515,6 +1516,9 @@
2811 goto failed;
2812 }
2813
2814 + gdp = ext4_get_group_desc(sb, 1, &bh);
2815 + block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
2816 +
2817 for (i = 0; i < sbi->s_groups_count; i++) {
2818 gdp = ext4_get_group_desc(sb, i, &bh);
2819
2820 @@ -1916,8 +1920,8 @@
2821 struct inode *root;
2822 int ret = -EINVAL;
2823 int blocksize;
2824 + int db_count;
2825 + int i;
2826 - unsigned int db_count;
2827 - unsigned int i;
2828 int needs_recovery;
2829 __le32 features;
2830 __u64 blocks_count;
2831 @@ -2168,18 +2172,6 @@
2832 for (i = 0; i < 4; i++)
2833 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2834 sbi->s_def_hash_version = es->s_def_hash_version;
2835 - i = le32_to_cpu(es->s_flags);
2836 - if (i & EXT2_FLAGS_UNSIGNED_HASH)
2837 - sbi->s_hash_unsigned = 3;
2838 - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2839 -#ifdef __CHAR_UNSIGNED__
2840 - es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2841 - sbi->s_hash_unsigned = 3;
2842 -#else
2843 - es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2844 -#endif
2845 - sb->s_dirt = 1;
2846 - }
2847
2848 if (sbi->s_blocks_per_group > blocksize * 8) {
2849 printk(KERN_ERR
2850 @@ -2207,30 +2199,20 @@
2851 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2852 goto cantfind_ext4;
2853
2854 + /* ensure blocks_count calculation below doesn't sign-extend */
2855 + if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2856 + le32_to_cpu(es->s_first_data_block) + 1) {
2857 + printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2858 + "first data block %u, blocks per group %lu\n",
2859 + ext4_blocks_count(es),
2860 + le32_to_cpu(es->s_first_data_block),
2861 + EXT4_BLOCKS_PER_GROUP(sb));
2862 - /*
2863 - * It makes no sense for the first data block to be beyond the end
2864 - * of the filesystem.
2865 - */
2866 - if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2867 - printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2868 - "block %u is beyond end of filesystem (%llu)\n",
2869 - le32_to_cpu(es->s_first_data_block),
2870 - ext4_blocks_count(es));
2871 goto failed_mount;
2872 }
2873 blocks_count = (ext4_blocks_count(es) -
2874 le32_to_cpu(es->s_first_data_block) +
2875 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2876 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2877 - if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2878 - printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2879 - "(block count %llu, first data block %u, "
2880 - "blocks per group %lu)\n", sbi->s_groups_count,
2881 - ext4_blocks_count(es),
2882 - le32_to_cpu(es->s_first_data_block),
2883 - EXT4_BLOCKS_PER_GROUP(sb));
2884 - goto failed_mount;
2885 - }
2886 sbi->s_groups_count = blocks_count;
2887 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2888 EXT4_DESC_PER_BLOCK(sb);
2889 @@ -2950,14 +2932,14 @@
2890
2891 static int ext4_sync_fs(struct super_block *sb, int wait)
2892 {
2893 + int ret = 0;
2894 - tid_t target;
2895
2896 sb->s_dirt = 0;
2897 + if (wait)
2898 + ret = ext4_force_commit(sb);
2899 + else
2900 + jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
2901 + return ret;
2902 - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2903 - if (wait)
2904 - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
2905 - }
2906 - return 0;
2907 }
2908
2909 /*
2910 reverted:
2911 --- b/fs/jbd2/commit.c
2912 +++ a/fs/jbd2/commit.c
2913 @@ -24,7 +24,6 @@
2914 #include <linux/crc32.h>
2915 #include <linux/writeback.h>
2916 #include <linux/backing-dev.h>
2917 -#include <linux/bio.h>
2918
2919 /*
2920 * Default IO end handler for temporary BJ_IO buffer_heads.
2921 @@ -171,34 +170,12 @@
2922 * This function along with journal_submit_commit_record
2923 * allows to write the commit record asynchronously.
2924 */
2925 +static int journal_wait_on_commit_record(struct buffer_head *bh)
2926 -static int journal_wait_on_commit_record(journal_t *journal,
2927 - struct buffer_head *bh)
2928 {
2929 int ret = 0;
2930
2931 -retry:
2932 clear_buffer_dirty(bh);
2933 wait_on_buffer(bh);
2934 - if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
2935 - printk(KERN_WARNING
2936 - "JBD2: wait_on_commit_record: sync failed on %s - "
2937 - "disabling barriers\n", journal->j_devname);
2938 - spin_lock(&journal->j_state_lock);
2939 - journal->j_flags &= ~JBD2_BARRIER;
2940 - spin_unlock(&journal->j_state_lock);
2941 -
2942 - lock_buffer(bh);
2943 - clear_buffer_dirty(bh);
2944 - set_buffer_uptodate(bh);
2945 - bh->b_end_io = journal_end_buffer_io_sync;
2946 -
2947 - ret = submit_bh(WRITE_SYNC, bh);
2948 - if (ret) {
2949 - unlock_buffer(bh);
2950 - return ret;
2951 - }
2952 - goto retry;
2953 - }
2954
2955 if (unlikely(!buffer_uptodate(bh)))
2956 ret = -EIO;
2957 @@ -818,7 +795,7 @@
2958 __jbd2_journal_abort_hard(journal);
2959 }
2960 if (!err && !is_journal_aborted(journal))
2961 + err = journal_wait_on_commit_record(cbh);
2962 - err = journal_wait_on_commit_record(journal, cbh);
2963
2964 if (err)
2965 jbd2_journal_abort(journal, err);
2966 reverted:
2967 --- b/fs/jbd2/journal.c
2968 +++ a/fs/jbd2/journal.c
2969 @@ -430,7 +430,7 @@
2970 }
2971
2972 /*
2973 + * Called under j_state_lock. Returns true if a transaction was started.
2974 - * Called under j_state_lock. Returns true if a transaction commit was started.
2975 */
2976 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
2977 {
2978 @@ -498,8 +498,7 @@
2979
2980 /*
2981 * Start a commit of the current running transaction (if any). Returns true
2982 + * if a transaction was started, and fills its tid in at *ptid
2983 - * if a transaction is going to be committed (or is currently already
2984 - * committing), and fills its tid in at *ptid
2985 */
2986 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
2987 {
2988 @@ -509,19 +508,15 @@
2989 if (journal->j_running_transaction) {
2990 tid_t tid = journal->j_running_transaction->t_tid;
2991
2992 + ret = __jbd2_log_start_commit(journal, tid);
2993 + if (ret && ptid)
2994 - __jbd2_log_start_commit(journal, tid);
2995 - /* There's a running transaction and we've just made sure
2996 - * it's commit has been scheduled. */
2997 - if (ptid)
2998 *ptid = tid;
2999 + } else if (journal->j_committing_transaction && ptid) {
3000 - ret = 1;
3001 - } else if (journal->j_committing_transaction) {
3002 /*
3003 * If ext3_write_super() recently started a commit, then we
3004 * have to wait for completion of that transaction
3005 */
3006 + *ptid = journal->j_committing_transaction->t_tid;
3007 - if (ptid)
3008 - *ptid = journal->j_committing_transaction->t_tid;
3009 ret = 1;
3010 }
3011 spin_unlock(&journal->j_state_lock);
3012 reverted:
3013 --- b/fs/jbd2/revoke.c
3014 +++ a/fs/jbd2/revoke.c
3015 @@ -55,25 +55,6 @@
3016 * need do nothing.
3017 * RevokeValid set, Revoked set:
3018 * buffer has been revoked.
3019 - *
3020 - * Locking rules:
3021 - * We keep two hash tables of revoke records. One hashtable belongs to the
3022 - * running transaction (is pointed to by journal->j_revoke), the other one
3023 - * belongs to the committing transaction. Accesses to the second hash table
3024 - * happen only from the kjournald and no other thread touches this table. Also
3025 - * journal_switch_revoke_table() which switches which hashtable belongs to the
3026 - * running and which to the committing transaction is called only from
3027 - * kjournald. Therefore we need no locks when accessing the hashtable belonging
3028 - * to the committing transaction.
3029 - *
3030 - * All users operating on the hash table belonging to the running transaction
3031 - * have a handle to the transaction. Therefore they are safe from kjournald
3032 - * switching hash tables under them. For operations on the lists of entries in
3033 - * the hash table j_revoke_lock is used.
3034 - *
3035 - * Finally, also replay code uses the hash tables but at this moment noone else
3036 - * can touch them (filesystem isn't mounted yet) and hence no locking is
3037 - * needed.
3038 */
3039
3040 #ifndef __KERNEL__
3041 @@ -420,6 +401,8 @@
3042 * the second time we would still have a pending revoke to cancel. So,
3043 * do not trust the Revoked bit on buffers unless RevokeValid is also
3044 * set.
3045 + *
3046 + * The caller must have the journal locked.
3047 */
3048 int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
3049 {
3050 @@ -497,7 +480,10 @@
3051 /*
3052 * Write revoke records to the journal for all entries in the current
3053 * revoke hash, deleting the entries as we go.
3054 + *
3055 + * Called with the journal lock held.
3056 */
3057 +
3058 void jbd2_journal_write_revoke_records(journal_t *journal,
3059 transaction_t *transaction)
3060 {
3061 reverted:
3062 --- b/fs/jbd2/transaction.c
3063 +++ a/fs/jbd2/transaction.c
3064 @@ -2049,46 +2049,26 @@
3065 }
3066
3067 /*
3068 + * This function must be called when inode is journaled in ordered mode
3069 + * before truncation happens. It starts writeout of truncated part in
3070 + * case it is in the committing transaction so that we stand to ordered
3071 + * mode consistency guarantees.
3072 - * File truncate and transaction commit interact with each other in a
3073 - * non-trivial way. If a transaction writing data block A is
3074 - * committing, we cannot discard the data by truncate until we have
3075 - * written them. Otherwise if we crashed after the transaction with
3076 - * write has committed but before the transaction with truncate has
3077 - * committed, we could see stale data in block A. This function is a
3078 - * helper to solve this problem. It starts writeout of the truncated
3079 - * part in case it is in the committing transaction.
3080 - *
3081 - * Filesystem code must call this function when inode is journaled in
3082 - * ordered mode before truncation happens and after the inode has been
3083 - * placed on orphan list with the new inode size. The second condition
3084 - * avoids the race that someone writes new data and we start
3085 - * committing the transaction after this function has been called but
3086 - * before a transaction for truncate is started (and furthermore it
3087 - * allows us to optimize the case where the addition to orphan list
3088 - * happens in the same transaction as write --- we don't have to write
3089 - * any data in such case).
3090 */
3091 +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
3092 -int jbd2_journal_begin_ordered_truncate(journal_t *journal,
3093 - struct jbd2_inode *jinode,
3094 loff_t new_size)
3095 {
3096 + journal_t *journal;
3097 + transaction_t *commit_trans;
3098 - transaction_t *inode_trans, *commit_trans;
3099 int ret = 0;
3100
3101 + if (!inode->i_transaction && !inode->i_next_transaction)
3102 - /* This is a quick check to avoid locking if not necessary */
3103 - if (!jinode->i_transaction)
3104 goto out;
3105 + journal = inode->i_transaction->t_journal;
3106 - /* Locks are here just to force reading of recent values, it is
3107 - * enough that the transaction was not committing before we started
3108 - * a transaction adding the inode to orphan list */
3109 spin_lock(&journal->j_state_lock);
3110 commit_trans = journal->j_committing_transaction;
3111 spin_unlock(&journal->j_state_lock);
3112 + if (inode->i_transaction == commit_trans) {
3113 + ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
3114 - spin_lock(&journal->j_list_lock);
3115 - inode_trans = jinode->i_transaction;
3116 - spin_unlock(&journal->j_list_lock);
3117 - if (inode_trans == commit_trans) {
3118 - ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
3119 new_size, LLONG_MAX);
3120 if (ret)
3121 jbd2_journal_abort(journal, ret);
3122 reverted:
3123 --- b/include/linux/jbd2.h
3124 +++ a/include/linux/jbd2.h
3125 @@ -308,8 +308,7 @@
3126 int val = (expr); \
3127 if (!val) { \
3128 printk(KERN_ERR \
3129 + "EXT3-fs unexpected failure: %s;\n",# expr); \
3130 - "JBD2 unexpected failure: %s: %s;\n", \
3131 - __func__, #expr); \
3132 printk(KERN_ERR why "\n"); \
3133 } \
3134 val; \
3135 @@ -330,7 +329,6 @@
3136 BH_State, /* Pins most journal_head state */
3137 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
3138 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
3139 - BH_JBDPrivateStart, /* First bit available for private use by FS */
3140 };
3141
3142 BUFFER_FNS(JBD, jbd)
3143 @@ -1075,8 +1073,7 @@
3144 extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
3145 extern int jbd2_journal_force_commit(journal_t *);
3146 extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
3147 +extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
3148 -extern int jbd2_journal_begin_ordered_truncate(journal_t *journal,
3149 - struct jbd2_inode *inode, loff_t new_size);
3150 extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
3151 extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
3152