]> git.ipfire.org Git - people/teissler/ipfire-2.x.git/blame - src/patches/suse-2.6.27.31/patches.kernel.org/revert-ext4-changes-in-2.6.27.19-and-2.6.27.20-and-2.6.27.25.patch
Move xen patchset to new version's subdir.
[people/teissler/ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.kernel.org / revert-ext4-changes-in-2.6.27.19-and-2.6.27.20-and-2.6.27.25.patch
CommitLineData
00e5a55c
BS
1From: Greg Kroah-Hartman <gregkh@suse.de>
2Subject: revert ext4 changes in 2.6.27.19 and 2.6.27.20 and 2.6.27.25
3Patch-mainline: no
4
5As we are already taking a different version of ext4, revert the
6changes that were made to ext4 in 2.6.27.19 and 2.6.27.20 and 2.6.27.25
7
8Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
9
10--- b/Documentation/filesystems/ext4.txt
11+++ a/Documentation/filesystems/ext4.txt
12@@ -73,7 +73,7 @@
13 * extent format more robust in face of on-disk corruption due to magics,
14 * internal redunancy in tree
15 * improved file allocation (multi-block alloc)
16+* fix 32000 subdirectory limit
17-* lift 32000 subdirectory limit imposed by i_links_count[1]
18 * nsec timestamps for mtime, atime, ctime, create time
19 * inode version field on disk (NFSv4, Lustre)
20 * reduced e2fsck time via uninit_bg feature
21@@ -88,9 +88,6 @@
22 * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
23 the ordering)
24
25-[1] Filesystems with a block size of 1k may see a limit imposed by the
26-directory hash tree having a maximum depth of two.
27-
28 2.2 Candidate features for future inclusion
29
30 * Online defrag (patches available but not well tested)
31reverted:
32--- b/fs/ext4/balloc.c
33+++ a/fs/ext4/balloc.c
34@@ -20,7 +20,6 @@
35 #include "ext4.h"
36 #include "ext4_jbd2.h"
37 #include "group.h"
38-#include "mballoc.h"
39
40 /*
41 * balloc.c contains the blocks allocation and deallocation routines
42@@ -319,41 +318,18 @@
43 block_group, bitmap_blk);
44 return NULL;
45 }
46+ if (bh_uptodate_or_lock(bh))
47-
48- if (bitmap_uptodate(bh))
49 return bh;
50
51- lock_buffer(bh);
52- if (bitmap_uptodate(bh)) {
53- unlock_buffer(bh);
54- return bh;
55- }
56 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
57 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
58 ext4_init_block_bitmap(sb, bh, block_group, desc);
59- set_bitmap_uptodate(bh);
60 set_buffer_uptodate(bh);
61 unlock_buffer(bh);
62 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
63 return bh;
64 }
65 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
66- if (buffer_uptodate(bh)) {
67- /*
68- * if not uninit if bh is uptodate,
69- * bitmap is also uptodate
70- */
71- set_bitmap_uptodate(bh);
72- unlock_buffer(bh);
73- return bh;
74- }
75- /*
76- * submit the buffer_head for read. We can
77- * safely mark the bitmap as uptodate now.
78- * We do it here so the bitmap uptodate bit
79- * get set with buffer lock held.
80- */
81- set_bitmap_uptodate(bh);
82 if (bh_submit_read(bh) < 0) {
83 put_bh(bh);
84 ext4_error(sb, __func__,
85@@ -861,136 +837,6 @@
86 }
87
88 /**
89- * ext4_add_groupblocks() -- Add given blocks to an existing group
90- * @handle: handle to this transaction
91- * @sb: super block
92- * @block: start physcial block to add to the block group
93- * @count: number of blocks to free
94- *
95- * This marks the blocks as free in the bitmap. We ask the
96- * mballoc to reload the buddy after this by setting group
97- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
98- */
99-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
100- ext4_fsblk_t block, unsigned long count)
101-{
102- struct buffer_head *bitmap_bh = NULL;
103- struct buffer_head *gd_bh;
104- ext4_group_t block_group;
105- ext4_grpblk_t bit;
106- unsigned long i;
107- struct ext4_group_desc *desc;
108- struct ext4_super_block *es;
109- struct ext4_sb_info *sbi;
110- int err = 0, ret;
111- ext4_grpblk_t blocks_freed;
112- struct ext4_group_info *grp;
113-
114- sbi = EXT4_SB(sb);
115- es = sbi->s_es;
116- ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
117-
118- ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
119- grp = ext4_get_group_info(sb, block_group);
120- /*
121- * Check to see if we are freeing blocks across a group
122- * boundary.
123- */
124- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
125- goto error_return;
126-
127- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
128- if (!bitmap_bh)
129- goto error_return;
130- desc = ext4_get_group_desc(sb, block_group, &gd_bh);
131- if (!desc)
132- goto error_return;
133-
134- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
135- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
136- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
137- in_range(block + count - 1, ext4_inode_table(sb, desc),
138- sbi->s_itb_per_group)) {
139- ext4_error(sb, __func__,
140- "Adding blocks in system zones - "
141- "Block = %llu, count = %lu",
142- block, count);
143- goto error_return;
144- }
145-
146- /*
147- * We are about to add blocks to the bitmap,
148- * so we need undo access.
149- */
150- BUFFER_TRACE(bitmap_bh, "getting undo access");
151- err = ext4_journal_get_undo_access(handle, bitmap_bh);
152- if (err)
153- goto error_return;
154-
155- /*
156- * We are about to modify some metadata. Call the journal APIs
157- * to unshare ->b_data if a currently-committing transaction is
158- * using it
159- */
160- BUFFER_TRACE(gd_bh, "get_write_access");
161- err = ext4_journal_get_write_access(handle, gd_bh);
162- if (err)
163- goto error_return;
164- /*
165- * make sure we don't allow a parallel init on other groups in the
166- * same buddy cache
167- */
168- down_write(&grp->alloc_sem);
169- for (i = 0, blocks_freed = 0; i < count; i++) {
170- BUFFER_TRACE(bitmap_bh, "clear bit");
171- if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
172- bit + i, bitmap_bh->b_data)) {
173- ext4_error(sb, __func__,
174- "bit already cleared for block %llu",
175- (ext4_fsblk_t)(block + i));
176- BUFFER_TRACE(bitmap_bh, "bit already cleared");
177- } else {
178- blocks_freed++;
179- }
180- }
181- spin_lock(sb_bgl_lock(sbi, block_group));
182- le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
183- desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
184- spin_unlock(sb_bgl_lock(sbi, block_group));
185- percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
186-
187- if (sbi->s_log_groups_per_flex) {
188- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
189- spin_lock(sb_bgl_lock(sbi, flex_group));
190- sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
191- spin_unlock(sb_bgl_lock(sbi, flex_group));
192- }
193- /*
194- * request to reload the buddy with the
195- * new bitmap information
196- */
197- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
198- ext4_mb_update_group_info(grp, blocks_freed);
199- up_write(&grp->alloc_sem);
200-
201- /* We dirtied the bitmap block */
202- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
203- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
204-
205- /* And the group descriptor block */
206- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
207- ret = ext4_journal_dirty_metadata(handle, gd_bh);
208- if (!err)
209- err = ret;
210- sb->s_dirt = 1;
211-
212-error_return:
213- brelse(bitmap_bh);
214- ext4_std_error(sb, err);
215- return;
216-}
217-
218-/**
219 * ext4_free_blocks() -- Free given blocks and update quota
220 * @handle: handle for this transaction
221 * @inode: inode
222reverted:
223--- b/fs/ext4/ext4.h
224+++ a/fs/ext4/ext4.h
225@@ -19,7 +19,6 @@
226 #include <linux/types.h>
227 #include <linux/blkdev.h>
228 #include <linux/magic.h>
229-#include <linux/jbd2.h>
230 #include "ext4_i.h"
231
232 /*
233@@ -248,30 +247,6 @@
234 #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
235 #define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
236
237-/* Flags that should be inherited by new inodes from their parent. */
238-#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
239- EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
240- EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
241- EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
242- EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
243-
244-/* Flags that are appropriate for regular files (all but dir-specific ones). */
245-#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
246-
247-/* Flags that are appropriate for non-directories/regular files. */
248-#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
249-
250-/* Mask out flags that are inappropriate for the given type of inode. */
251-static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
252-{
253- if (S_ISDIR(mode))
254- return flags;
255- else if (S_ISREG(mode))
256- return flags & EXT4_REG_FLMASK;
257- else
258- return flags & EXT4_OTHER_FLMASK;
259-}
260-
261 /*
262 * Inode dynamic state flags
263 */
264@@ -279,7 +254,6 @@
265 #define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
266 #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
267 #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
268-#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
269
270 /* Used to pass group descriptor data when online resize is done */
271 struct ext4_new_group_input {
272@@ -327,9 +301,7 @@
273 #define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
274 #define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
275 #define EXT4_IOC_MIGRATE _IO('f', 9)
276- /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
277 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
278-#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
279
280 /*
281 * ioctl commands in 32 bit emulation
282@@ -887,7 +859,7 @@
283 {
284 unsigned len = le16_to_cpu(dlen);
285
286+ if (len == EXT4_MAX_REC_LEN)
287- if (len == EXT4_MAX_REC_LEN || len == 0)
288 return 1 << 16;
289 return len;
290 }
291@@ -917,9 +889,6 @@
292 #define DX_HASH_LEGACY 0
293 #define DX_HASH_HALF_MD4 1
294 #define DX_HASH_TEA 2
295-#define DX_HASH_LEGACY_UNSIGNED 3
296-#define DX_HASH_HALF_MD4_UNSIGNED 4
297-#define DX_HASH_TEA_UNSIGNED 5
298
299 #ifdef __KERNEL__
300
301@@ -1019,11 +988,9 @@
302 ext4_fsblk_t nblocks);
303 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
304 ext4_fsblk_t block, unsigned long count, int metadata);
305+extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
306+ ext4_fsblk_t block, unsigned long count,
307-extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
308- ext4_fsblk_t block, unsigned long count,
309 unsigned long *pdquot_freed_blocks);
310-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
311- ext4_fsblk_t block, unsigned long count);
312 extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
313 extern void ext4_check_blocks_bitmap (struct super_block *);
314 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
315@@ -1071,13 +1038,12 @@
316 extern void exit_ext4_mballoc(void);
317 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
318 unsigned long, unsigned long, int, unsigned long *);
319+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
320-extern int ext4_mb_add_groupinfo(struct super_block *sb,
321 ext4_group_t i, struct ext4_group_desc *desc);
322 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
323 ext4_grpblk_t add);
324+
325+
326-extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
327-extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
328- ext4_group_t, int);
329 /* inode.c */
330 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
331 struct buffer_head *bh, ext4_fsblk_t blocknr);
332@@ -1105,14 +1071,13 @@
333 extern void ext4_truncate (struct inode *);
334 extern void ext4_set_inode_flags(struct inode *);
335 extern void ext4_get_inode_flags(struct ext4_inode_info *);
336-extern int ext4_alloc_da_blocks(struct inode *inode);
337 extern void ext4_set_aops(struct inode *inode);
338 extern int ext4_writepage_trans_blocks(struct inode *);
339 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
340 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
341 extern int ext4_block_truncate_page(handle_t *handle,
342 struct address_space *mapping, loff_t from);
343+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
344-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
345
346 /* ioctl.c */
347 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
348@@ -1202,11 +1167,8 @@
349
350 static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
351 {
352+ return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
353+ le32_to_cpu(raw_inode->i_size_lo);
354- if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
355- return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
356- le32_to_cpu(raw_inode->i_size_lo);
357- else
358- return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
359 }
360
361 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
362@@ -1282,23 +1244,6 @@
363 sector_t block, unsigned long max_blocks,
364 struct buffer_head *bh, int create,
365 int extend_disksize, int flag);
366-/*
367- * Add new method to test wether block and inode bitmaps are properly
368- * initialized. With uninit_bg reading the block from disk is not enough
369- * to mark the bitmap uptodate. We need to also zero-out the bitmap
370- */
371-#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
372-
373-static inline int bitmap_uptodate(struct buffer_head *bh)
374-{
375- return (buffer_uptodate(bh) &&
376- test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
377-}
378-static inline void set_bitmap_uptodate(struct buffer_head *bh)
379-{
380- set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
381-}
382-
383 #endif /* __KERNEL__ */
384
385 #endif /* _EXT4_H */
386reverted:
387--- b/fs/ext4/ext4_sb.h
388+++ a/fs/ext4/ext4_sb.h
389@@ -56,7 +56,6 @@
390 u32 s_next_generation;
391 u32 s_hash_seed[4];
392 int s_def_hash_version;
393- int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
394 struct percpu_counter s_freeblocks_counter;
395 struct percpu_counter s_freeinodes_counter;
396 struct percpu_counter s_dirs_counter;
397@@ -103,8 +102,7 @@
398 struct list_head s_committed_transaction;
399 spinlock_t s_md_lock;
400 tid_t s_last_transaction;
401+ unsigned short *s_mb_offsets, *s_mb_maxs;
402- unsigned short *s_mb_offsets;
403- unsigned int *s_mb_maxs;
404
405 /* tunables */
406 unsigned long s_stripe;
407reverted:
408--- b/fs/ext4/extents.c
409+++ a/fs/ext4/extents.c
410@@ -1118,8 +1118,7 @@
411 struct ext4_extent_idx *ix;
412 struct ext4_extent *ex;
413 ext4_fsblk_t block;
414+ int depth, ee_len;
415- int depth; /* Note, NOT eh_depth; depth from top of tree */
416- int ee_len;
417
418 BUG_ON(path == NULL);
419 depth = path->p_depth;
420@@ -1178,8 +1177,7 @@
421 if (bh == NULL)
422 return -EIO;
423 eh = ext_block_hdr(bh);
424+ if (ext4_ext_check_header(inode, eh, depth)) {
425- /* subtract from p_depth to get proper eh_depth */
426- if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
427 put_bh(bh);
428 return -EIO;
429 }
430@@ -1633,13 +1631,11 @@
431 {
432 struct ext4_ext_cache *cex;
433 BUG_ON(len == 0);
434- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
435 cex = &EXT4_I(inode)->i_cached_extent;
436 cex->ec_type = type;
437 cex->ec_block = block;
438 cex->ec_len = len;
439 cex->ec_start = start;
440- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
441 }
442
443 /*
444@@ -1696,17 +1692,12 @@
445 struct ext4_extent *ex)
446 {
447 struct ext4_ext_cache *cex;
448- int ret = EXT4_EXT_CACHE_NO;
449
450- /*
451- * We borrow i_block_reservation_lock to protect i_cached_extent
452- */
453- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
454 cex = &EXT4_I(inode)->i_cached_extent;
455
456 /* has cache valid data? */
457 if (cex->ec_type == EXT4_EXT_CACHE_NO)
458+ return EXT4_EXT_CACHE_NO;
459- goto errout;
460
461 BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
462 cex->ec_type != EXT4_EXT_CACHE_EXTENT);
463@@ -1717,11 +1708,11 @@
464 ext_debug("%u cached by %u:%u:%llu\n",
465 block,
466 cex->ec_block, cex->ec_len, cex->ec_start);
467+ return cex->ec_type;
468- ret = cex->ec_type;
469 }
470+
471+ /* not in cache */
472+ return EXT4_EXT_CACHE_NO;
473-errout:
474- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
475- return ret;
476 }
477
478 /*
479@@ -2677,8 +2668,6 @@
480 if (allocated > max_blocks)
481 allocated = max_blocks;
482 set_buffer_unwritten(bh_result);
483- bh_result->b_bdev = inode->i_sb->s_bdev;
484- bh_result->b_blocknr = newblock;
485 goto out2;
486 }
487
488reverted:
489--- b/fs/ext4/file.c
490+++ a/fs/ext4/file.c
491@@ -33,14 +33,9 @@
492 */
493 static int ext4_release_file (struct inode * inode, struct file * filp)
494 {
495- if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
496- ext4_alloc_da_blocks(inode);
497- EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
498- }
499 /* if we are the last writer on the inode, drop the block reservation */
500 if ((filp->f_mode & FMODE_WRITE) &&
501+ (atomic_read(&inode->i_writecount) == 1))
502- (atomic_read(&inode->i_writecount) == 1) &&
503- !EXT4_I(inode)->i_reserved_data_blocks)
504 {
505 down_write(&EXT4_I(inode)->i_data_sem);
506 ext4_discard_reservation(inode);
507reverted:
508--- b/fs/ext4/hash.c
509+++ a/fs/ext4/hash.c
510@@ -35,71 +35,23 @@
511
512
513 /* The old legacy hash */
514+static __u32 dx_hack_hash (const char *name, int len)
515-static __u32 dx_hack_hash_unsigned(const char *name, int len)
516 {
517+ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
518- __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
519- const unsigned char *ucp = (const unsigned char *) name;
520-
521- while (len--) {
522- hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
523-
524- if (hash & 0x80000000)
525- hash -= 0x7fffffff;
526- hash1 = hash0;
527- hash0 = hash;
528- }
529- return hash0 << 1;
530-}
531-
532-static __u32 dx_hack_hash_signed(const char *name, int len)
533-{
534- __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
535- const signed char *scp = (const signed char *) name;
536-
537 while (len--) {
538+ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
539- hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
540
541+ if (hash & 0x80000000) hash -= 0x7fffffff;
542- if (hash & 0x80000000)
543- hash -= 0x7fffffff;
544 hash1 = hash0;
545 hash0 = hash;
546 }
547+ return (hash0 << 1);
548- return hash0 << 1;
549 }
550
551+static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
552-static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
553 {
554 __u32 pad, val;
555 int i;
556- const signed char *scp = (const signed char *) msg;
557-
558- pad = (__u32)len | ((__u32)len << 8);
559- pad |= pad << 16;
560-
561- val = pad;
562- if (len > num*4)
563- len = num * 4;
564- for (i = 0; i < len; i++) {
565- if ((i % 4) == 0)
566- val = pad;
567- val = ((int) scp[i]) + (val << 8);
568- if ((i % 4) == 3) {
569- *buf++ = val;
570- val = pad;
571- num--;
572- }
573- }
574- if (--num >= 0)
575- *buf++ = val;
576- while (--num >= 0)
577- *buf++ = pad;
578-}
579-
580-static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
581-{
582- __u32 pad, val;
583- int i;
584- const unsigned char *ucp = (const unsigned char *) msg;
585
586 pad = (__u32)len | ((__u32)len << 8);
587 pad |= pad << 16;
588@@ -110,7 +62,7 @@
589 for (i=0; i < len; i++) {
590 if ((i % 4) == 0)
591 val = pad;
592+ val = msg[i] + (val << 8);
593- val = ((int) ucp[i]) + (val << 8);
594 if ((i % 4) == 3) {
595 *buf++ = val;
596 val = pad;
597@@ -143,8 +95,6 @@
598 const char *p;
599 int i;
600 __u32 in[8], buf[4];
601- void (*str2hashbuf)(const char *, int, __u32 *, int) =
602- str2hashbuf_signed;
603
604 /* Initialize the default seed for the hash checksum functions */
605 buf[0] = 0x67452301;
606@@ -163,18 +113,13 @@
607 }
608
609 switch (hinfo->hash_version) {
610- case DX_HASH_LEGACY_UNSIGNED:
611- hash = dx_hack_hash_unsigned(name, len);
612- break;
613 case DX_HASH_LEGACY:
614+ hash = dx_hack_hash(name, len);
615- hash = dx_hack_hash_signed(name, len);
616 break;
617- case DX_HASH_HALF_MD4_UNSIGNED:
618- str2hashbuf = str2hashbuf_unsigned;
619 case DX_HASH_HALF_MD4:
620 p = name;
621 while (len > 0) {
622+ str2hashbuf(p, len, in, 8);
623- (*str2hashbuf)(p, len, in, 8);
624 half_md4_transform(buf, in);
625 len -= 32;
626 p += 32;
627@@ -182,12 +127,10 @@
628 minor_hash = buf[2];
629 hash = buf[1];
630 break;
631- case DX_HASH_TEA_UNSIGNED:
632- str2hashbuf = str2hashbuf_unsigned;
633 case DX_HASH_TEA:
634 p = name;
635 while (len > 0) {
636+ str2hashbuf(p, len, in, 4);
637- (*str2hashbuf)(p, len, in, 4);
638 TEA_transform(buf, in);
639 len -= 16;
640 p += 16;
641reverted:
642--- b/fs/ext4/ialloc.c
643+++ a/fs/ext4/ialloc.c
644@@ -84,7 +84,7 @@
645 }
646
647 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
648+ mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
649- mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
650 bh->b_data);
651
652 return EXT4_INODES_PER_GROUP(sb);
653@@ -115,40 +115,18 @@
654 block_group, bitmap_blk);
655 return NULL;
656 }
657+ if (bh_uptodate_or_lock(bh))
658- if (bitmap_uptodate(bh))
659 return bh;
660
661- lock_buffer(bh);
662- if (bitmap_uptodate(bh)) {
663- unlock_buffer(bh);
664- return bh;
665- }
666 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
667 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
668 ext4_init_inode_bitmap(sb, bh, block_group, desc);
669- set_bitmap_uptodate(bh);
670 set_buffer_uptodate(bh);
671 unlock_buffer(bh);
672 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
673 return bh;
674 }
675 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
676- if (buffer_uptodate(bh)) {
677- /*
678- * if not uninit if bh is uptodate,
679- * bitmap is also uptodate
680- */
681- set_bitmap_uptodate(bh);
682- unlock_buffer(bh);
683- return bh;
684- }
685- /*
686- * submit the buffer_head for read. We can
687- * safely mark the bitmap as uptodate now.
688- * We do it here so the bitmap uptodate bit
689- * get set with buffer lock held.
690- */
691- set_bitmap_uptodate(bh);
692 if (bh_submit_read(bh) < 0) {
693 put_bh(bh);
694 ext4_error(sb, __func__,
695@@ -188,7 +166,7 @@
696 struct ext4_group_desc * gdp;
697 struct ext4_super_block * es;
698 struct ext4_sb_info *sbi;
699+ int fatal = 0, err;
700- int fatal = 0, err, cleared;
701 ext4_group_t flex_group;
702
703 if (atomic_read(&inode->i_count) > 1) {
704@@ -242,12 +220,10 @@
705 goto error_return;
706
707 /* Ok, now we can actually update the inode bitmaps.. */
708+ if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
709+ bit, bitmap_bh->b_data))
710+ ext4_error (sb, "ext4_free_inode",
711+ "bit already cleared for inode %lu", ino);
712- spin_lock(sb_bgl_lock(sbi, block_group));
713- cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
714- spin_unlock(sb_bgl_lock(sbi, block_group));
715- if (!cleared)
716- ext4_error(sb, "ext4_free_inode",
717- "bit already cleared for inode %lu", ino);
718 else {
719 gdp = ext4_get_group_desc (sb, block_group, &bh2);
720
721@@ -591,77 +567,6 @@
722 }
723
724 /*
725- * claim the inode from the inode bitmap. If the group
726- * is uninit we need to take the groups's sb_bgl_lock
727- * and clear the uninit flag. The inode bitmap update
728- * and group desc uninit flag clear should be done
729- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
730- * doesn't race with the ext4_claim_inode
731- */
732-static int ext4_claim_inode(struct super_block *sb,
733- struct buffer_head *inode_bitmap_bh,
734- unsigned long ino, ext4_group_t group, int mode)
735-{
736- int free = 0, retval = 0;
737- struct ext4_sb_info *sbi = EXT4_SB(sb);
738- struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
739-
740- spin_lock(sb_bgl_lock(sbi, group));
741- if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
742- /* not a free inode */
743- retval = 1;
744- goto err_ret;
745- }
746- ino++;
747- if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
748- ino > EXT4_INODES_PER_GROUP(sb)) {
749- spin_unlock(sb_bgl_lock(sbi, group));
750- ext4_error(sb, __func__,
751- "reserved inode or inode > inodes count - "
752- "block_group = %lu, inode=%lu", group,
753- ino + group * EXT4_INODES_PER_GROUP(sb));
754- return 1;
755- }
756- /* If we didn't allocate from within the initialized part of the inode
757- * table then we need to initialize up to this inode. */
758- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
759-
760- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
761- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
762- /* When marking the block group with
763- * ~EXT4_BG_INODE_UNINIT we don't want to depend
764- * on the value of bg_itable_unused even though
765- * mke2fs could have initialized the same for us.
766- * Instead we calculated the value below
767- */
768-
769- free = 0;
770- } else {
771- free = EXT4_INODES_PER_GROUP(sb) -
772- le16_to_cpu(gdp->bg_itable_unused);
773- }
774-
775- /*
776- * Check the relative inode number against the last used
777- * relative inode number in this group. if it is greater
778- * we need to update the bg_itable_unused count
779- *
780- */
781- if (ino > free)
782- gdp->bg_itable_unused =
783- cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
784- }
785- le16_add_cpu(&gdp->bg_free_inodes_count, -1);
786- if (S_ISDIR(mode)) {
787- le16_add_cpu(&gdp->bg_used_dirs_count, 1);
788- }
789- gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
790-err_ret:
791- spin_unlock(sb_bgl_lock(sbi, group));
792- return retval;
793-}
794-
795-/*
796 * There are two policies for allocating an inode. If the new inode is
797 * a directory, then a forward search is made for a block group with both
798 * free space and a low directory-to-inode ratio; if that fails, then of
799@@ -687,7 +592,6 @@
800 struct inode *ret;
801 ext4_group_t i;
802 int free = 0;
803- static int once = 1;
804 ext4_group_t flex_group;
805
806 /* Cannot create files in a deleted directory */
807@@ -705,15 +609,6 @@
808
809 if (sbi->s_log_groups_per_flex) {
810 ret2 = find_group_flex(sb, dir, &group);
811- if (ret2 == -1) {
812- ret2 = find_group_other(sb, dir, &group);
813- if (ret2 == 0 && once) {
814- once = 0;
815- printk(KERN_NOTICE "ext4: find_group_flex "
816- "failed, fallback succeeded dir %lu\n",
817- dir->i_ino);
818- }
819- }
820 goto got_group;
821 }
822
823@@ -754,12 +649,8 @@
824 if (err)
825 goto fail;
826
827+ if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
828+ ino, bitmap_bh->b_data)) {
829- BUFFER_TRACE(bh2, "get_write_access");
830- err = ext4_journal_get_write_access(handle, bh2);
831- if (err)
832- goto fail;
833- if (!ext4_claim_inode(sb, bitmap_bh,
834- ino, group, mode)) {
835 /* we won it */
836 BUFFER_TRACE(bitmap_bh,
837 "call ext4_journal_dirty_metadata");
838@@ -767,13 +658,10 @@
839 bitmap_bh);
840 if (err)
841 goto fail;
842- /* zero bit is inode number 1*/
843- ino++;
844 goto got;
845 }
846 /* we lost it */
847 jbd2_journal_release_buffer(handle, bitmap_bh);
848- jbd2_journal_release_buffer(handle, bh2);
849
850 if (++ino < EXT4_INODES_PER_GROUP(sb))
851 goto repeat_in_this_group;
852@@ -793,6 +681,21 @@
853 goto out;
854
855 got:
856+ ino++;
857+ if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
858+ ino > EXT4_INODES_PER_GROUP(sb)) {
859+ ext4_error(sb, __func__,
860+ "reserved inode or inode > inodes count - "
861+ "block_group = %lu, inode=%lu", group,
862+ ino + group * EXT4_INODES_PER_GROUP(sb));
863+ err = -EIO;
864+ goto fail;
865+ }
866+
867+ BUFFER_TRACE(bh2, "get_write_access");
868+ err = ext4_journal_get_write_access(handle, bh2);
869+ if (err) goto fail;
870+
871 /* We may have to initialize the block bitmap if it isn't already */
872 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
873 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
874@@ -827,10 +730,47 @@
875 if (err)
876 goto fail;
877 }
878+
879+ spin_lock(sb_bgl_lock(sbi, group));
880+ /* If we didn't allocate from within the initialized part of the inode
881+ * table then we need to initialize up to this inode. */
882+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
883+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
884+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
885+
886+ /* When marking the block group with
887+ * ~EXT4_BG_INODE_UNINIT we don't want to depend
888+ * on the value of bg_itable_unused even though
889+ * mke2fs could have initialized the same for us.
890+ * Instead we calculated the value below
891+ */
892+
893+ free = 0;
894+ } else {
895+ free = EXT4_INODES_PER_GROUP(sb) -
896+ le16_to_cpu(gdp->bg_itable_unused);
897+ }
898+
899+ /*
900+ * Check the relative inode number against the last used
901+ * relative inode number in this group. if it is greater
902+ * we need to update the bg_itable_unused count
903+ *
904+ */
905+ if (ino > free)
906+ gdp->bg_itable_unused =
907+ cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
908+ }
909+
910+ le16_add_cpu(&gdp->bg_free_inodes_count, -1);
911+ if (S_ISDIR(mode)) {
912+ le16_add_cpu(&gdp->bg_used_dirs_count, 1);
913+ }
914+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
915+ spin_unlock(sb_bgl_lock(sbi, group));
916+ BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
917- BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
918 err = ext4_journal_dirty_metadata(handle, bh2);
919+ if (err) goto fail;
920- if (err)
921- goto fail;
922
923 percpu_counter_dec(&sbi->s_freeinodes_counter);
924 if (S_ISDIR(mode))
925@@ -866,12 +806,16 @@
926 ei->i_disksize = 0;
927
928 /*
929+ * Don't inherit extent flag from directory. We set extent flag on
930+ * newly created directory and file only if -o extent mount option is
931+ * specified
932- * Don't inherit extent flag from directory, amongst others. We set
933- * extent flag on newly created directory and file only if -o extent
934- * mount option is specified
935 */
936+ ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
937+ if (S_ISLNK(mode))
938+ ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
939+ /* dirsync only applies to directories */
940+ if (!S_ISDIR(mode))
941+ ei->i_flags &= ~EXT4_DIRSYNC_FL;
942- ei->i_flags =
943- ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
944 ei->i_file_acl = 0;
945 ei->i_dtime = 0;
946 ei->i_block_alloc_info = NULL;
947reverted:
948--- b/fs/ext4/inode.c
949+++ a/fs/ext4/inode.c
950@@ -46,10 +46,8 @@
951 static inline int ext4_begin_ordered_truncate(struct inode *inode,
952 loff_t new_size)
953 {
954+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
955+ new_size);
956- return jbd2_journal_begin_ordered_truncate(
957- EXT4_SB(inode->i_sb)->s_journal,
958- &EXT4_I(inode)->jinode,
959- new_size);
960 }
961
962 static void ext4_invalidatepage(struct page *page, unsigned long offset);
963@@ -353,9 +351,9 @@
964 final = ptrs;
965 } else {
966 ext4_warning(inode->i_sb, "ext4_block_to_path",
967+ "block %lu > max",
968- "block %lu > max in inode %lu",
969 i_block + direct_blocks +
970+ indirect_blocks + double_blocks);
971- indirect_blocks + double_blocks, inode->i_ino);
972 }
973 if (boundary)
974 *boundary = final - 1 - (i_block & (ptrs - 1));
975@@ -1046,14 +1044,6 @@
976 EXT4_I(inode)->i_reserved_meta_blocks = mdb;
977 EXT4_I(inode)->i_allocated_meta_blocks = 0;
978 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
979-
980- /*
981- * If we have done all the pending block allocations and if
982- * there aren't any writers on the inode, we can discard the
983- * inode's preallocations.
984- */
985- if (!total && (atomic_read(&inode->i_writecount) == 0))
986- ext4_discard_reservation(inode);
987 }
988
989 /*
990@@ -1085,7 +1075,6 @@
991 int retval;
992
993 clear_buffer_mapped(bh);
994- clear_buffer_unwritten(bh);
995
996 /*
997 * Try to see if we can get the block without requesting
998@@ -1116,18 +1105,6 @@
999 return retval;
1000
1001 /*
1002- * When we call get_blocks without the create flag, the
1003- * BH_Unwritten flag could have gotten set if the blocks
1004- * requested were part of a uninitialized extent. We need to
1005- * clear this flag now that we are committed to convert all or
1006- * part of the uninitialized extent to be an initialized
1007- * extent. This is because we need to avoid the combination
1008- * of BH_Unwritten and BH_Mapped flags being simultaneously
1009- * set on the buffer_head.
1010- */
1011- clear_buffer_unwritten(bh);
1012-
1013- /*
1014 * New blocks allocate and/or writing to uninitialized extent
1015 * will possibly result in updating i_data, so we take
1016 * the write lock of i_data_sem, and call get_blocks()
1017@@ -1393,10 +1370,6 @@
1018 goto out;
1019 }
1020
1021- /* We cannot recurse into the filesystem as the transaction is already
1022- * started */
1023- flags |= AOP_FLAG_NOFS;
1024-
1025 page = grab_cache_page_write_begin(mapping, index, flags);
1026 if (!page) {
1027 ext4_journal_stop(handle);
1028@@ -1406,7 +1379,7 @@
1029 *pagep = page;
1030
1031 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1032+ ext4_get_block);
1033- ext4_get_block);
1034
1035 if (!ret && ext4_should_journal_data(inode)) {
1036 ret = walk_page_buffers(handle, page_buffers(page),
1037@@ -1675,25 +1648,18 @@
1038 */
1039 static int mpage_da_submit_io(struct mpage_da_data *mpd)
1040 {
1041+ struct address_space *mapping = mpd->inode->i_mapping;
1042+ int ret = 0, err, nr_pages, i;
1043+ unsigned long index, end;
1044- long pages_skipped;
1045 struct pagevec pvec;
1046- unsigned long index, end;
1047- int ret = 0, err, nr_pages, i;
1048- struct inode *inode = mpd->inode;
1049- struct address_space *mapping = inode->i_mapping;
1050
1051 BUG_ON(mpd->next_page <= mpd->first_page);
1052+ pagevec_init(&pvec, 0);
1053- /*
1054- * We need to start from the first_page to the next_page - 1
1055- * to make sure we also write the mapped dirty buffer_heads.
1056- * If we look at mpd->lbh.b_blocknr we would only be looking
1057- * at the currently mapped buffer_heads.
1058- */
1059 index = mpd->first_page;
1060 end = mpd->next_page - 1;
1061
1062- pagevec_init(&pvec, 0);
1063 while (index <= end) {
1064+ /* XXX: optimize tail */
1065 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1066 if (nr_pages == 0)
1067 break;
1068@@ -1705,10 +1671,6 @@
1069 break;
1070 index++;
1071
1072- BUG_ON(!PageLocked(page));
1073- BUG_ON(PageWriteback(page));
1074-
1075- pages_skipped = mpd->wbc->pages_skipped;
1076 err = mapping->a_ops->writepage(page, mpd->wbc);
1077 if (!err)
1078 mpd->pages_written++;
1079@@ -2029,29 +1991,11 @@
1080 bh = head;
1081 do {
1082 BUG_ON(buffer_locked(bh));
1083- /*
1084- * We need to try to allocate
1085- * unmapped blocks in the same page.
1086- * Otherwise we won't make progress
1087- * with the page in ext4_da_writepage
1088- */
1089 if (buffer_dirty(bh) &&
1090 (!buffer_mapped(bh) || buffer_delay(bh))) {
1091 mpage_add_bh_to_extent(mpd, logical, bh);
1092 if (mpd->io_done)
1093 return MPAGE_DA_EXTENT_TAIL;
1094- } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
1095- /*
1096- * mapped dirty buffer. We need to update
1097- * the b_state because we look at
1098- * b_state in mpage_da_map_blocks. We don't
1099- * update b_size because if we find an
1100- * unmapped buffer_head later we need to
1101- * use the b_state flag of that buffer_head.
1102- */
1103- if (mpd->lbh.b_size == 0)
1104- mpd->lbh.b_state =
1105- bh->b_state & BH_FLAGS;
1106 }
1107 logical++;
1108 } while ((bh = bh->b_this_page) != head);
1109@@ -2118,10 +2062,6 @@
1110 struct buffer_head *bh_result, int create)
1111 {
1112 int ret = 0;
1113- sector_t invalid_block = ~((sector_t) 0xffff);
1114-
1115- if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1116- invalid_block = ~0;
1117
1118 BUG_ON(create == 0);
1119 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1120@@ -2143,18 +2083,11 @@
1121 /* not enough space to reserve */
1122 return ret;
1123
1124+ map_bh(bh_result, inode->i_sb, 0);
1125- map_bh(bh_result, inode->i_sb, invalid_block);
1126 set_buffer_new(bh_result);
1127 set_buffer_delay(bh_result);
1128 } else if (ret > 0) {
1129 bh_result->b_size = (ret << inode->i_blkbits);
1130- /*
1131- * With sub-block writes into unwritten extents
1132- * we also need to mark the buffer as new so that
1133- * the unwritten parts of the buffer gets correctly zeroed.
1134- */
1135- if (buffer_unwritten(bh_result))
1136- set_buffer_new(bh_result);
1137 ret = 0;
1138 }
1139
1140@@ -2365,20 +2298,6 @@
1141 */
1142 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1143 return 0;
1144-
1145- /*
1146- * If the filesystem has aborted, it is read-only, so return
1147- * right away instead of dumping stack traces later on that
1148- * will obscure the real source of the problem. We test
1149- * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
1150- * the latter could be true if the filesystem is mounted
1151- * read-only, and in that case, ext4_da_writepages should
1152- * *never* be called, so if that ever happens, we would want
1153- * the stack trace.
1154- */
1155- if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
1156- return -EROFS;
1157-
1158 /*
1159 * Make sure nr_to_write is >= sbi->s_mb_stream_request
1160 * This make sure small files blocks are allocated in
1161@@ -2417,7 +2336,7 @@
1162 handle = ext4_journal_start(inode, needed_blocks);
1163 if (IS_ERR(handle)) {
1164 ret = PTR_ERR(handle);
1165+ printk(KERN_EMERG "%s: jbd2_start: "
1166- printk(KERN_CRIT "%s: jbd2_start: "
1167 "%ld pages, ino %lu; err %d\n", __func__,
1168 wbc->nr_to_write, inode->i_ino, ret);
1169 dump_stack();
1170@@ -2501,9 +2420,6 @@
1171 ret = PTR_ERR(handle);
1172 goto out;
1173 }
1174- /* We cannot recurse into the filesystem as the transaction is already
1175- * started */
1176- flags |= AOP_FLAG_NOFS;
1177
1178 page = grab_cache_page_write_begin(mapping, index, flags);
1179 if (!page) {
1180@@ -2617,48 +2533,6 @@
1181 return;
1182 }
1183
1184-/*
1185- * Force all delayed allocation blocks to be allocated for a given inode.
1186- */
1187-int ext4_alloc_da_blocks(struct inode *inode)
1188-{
1189- if (!EXT4_I(inode)->i_reserved_data_blocks &&
1190- !EXT4_I(inode)->i_reserved_meta_blocks)
1191- return 0;
1192-
1193- /*
1194- * We do something simple for now. The filemap_flush() will
1195- * also start triggering a write of the data blocks, which is
1196- * not strictly speaking necessary (and for users of
1197- * laptop_mode, not even desirable). However, to do otherwise
1198- * would require replicating code paths in:
1199- *
1200- * ext4_da_writepages() ->
1201- * write_cache_pages() ---> (via passed in callback function)
1202- * __mpage_da_writepage() -->
1203- * mpage_add_bh_to_extent()
1204- * mpage_da_map_blocks()
1205- *
1206- * The problem is that write_cache_pages(), located in
1207- * mm/page-writeback.c, marks pages clean in preparation for
1208- * doing I/O, which is not desirable if we're not planning on
1209- * doing I/O at all.
1210- *
1211- * We could call write_cache_pages(), and then redirty all of
1212- * the pages by calling redirty_page_for_writeback() but that
1213- * would be ugly in the extreme. So instead we would need to
1214- * replicate parts of the code in the above functions,
1215- * simplifying them becuase we wouldn't actually intend to
1216- * write out the pages, but rather only collect contiguous
1217- * logical block extents, call the multi-block allocator, and
1218- * then update the buffer heads with the block allocations.
1219- *
1220- * For now, though, we'll cheat by calling filemap_flush(),
1221- * which will map the blocks, and start the I/O, but not
1222- * actually wait for the I/O to complete.
1223- */
1224- return filemap_flush(inode->i_mapping);
1225-}
1226
1227 /*
1228 * bmap() is special. It gets used by applications such as lilo and by
1229@@ -3668,9 +3542,6 @@
1230 if (!ext4_can_truncate(inode))
1231 return;
1232
1233- if (inode->i_size == 0)
1234- ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
1235-
1236 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1237 ext4_ext_truncate(inode);
1238 return;
1239@@ -4088,9 +3959,11 @@
1240 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
1241 inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
1242 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
1243+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
1244+ cpu_to_le32(EXT4_OS_HURD)) {
1245- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
1246 ei->i_file_acl |=
1247 ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
1248+ }
1249 inode->i_size = ext4_isize(raw_inode);
1250 ei->i_disksize = inode->i_size;
1251 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
1252@@ -4137,18 +4010,6 @@
1253 (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
1254 }
1255
1256- if (ei->i_file_acl &&
1257- ((ei->i_file_acl <
1258- (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
1259- EXT4_SB(sb)->s_gdb_count)) ||
1260- (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
1261- ext4_error(sb, __func__,
1262- "bad extended attribute block %llu in inode #%lu",
1263- ei->i_file_acl, inode->i_ino);
1264- ret = -EIO;
1265- goto bad_inode;
1266- }
1267-
1268 if (S_ISREG(inode->i_mode)) {
1269 inode->i_op = &ext4_file_inode_operations;
1270 inode->i_fop = &ext4_file_operations;
1271@@ -4163,8 +4024,7 @@
1272 inode->i_op = &ext4_symlink_inode_operations;
1273 ext4_set_aops(inode);
1274 }
1275+ } else {
1276- } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
1277- S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1278 inode->i_op = &ext4_special_inode_operations;
1279 if (raw_inode->i_block[0])
1280 init_special_inode(inode, inode->i_mode,
1281@@ -4172,13 +4032,6 @@
1282 else
1283 init_special_inode(inode, inode->i_mode,
1284 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
1285- } else {
1286- brelse(bh);
1287- ret = -EIO;
1288- ext4_error(inode->i_sb, __func__,
1289- "bogus i_mode (%o) for inode=%lu",
1290- inode->i_mode, inode->i_ino);
1291- goto bad_inode;
1292 }
1293 brelse (iloc.bh);
1294 ext4_set_inode_flags(inode);
1295@@ -4956,9 +4809,8 @@
1296 return !buffer_mapped(bh);
1297 }
1298
1299+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1300-int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1301 {
1302- struct page *page = vmf->page;
1303 loff_t size;
1304 unsigned long len;
1305 int ret = -EINVAL;
1306@@ -5009,8 +4861,6 @@
1307 goto out_unlock;
1308 ret = 0;
1309 out_unlock:
1310- if (ret)
1311- ret = VM_FAULT_SIGBUS;
1312 up_read(&inode->i_alloc_sem);
1313 return ret;
1314 }
1315reverted:
1316--- b/fs/ext4/ioctl.c
1317+++ a/fs/ext4/ioctl.c
1318@@ -49,7 +49,8 @@
1319 if (err)
1320 return err;
1321
1322+ if (!S_ISDIR(inode->i_mode))
1323+ flags &= ~EXT4_DIRSYNC_FL;
1324- flags = ext4_mask_flags(inode->i_mode, flags);
1325
1326 err = -EPERM;
1327 mutex_lock(&inode->i_mutex);
1328@@ -287,20 +288,6 @@
1329 return err;
1330 }
1331
1332- case EXT4_IOC_ALLOC_DA_BLKS:
1333- {
1334- int err;
1335- if (!is_owner_or_cap(inode))
1336- return -EACCES;
1337-
1338- err = mnt_want_write(filp->f_path.mnt);
1339- if (err)
1340- return err;
1341- err = ext4_alloc_da_blocks(inode);
1342- mnt_drop_write(filp->f_path.mnt);
1343- return err;
1344- }
1345-
1346 default:
1347 return -ENOTTY;
1348 }
1349reverted:
1350--- b/fs/ext4/mballoc.c
1351+++ a/fs/ext4/mballoc.c
1352@@ -100,7 +100,7 @@
1353 * inode as:
1354 *
1355 * { page }
1356+ * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1357- * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1358 *
1359 *
1360 * one block each for bitmap and buddy information. So for each group we
1361@@ -330,18 +330,6 @@
1362 * object
1363 *
1364 */
1365-static struct kmem_cache *ext4_pspace_cachep;
1366-static struct kmem_cache *ext4_ac_cachep;
1367-static struct kmem_cache *ext4_free_ext_cachep;
1368-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
1369- ext4_group_t group);
1370-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1371- ext4_group_t group);
1372-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
1373-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
1374-static void ext4_mb_free_committed_blocks(struct super_block *);
1375-static void ext4_mb_poll_new_transaction(struct super_block *sb,
1376- handle_t *handle);
1377
1378 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
1379 {
1380@@ -730,7 +718,7 @@
1381 * stored in the inode as
1382 *
1383 * { page }
1384+ * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1385- * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1386 *
1387 *
1388 * one block each for bitmap and buddy information.
1389@@ -796,42 +784,20 @@
1390 if (bh[i] == NULL)
1391 goto out;
1392
1393+ if (bh_uptodate_or_lock(bh[i]))
1394- if (bitmap_uptodate(bh[i]))
1395 continue;
1396
1397- lock_buffer(bh[i]);
1398- if (bitmap_uptodate(bh[i])) {
1399- unlock_buffer(bh[i]);
1400- continue;
1401- }
1402 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1403 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1404 ext4_init_block_bitmap(sb, bh[i],
1405 first_group + i, desc);
1406- set_bitmap_uptodate(bh[i]);
1407 set_buffer_uptodate(bh[i]);
1408 unlock_buffer(bh[i]);
1409 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1410 continue;
1411 }
1412 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1413- if (buffer_uptodate(bh[i])) {
1414- /*
1415- * if not uninit if bh is uptodate,
1416- * bitmap is also uptodate
1417- */
1418- set_bitmap_uptodate(bh[i]);
1419- unlock_buffer(bh[i]);
1420- continue;
1421- }
1422 get_bh(bh[i]);
1423- /*
1424- * submit the buffer_head for read. We can
1425- * safely mark the bitmap as uptodate now.
1426- * We do it here so the bitmap uptodate bit
1427- * get set with buffer lock held.
1428- */
1429- set_bitmap_uptodate(bh[i]);
1430 bh[i]->b_end_io = end_buffer_read_sync;
1431 submit_bh(READ, bh[i]);
1432 mb_debug("read bitmap for group %lu\n", first_group + i);
1433@@ -848,8 +814,6 @@
1434
1435 err = 0;
1436 first_block = page->index * blocks_per_page;
1437- /* init the page */
1438- memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
1439 for (i = 0; i < blocks_per_page; i++) {
1440 int group;
1441 struct ext4_group_info *grinfo;
1442@@ -876,6 +840,7 @@
1443 BUG_ON(incore == NULL);
1444 mb_debug("put buddy for group %u in page %lu/%x\n",
1445 group, page->index, i * blocksize);
1446+ memset(data, 0xff, blocksize);
1447 grinfo = ext4_get_group_info(sb, group);
1448 grinfo->bb_fragments = 0;
1449 memset(grinfo->bb_counters, 0,
1450@@ -883,9 +848,7 @@
1451 /*
1452 * incore got set to the group block bitmap below
1453 */
1454- ext4_lock_group(sb, group);
1455 ext4_mb_generate_buddy(sb, data, incore, group);
1456- ext4_unlock_group(sb, group);
1457 incore = NULL;
1458 } else {
1459 /* this is block of bitmap */
1460@@ -899,7 +862,6 @@
1461
1462 /* mark all preallocated blks used in in-core bitmap */
1463 ext4_mb_generate_from_pa(sb, data, group);
1464- ext4_mb_generate_from_freelist(sb, data, group);
1465 ext4_unlock_group(sb, group);
1466
1467 /* set incore so that the buddy information can be
1468@@ -924,20 +886,18 @@
1469 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1470 struct ext4_buddy *e4b)
1471 {
1472+ struct ext4_sb_info *sbi = EXT4_SB(sb);
1473+ struct inode *inode = sbi->s_buddy_cache;
1474 int blocks_per_page;
1475 int block;
1476 int pnum;
1477 int poff;
1478 struct page *page;
1479 int ret;
1480- struct ext4_group_info *grp;
1481- struct ext4_sb_info *sbi = EXT4_SB(sb);
1482- struct inode *inode = sbi->s_buddy_cache;
1483
1484 mb_debug("load group %lu\n", group);
1485
1486 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1487- grp = ext4_get_group_info(sb, group);
1488
1489 e4b->bd_blkbits = sb->s_blocksize_bits;
1490 e4b->bd_info = ext4_get_group_info(sb, group);
1491@@ -945,15 +905,6 @@
1492 e4b->bd_group = group;
1493 e4b->bd_buddy_page = NULL;
1494 e4b->bd_bitmap_page = NULL;
1495- e4b->alloc_semp = &grp->alloc_sem;
1496-
1497- /* Take the read lock on the group alloc
1498- * sem. This would make sure a parallel
1499- * ext4_mb_init_group happening on other
1500- * groups mapped by the page is blocked
1501- * till we are done with allocation
1502- */
1503- down_read(e4b->alloc_semp);
1504
1505 /*
1506 * the buddy cache inode stores the block bitmap
1507@@ -969,14 +920,6 @@
1508 page = find_get_page(inode->i_mapping, pnum);
1509 if (page == NULL || !PageUptodate(page)) {
1510 if (page)
1511- /*
1512- * drop the page reference and try
1513- * to get the page with lock. If we
1514- * are not uptodate that implies
1515- * somebody just created the page but
1516- * is yet to initialize the same. So
1517- * wait for it to initialize.
1518- */
1519 page_cache_release(page);
1520 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1521 if (page) {
1522@@ -1042,9 +985,6 @@
1523 page_cache_release(e4b->bd_buddy_page);
1524 e4b->bd_buddy = NULL;
1525 e4b->bd_bitmap = NULL;
1526-
1527- /* Done with the buddy cache */
1528- up_read(e4b->alloc_semp);
1529 return ret;
1530 }
1531
1532@@ -1054,9 +994,6 @@
1533 page_cache_release(e4b->bd_bitmap_page);
1534 if (e4b->bd_buddy_page)
1535 page_cache_release(e4b->bd_buddy_page);
1536- /* Done with the buddy cache */
1537- if (e4b->alloc_semp)
1538- up_read(e4b->alloc_semp);
1539 }
1540
1541
1542@@ -1094,10 +1031,7 @@
1543 cur += 32;
1544 continue;
1545 }
1546+ mb_clear_bit_atomic(lock, cur, bm);
1547- if (lock)
1548- mb_clear_bit_atomic(lock, cur, bm);
1549- else
1550- mb_clear_bit(cur, bm);
1551 cur++;
1552 }
1553 }
1554@@ -1115,10 +1049,7 @@
1555 cur += 32;
1556 continue;
1557 }
1558+ mb_set_bit_atomic(lock, cur, bm);
1559- if (lock)
1560- mb_set_bit_atomic(lock, cur, bm);
1561- else
1562- mb_set_bit(cur, bm);
1563 cur++;
1564 }
1565 }
1566@@ -1365,20 +1296,13 @@
1567 ac->ac_tail = ret & 0xffff;
1568 ac->ac_buddy = ret >> 16;
1569
1570+ /* XXXXXXX: SUCH A HORRIBLE **CK */
1571+ /*FIXME!! Why ? */
1572- /*
1573- * take the page reference. We want the page to be pinned
1574- * so that we don't get a ext4_mb_init_cache_call for this
1575- * group until we update the bitmap. That would mean we
1576- * double allocate blocks. The reference is dropped
1577- * in ext4_mb_release_context
1578- */
1579 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1580 get_page(ac->ac_bitmap_page);
1581 ac->ac_buddy_page = e4b->bd_buddy_page;
1582 get_page(ac->ac_buddy_page);
1583+
1584- /* on allocation we use ac to track the held semaphore */
1585- ac->alloc_semp = e4b->alloc_semp;
1586- e4b->alloc_semp = NULL;
1587 /* store last allocated for subsequent stream allocation */
1588 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1589 spin_lock(&sbi->s_md_lock);
1590@@ -1402,8 +1326,6 @@
1591 struct ext4_free_extent ex;
1592 int max;
1593
1594- if (ac->ac_status == AC_STATUS_FOUND)
1595- return;
1596 /*
1597 * We don't want to scan for a whole year
1598 */
1599@@ -1450,7 +1372,7 @@
1600 struct ext4_free_extent *gex = &ac->ac_g_ex;
1601
1602 BUG_ON(ex->fe_len <= 0);
1603+ BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1604- BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1605 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1606 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1607
1608@@ -1770,173 +1692,6 @@
1609 return 0;
1610 }
1611
1612-/*
1613- * lock the group_info alloc_sem of all the groups
1614- * belonging to the same buddy cache page. This
1615- * make sure other parallel operation on the buddy
1616- * cache doesn't happen whild holding the buddy cache
1617- * lock
1618- */
1619-int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1620-{
1621- int i;
1622- int block, pnum;
1623- int blocks_per_page;
1624- int groups_per_page;
1625- ext4_group_t first_group;
1626- struct ext4_group_info *grp;
1627-
1628- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1629- /*
1630- * the buddy cache inode stores the block bitmap
1631- * and buddy information in consecutive blocks.
1632- * So for each group we need two blocks.
1633- */
1634- block = group * 2;
1635- pnum = block / blocks_per_page;
1636- first_group = pnum * blocks_per_page / 2;
1637-
1638- groups_per_page = blocks_per_page >> 1;
1639- if (groups_per_page == 0)
1640- groups_per_page = 1;
1641- /* read all groups the page covers into the cache */
1642- for (i = 0; i < groups_per_page; i++) {
1643-
1644- if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1645- break;
1646- grp = ext4_get_group_info(sb, first_group + i);
1647- /* take all groups write allocation
1648- * semaphore. This make sure there is
1649- * no block allocation going on in any
1650- * of that groups
1651- */
1652- down_write(&grp->alloc_sem);
1653- }
1654- return i;
1655-}
1656-
1657-void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1658- ext4_group_t group, int locked_group)
1659-{
1660- int i;
1661- int block, pnum;
1662- int blocks_per_page;
1663- ext4_group_t first_group;
1664- struct ext4_group_info *grp;
1665-
1666- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1667- /*
1668- * the buddy cache inode stores the block bitmap
1669- * and buddy information in consecutive blocks.
1670- * So for each group we need two blocks.
1671- */
1672- block = group * 2;
1673- pnum = block / blocks_per_page;
1674- first_group = pnum * blocks_per_page / 2;
1675- /* release locks on all the groups */
1676- for (i = 0; i < locked_group; i++) {
1677-
1678- grp = ext4_get_group_info(sb, first_group + i);
1679- /* take all groups write allocation
1680- * semaphore. This make sure there is
1681- * no block allocation going on in any
1682- * of that groups
1683- */
1684- up_write(&grp->alloc_sem);
1685- }
1686-
1687-}
1688-
1689-static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1690-{
1691-
1692- int ret;
1693- void *bitmap;
1694- int blocks_per_page;
1695- int block, pnum, poff;
1696- int num_grp_locked = 0;
1697- struct ext4_group_info *this_grp;
1698- struct ext4_sb_info *sbi = EXT4_SB(sb);
1699- struct inode *inode = sbi->s_buddy_cache;
1700- struct page *page = NULL, *bitmap_page = NULL;
1701-
1702- mb_debug("init group %lu\n", group);
1703- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1704- this_grp = ext4_get_group_info(sb, group);
1705- /*
1706- * This ensures we don't add group
1707- * to this buddy cache via resize
1708- */
1709- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1710- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1711- /*
1712- * somebody initialized the group
1713- * return without doing anything
1714- */
1715- ret = 0;
1716- goto err;
1717- }
1718- /*
1719- * the buddy cache inode stores the block bitmap
1720- * and buddy information in consecutive blocks.
1721- * So for each group we need two blocks.
1722- */
1723- block = group * 2;
1724- pnum = block / blocks_per_page;
1725- poff = block % blocks_per_page;
1726- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1727- if (page) {
1728- BUG_ON(page->mapping != inode->i_mapping);
1729- ret = ext4_mb_init_cache(page, NULL);
1730- if (ret) {
1731- unlock_page(page);
1732- goto err;
1733- }
1734- unlock_page(page);
1735- }
1736- if (page == NULL || !PageUptodate(page)) {
1737- ret = -EIO;
1738- goto err;
1739- }
1740- mark_page_accessed(page);
1741- bitmap_page = page;
1742- bitmap = page_address(page) + (poff * sb->s_blocksize);
1743-
1744- /* init buddy cache */
1745- block++;
1746- pnum = block / blocks_per_page;
1747- poff = block % blocks_per_page;
1748- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1749- if (page == bitmap_page) {
1750- /*
1751- * If both the bitmap and buddy are in
1752- * the same page we don't need to force
1753- * init the buddy
1754- */
1755- unlock_page(page);
1756- } else if (page) {
1757- BUG_ON(page->mapping != inode->i_mapping);
1758- ret = ext4_mb_init_cache(page, bitmap);
1759- if (ret) {
1760- unlock_page(page);
1761- goto err;
1762- }
1763- unlock_page(page);
1764- }
1765- if (page == NULL || !PageUptodate(page)) {
1766- ret = -EIO;
1767- goto err;
1768- }
1769- mark_page_accessed(page);
1770-err:
1771- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1772- if (bitmap_page)
1773- page_cache_release(bitmap_page);
1774- if (page)
1775- page_cache_release(page);
1776- return ret;
1777-}
1778-
1779 static noinline_for_stack int
1780 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1781 {
1782@@ -2020,7 +1775,7 @@
1783 group = 0;
1784
1785 /* quick check to skip empty groups */
1786+ grp = ext4_get_group_info(ac->ac_sb, group);
1787- grp = ext4_get_group_info(sb, group);
1788 if (grp->bb_free == 0)
1789 continue;
1790
1791@@ -2033,9 +1788,10 @@
1792 * we need full data about the group
1793 * to make a good selection
1794 */
1795+ err = ext4_mb_load_buddy(sb, group, &e4b);
1796- err = ext4_mb_init_group(sb, group);
1797 if (err)
1798 goto out;
1799+ ext4_mb_release_desc(&e4b);
1800 }
1801
1802 /*
1803@@ -2543,8 +2299,6 @@
1804 }
1805
1806 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
1807- init_rwsem(&meta_group_info[i]->alloc_sem);
1808- meta_group_info[i]->bb_free_root.rb_node = NULL;;
1809
1810 #ifdef DOUBLE_CHECK
1811 {
1812@@ -2571,6 +2325,54 @@
1813 } /* ext4_mb_add_groupinfo */
1814
1815 /*
1816+ * Add a group to the existing groups.
1817+ * This function is used for online resize
1818+ */
1819+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
1820+ struct ext4_group_desc *desc)
1821+{
1822+ struct ext4_sb_info *sbi = EXT4_SB(sb);
1823+ struct inode *inode = sbi->s_buddy_cache;
1824+ int blocks_per_page;
1825+ int block;
1826+ int pnum;
1827+ struct page *page;
1828+ int err;
1829+
1830+ /* Add group based on group descriptor*/
1831+ err = ext4_mb_add_groupinfo(sb, group, desc);
1832+ if (err)
1833+ return err;
1834+
1835+ /*
1836+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
1837+ * datas) are set not up to date so that they will be re-initilaized
1838+ * during the next call to ext4_mb_load_buddy
1839+ */
1840+
1841+ /* Set buddy page as not up to date */
1842+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1843+ block = group * 2;
1844+ pnum = block / blocks_per_page;
1845+ page = find_get_page(inode->i_mapping, pnum);
1846+ if (page != NULL) {
1847+ ClearPageUptodate(page);
1848+ page_cache_release(page);
1849+ }
1850+
1851+ /* Set bitmap page as not up to date */
1852+ block++;
1853+ pnum = block / blocks_per_page;
1854+ page = find_get_page(inode->i_mapping, pnum);
1855+ if (page != NULL) {
1856+ ClearPageUptodate(page);
1857+ page_cache_release(page);
1858+ }
1859+
1860+ return 0;
1861+}
1862+
1863+/*
1864 * Update an existing group.
1865 * This function is used for online resize
1866 */
1867@@ -2693,12 +2495,10 @@
1868 clear_opt(sbi->s_mount_opt, MBALLOC);
1869 return -ENOMEM;
1870 }
1871-
1872- i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
1873 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
1874 if (sbi->s_mb_maxs == NULL) {
1875 clear_opt(sbi->s_mount_opt, MBALLOC);
1876+ kfree(sbi->s_mb_maxs);
1877- kfree(sbi->s_mb_offsets);
1878 return -ENOMEM;
1879 }
1880
1881@@ -2858,11 +2658,13 @@
1882 static noinline_for_stack void
1883 ext4_mb_free_committed_blocks(struct super_block *sb)
1884 {
1885+ struct ext4_sb_info *sbi = EXT4_SB(sb);
1886+ int err;
1887+ int i;
1888+ int count = 0;
1889+ int count2 = 0;
1890+ struct ext4_free_metadata *md;
1891 struct ext4_buddy e4b;
1892- struct ext4_group_info *db;
1893- struct ext4_sb_info *sbi = EXT4_SB(sb);
1894- int err, count = 0, count2 = 0;
1895- struct ext4_free_data *entry;
1896
1897 if (list_empty(&sbi->s_committed_transaction))
1898 return;
1899@@ -2870,46 +2672,44 @@
1900 /* there is committed blocks to be freed yet */
1901 do {
1902 /* get next array of blocks */
1903+ md = NULL;
1904- entry = NULL;
1905 spin_lock(&sbi->s_md_lock);
1906 if (!list_empty(&sbi->s_committed_transaction)) {
1907+ md = list_entry(sbi->s_committed_transaction.next,
1908+ struct ext4_free_metadata, list);
1909+ list_del(&md->list);
1910- entry = list_entry(sbi->s_committed_transaction.next,
1911- struct ext4_free_data, list);
1912- list_del(&entry->list);
1913 }
1914 spin_unlock(&sbi->s_md_lock);
1915
1916+ if (md == NULL)
1917- if (entry == NULL)
1918 break;
1919
1920 mb_debug("gonna free %u blocks in group %lu (0x%p):",
1921+ md->num, md->group, md);
1922- entry->count, entry->group, entry);
1923
1924+ err = ext4_mb_load_buddy(sb, md->group, &e4b);
1925- err = ext4_mb_load_buddy(sb, entry->group, &e4b);
1926 /* we expect to find existing buddy because it's pinned */
1927 BUG_ON(err != 0);
1928
1929- db = e4b.bd_info;
1930 /* there are blocks to put in buddy to make them really free */
1931+ count += md->num;
1932- count += entry->count;
1933 count2++;
1934+ ext4_lock_group(sb, md->group);
1935+ for (i = 0; i < md->num; i++) {
1936+ mb_debug(" %u", md->blocks[i]);
1937+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
1938- ext4_lock_group(sb, entry->group);
1939- /* Take it out of per group rb tree */
1940- rb_erase(&entry->node, &(db->bb_free_root));
1941- mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
1942-
1943- if (!db->bb_free_root.rb_node) {
1944- /* No more items in the per group rb tree
1945- * balance refcounts from ext4_mb_free_metadata()
1946- */
1947- page_cache_release(e4b.bd_buddy_page);
1948- page_cache_release(e4b.bd_bitmap_page);
1949 }
1950+ mb_debug("\n");
1951+ ext4_unlock_group(sb, md->group);
1952- ext4_unlock_group(sb, entry->group);
1953
1954+ /* balance refcounts from ext4_mb_free_metadata() */
1955+ page_cache_release(e4b.bd_buddy_page);
1956+ page_cache_release(e4b.bd_bitmap_page);
1957+
1958+ kfree(md);
1959- kmem_cache_free(ext4_free_ext_cachep, entry);
1960 ext4_mb_release_desc(&e4b);
1961+
1962+ } while (md);
1963- } while (1);
1964
1965 mb_debug("freed %u blocks in %u structures\n", count, count2);
1966 }
1967@@ -3064,16 +2864,6 @@
1968 kmem_cache_destroy(ext4_pspace_cachep);
1969 return -ENOMEM;
1970 }
1971-
1972- ext4_free_ext_cachep =
1973- kmem_cache_create("ext4_free_block_extents",
1974- sizeof(struct ext4_free_data),
1975- 0, SLAB_RECLAIM_ACCOUNT, NULL);
1976- if (ext4_free_ext_cachep == NULL) {
1977- kmem_cache_destroy(ext4_pspace_cachep);
1978- kmem_cache_destroy(ext4_ac_cachep);
1979- return -ENOMEM;
1980- }
1981 #ifdef CONFIG_PROC_FS
1982 proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
1983 if (proc_root_ext4 == NULL)
1984@@ -3090,7 +2880,6 @@
1985 #ifdef CONFIG_PROC_FS
1986 remove_proc_entry("fs/ext4", NULL);
1987 #endif
1988- kmem_cache_destroy(ext4_free_ext_cachep);
1989 }
1990
1991
1992@@ -3152,8 +2941,8 @@
1993 in_range(block + len - 1, ext4_inode_table(sb, gdp),
1994 EXT4_SB(sb)->s_itb_per_group)) {
1995 ext4_error(sb, __func__,
1996+ "Allocating block in system zone - block = %llu",
1997+ block);
1998- "Allocating block %llu in system zone of %lu group\n",
1999- block, ac->ac_b_ex.fe_group);
2000 /* File system mounted not to panic on error
2001 * Fix the bitmap and repeat the block allocation
2002 * We leak some of the blocks here.
2003@@ -3175,9 +2964,10 @@
2004 }
2005 }
2006 #endif
2007+ mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
2008+ ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2009+
2010 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2011- mb_set_bits(NULL, bitmap_bh->b_data,
2012- ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2013 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2014 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2015 gdp->bg_free_blocks_count =
2016@@ -3400,7 +3190,7 @@
2017 }
2018 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
2019 start > ac->ac_o_ex.fe_logical);
2020+ BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
2021- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
2022
2023 /* now prepare goal request */
2024
2025@@ -3610,37 +3400,10 @@
2026 ac->ac_criteria = 20;
2027 return 1;
2028 }
2029-
2030 return 0;
2031 }
2032
2033 /*
2034- * the function goes through all block freed in the group
2035- * but not yet committed and marks them used in in-core bitmap.
2036- * buddy must be generated from this bitmap
2037- * Need to be called with ext4 group lock (ext4_lock_group)
2038- */
2039-static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
2040- ext4_group_t group)
2041-{
2042- struct rb_node *n;
2043- struct ext4_group_info *grp;
2044- struct ext4_free_data *entry;
2045-
2046- grp = ext4_get_group_info(sb, group);
2047- n = rb_first(&(grp->bb_free_root));
2048-
2049- while (n) {
2050- entry = rb_entry(n, struct ext4_free_data, node);
2051- mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
2052- bitmap, entry->start_blk,
2053- entry->count);
2054- n = rb_next(n);
2055- }
2056- return;
2057-}
2058-
2059-/*
2060 * the function goes through all preallocation in this group and marks them
2061 * used in in-core bitmap. buddy must be generated from this bitmap
2062 * Need to be called with ext4 group lock (ext4_lock_group)
2063@@ -3698,7 +3461,6 @@
2064 struct super_block *sb, struct ext4_prealloc_space *pa)
2065 {
2066 unsigned long grp;
2067- ext4_fsblk_t grp_blk;
2068
2069 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
2070 return;
2071@@ -3713,12 +3475,8 @@
2072 pa->pa_deleted = 1;
2073 spin_unlock(&pa->pa_lock);
2074
2075+ /* -1 is to protect from crossing allocation group */
2076+ ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
2077- grp_blk = pa->pa_pstart;
2078- /* If linear, pa_pstart may be in the next group when pa is used up */
2079- if (pa->pa_linear)
2080- grp_blk--;
2081-
2082- ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
2083
2084 /*
2085 * possible race:
2086@@ -3807,8 +3565,6 @@
2087 pa->pa_free = pa->pa_len;
2088 atomic_set(&pa->pa_count, 1);
2089 spin_lock_init(&pa->pa_lock);
2090- INIT_LIST_HEAD(&pa->pa_inode_list);
2091- INIT_LIST_HEAD(&pa->pa_group_list);
2092 pa->pa_deleted = 0;
2093 pa->pa_linear = 0;
2094
2095@@ -3867,7 +3623,6 @@
2096 atomic_set(&pa->pa_count, 1);
2097 spin_lock_init(&pa->pa_lock);
2098 INIT_LIST_HEAD(&pa->pa_inode_list);
2099- INIT_LIST_HEAD(&pa->pa_group_list);
2100 pa->pa_deleted = 0;
2101 pa->pa_linear = 1;
2102
2103@@ -4411,7 +4166,6 @@
2104 ac->ac_pa = NULL;
2105 ac->ac_bitmap_page = NULL;
2106 ac->ac_buddy_page = NULL;
2107- ac->alloc_semp = NULL;
2108 ac->ac_lg = NULL;
2109
2110 /* we have to define context: we'll we work with a file or
2111@@ -4532,7 +4286,7 @@
2112 pa_inode_list) {
2113 spin_lock(&tmp_pa->pa_lock);
2114 if (tmp_pa->pa_deleted) {
2115+ spin_unlock(&pa->pa_lock);
2116- spin_unlock(&tmp_pa->pa_lock);
2117 continue;
2118 }
2119 if (!added && pa->pa_free < tmp_pa->pa_free) {
2120@@ -4577,23 +4331,18 @@
2121 pa->pa_free -= ac->ac_b_ex.fe_len;
2122 pa->pa_len -= ac->ac_b_ex.fe_len;
2123 spin_unlock(&pa->pa_lock);
2124+ /*
2125+ * We want to add the pa to the right bucket.
2126+ * Remove it from the list and while adding
2127+ * make sure the list to which we are adding
2128+ * doesn't grow big.
2129+ */
2130+ if (likely(pa->pa_free)) {
2131+ spin_lock(pa->pa_obj_lock);
2132+ list_del_rcu(&pa->pa_inode_list);
2133+ spin_unlock(pa->pa_obj_lock);
2134+ ext4_mb_add_n_trim(ac);
2135+ }
2136- }
2137- }
2138- if (ac->alloc_semp)
2139- up_read(ac->alloc_semp);
2140- if (pa) {
2141- /*
2142- * We want to add the pa to the right bucket.
2143- * Remove it from the list and while adding
2144- * make sure the list to which we are adding
2145- * doesn't grow big. We need to release
2146- * alloc_semp before calling ext4_mb_add_n_trim()
2147- */
2148- if (pa->pa_linear && likely(pa->pa_free)) {
2149- spin_lock(pa->pa_obj_lock);
2150- list_del_rcu(&pa->pa_inode_list);
2151- spin_unlock(pa->pa_obj_lock);
2152- ext4_mb_add_n_trim(ac);
2153 }
2154 ext4_mb_put_pa(ac, ac->ac_sb, pa);
2155 }
2156@@ -4700,14 +4449,10 @@
2157 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
2158 ext4_mb_new_preallocation(ac);
2159 }
2160+
2161 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
2162 *errp = ext4_mb_mark_diskspace_used(ac, handle);
2163 if (*errp == -EAGAIN) {
2164- /*
2165- * drop the reference that we took
2166- * in ext4_mb_use_best_found
2167- */
2168- ext4_mb_release_context(ac);
2169 ac->ac_b_ex.fe_group = 0;
2170 ac->ac_b_ex.fe_start = 0;
2171 ac->ac_b_ex.fe_len = 0;
2172@@ -4772,97 +4517,65 @@
2173 ext4_mb_free_committed_blocks(sb);
2174 }
2175
2176-/*
2177- * We can merge two free data extents only if the physical blocks
2178- * are contiguous, AND the extents were freed by the same transaction,
2179- * AND the blocks are associated with the same group.
2180- */
2181-static int can_merge(struct ext4_free_data *entry1,
2182- struct ext4_free_data *entry2)
2183-{
2184- if ((entry1->t_tid == entry2->t_tid) &&
2185- (entry1->group == entry2->group) &&
2186- ((entry1->start_blk + entry1->count) == entry2->start_blk))
2187- return 1;
2188- return 0;
2189-}
2190-
2191 static noinline_for_stack int
2192 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
2193+ ext4_group_t group, ext4_grpblk_t block, int count)
2194- struct ext4_free_data *new_entry)
2195 {
2196- ext4_grpblk_t block;
2197- struct ext4_free_data *entry;
2198 struct ext4_group_info *db = e4b->bd_info;
2199 struct super_block *sb = e4b->bd_sb;
2200 struct ext4_sb_info *sbi = EXT4_SB(sb);
2201+ struct ext4_free_metadata *md;
2202+ int i;
2203- struct rb_node **n = &db->bb_free_root.rb_node, *node;
2204- struct rb_node *parent = NULL, *new_node;
2205
2206 BUG_ON(e4b->bd_bitmap_page == NULL);
2207 BUG_ON(e4b->bd_buddy_page == NULL);
2208
2209+ ext4_lock_group(sb, group);
2210+ for (i = 0; i < count; i++) {
2211+ md = db->bb_md_cur;
2212+ if (md && db->bb_tid != handle->h_transaction->t_tid) {
2213+ db->bb_md_cur = NULL;
2214+ md = NULL;
2215- new_node = &new_entry->node;
2216- block = new_entry->start_blk;
2217-
2218- if (!*n) {
2219- /* first free block exent. We need to
2220- protect buddy cache from being freed,
2221- * otherwise we'll refresh it from
2222- * on-disk bitmap and lose not-yet-available
2223- * blocks */
2224- page_cache_get(e4b->bd_buddy_page);
2225- page_cache_get(e4b->bd_bitmap_page);
2226- }
2227- while (*n) {
2228- parent = *n;
2229- entry = rb_entry(parent, struct ext4_free_data, node);
2230- if (block < entry->start_blk)
2231- n = &(*n)->rb_left;
2232- else if (block >= (entry->start_blk + entry->count))
2233- n = &(*n)->rb_right;
2234- else {
2235- ext4_error(sb, __func__,
2236- "Double free of blocks %d (%d %d)\n",
2237- block, entry->start_blk, entry->count);
2238- return 0;
2239 }
2240- }
2241
2242+ if (md == NULL) {
2243+ ext4_unlock_group(sb, group);
2244+ md = kmalloc(sizeof(*md), GFP_NOFS);
2245+ if (md == NULL)
2246+ return -ENOMEM;
2247+ md->num = 0;
2248+ md->group = group;
2249+
2250+ ext4_lock_group(sb, group);
2251+ if (db->bb_md_cur == NULL) {
2252+ spin_lock(&sbi->s_md_lock);
2253+ list_add(&md->list, &sbi->s_active_transaction);
2254+ spin_unlock(&sbi->s_md_lock);
2255+ /* protect buddy cache from being freed,
2256+ * otherwise we'll refresh it from
2257+ * on-disk bitmap and lose not-yet-available
2258+ * blocks */
2259+ page_cache_get(e4b->bd_buddy_page);
2260+ page_cache_get(e4b->bd_bitmap_page);
2261+ db->bb_md_cur = md;
2262+ db->bb_tid = handle->h_transaction->t_tid;
2263+ mb_debug("new md 0x%p for group %lu\n",
2264+ md, md->group);
2265+ } else {
2266+ kfree(md);
2267+ md = db->bb_md_cur;
2268+ }
2269- rb_link_node(new_node, parent, n);
2270- rb_insert_color(new_node, &db->bb_free_root);
2271-
2272- /* Now try to see the extent can be merged to left and right */
2273- node = rb_prev(new_node);
2274- if (node) {
2275- entry = rb_entry(node, struct ext4_free_data, node);
2276- if (can_merge(entry, new_entry)) {
2277- new_entry->start_blk = entry->start_blk;
2278- new_entry->count += entry->count;
2279- rb_erase(node, &(db->bb_free_root));
2280- spin_lock(&sbi->s_md_lock);
2281- list_del(&entry->list);
2282- spin_unlock(&sbi->s_md_lock);
2283- kmem_cache_free(ext4_free_ext_cachep, entry);
2284 }
2285- }
2286
2287+ BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
2288+ md->blocks[md->num] = block + i;
2289+ md->num++;
2290+ if (md->num == EXT4_BB_MAX_BLOCKS) {
2291+ /* no more space, put full container on a sb's list */
2292+ db->bb_md_cur = NULL;
2293- node = rb_next(new_node);
2294- if (node) {
2295- entry = rb_entry(node, struct ext4_free_data, node);
2296- if (can_merge(new_entry, entry)) {
2297- new_entry->count += entry->count;
2298- rb_erase(node, &(db->bb_free_root));
2299- spin_lock(&sbi->s_md_lock);
2300- list_del(&entry->list);
2301- spin_unlock(&sbi->s_md_lock);
2302- kmem_cache_free(ext4_free_ext_cachep, entry);
2303 }
2304 }
2305+ ext4_unlock_group(sb, group);
2306- /* Add the extent to active_transaction list */
2307- spin_lock(&sbi->s_md_lock);
2308- list_add(&new_entry->list, &sbi->s_active_transaction);
2309- spin_unlock(&sbi->s_md_lock);
2310 return 0;
2311 }
2312
2313@@ -4962,6 +4675,11 @@
2314 err = ext4_journal_get_write_access(handle, gd_bh);
2315 if (err)
2316 goto error_return;
2317+
2318+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
2319+ if (err)
2320+ goto error_return;
2321+
2322 #ifdef AGGRESSIVE_CHECK
2323 {
2324 int i;
2325@@ -4969,6 +4687,13 @@
2326 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
2327 }
2328 #endif
2329+ mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2330+ bit, count);
2331+
2332+ /* We dirtied the bitmap block */
2333+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2334+ err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2335+
2336 if (ac) {
2337 ac->ac_b_ex.fe_group = block_group;
2338 ac->ac_b_ex.fe_start = bit;
2339@@ -4976,33 +4701,12 @@
2340 ext4_mb_store_history(ac);
2341 }
2342
2343- err = ext4_mb_load_buddy(sb, block_group, &e4b);
2344- if (err)
2345- goto error_return;
2346 if (metadata) {
2347+ /* blocks being freed are metadata. these blocks shouldn't
2348+ * be used until this transaction is committed */
2349+ ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
2350- struct ext4_free_data *new_entry;
2351- /*
2352- * blocks being freed are metadata. these blocks shouldn't
2353- * be used until this transaction is committed
2354- */
2355- new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
2356- new_entry->start_blk = bit;
2357- new_entry->group = block_group;
2358- new_entry->count = count;
2359- new_entry->t_tid = handle->h_transaction->t_tid;
2360- ext4_lock_group(sb, block_group);
2361- mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2362- bit, count);
2363- ext4_mb_free_metadata(handle, &e4b, new_entry);
2364- ext4_unlock_group(sb, block_group);
2365 } else {
2366 ext4_lock_group(sb, block_group);
2367- /* need to update group_info->bb_free and bitmap
2368- * with group lock held. generate_buddy look at
2369- * them with group lock_held
2370- */
2371- mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2372- bit, count);
2373 mb_free_blocks(inode, &e4b, bit, count);
2374 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
2375 ext4_unlock_group(sb, block_group);
2376@@ -5025,10 +4729,6 @@
2377
2378 *freed += count;
2379
2380- /* We dirtied the bitmap block */
2381- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2382- err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2383-
2384 /* And the group descriptor block */
2385 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
2386 ret = ext4_journal_dirty_metadata(handle, gd_bh);
2387reverted:
2388--- b/fs/ext4/mballoc.h
2389+++ a/fs/ext4/mballoc.h
2390@@ -18,7 +18,6 @@
2391 #include <linux/pagemap.h>
2392 #include <linux/seq_file.h>
2393 #include <linux/version.h>
2394-#include <linux/mutex.h>
2395 #include "ext4_jbd2.h"
2396 #include "ext4.h"
2397 #include "group.h"
2398@@ -97,27 +96,25 @@
2399 */
2400 #define MB_DEFAULT_GROUP_PREALLOC 512
2401
2402+static struct kmem_cache *ext4_pspace_cachep;
2403+static struct kmem_cache *ext4_ac_cachep;
2404-struct ext4_free_data {
2405- /* this links the free block information from group_info */
2406- struct rb_node node;
2407
2408+#ifdef EXT4_BB_MAX_BLOCKS
2409+#undef EXT4_BB_MAX_BLOCKS
2410+#endif
2411+#define EXT4_BB_MAX_BLOCKS 30
2412- /* this links the free block information from ext4_sb_info */
2413- struct list_head list;
2414
2415+struct ext4_free_metadata {
2416- /* group which free block extent belongs */
2417 ext4_group_t group;
2418+ unsigned short num;
2419+ ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
2420+ struct list_head list;
2421-
2422- /* free block extent */
2423- ext4_grpblk_t start_blk;
2424- ext4_grpblk_t count;
2425-
2426- /* transaction which freed this extent */
2427- tid_t t_tid;
2428 };
2429
2430 struct ext4_group_info {
2431 unsigned long bb_state;
2432+ unsigned long bb_tid;
2433+ struct ext4_free_metadata *bb_md_cur;
2434- struct rb_root bb_free_root;
2435 unsigned short bb_first_free;
2436 unsigned short bb_free;
2437 unsigned short bb_fragments;
2438@@ -125,7 +122,6 @@
2439 #ifdef DOUBLE_CHECK
2440 void *bb_bitmap;
2441 #endif
2442- struct rw_semaphore alloc_sem;
2443 unsigned short bb_counters[];
2444 };
2445
2446@@ -213,11 +209,6 @@
2447 __u8 ac_op; /* operation, for history only */
2448 struct page *ac_bitmap_page;
2449 struct page *ac_buddy_page;
2450- /*
2451- * pointer to the held semaphore upon successful
2452- * block allocation
2453- */
2454- struct rw_semaphore *alloc_semp;
2455 struct ext4_prealloc_space *ac_pa;
2456 struct ext4_locality_group *ac_lg;
2457 };
2458@@ -251,7 +242,6 @@
2459 struct super_block *bd_sb;
2460 __u16 bd_blkbits;
2461 ext4_group_t bd_group;
2462- struct rw_semaphore *alloc_semp;
2463 };
2464 #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
2465 #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
2466@@ -261,6 +251,8 @@
2467 {
2468 return;
2469 }
2470+#else
2471+static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2472 #endif
2473
2474 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2475@@ -268,6 +260,19 @@
2476 static struct proc_dir_entry *proc_root_ext4;
2477 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
2478
2479+static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
2480+ ext4_group_t group);
2481+static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
2482+static void ext4_mb_free_committed_blocks(struct super_block *);
2483+static void ext4_mb_return_to_preallocation(struct inode *inode,
2484+ struct ext4_buddy *e4b, sector_t block,
2485+ int count);
2486+static void ext4_mb_put_pa(struct ext4_allocation_context *,
2487+ struct super_block *, struct ext4_prealloc_space *pa);
2488+static int ext4_mb_init_per_dev_proc(struct super_block *sb);
2489+static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
2490+
2491+
2492 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
2493 {
2494 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
2495@@ -292,7 +297,7 @@
2496 &(grinfo->bb_state));
2497 }
2498
2499+static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2500-static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2501 struct ext4_free_extent *fex)
2502 {
2503 ext4_fsblk_t block;
2504reverted:
2505--- b/fs/ext4/migrate.c
2506+++ a/fs/ext4/migrate.c
2507@@ -480,7 +480,7 @@
2508 + 1);
2509 if (IS_ERR(handle)) {
2510 retval = PTR_ERR(handle);
2511+ goto err_out;
2512- return retval;
2513 }
2514 tmp_inode = ext4_new_inode(handle,
2515 inode->i_sb->s_root->d_inode,
2516@@ -488,7 +488,8 @@
2517 if (IS_ERR(tmp_inode)) {
2518 retval = -ENOMEM;
2519 ext4_journal_stop(handle);
2520+ tmp_inode = NULL;
2521+ goto err_out;
2522- return retval;
2523 }
2524 i_size_write(tmp_inode, i_size_read(inode));
2525 /*
2526@@ -616,7 +617,8 @@
2527
2528 ext4_journal_stop(handle);
2529
2530+ if (tmp_inode)
2531+ iput(tmp_inode);
2532- iput(tmp_inode);
2533
2534 return retval;
2535 }
2536reverted:
2537--- b/fs/ext4/namei.c
2538+++ a/fs/ext4/namei.c
2539@@ -371,8 +371,6 @@
2540 goto fail;
2541 }
2542 hinfo->hash_version = root->info.hash_version;
2543- if (hinfo->hash_version <= DX_HASH_TEA)
2544- hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2545 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2546 if (dentry)
2547 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
2548@@ -642,9 +640,6 @@
2549 dir = dir_file->f_path.dentry->d_inode;
2550 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
2551 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
2552- if (hinfo.hash_version <= DX_HASH_TEA)
2553- hinfo.hash_version +=
2554- EXT4_SB(dir->i_sb)->s_hash_unsigned;
2555 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2556 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
2557 start_hash, start_minor_hash);
2558@@ -1055,16 +1050,8 @@
2559 return ERR_PTR(-EIO);
2560 }
2561 inode = ext4_iget(dir->i_sb, ino);
2562+ if (IS_ERR(inode))
2563+ return ERR_CAST(inode);
2564- if (unlikely(IS_ERR(inode))) {
2565- if (PTR_ERR(inode) == -ESTALE) {
2566- ext4_error(dir->i_sb, __func__,
2567- "deleted inode referenced: %u",
2568- ino);
2569- return ERR_PTR(-EIO);
2570- } else {
2571- return ERR_CAST(inode);
2572- }
2573- }
2574 }
2575 return d_splice_alias(inode, dentry);
2576 }
2577@@ -1390,7 +1377,7 @@
2578 struct fake_dirent *fde;
2579
2580 blocksize = dir->i_sb->s_blocksize;
2581+ dxtrace(printk("Creating index\n"));
2582- dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
2583 retval = ext4_journal_get_write_access(handle, bh);
2584 if (retval) {
2585 ext4_std_error(dir->i_sb, retval);
2586@@ -1399,20 +1386,6 @@
2587 }
2588 root = (struct dx_root *) bh->b_data;
2589
2590- /* The 0th block becomes the root, move the dirents out */
2591- fde = &root->dotdot;
2592- de = (struct ext4_dir_entry_2 *)((char *)fde +
2593- ext4_rec_len_from_disk(fde->rec_len));
2594- if ((char *) de >= (((char *) root) + blocksize)) {
2595- ext4_error(dir->i_sb, __func__,
2596- "invalid rec_len for '..' in inode %lu",
2597- dir->i_ino);
2598- brelse(bh);
2599- return -EIO;
2600- }
2601- len = ((char *) root) + blocksize - (char *) de;
2602-
2603- /* Allocate new block for the 0th block's dirents */
2604 bh2 = ext4_append (handle, dir, &block, &retval);
2605 if (!(bh2)) {
2606 brelse(bh);
2607@@ -1421,6 +1394,11 @@
2608 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
2609 data1 = bh2->b_data;
2610
2611+ /* The 0th block becomes the root, move the dirents out */
2612+ fde = &root->dotdot;
2613+ de = (struct ext4_dir_entry_2 *)((char *)fde +
2614+ ext4_rec_len_from_disk(fde->rec_len));
2615+ len = ((char *) root) + blocksize - (char *) de;
2616 memcpy (data1, de, len);
2617 de = (struct ext4_dir_entry_2 *) data1;
2618 top = data1 + len;
2619@@ -1440,8 +1418,6 @@
2620
2621 /* Initialize as for dx_probe */
2622 hinfo.hash_version = root->info.hash_version;
2623- if (hinfo.hash_version <= DX_HASH_TEA)
2624- hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2625 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2626 ext4fs_dirhash(name, namelen, &hinfo);
2627 frame = frames;
2628@@ -2314,7 +2290,7 @@
2629 struct inode * old_inode, * new_inode;
2630 struct buffer_head * old_bh, * new_bh, * dir_bh;
2631 struct ext4_dir_entry_2 * old_de, * new_de;
2632+ int retval;
2633- int retval, force_da_alloc = 0;
2634
2635 old_bh = new_bh = dir_bh = NULL;
2636
2637@@ -2452,7 +2428,6 @@
2638 ext4_mark_inode_dirty(handle, new_inode);
2639 if (!new_inode->i_nlink)
2640 ext4_orphan_add(handle, new_inode);
2641- force_da_alloc = 1;
2642 }
2643 retval = 0;
2644
2645@@ -2461,8 +2436,6 @@
2646 brelse (old_bh);
2647 brelse (new_bh);
2648 ext4_journal_stop(handle);
2649- if (retval == 0 && force_da_alloc)
2650- ext4_alloc_da_blocks(old_inode);
2651 return retval;
2652 }
2653
2654reverted:
2655--- b/fs/ext4/resize.c
2656+++ a/fs/ext4/resize.c
2657@@ -284,9 +284,11 @@
2658 if ((err = extend_or_restart_transaction(handle, 2, bh)))
2659 goto exit_bh;
2660
2661+ mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
2662+ bh->b_data);
2663- mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
2664 ext4_journal_dirty_metadata(handle, bh);
2665 brelse(bh);
2666+
2667 /* Mark unused entries in inode bitmap used */
2668 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
2669 input->inode_bitmap, input->inode_bitmap - start);
2670@@ -295,7 +297,7 @@
2671 goto exit_journal;
2672 }
2673
2674+ mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
2675- mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
2676 bh->b_data);
2677 ext4_journal_dirty_metadata(handle, bh);
2678 exit_bh:
2679@@ -745,7 +747,6 @@
2680 struct inode *inode = NULL;
2681 handle_t *handle;
2682 int gdb_off, gdb_num;
2683- int num_grp_locked = 0;
2684 int err, err2;
2685
2686 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
2687@@ -786,7 +787,6 @@
2688 }
2689 }
2690
2691-
2692 if ((err = verify_group_input(sb, input)))
2693 goto exit_put;
2694
2695@@ -855,18 +855,15 @@
2696 * using the new disk blocks.
2697 */
2698
2699- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
2700 /* Update group descriptor block for new group */
2701 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
2702 gdb_off * EXT4_DESC_SIZE(sb));
2703
2704- memset(gdp, 0, EXT4_DESC_SIZE(sb));
2705 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
2706 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
2707 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
2708 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
2709 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
2710- gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
2711 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
2712
2713 /*
2714@@ -874,11 +871,9 @@
2715 * descriptor
2716 */
2717 if (test_opt(sb, MBALLOC)) {
2718+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
2719+ if (err)
2720- err = ext4_mb_add_groupinfo(sb, input->group, gdp);
2721- if (err) {
2722- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2723 goto exit_journal;
2724- }
2725 }
2726 /*
2727 * Make the new blocks and inodes valid next. We do this before
2728@@ -920,7 +915,6 @@
2729
2730 /* Update the global fs size fields */
2731 sbi->s_groups_count++;
2732- ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2733
2734 ext4_journal_dirty_metadata(handle, primary);
2735
2736@@ -982,7 +976,9 @@
2737 struct buffer_head * bh;
2738 handle_t *handle;
2739 int err;
2740+ unsigned long freed_blocks;
2741 ext4_group_t group;
2742+ struct ext4_group_info *grp;
2743
2744 /* We don't need to worry about locking wrt other resizers just
2745 * yet: we're going to revalidate es->s_blocks_count after
2746@@ -1081,13 +1077,50 @@
2747 unlock_super(sb);
2748 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
2749 o_blocks_count + add);
2750+ ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
2751- /* We add the blocks to the bitmap and set the group need init bit */
2752- ext4_add_groupblocks(handle, sb, o_blocks_count, add);
2753 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
2754 o_blocks_count + add);
2755 if ((err = ext4_journal_stop(handle)))
2756 goto exit_put;
2757
2758+ /*
2759+ * Mark mballoc pages as not up to date so that they will be updated
2760+ * next time they are loaded by ext4_mb_load_buddy.
2761+ */
2762+ if (test_opt(sb, MBALLOC)) {
2763+ struct ext4_sb_info *sbi = EXT4_SB(sb);
2764+ struct inode *inode = sbi->s_buddy_cache;
2765+ int blocks_per_page;
2766+ int block;
2767+ int pnum;
2768+ struct page *page;
2769+
2770+ /* Set buddy page as not up to date */
2771+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2772+ block = group * 2;
2773+ pnum = block / blocks_per_page;
2774+ page = find_get_page(inode->i_mapping, pnum);
2775+ if (page != NULL) {
2776+ ClearPageUptodate(page);
2777+ page_cache_release(page);
2778+ }
2779+
2780+ /* Set bitmap page as not up to date */
2781+ block++;
2782+ pnum = block / blocks_per_page;
2783+ page = find_get_page(inode->i_mapping, pnum);
2784+ if (page != NULL) {
2785+ ClearPageUptodate(page);
2786+ page_cache_release(page);
2787+ }
2788+
2789+ /* Get the info on the last group */
2790+ grp = ext4_get_group_info(sb, group);
2791+
2792+ /* Update free blocks in group info */
2793+ ext4_mb_update_group_info(grp, add);
2794+ }
2795+
2796 if (test_opt(sb, DEBUG))
2797 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
2798 ext4_blocks_count(es));
2799reverted:
2800--- b/fs/ext4/super.c
2801+++ a/fs/ext4/super.c
2802@@ -1493,6 +1493,7 @@
2803 ext4_group_t flex_group_count;
2804 ext4_group_t flex_group;
2805 int groups_per_flex = 0;
2806+ __u64 block_bitmap = 0;
2807 int i;
2808
2809 if (!sbi->s_es->s_log_groups_per_flex) {
2810@@ -1515,6 +1516,9 @@
2811 goto failed;
2812 }
2813
2814+ gdp = ext4_get_group_desc(sb, 1, &bh);
2815+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
2816+
2817 for (i = 0; i < sbi->s_groups_count; i++) {
2818 gdp = ext4_get_group_desc(sb, i, &bh);
2819
2820@@ -1916,8 +1920,8 @@
2821 struct inode *root;
2822 int ret = -EINVAL;
2823 int blocksize;
2824+ int db_count;
2825+ int i;
2826- unsigned int db_count;
2827- unsigned int i;
2828 int needs_recovery;
2829 __le32 features;
2830 __u64 blocks_count;
2831@@ -2168,18 +2172,6 @@
2832 for (i = 0; i < 4; i++)
2833 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2834 sbi->s_def_hash_version = es->s_def_hash_version;
2835- i = le32_to_cpu(es->s_flags);
2836- if (i & EXT2_FLAGS_UNSIGNED_HASH)
2837- sbi->s_hash_unsigned = 3;
2838- else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2839-#ifdef __CHAR_UNSIGNED__
2840- es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2841- sbi->s_hash_unsigned = 3;
2842-#else
2843- es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2844-#endif
2845- sb->s_dirt = 1;
2846- }
2847
2848 if (sbi->s_blocks_per_group > blocksize * 8) {
2849 printk(KERN_ERR
2850@@ -2207,30 +2199,20 @@
2851 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2852 goto cantfind_ext4;
2853
2854+ /* ensure blocks_count calculation below doesn't sign-extend */
2855+ if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2856+ le32_to_cpu(es->s_first_data_block) + 1) {
2857+ printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2858+ "first data block %u, blocks per group %lu\n",
2859+ ext4_blocks_count(es),
2860+ le32_to_cpu(es->s_first_data_block),
2861+ EXT4_BLOCKS_PER_GROUP(sb));
2862- /*
2863- * It makes no sense for the first data block to be beyond the end
2864- * of the filesystem.
2865- */
2866- if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2867- printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2868- "block %u is beyond end of filesystem (%llu)\n",
2869- le32_to_cpu(es->s_first_data_block),
2870- ext4_blocks_count(es));
2871 goto failed_mount;
2872 }
2873 blocks_count = (ext4_blocks_count(es) -
2874 le32_to_cpu(es->s_first_data_block) +
2875 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2876 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2877- if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2878- printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2879- "(block count %llu, first data block %u, "
2880- "blocks per group %lu)\n", sbi->s_groups_count,
2881- ext4_blocks_count(es),
2882- le32_to_cpu(es->s_first_data_block),
2883- EXT4_BLOCKS_PER_GROUP(sb));
2884- goto failed_mount;
2885- }
2886 sbi->s_groups_count = blocks_count;
2887 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2888 EXT4_DESC_PER_BLOCK(sb);
2889@@ -2950,14 +2932,14 @@
2890
2891 static int ext4_sync_fs(struct super_block *sb, int wait)
2892 {
2893+ int ret = 0;
2894- tid_t target;
2895
2896 sb->s_dirt = 0;
2897+ if (wait)
2898+ ret = ext4_force_commit(sb);
2899+ else
2900+ jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
2901+ return ret;
2902- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2903- if (wait)
2904- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
2905- }
2906- return 0;
2907 }
2908
2909 /*
2910reverted:
2911--- b/fs/jbd2/commit.c
2912+++ a/fs/jbd2/commit.c
2913@@ -24,7 +24,6 @@
2914 #include <linux/crc32.h>
2915 #include <linux/writeback.h>
2916 #include <linux/backing-dev.h>
2917-#include <linux/bio.h>
2918
2919 /*
2920 * Default IO end handler for temporary BJ_IO buffer_heads.
2921@@ -171,34 +170,12 @@
2922 * This function along with journal_submit_commit_record
2923 * allows to write the commit record asynchronously.
2924 */
2925+static int journal_wait_on_commit_record(struct buffer_head *bh)
2926-static int journal_wait_on_commit_record(journal_t *journal,
2927- struct buffer_head *bh)
2928 {
2929 int ret = 0;
2930
2931-retry:
2932 clear_buffer_dirty(bh);
2933 wait_on_buffer(bh);
2934- if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
2935- printk(KERN_WARNING
2936- "JBD2: wait_on_commit_record: sync failed on %s - "
2937- "disabling barriers\n", journal->j_devname);
2938- spin_lock(&journal->j_state_lock);
2939- journal->j_flags &= ~JBD2_BARRIER;
2940- spin_unlock(&journal->j_state_lock);
2941-
2942- lock_buffer(bh);
2943- clear_buffer_dirty(bh);
2944- set_buffer_uptodate(bh);
2945- bh->b_end_io = journal_end_buffer_io_sync;
2946-
2947- ret = submit_bh(WRITE_SYNC, bh);
2948- if (ret) {
2949- unlock_buffer(bh);
2950- return ret;
2951- }
2952- goto retry;
2953- }
2954
2955 if (unlikely(!buffer_uptodate(bh)))
2956 ret = -EIO;
2957@@ -818,7 +795,7 @@
2958 __jbd2_journal_abort_hard(journal);
2959 }
2960 if (!err && !is_journal_aborted(journal))
2961+ err = journal_wait_on_commit_record(cbh);
2962- err = journal_wait_on_commit_record(journal, cbh);
2963
2964 if (err)
2965 jbd2_journal_abort(journal, err);
2966reverted:
2967--- b/fs/jbd2/journal.c
2968+++ a/fs/jbd2/journal.c
2969@@ -430,7 +430,7 @@
2970 }
2971
2972 /*
2973+ * Called under j_state_lock. Returns true if a transaction was started.
2974- * Called under j_state_lock. Returns true if a transaction commit was started.
2975 */
2976 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
2977 {
2978@@ -498,8 +498,7 @@
2979
2980 /*
2981 * Start a commit of the current running transaction (if any). Returns true
2982+ * if a transaction was started, and fills its tid in at *ptid
2983- * if a transaction is going to be committed (or is currently already
2984- * committing), and fills its tid in at *ptid
2985 */
2986 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
2987 {
2988@@ -509,19 +508,15 @@
2989 if (journal->j_running_transaction) {
2990 tid_t tid = journal->j_running_transaction->t_tid;
2991
2992+ ret = __jbd2_log_start_commit(journal, tid);
2993+ if (ret && ptid)
2994- __jbd2_log_start_commit(journal, tid);
2995- /* There's a running transaction and we've just made sure
2996- * it's commit has been scheduled. */
2997- if (ptid)
2998 *ptid = tid;
2999+ } else if (journal->j_committing_transaction && ptid) {
3000- ret = 1;
3001- } else if (journal->j_committing_transaction) {
3002 /*
3003 * If ext3_write_super() recently started a commit, then we
3004 * have to wait for completion of that transaction
3005 */
3006+ *ptid = journal->j_committing_transaction->t_tid;
3007- if (ptid)
3008- *ptid = journal->j_committing_transaction->t_tid;
3009 ret = 1;
3010 }
3011 spin_unlock(&journal->j_state_lock);
3012reverted:
3013--- b/fs/jbd2/revoke.c
3014+++ a/fs/jbd2/revoke.c
3015@@ -55,25 +55,6 @@
3016 * need do nothing.
3017 * RevokeValid set, Revoked set:
3018 * buffer has been revoked.
3019- *
3020- * Locking rules:
3021- * We keep two hash tables of revoke records. One hashtable belongs to the
3022- * running transaction (is pointed to by journal->j_revoke), the other one
3023- * belongs to the committing transaction. Accesses to the second hash table
3024- * happen only from the kjournald and no other thread touches this table. Also
3025- * journal_switch_revoke_table() which switches which hashtable belongs to the
3026- * running and which to the committing transaction is called only from
3027- * kjournald. Therefore we need no locks when accessing the hashtable belonging
3028- * to the committing transaction.
3029- *
3030- * All users operating on the hash table belonging to the running transaction
3031- * have a handle to the transaction. Therefore they are safe from kjournald
3032- * switching hash tables under them. For operations on the lists of entries in
3033- * the hash table j_revoke_lock is used.
3034- *
3035- * Finally, also replay code uses the hash tables but at this moment noone else
3036- * can touch them (filesystem isn't mounted yet) and hence no locking is
3037- * needed.
3038 */
3039
3040 #ifndef __KERNEL__
3041@@ -420,6 +401,8 @@
3042 * the second time we would still have a pending revoke to cancel. So,
3043 * do not trust the Revoked bit on buffers unless RevokeValid is also
3044 * set.
3045+ *
3046+ * The caller must have the journal locked.
3047 */
3048 int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
3049 {
3050@@ -497,7 +480,10 @@
3051 /*
3052 * Write revoke records to the journal for all entries in the current
3053 * revoke hash, deleting the entries as we go.
3054+ *
3055+ * Called with the journal lock held.
3056 */
3057+
3058 void jbd2_journal_write_revoke_records(journal_t *journal,
3059 transaction_t *transaction)
3060 {
3061reverted:
3062--- b/fs/jbd2/transaction.c
3063+++ a/fs/jbd2/transaction.c
3064@@ -2049,46 +2049,26 @@
3065 }
3066
3067 /*
3068+ * This function must be called when inode is journaled in ordered mode
3069+ * before truncation happens. It starts writeout of truncated part in
3070+ * case it is in the committing transaction so that we stand to ordered
3071+ * mode consistency guarantees.
3072- * File truncate and transaction commit interact with each other in a
3073- * non-trivial way. If a transaction writing data block A is
3074- * committing, we cannot discard the data by truncate until we have
3075- * written them. Otherwise if we crashed after the transaction with
3076- * write has committed but before the transaction with truncate has
3077- * committed, we could see stale data in block A. This function is a
3078- * helper to solve this problem. It starts writeout of the truncated
3079- * part in case it is in the committing transaction.
3080- *
3081- * Filesystem code must call this function when inode is journaled in
3082- * ordered mode before truncation happens and after the inode has been
3083- * placed on orphan list with the new inode size. The second condition
3084- * avoids the race that someone writes new data and we start
3085- * committing the transaction after this function has been called but
3086- * before a transaction for truncate is started (and furthermore it
3087- * allows us to optimize the case where the addition to orphan list
3088- * happens in the same transaction as write --- we don't have to write
3089- * any data in such case).
3090 */
3091+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
3092-int jbd2_journal_begin_ordered_truncate(journal_t *journal,
3093- struct jbd2_inode *jinode,
3094 loff_t new_size)
3095 {
3096+ journal_t *journal;
3097+ transaction_t *commit_trans;
3098- transaction_t *inode_trans, *commit_trans;
3099 int ret = 0;
3100
3101+ if (!inode->i_transaction && !inode->i_next_transaction)
3102- /* This is a quick check to avoid locking if not necessary */
3103- if (!jinode->i_transaction)
3104 goto out;
3105+ journal = inode->i_transaction->t_journal;
3106- /* Locks are here just to force reading of recent values, it is
3107- * enough that the transaction was not committing before we started
3108- * a transaction adding the inode to orphan list */
3109 spin_lock(&journal->j_state_lock);
3110 commit_trans = journal->j_committing_transaction;
3111 spin_unlock(&journal->j_state_lock);
3112+ if (inode->i_transaction == commit_trans) {
3113+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
3114- spin_lock(&journal->j_list_lock);
3115- inode_trans = jinode->i_transaction;
3116- spin_unlock(&journal->j_list_lock);
3117- if (inode_trans == commit_trans) {
3118- ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
3119 new_size, LLONG_MAX);
3120 if (ret)
3121 jbd2_journal_abort(journal, ret);
3122reverted:
3123--- b/include/linux/jbd2.h
3124+++ a/include/linux/jbd2.h
3125@@ -308,8 +308,7 @@
3126 int val = (expr); \
3127 if (!val) { \
3128 printk(KERN_ERR \
3129+ "EXT3-fs unexpected failure: %s;\n",# expr); \
3130- "JBD2 unexpected failure: %s: %s;\n", \
3131- __func__, #expr); \
3132 printk(KERN_ERR why "\n"); \
3133 } \
3134 val; \
3135@@ -330,7 +329,6 @@
3136 BH_State, /* Pins most journal_head state */
3137 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
3138 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
3139- BH_JBDPrivateStart, /* First bit available for private use by FS */
3140 };
3141
3142 BUFFER_FNS(JBD, jbd)
3143@@ -1075,8 +1073,7 @@
3144 extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
3145 extern int jbd2_journal_force_commit(journal_t *);
3146 extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
3147+extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
3148-extern int jbd2_journal_begin_ordered_truncate(journal_t *journal,
3149- struct jbd2_inode *inode, loff_t new_size);
3150 extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
3151 extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
3152