]> git.ipfire.org Git - ipfire-2.x.git/blob - src/patches/suse-2.6.27.39/patches.kernel.org/patch-2.6.27.18-19
Imported linux-2.6.27.39 suse/xen patches.
[ipfire-2.x.git] / src / patches / suse-2.6.27.39 / patches.kernel.org / patch-2.6.27.18-19
1 From: Og <og@kroah.com>
2 Subject: Linux 2.6.27.19
3
4 Upstream 2.6.27.19 release from kernel.org
5
6 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
7
8 diff --git a/Makefile b/Makefile
9 index 9273a73..dbe8543 100644
10 --- a/Makefile
11 +++ b/Makefile
12 @@ -1,7 +1,7 @@
13 VERSION = 2
14 PATCHLEVEL = 6
15 SUBLEVEL = 27
16 -EXTRAVERSION = .18
17 +EXTRAVERSION = .19
18 NAME = Trembling Tortoise
19
20 # *DOCUMENTATION*
21 diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
22 index 5af4e9b..ada0692 100644
23 --- a/arch/powerpc/kernel/align.c
24 +++ b/arch/powerpc/kernel/align.c
25 @@ -646,11 +646,16 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
26 unsigned int areg, struct pt_regs *regs,
27 unsigned int flags, unsigned int length)
28 {
29 - char *ptr = (char *) &current->thread.TS_FPR(reg);
30 + char *ptr;
31 int ret = 0;
32
33 flush_vsx_to_thread(current);
34
35 + if (reg < 32)
36 + ptr = (char *) &current->thread.TS_FPR(reg);
37 + else
38 + ptr = (char *) &current->thread.vr[reg - 32];
39 +
40 if (flags & ST)
41 ret = __copy_to_user(addr, ptr, length);
42 else {
43 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
44 index 5b719a0..7c3b8dc 100644
45 --- a/arch/x86/mm/pageattr.c
46 +++ b/arch/x86/mm/pageattr.c
47 @@ -619,6 +619,13 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
48 unsigned int level;
49 pte_t *kpte, old_pte;
50
51 + /*
52 + * If we're called with lazy mmu updates enabled, the
53 + * in-memory pte state may be stale. Flush pending updates to
54 + * bring them up to date.
55 + */
56 + arch_flush_lazy_mmu_mode();
57 +
58 repeat:
59 kpte = lookup_address(address, &level);
60 if (!kpte)
61 @@ -836,6 +843,13 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
62 else
63 cpa_flush_all(cache);
64
65 + /*
66 + * If we've been called with lazy mmu updates enabled, then
67 + * make sure that everything gets flushed out before we
68 + * return.
69 + */
70 + arch_flush_lazy_mmu_mode();
71 +
72 out:
73 cpa_fill_pool(NULL);
74
75 diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
76 index c5be6a1..b6f55e8 100644
77 --- a/drivers/ata/pata_via.c
78 +++ b/drivers/ata/pata_via.c
79 @@ -111,7 +111,8 @@ static const struct via_isa_bridge {
80 { "vt8237s", PCI_DEVICE_ID_VIA_8237S, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
81 { "vt8251", PCI_DEVICE_ID_VIA_8251, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
82 { "cx700", PCI_DEVICE_ID_VIA_CX700, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA },
83 - { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES},
84 + { "vt6410", PCI_DEVICE_ID_VIA_6410, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
85 + { "vt6415", PCI_DEVICE_ID_VIA_6415, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
86 { "vt8237a", PCI_DEVICE_ID_VIA_8237A, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
87 { "vt8237", PCI_DEVICE_ID_VIA_8237, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
88 { "vt8235", PCI_DEVICE_ID_VIA_8235, 0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
89 @@ -594,6 +595,7 @@ static int via_reinit_one(struct pci_dev *pdev)
90 #endif
91
92 static const struct pci_device_id via[] = {
93 + { PCI_VDEVICE(VIA, 0x0415), },
94 { PCI_VDEVICE(VIA, 0x0571), },
95 { PCI_VDEVICE(VIA, 0x0581), },
96 { PCI_VDEVICE(VIA, 0x1571), },
97 diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
98 index 89e3b7f..8b6f9c0 100644
99 --- a/drivers/ata/sata_nv.c
100 +++ b/drivers/ata/sata_nv.c
101 @@ -421,19 +421,21 @@ static struct ata_port_operations nv_generic_ops = {
102 .hardreset = ATA_OP_NULL,
103 };
104
105 -/* OSDL bz3352 reports that nf2/3 controllers can't determine device
106 - * signature reliably. Also, the following thread reports detection
107 - * failure on cold boot with the standard debouncing timing.
108 +/* nf2 is ripe with hardreset related problems.
109 + *
110 + * kernel bz#3352 reports nf2/3 controllers can't determine device
111 + * signature reliably. The following thread reports detection failure
112 + * on cold boot with the standard debouncing timing.
113 *
114 * http://thread.gmane.org/gmane.linux.ide/34098
115 *
116 - * Debounce with hotplug timing and request follow-up SRST.
117 + * And bz#12176 reports that hardreset simply doesn't work on nf2.
118 + * Give up on it and just don't do hardreset.
119 */
120 static struct ata_port_operations nv_nf2_ops = {
121 - .inherits = &nv_common_ops,
122 + .inherits = &nv_generic_ops,
123 .freeze = nv_nf2_freeze,
124 .thaw = nv_nf2_thaw,
125 - .hardreset = nv_noclassify_hardreset,
126 };
127
128 /* For initial probing after boot and hot plugging, hardreset mostly
129 diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
130 index 58630cc..f2ada0c 100644
131 --- a/drivers/bluetooth/btsdio.c
132 +++ b/drivers/bluetooth/btsdio.c
133 @@ -91,6 +91,7 @@ static int btsdio_tx_packet(struct btsdio_data *data, struct sk_buff *skb)
134
135 err = sdio_writesb(data->func, REG_TDAT, skb->data, skb->len);
136 if (err < 0) {
137 + skb_pull(skb, 4);
138 sdio_writeb(data->func, 0x01, REG_PC_WRT, NULL);
139 return err;
140 }
141 @@ -152,7 +153,7 @@ static int btsdio_rx_packet(struct btsdio_data *data)
142
143 err = sdio_readsb(data->func, skb->data, REG_RDAT, len - 4);
144 if (err < 0) {
145 - kfree(skb);
146 + kfree_skb(skb);
147 return err;
148 }
149
150 diff --git a/drivers/net/3c505.c b/drivers/net/3c505.c
151 index fdfb2b2..ae8e36c 100644
152 --- a/drivers/net/3c505.c
153 +++ b/drivers/net/3c505.c
154 @@ -493,21 +493,27 @@ static bool receive_pcb(struct net_device *dev, pcb_struct * pcb)
155 }
156 /* read the data */
157 spin_lock_irqsave(&adapter->lock, flags);
158 - i = 0;
159 - do {
160 - j = 0;
161 - while (((stat = get_status(dev->base_addr)) & ACRF) == 0 && j++ < 20000);
162 - pcb->data.raw[i++] = inb_command(dev->base_addr);
163 - if (i > MAX_PCB_DATA)
164 - INVALID_PCB_MSG(i);
165 - } while ((stat & ASF_PCB_MASK) != ASF_PCB_END && j < 20000);
166 + for (i = 0; i < MAX_PCB_DATA; i++) {
167 + for (j = 0; j < 20000; j++) {
168 + stat = get_status(dev->base_addr);
169 + if (stat & ACRF)
170 + break;
171 + }
172 + pcb->data.raw[i] = inb_command(dev->base_addr);
173 + if ((stat & ASF_PCB_MASK) == ASF_PCB_END || j >= 20000)
174 + break;
175 + }
176 spin_unlock_irqrestore(&adapter->lock, flags);
177 + if (i >= MAX_PCB_DATA) {
178 + INVALID_PCB_MSG(i);
179 + return false;
180 + }
181 if (j >= 20000) {
182 TIMEOUT_MSG(__LINE__);
183 return false;
184 }
185 - /* woops, the last "data" byte was really the length! */
186 - total_length = pcb->data.raw[--i];
187 + /* the last "data" byte was really the length! */
188 + total_length = pcb->data.raw[i];
189
190 /* safety check total length vs data length */
191 if (total_length != (pcb->length + 2)) {
192 diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
193 index c3edcdc..2d90a3c 100644
194 --- a/drivers/pci/intel-iommu.c
195 +++ b/drivers/pci/intel-iommu.c
196 @@ -72,6 +72,8 @@ static struct deferred_flush_tables *deferred_flush;
197 /* bitmap for indexing intel_iommus */
198 static int g_num_of_iommus;
199
200 +static int rwbf_quirk = 0;
201 +
202 static DEFINE_SPINLOCK(async_umap_flush_lock);
203 static LIST_HEAD(unmaps_to_do);
204
205 @@ -527,7 +529,7 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu)
206 u32 val;
207 unsigned long flag;
208
209 - if (!cap_rwbf(iommu->cap))
210 + if (!rwbf_quirk && !cap_rwbf(iommu->cap))
211 return;
212 val = iommu->gcmd | DMA_GCMD_WBF;
213
214 @@ -2453,3 +2455,12 @@ int __init intel_iommu_init(void)
215 return 0;
216 }
217
218 +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
219 +{
220 + /* Mobile 4 Series Chipset neglects to set RWBF capability,
221 + but needs it */
222 + printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
223 + rwbf_quirk = 1;
224 +}
225 +
226 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
227 diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
228 index 299e075..55ac5c3 100644
229 --- a/drivers/scsi/libiscsi.c
230 +++ b/drivers/scsi/libiscsi.c
231 @@ -1844,6 +1844,7 @@ void iscsi_pool_free(struct iscsi_pool *q)
232 kfree(q->pool[i]);
233 if (q->pool)
234 kfree(q->pool);
235 + kfree(q->queue);
236 }
237 EXPORT_SYMBOL_GPL(iscsi_pool_free);
238
239 diff --git a/fs/ext2/super.c b/fs/ext2/super.c
240 index fd88c7b..2ebc0c4 100644
241 --- a/fs/ext2/super.c
242 +++ b/fs/ext2/super.c
243 @@ -1177,9 +1177,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
244 es = sbi->s_es;
245 if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
246 (old_mount_opt & EXT2_MOUNT_XIP)) &&
247 - invalidate_inodes(sb))
248 - ext2_warning(sb, __func__, "busy inodes while remounting "\
249 - "xip remain in cache (no functional problem)");
250 + invalidate_inodes(sb)) {
251 + ext2_warning(sb, __func__, "refusing change of xip flag "
252 + "with busy inodes while remounting");
253 + sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
254 + sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
255 + }
256 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
257 return 0;
258 if (*flags & MS_RDONLY) {
259 diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
260 index e9fa960..8b7c776 100644
261 --- a/fs/ext4/balloc.c
262 +++ b/fs/ext4/balloc.c
263 @@ -20,6 +20,7 @@
264 #include "ext4.h"
265 #include "ext4_jbd2.h"
266 #include "group.h"
267 +#include "mballoc.h"
268
269 /*
270 * balloc.c contains the blocks allocation and deallocation routines
271 @@ -318,18 +319,41 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
272 block_group, bitmap_blk);
273 return NULL;
274 }
275 - if (bh_uptodate_or_lock(bh))
276 +
277 + if (bitmap_uptodate(bh))
278 return bh;
279
280 + lock_buffer(bh);
281 + if (bitmap_uptodate(bh)) {
282 + unlock_buffer(bh);
283 + return bh;
284 + }
285 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
286 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
287 ext4_init_block_bitmap(sb, bh, block_group, desc);
288 + set_bitmap_uptodate(bh);
289 set_buffer_uptodate(bh);
290 unlock_buffer(bh);
291 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
292 return bh;
293 }
294 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
295 + if (buffer_uptodate(bh)) {
296 + /*
297 + * if not uninit if bh is uptodate,
298 + * bitmap is also uptodate
299 + */
300 + set_bitmap_uptodate(bh);
301 + unlock_buffer(bh);
302 + return bh;
303 + }
304 + /*
305 + * submit the buffer_head for read. We can
306 + * safely mark the bitmap as uptodate now.
307 + * We do it here so the bitmap uptodate bit
308 + * get set with buffer lock held.
309 + */
310 + set_bitmap_uptodate(bh);
311 if (bh_submit_read(bh) < 0) {
312 put_bh(bh);
313 ext4_error(sb, __func__,
314 @@ -837,6 +861,136 @@ error_return:
315 }
316
317 /**
318 + * ext4_add_groupblocks() -- Add given blocks to an existing group
319 + * @handle: handle to this transaction
320 + * @sb: super block
321 + * @block: start physcial block to add to the block group
322 + * @count: number of blocks to free
323 + *
324 + * This marks the blocks as free in the bitmap. We ask the
325 + * mballoc to reload the buddy after this by setting group
326 + * EXT4_GROUP_INFO_NEED_INIT_BIT flag
327 + */
328 +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
329 + ext4_fsblk_t block, unsigned long count)
330 +{
331 + struct buffer_head *bitmap_bh = NULL;
332 + struct buffer_head *gd_bh;
333 + ext4_group_t block_group;
334 + ext4_grpblk_t bit;
335 + unsigned long i;
336 + struct ext4_group_desc *desc;
337 + struct ext4_super_block *es;
338 + struct ext4_sb_info *sbi;
339 + int err = 0, ret;
340 + ext4_grpblk_t blocks_freed;
341 + struct ext4_group_info *grp;
342 +
343 + sbi = EXT4_SB(sb);
344 + es = sbi->s_es;
345 + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
346 +
347 + ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
348 + grp = ext4_get_group_info(sb, block_group);
349 + /*
350 + * Check to see if we are freeing blocks across a group
351 + * boundary.
352 + */
353 + if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
354 + goto error_return;
355 +
356 + bitmap_bh = ext4_read_block_bitmap(sb, block_group);
357 + if (!bitmap_bh)
358 + goto error_return;
359 + desc = ext4_get_group_desc(sb, block_group, &gd_bh);
360 + if (!desc)
361 + goto error_return;
362 +
363 + if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
364 + in_range(ext4_inode_bitmap(sb, desc), block, count) ||
365 + in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
366 + in_range(block + count - 1, ext4_inode_table(sb, desc),
367 + sbi->s_itb_per_group)) {
368 + ext4_error(sb, __func__,
369 + "Adding blocks in system zones - "
370 + "Block = %llu, count = %lu",
371 + block, count);
372 + goto error_return;
373 + }
374 +
375 + /*
376 + * We are about to add blocks to the bitmap,
377 + * so we need undo access.
378 + */
379 + BUFFER_TRACE(bitmap_bh, "getting undo access");
380 + err = ext4_journal_get_undo_access(handle, bitmap_bh);
381 + if (err)
382 + goto error_return;
383 +
384 + /*
385 + * We are about to modify some metadata. Call the journal APIs
386 + * to unshare ->b_data if a currently-committing transaction is
387 + * using it
388 + */
389 + BUFFER_TRACE(gd_bh, "get_write_access");
390 + err = ext4_journal_get_write_access(handle, gd_bh);
391 + if (err)
392 + goto error_return;
393 + /*
394 + * make sure we don't allow a parallel init on other groups in the
395 + * same buddy cache
396 + */
397 + down_write(&grp->alloc_sem);
398 + for (i = 0, blocks_freed = 0; i < count; i++) {
399 + BUFFER_TRACE(bitmap_bh, "clear bit");
400 + if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
401 + bit + i, bitmap_bh->b_data)) {
402 + ext4_error(sb, __func__,
403 + "bit already cleared for block %llu",
404 + (ext4_fsblk_t)(block + i));
405 + BUFFER_TRACE(bitmap_bh, "bit already cleared");
406 + } else {
407 + blocks_freed++;
408 + }
409 + }
410 + spin_lock(sb_bgl_lock(sbi, block_group));
411 + le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
412 + desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
413 + spin_unlock(sb_bgl_lock(sbi, block_group));
414 + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
415 +
416 + if (sbi->s_log_groups_per_flex) {
417 + ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
418 + spin_lock(sb_bgl_lock(sbi, flex_group));
419 + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
420 + spin_unlock(sb_bgl_lock(sbi, flex_group));
421 + }
422 + /*
423 + * request to reload the buddy with the
424 + * new bitmap information
425 + */
426 + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
427 + ext4_mb_update_group_info(grp, blocks_freed);
428 + up_write(&grp->alloc_sem);
429 +
430 + /* We dirtied the bitmap block */
431 + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
432 + err = ext4_journal_dirty_metadata(handle, bitmap_bh);
433 +
434 + /* And the group descriptor block */
435 + BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
436 + ret = ext4_journal_dirty_metadata(handle, gd_bh);
437 + if (!err)
438 + err = ret;
439 + sb->s_dirt = 1;
440 +
441 +error_return:
442 + brelse(bitmap_bh);
443 + ext4_std_error(sb, err);
444 + return;
445 +}
446 +
447 +/**
448 * ext4_free_blocks() -- Free given blocks and update quota
449 * @handle: handle for this transaction
450 * @inode: inode
451 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
452 index 4829dac..85f58af 100644
453 --- a/fs/ext4/ext4.h
454 +++ b/fs/ext4/ext4.h
455 @@ -19,6 +19,7 @@
456 #include <linux/types.h>
457 #include <linux/blkdev.h>
458 #include <linux/magic.h>
459 +#include <linux/jbd2.h>
460 #include "ext4_i.h"
461
462 /*
463 @@ -889,6 +890,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
464 #define DX_HASH_LEGACY 0
465 #define DX_HASH_HALF_MD4 1
466 #define DX_HASH_TEA 2
467 +#define DX_HASH_LEGACY_UNSIGNED 3
468 +#define DX_HASH_HALF_MD4_UNSIGNED 4
469 +#define DX_HASH_TEA_UNSIGNED 5
470
471 #ifdef __KERNEL__
472
473 @@ -988,9 +992,11 @@ extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
474 ext4_fsblk_t nblocks);
475 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
476 ext4_fsblk_t block, unsigned long count, int metadata);
477 -extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
478 - ext4_fsblk_t block, unsigned long count,
479 +extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
480 + ext4_fsblk_t block, unsigned long count,
481 unsigned long *pdquot_freed_blocks);
482 +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
483 + ext4_fsblk_t block, unsigned long count);
484 extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
485 extern void ext4_check_blocks_bitmap (struct super_block *);
486 extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
487 @@ -1038,12 +1044,13 @@ extern int __init init_ext4_mballoc(void);
488 extern void exit_ext4_mballoc(void);
489 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
490 unsigned long, unsigned long, int, unsigned long *);
491 -extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
492 +extern int ext4_mb_add_groupinfo(struct super_block *sb,
493 ext4_group_t i, struct ext4_group_desc *desc);
494 extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
495 ext4_grpblk_t add);
496 -
497 -
498 +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
499 +extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
500 + ext4_group_t, int);
501 /* inode.c */
502 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
503 struct buffer_head *bh, ext4_fsblk_t blocknr);
504 @@ -1167,8 +1174,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
505
506 static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
507 {
508 - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
509 - le32_to_cpu(raw_inode->i_size_lo);
510 + if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
511 + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
512 + le32_to_cpu(raw_inode->i_size_lo);
513 + else
514 + return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
515 }
516
517 static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
518 @@ -1244,6 +1254,23 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
519 sector_t block, unsigned long max_blocks,
520 struct buffer_head *bh, int create,
521 int extend_disksize, int flag);
522 +/*
523 + * Add new method to test wether block and inode bitmaps are properly
524 + * initialized. With uninit_bg reading the block from disk is not enough
525 + * to mark the bitmap uptodate. We need to also zero-out the bitmap
526 + */
527 +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
528 +
529 +static inline int bitmap_uptodate(struct buffer_head *bh)
530 +{
531 + return (buffer_uptodate(bh) &&
532 + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
533 +}
534 +static inline void set_bitmap_uptodate(struct buffer_head *bh)
535 +{
536 + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
537 +}
538 +
539 #endif /* __KERNEL__ */
540
541 #endif /* _EXT4_H */
542 diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
543 index 6300226..f20df8a 100644
544 --- a/fs/ext4/ext4_sb.h
545 +++ b/fs/ext4/ext4_sb.h
546 @@ -56,6 +56,7 @@ struct ext4_sb_info {
547 u32 s_next_generation;
548 u32 s_hash_seed[4];
549 int s_def_hash_version;
550 + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */
551 struct percpu_counter s_freeblocks_counter;
552 struct percpu_counter s_freeinodes_counter;
553 struct percpu_counter s_dirs_counter;
554 @@ -102,7 +103,8 @@ struct ext4_sb_info {
555 struct list_head s_committed_transaction;
556 spinlock_t s_md_lock;
557 tid_t s_last_transaction;
558 - unsigned short *s_mb_offsets, *s_mb_maxs;
559 + unsigned short *s_mb_offsets;
560 + unsigned int *s_mb_maxs;
561
562 /* tunables */
563 unsigned long s_stripe;
564 diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
565 index 1d6329d..bd7d14d 100644
566 --- a/fs/ext4/hash.c
567 +++ b/fs/ext4/hash.c
568 @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
569
570
571 /* The old legacy hash */
572 -static __u32 dx_hack_hash (const char *name, int len)
573 +static __u32 dx_hack_hash_unsigned(const char *name, int len)
574 {
575 - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
576 + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
577 + const unsigned char *ucp = (const unsigned char *) name;
578 +
579 + while (len--) {
580 + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
581 +
582 + if (hash & 0x80000000)
583 + hash -= 0x7fffffff;
584 + hash1 = hash0;
585 + hash0 = hash;
586 + }
587 + return hash0 << 1;
588 +}
589 +
590 +static __u32 dx_hack_hash_signed(const char *name, int len)
591 +{
592 + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
593 + const signed char *scp = (const signed char *) name;
594 +
595 while (len--) {
596 - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
597 + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
598
599 - if (hash & 0x80000000) hash -= 0x7fffffff;
600 + if (hash & 0x80000000)
601 + hash -= 0x7fffffff;
602 hash1 = hash0;
603 hash0 = hash;
604 }
605 - return (hash0 << 1);
606 + return hash0 << 1;
607 }
608
609 -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
610 +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
611 {
612 __u32 pad, val;
613 int i;
614 + const signed char *scp = (const signed char *) msg;
615 +
616 + pad = (__u32)len | ((__u32)len << 8);
617 + pad |= pad << 16;
618 +
619 + val = pad;
620 + if (len > num*4)
621 + len = num * 4;
622 + for (i = 0; i < len; i++) {
623 + if ((i % 4) == 0)
624 + val = pad;
625 + val = ((int) scp[i]) + (val << 8);
626 + if ((i % 4) == 3) {
627 + *buf++ = val;
628 + val = pad;
629 + num--;
630 + }
631 + }
632 + if (--num >= 0)
633 + *buf++ = val;
634 + while (--num >= 0)
635 + *buf++ = pad;
636 +}
637 +
638 +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
639 +{
640 + __u32 pad, val;
641 + int i;
642 + const unsigned char *ucp = (const unsigned char *) msg;
643
644 pad = (__u32)len | ((__u32)len << 8);
645 pad |= pad << 16;
646 @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
647 for (i=0; i < len; i++) {
648 if ((i % 4) == 0)
649 val = pad;
650 - val = msg[i] + (val << 8);
651 + val = ((int) ucp[i]) + (val << 8);
652 if ((i % 4) == 3) {
653 *buf++ = val;
654 val = pad;
655 @@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
656 const char *p;
657 int i;
658 __u32 in[8], buf[4];
659 + void (*str2hashbuf)(const char *, int, __u32 *, int) =
660 + str2hashbuf_signed;
661
662 /* Initialize the default seed for the hash checksum functions */
663 buf[0] = 0x67452301;
664 @@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
665 }
666
667 switch (hinfo->hash_version) {
668 + case DX_HASH_LEGACY_UNSIGNED:
669 + hash = dx_hack_hash_unsigned(name, len);
670 + break;
671 case DX_HASH_LEGACY:
672 - hash = dx_hack_hash(name, len);
673 + hash = dx_hack_hash_signed(name, len);
674 break;
675 + case DX_HASH_HALF_MD4_UNSIGNED:
676 + str2hashbuf = str2hashbuf_unsigned;
677 case DX_HASH_HALF_MD4:
678 p = name;
679 while (len > 0) {
680 - str2hashbuf(p, len, in, 8);
681 + (*str2hashbuf)(p, len, in, 8);
682 half_md4_transform(buf, in);
683 len -= 32;
684 p += 32;
685 @@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
686 minor_hash = buf[2];
687 hash = buf[1];
688 break;
689 + case DX_HASH_TEA_UNSIGNED:
690 + str2hashbuf = str2hashbuf_unsigned;
691 case DX_HASH_TEA:
692 p = name;
693 while (len > 0) {
694 - str2hashbuf(p, len, in, 4);
695 + (*str2hashbuf)(p, len, in, 4);
696 TEA_transform(buf, in);
697 len -= 16;
698 p += 16;
699 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
700 index 9805924..b994854 100644
701 --- a/fs/ext4/ialloc.c
702 +++ b/fs/ext4/ialloc.c
703 @@ -84,7 +84,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
704 }
705
706 memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
707 - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
708 + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
709 bh->b_data);
710
711 return EXT4_INODES_PER_GROUP(sb);
712 @@ -115,18 +115,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
713 block_group, bitmap_blk);
714 return NULL;
715 }
716 - if (bh_uptodate_or_lock(bh))
717 + if (bitmap_uptodate(bh))
718 return bh;
719
720 + lock_buffer(bh);
721 + if (bitmap_uptodate(bh)) {
722 + unlock_buffer(bh);
723 + return bh;
724 + }
725 spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
726 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
727 ext4_init_inode_bitmap(sb, bh, block_group, desc);
728 + set_bitmap_uptodate(bh);
729 set_buffer_uptodate(bh);
730 unlock_buffer(bh);
731 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
732 return bh;
733 }
734 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
735 + if (buffer_uptodate(bh)) {
736 + /*
737 + * if not uninit if bh is uptodate,
738 + * bitmap is also uptodate
739 + */
740 + set_bitmap_uptodate(bh);
741 + unlock_buffer(bh);
742 + return bh;
743 + }
744 + /*
745 + * submit the buffer_head for read. We can
746 + * safely mark the bitmap as uptodate now.
747 + * We do it here so the bitmap uptodate bit
748 + * get set with buffer lock held.
749 + */
750 + set_bitmap_uptodate(bh);
751 if (bh_submit_read(bh) < 0) {
752 put_bh(bh);
753 ext4_error(sb, __func__,
754 @@ -567,6 +589,77 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
755 }
756
757 /*
758 + * claim the inode from the inode bitmap. If the group
759 + * is uninit we need to take the groups's sb_bgl_lock
760 + * and clear the uninit flag. The inode bitmap update
761 + * and group desc uninit flag clear should be done
762 + * after holding sb_bgl_lock so that ext4_read_inode_bitmap
763 + * doesn't race with the ext4_claim_inode
764 + */
765 +static int ext4_claim_inode(struct super_block *sb,
766 + struct buffer_head *inode_bitmap_bh,
767 + unsigned long ino, ext4_group_t group, int mode)
768 +{
769 + int free = 0, retval = 0;
770 + struct ext4_sb_info *sbi = EXT4_SB(sb);
771 + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
772 +
773 + spin_lock(sb_bgl_lock(sbi, group));
774 + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
775 + /* not a free inode */
776 + retval = 1;
777 + goto err_ret;
778 + }
779 + ino++;
780 + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
781 + ino > EXT4_INODES_PER_GROUP(sb)) {
782 + spin_unlock(sb_bgl_lock(sbi, group));
783 + ext4_error(sb, __func__,
784 + "reserved inode or inode > inodes count - "
785 + "block_group = %lu, inode=%lu", group,
786 + ino + group * EXT4_INODES_PER_GROUP(sb));
787 + return 1;
788 + }
789 + /* If we didn't allocate from within the initialized part of the inode
790 + * table then we need to initialize up to this inode. */
791 + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
792 +
793 + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
794 + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
795 + /* When marking the block group with
796 + * ~EXT4_BG_INODE_UNINIT we don't want to depend
797 + * on the value of bg_itable_unused even though
798 + * mke2fs could have initialized the same for us.
799 + * Instead we calculated the value below
800 + */
801 +
802 + free = 0;
803 + } else {
804 + free = EXT4_INODES_PER_GROUP(sb) -
805 + le16_to_cpu(gdp->bg_itable_unused);
806 + }
807 +
808 + /*
809 + * Check the relative inode number against the last used
810 + * relative inode number in this group. if it is greater
811 + * we need to update the bg_itable_unused count
812 + *
813 + */
814 + if (ino > free)
815 + gdp->bg_itable_unused =
816 + cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
817 + }
818 + le16_add_cpu(&gdp->bg_free_inodes_count, -1);
819 + if (S_ISDIR(mode)) {
820 + le16_add_cpu(&gdp->bg_used_dirs_count, 1);
821 + }
822 + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
823 +err_ret:
824 + spin_unlock(sb_bgl_lock(sbi, group));
825 + return retval;
826 +}
827 +
828 +/*
829 * There are two policies for allocating an inode. If the new inode is
830 * a directory, then a forward search is made for a block group with both
831 * free space and a low directory-to-inode ratio; if that fails, then of
832 @@ -649,8 +742,12 @@ repeat_in_this_group:
833 if (err)
834 goto fail;
835
836 - if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
837 - ino, bitmap_bh->b_data)) {
838 + BUFFER_TRACE(bh2, "get_write_access");
839 + err = ext4_journal_get_write_access(handle, bh2);
840 + if (err)
841 + goto fail;
842 + if (!ext4_claim_inode(sb, bitmap_bh,
843 + ino, group, mode)) {
844 /* we won it */
845 BUFFER_TRACE(bitmap_bh,
846 "call ext4_journal_dirty_metadata");
847 @@ -658,10 +755,13 @@ repeat_in_this_group:
848 bitmap_bh);
849 if (err)
850 goto fail;
851 + /* zero bit is inode number 1*/
852 + ino++;
853 goto got;
854 }
855 /* we lost it */
856 jbd2_journal_release_buffer(handle, bitmap_bh);
857 + jbd2_journal_release_buffer(handle, bh2);
858
859 if (++ino < EXT4_INODES_PER_GROUP(sb))
860 goto repeat_in_this_group;
861 @@ -681,21 +781,6 @@ repeat_in_this_group:
862 goto out;
863
864 got:
865 - ino++;
866 - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
867 - ino > EXT4_INODES_PER_GROUP(sb)) {
868 - ext4_error(sb, __func__,
869 - "reserved inode or inode > inodes count - "
870 - "block_group = %lu, inode=%lu", group,
871 - ino + group * EXT4_INODES_PER_GROUP(sb));
872 - err = -EIO;
873 - goto fail;
874 - }
875 -
876 - BUFFER_TRACE(bh2, "get_write_access");
877 - err = ext4_journal_get_write_access(handle, bh2);
878 - if (err) goto fail;
879 -
880 /* We may have to initialize the block bitmap if it isn't already */
881 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
882 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
883 @@ -730,47 +815,10 @@ got:
884 if (err)
885 goto fail;
886 }
887 -
888 - spin_lock(sb_bgl_lock(sbi, group));
889 - /* If we didn't allocate from within the initialized part of the inode
890 - * table then we need to initialize up to this inode. */
891 - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
892 - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
893 - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
894 -
895 - /* When marking the block group with
896 - * ~EXT4_BG_INODE_UNINIT we don't want to depend
897 - * on the value of bg_itable_unused even though
898 - * mke2fs could have initialized the same for us.
899 - * Instead we calculated the value below
900 - */
901 -
902 - free = 0;
903 - } else {
904 - free = EXT4_INODES_PER_GROUP(sb) -
905 - le16_to_cpu(gdp->bg_itable_unused);
906 - }
907 -
908 - /*
909 - * Check the relative inode number against the last used
910 - * relative inode number in this group. if it is greater
911 - * we need to update the bg_itable_unused count
912 - *
913 - */
914 - if (ino > free)
915 - gdp->bg_itable_unused =
916 - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
917 - }
918 -
919 - le16_add_cpu(&gdp->bg_free_inodes_count, -1);
920 - if (S_ISDIR(mode)) {
921 - le16_add_cpu(&gdp->bg_used_dirs_count, 1);
922 - }
923 - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
924 - spin_unlock(sb_bgl_lock(sbi, group));
925 - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
926 + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
927 err = ext4_journal_dirty_metadata(handle, bh2);
928 - if (err) goto fail;
929 + if (err)
930 + goto fail;
931
932 percpu_counter_dec(&sbi->s_freeinodes_counter);
933 if (S_ISDIR(mode))
934 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
935 index d77f674..6e7f085 100644
936 --- a/fs/ext4/inode.c
937 +++ b/fs/ext4/inode.c
938 @@ -351,9 +351,9 @@ static int ext4_block_to_path(struct inode *inode,
939 final = ptrs;
940 } else {
941 ext4_warning(inode->i_sb, "ext4_block_to_path",
942 - "block %lu > max",
943 + "block %lu > max in inode %lu",
944 i_block + direct_blocks +
945 - indirect_blocks + double_blocks);
946 + indirect_blocks + double_blocks, inode->i_ino);
947 }
948 if (boundary)
949 *boundary = final - 1 - (i_block & (ptrs - 1));
950 @@ -1648,18 +1648,25 @@ struct mpage_da_data {
951 */
952 static int mpage_da_submit_io(struct mpage_da_data *mpd)
953 {
954 - struct address_space *mapping = mpd->inode->i_mapping;
955 - int ret = 0, err, nr_pages, i;
956 - unsigned long index, end;
957 + long pages_skipped;
958 struct pagevec pvec;
959 + unsigned long index, end;
960 + int ret = 0, err, nr_pages, i;
961 + struct inode *inode = mpd->inode;
962 + struct address_space *mapping = inode->i_mapping;
963
964 BUG_ON(mpd->next_page <= mpd->first_page);
965 - pagevec_init(&pvec, 0);
966 + /*
967 + * We need to start from the first_page to the next_page - 1
968 + * to make sure we also write the mapped dirty buffer_heads.
969 + * If we look at mpd->lbh.b_blocknr we would only be looking
970 + * at the currently mapped buffer_heads.
971 + */
972 index = mpd->first_page;
973 end = mpd->next_page - 1;
974
975 + pagevec_init(&pvec, 0);
976 while (index <= end) {
977 - /* XXX: optimize tail */
978 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
979 if (nr_pages == 0)
980 break;
981 @@ -1671,6 +1678,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
982 break;
983 index++;
984
985 + BUG_ON(!PageLocked(page));
986 + BUG_ON(PageWriteback(page));
987 +
988 + pages_skipped = mpd->wbc->pages_skipped;
989 err = mapping->a_ops->writepage(page, mpd->wbc);
990 if (!err)
991 mpd->pages_written++;
992 @@ -1991,11 +2002,29 @@ static int __mpage_da_writepage(struct page *page,
993 bh = head;
994 do {
995 BUG_ON(buffer_locked(bh));
996 + /*
997 + * We need to try to allocate
998 + * unmapped blocks in the same page.
999 + * Otherwise we won't make progress
1000 + * with the page in ext4_da_writepage
1001 + */
1002 if (buffer_dirty(bh) &&
1003 (!buffer_mapped(bh) || buffer_delay(bh))) {
1004 mpage_add_bh_to_extent(mpd, logical, bh);
1005 if (mpd->io_done)
1006 return MPAGE_DA_EXTENT_TAIL;
1007 + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
1008 + /*
1009 + * mapped dirty buffer. We need to update
1010 + * the b_state because we look at
1011 + * b_state in mpage_da_map_blocks. We don't
1012 + * update b_size because if we find an
1013 + * unmapped buffer_head later we need to
1014 + * use the b_state flag of that buffer_head.
1015 + */
1016 + if (mpd->lbh.b_size == 0)
1017 + mpd->lbh.b_state =
1018 + bh->b_state & BH_FLAGS;
1019 }
1020 logical++;
1021 } while ((bh = bh->b_this_page) != head);
1022 @@ -2298,6 +2327,20 @@ static int ext4_da_writepages(struct address_space *mapping,
1023 */
1024 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1025 return 0;
1026 +
1027 + /*
1028 + * If the filesystem has aborted, it is read-only, so return
1029 + * right away instead of dumping stack traces later on that
1030 + * will obscure the real source of the problem. We test
1031 + * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
1032 + * the latter could be true if the filesystem is mounted
1033 + * read-only, and in that case, ext4_da_writepages should
1034 + * *never* be called, so if that ever happens, we would want
1035 + * the stack trace.
1036 + */
1037 + if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
1038 + return -EROFS;
1039 +
1040 /*
1041 * Make sure nr_to_write is >= sbi->s_mb_stream_request
1042 * This make sure small files blocks are allocated in
1043 @@ -2336,7 +2379,7 @@ restart_loop:
1044 handle = ext4_journal_start(inode, needed_blocks);
1045 if (IS_ERR(handle)) {
1046 ret = PTR_ERR(handle);
1047 - printk(KERN_EMERG "%s: jbd2_start: "
1048 + printk(KERN_CRIT "%s: jbd2_start: "
1049 "%ld pages, ino %lu; err %d\n", __func__,
1050 wbc->nr_to_write, inode->i_ino, ret);
1051 dump_stack();
1052 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
1053 index ba86b56..dbf6c0e 100644
1054 --- a/fs/ext4/mballoc.c
1055 +++ b/fs/ext4/mballoc.c
1056 @@ -100,7 +100,7 @@
1057 * inode as:
1058 *
1059 * { page }
1060 - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1061 + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1062 *
1063 *
1064 * one block each for bitmap and buddy information. So for each group we
1065 @@ -330,6 +330,18 @@
1066 * object
1067 *
1068 */
1069 +static struct kmem_cache *ext4_pspace_cachep;
1070 +static struct kmem_cache *ext4_ac_cachep;
1071 +static struct kmem_cache *ext4_free_ext_cachep;
1072 +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
1073 + ext4_group_t group);
1074 +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1075 + ext4_group_t group);
1076 +static int ext4_mb_init_per_dev_proc(struct super_block *sb);
1077 +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
1078 +static void ext4_mb_free_committed_blocks(struct super_block *);
1079 +static void ext4_mb_poll_new_transaction(struct super_block *sb,
1080 + handle_t *handle);
1081
1082 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
1083 {
1084 @@ -718,7 +730,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
1085 * stored in the inode as
1086 *
1087 * { page }
1088 - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1089 + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1090 *
1091 *
1092 * one block each for bitmap and buddy information.
1093 @@ -784,20 +796,42 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1094 if (bh[i] == NULL)
1095 goto out;
1096
1097 - if (bh_uptodate_or_lock(bh[i]))
1098 + if (bitmap_uptodate(bh[i]))
1099 continue;
1100
1101 + lock_buffer(bh[i]);
1102 + if (bitmap_uptodate(bh[i])) {
1103 + unlock_buffer(bh[i]);
1104 + continue;
1105 + }
1106 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1107 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1108 ext4_init_block_bitmap(sb, bh[i],
1109 first_group + i, desc);
1110 + set_bitmap_uptodate(bh[i]);
1111 set_buffer_uptodate(bh[i]);
1112 unlock_buffer(bh[i]);
1113 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1114 continue;
1115 }
1116 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1117 + if (buffer_uptodate(bh[i])) {
1118 + /*
1119 + * if not uninit if bh is uptodate,
1120 + * bitmap is also uptodate
1121 + */
1122 + set_bitmap_uptodate(bh[i]);
1123 + unlock_buffer(bh[i]);
1124 + continue;
1125 + }
1126 get_bh(bh[i]);
1127 + /*
1128 + * submit the buffer_head for read. We can
1129 + * safely mark the bitmap as uptodate now.
1130 + * We do it here so the bitmap uptodate bit
1131 + * get set with buffer lock held.
1132 + */
1133 + set_bitmap_uptodate(bh[i]);
1134 bh[i]->b_end_io = end_buffer_read_sync;
1135 submit_bh(READ, bh[i]);
1136 mb_debug("read bitmap for group %lu\n", first_group + i);
1137 @@ -814,6 +848,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1138
1139 err = 0;
1140 first_block = page->index * blocks_per_page;
1141 + /* init the page */
1142 + memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
1143 for (i = 0; i < blocks_per_page; i++) {
1144 int group;
1145 struct ext4_group_info *grinfo;
1146 @@ -840,7 +876,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1147 BUG_ON(incore == NULL);
1148 mb_debug("put buddy for group %u in page %lu/%x\n",
1149 group, page->index, i * blocksize);
1150 - memset(data, 0xff, blocksize);
1151 grinfo = ext4_get_group_info(sb, group);
1152 grinfo->bb_fragments = 0;
1153 memset(grinfo->bb_counters, 0,
1154 @@ -848,7 +883,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1155 /*
1156 * incore got set to the group block bitmap below
1157 */
1158 + ext4_lock_group(sb, group);
1159 ext4_mb_generate_buddy(sb, data, incore, group);
1160 + ext4_unlock_group(sb, group);
1161 incore = NULL;
1162 } else {
1163 /* this is block of bitmap */
1164 @@ -862,6 +899,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1165
1166 /* mark all preallocated blks used in in-core bitmap */
1167 ext4_mb_generate_from_pa(sb, data, group);
1168 + ext4_mb_generate_from_freelist(sb, data, group);
1169 ext4_unlock_group(sb, group);
1170
1171 /* set incore so that the buddy information can be
1172 @@ -886,18 +924,20 @@ static noinline_for_stack int
1173 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1174 struct ext4_buddy *e4b)
1175 {
1176 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1177 - struct inode *inode = sbi->s_buddy_cache;
1178 int blocks_per_page;
1179 int block;
1180 int pnum;
1181 int poff;
1182 struct page *page;
1183 int ret;
1184 + struct ext4_group_info *grp;
1185 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1186 + struct inode *inode = sbi->s_buddy_cache;
1187
1188 mb_debug("load group %lu\n", group);
1189
1190 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1191 + grp = ext4_get_group_info(sb, group);
1192
1193 e4b->bd_blkbits = sb->s_blocksize_bits;
1194 e4b->bd_info = ext4_get_group_info(sb, group);
1195 @@ -905,6 +945,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1196 e4b->bd_group = group;
1197 e4b->bd_buddy_page = NULL;
1198 e4b->bd_bitmap_page = NULL;
1199 + e4b->alloc_semp = &grp->alloc_sem;
1200 +
1201 + /* Take the read lock on the group alloc
1202 + * sem. This would make sure a parallel
1203 + * ext4_mb_init_group happening on other
1204 + * groups mapped by the page is blocked
1205 + * till we are done with allocation
1206 + */
1207 + down_read(e4b->alloc_semp);
1208
1209 /*
1210 * the buddy cache inode stores the block bitmap
1211 @@ -920,6 +969,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1212 page = find_get_page(inode->i_mapping, pnum);
1213 if (page == NULL || !PageUptodate(page)) {
1214 if (page)
1215 + /*
1216 + * drop the page reference and try
1217 + * to get the page with lock. If we
1218 + * are not uptodate that implies
1219 + * somebody just created the page but
1220 + * is yet to initialize the same. So
1221 + * wait for it to initialize.
1222 + */
1223 page_cache_release(page);
1224 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1225 if (page) {
1226 @@ -985,6 +1042,9 @@ err:
1227 page_cache_release(e4b->bd_buddy_page);
1228 e4b->bd_buddy = NULL;
1229 e4b->bd_bitmap = NULL;
1230 +
1231 + /* Done with the buddy cache */
1232 + up_read(e4b->alloc_semp);
1233 return ret;
1234 }
1235
1236 @@ -994,6 +1054,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
1237 page_cache_release(e4b->bd_bitmap_page);
1238 if (e4b->bd_buddy_page)
1239 page_cache_release(e4b->bd_buddy_page);
1240 + /* Done with the buddy cache */
1241 + if (e4b->alloc_semp)
1242 + up_read(e4b->alloc_semp);
1243 }
1244
1245
1246 @@ -1031,7 +1094,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1247 cur += 32;
1248 continue;
1249 }
1250 - mb_clear_bit_atomic(lock, cur, bm);
1251 + if (lock)
1252 + mb_clear_bit_atomic(lock, cur, bm);
1253 + else
1254 + mb_clear_bit(cur, bm);
1255 cur++;
1256 }
1257 }
1258 @@ -1049,7 +1115,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1259 cur += 32;
1260 continue;
1261 }
1262 - mb_set_bit_atomic(lock, cur, bm);
1263 + if (lock)
1264 + mb_set_bit_atomic(lock, cur, bm);
1265 + else
1266 + mb_set_bit(cur, bm);
1267 cur++;
1268 }
1269 }
1270 @@ -1296,13 +1365,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1271 ac->ac_tail = ret & 0xffff;
1272 ac->ac_buddy = ret >> 16;
1273
1274 - /* XXXXXXX: SUCH A HORRIBLE **CK */
1275 - /*FIXME!! Why ? */
1276 + /*
1277 + * take the page reference. We want the page to be pinned
1278 + * so that we don't get a ext4_mb_init_cache_call for this
1279 + * group until we update the bitmap. That would mean we
1280 + * double allocate blocks. The reference is dropped
1281 + * in ext4_mb_release_context
1282 + */
1283 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1284 get_page(ac->ac_bitmap_page);
1285 ac->ac_buddy_page = e4b->bd_buddy_page;
1286 get_page(ac->ac_buddy_page);
1287 -
1288 + /* on allocation we use ac to track the held semaphore */
1289 + ac->alloc_semp = e4b->alloc_semp;
1290 + e4b->alloc_semp = NULL;
1291 /* store last allocated for subsequent stream allocation */
1292 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1293 spin_lock(&sbi->s_md_lock);
1294 @@ -1326,6 +1402,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1295 struct ext4_free_extent ex;
1296 int max;
1297
1298 + if (ac->ac_status == AC_STATUS_FOUND)
1299 + return;
1300 /*
1301 * We don't want to scan for a whole year
1302 */
1303 @@ -1692,6 +1770,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1304 return 0;
1305 }
1306
1307 +/*
1308 + * lock the group_info alloc_sem of all the groups
1309 + * belonging to the same buddy cache page. This
1310 + * make sure other parallel operation on the buddy
1311 + * cache doesn't happen whild holding the buddy cache
1312 + * lock
1313 + */
1314 +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1315 +{
1316 + int i;
1317 + int block, pnum;
1318 + int blocks_per_page;
1319 + int groups_per_page;
1320 + ext4_group_t first_group;
1321 + struct ext4_group_info *grp;
1322 +
1323 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1324 + /*
1325 + * the buddy cache inode stores the block bitmap
1326 + * and buddy information in consecutive blocks.
1327 + * So for each group we need two blocks.
1328 + */
1329 + block = group * 2;
1330 + pnum = block / blocks_per_page;
1331 + first_group = pnum * blocks_per_page / 2;
1332 +
1333 + groups_per_page = blocks_per_page >> 1;
1334 + if (groups_per_page == 0)
1335 + groups_per_page = 1;
1336 + /* read all groups the page covers into the cache */
1337 + for (i = 0; i < groups_per_page; i++) {
1338 +
1339 + if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1340 + break;
1341 + grp = ext4_get_group_info(sb, first_group + i);
1342 + /* take all groups write allocation
1343 + * semaphore. This make sure there is
1344 + * no block allocation going on in any
1345 + * of that groups
1346 + */
1347 + down_write(&grp->alloc_sem);
1348 + }
1349 + return i;
1350 +}
1351 +
1352 +void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1353 + ext4_group_t group, int locked_group)
1354 +{
1355 + int i;
1356 + int block, pnum;
1357 + int blocks_per_page;
1358 + ext4_group_t first_group;
1359 + struct ext4_group_info *grp;
1360 +
1361 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1362 + /*
1363 + * the buddy cache inode stores the block bitmap
1364 + * and buddy information in consecutive blocks.
1365 + * So for each group we need two blocks.
1366 + */
1367 + block = group * 2;
1368 + pnum = block / blocks_per_page;
1369 + first_group = pnum * blocks_per_page / 2;
1370 + /* release locks on all the groups */
1371 + for (i = 0; i < locked_group; i++) {
1372 +
1373 + grp = ext4_get_group_info(sb, first_group + i);
1374 + /* take all groups write allocation
1375 + * semaphore. This make sure there is
1376 + * no block allocation going on in any
1377 + * of that groups
1378 + */
1379 + up_write(&grp->alloc_sem);
1380 + }
1381 +
1382 +}
1383 +
1384 +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1385 +{
1386 +
1387 + int ret;
1388 + void *bitmap;
1389 + int blocks_per_page;
1390 + int block, pnum, poff;
1391 + int num_grp_locked = 0;
1392 + struct ext4_group_info *this_grp;
1393 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1394 + struct inode *inode = sbi->s_buddy_cache;
1395 + struct page *page = NULL, *bitmap_page = NULL;
1396 +
1397 + mb_debug("init group %lu\n", group);
1398 + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1399 + this_grp = ext4_get_group_info(sb, group);
1400 + /*
1401 + * This ensures we don't add group
1402 + * to this buddy cache via resize
1403 + */
1404 + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1405 + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1406 + /*
1407 + * somebody initialized the group
1408 + * return without doing anything
1409 + */
1410 + ret = 0;
1411 + goto err;
1412 + }
1413 + /*
1414 + * the buddy cache inode stores the block bitmap
1415 + * and buddy information in consecutive blocks.
1416 + * So for each group we need two blocks.
1417 + */
1418 + block = group * 2;
1419 + pnum = block / blocks_per_page;
1420 + poff = block % blocks_per_page;
1421 + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1422 + if (page) {
1423 + BUG_ON(page->mapping != inode->i_mapping);
1424 + ret = ext4_mb_init_cache(page, NULL);
1425 + if (ret) {
1426 + unlock_page(page);
1427 + goto err;
1428 + }
1429 + unlock_page(page);
1430 + }
1431 + if (page == NULL || !PageUptodate(page)) {
1432 + ret = -EIO;
1433 + goto err;
1434 + }
1435 + mark_page_accessed(page);
1436 + bitmap_page = page;
1437 + bitmap = page_address(page) + (poff * sb->s_blocksize);
1438 +
1439 + /* init buddy cache */
1440 + block++;
1441 + pnum = block / blocks_per_page;
1442 + poff = block % blocks_per_page;
1443 + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1444 + if (page == bitmap_page) {
1445 + /*
1446 + * If both the bitmap and buddy are in
1447 + * the same page we don't need to force
1448 + * init the buddy
1449 + */
1450 + unlock_page(page);
1451 + } else if (page) {
1452 + BUG_ON(page->mapping != inode->i_mapping);
1453 + ret = ext4_mb_init_cache(page, bitmap);
1454 + if (ret) {
1455 + unlock_page(page);
1456 + goto err;
1457 + }
1458 + unlock_page(page);
1459 + }
1460 + if (page == NULL || !PageUptodate(page)) {
1461 + ret = -EIO;
1462 + goto err;
1463 + }
1464 + mark_page_accessed(page);
1465 +err:
1466 + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1467 + if (bitmap_page)
1468 + page_cache_release(bitmap_page);
1469 + if (page)
1470 + page_cache_release(page);
1471 + return ret;
1472 +}
1473 +
1474 static noinline_for_stack int
1475 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1476 {
1477 @@ -1775,7 +2020,7 @@ repeat:
1478 group = 0;
1479
1480 /* quick check to skip empty groups */
1481 - grp = ext4_get_group_info(ac->ac_sb, group);
1482 + grp = ext4_get_group_info(sb, group);
1483 if (grp->bb_free == 0)
1484 continue;
1485
1486 @@ -1788,10 +2033,9 @@ repeat:
1487 * we need full data about the group
1488 * to make a good selection
1489 */
1490 - err = ext4_mb_load_buddy(sb, group, &e4b);
1491 + err = ext4_mb_init_group(sb, group);
1492 if (err)
1493 goto out;
1494 - ext4_mb_release_desc(&e4b);
1495 }
1496
1497 /*
1498 @@ -2299,6 +2543,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
1499 }
1500
1501 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
1502 + init_rwsem(&meta_group_info[i]->alloc_sem);
1503 + meta_group_info[i]->bb_free_root.rb_node = NULL;;
1504
1505 #ifdef DOUBLE_CHECK
1506 {
1507 @@ -2325,54 +2571,6 @@ exit_meta_group_info:
1508 } /* ext4_mb_add_groupinfo */
1509
1510 /*
1511 - * Add a group to the existing groups.
1512 - * This function is used for online resize
1513 - */
1514 -int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
1515 - struct ext4_group_desc *desc)
1516 -{
1517 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1518 - struct inode *inode = sbi->s_buddy_cache;
1519 - int blocks_per_page;
1520 - int block;
1521 - int pnum;
1522 - struct page *page;
1523 - int err;
1524 -
1525 - /* Add group based on group descriptor*/
1526 - err = ext4_mb_add_groupinfo(sb, group, desc);
1527 - if (err)
1528 - return err;
1529 -
1530 - /*
1531 - * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
1532 - * datas) are set not up to date so that they will be re-initilaized
1533 - * during the next call to ext4_mb_load_buddy
1534 - */
1535 -
1536 - /* Set buddy page as not up to date */
1537 - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1538 - block = group * 2;
1539 - pnum = block / blocks_per_page;
1540 - page = find_get_page(inode->i_mapping, pnum);
1541 - if (page != NULL) {
1542 - ClearPageUptodate(page);
1543 - page_cache_release(page);
1544 - }
1545 -
1546 - /* Set bitmap page as not up to date */
1547 - block++;
1548 - pnum = block / blocks_per_page;
1549 - page = find_get_page(inode->i_mapping, pnum);
1550 - if (page != NULL) {
1551 - ClearPageUptodate(page);
1552 - page_cache_release(page);
1553 - }
1554 -
1555 - return 0;
1556 -}
1557 -
1558 -/*
1559 * Update an existing group.
1560 * This function is used for online resize
1561 */
1562 @@ -2495,6 +2693,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
1563 clear_opt(sbi->s_mount_opt, MBALLOC);
1564 return -ENOMEM;
1565 }
1566 +
1567 + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
1568 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
1569 if (sbi->s_mb_maxs == NULL) {
1570 clear_opt(sbi->s_mount_opt, MBALLOC);
1571 @@ -2658,13 +2858,11 @@ int ext4_mb_release(struct super_block *sb)
1572 static noinline_for_stack void
1573 ext4_mb_free_committed_blocks(struct super_block *sb)
1574 {
1575 - struct ext4_sb_info *sbi = EXT4_SB(sb);
1576 - int err;
1577 - int i;
1578 - int count = 0;
1579 - int count2 = 0;
1580 - struct ext4_free_metadata *md;
1581 struct ext4_buddy e4b;
1582 + struct ext4_group_info *db;
1583 + struct ext4_sb_info *sbi = EXT4_SB(sb);
1584 + int err, count = 0, count2 = 0;
1585 + struct ext4_free_data *entry;
1586
1587 if (list_empty(&sbi->s_committed_transaction))
1588 return;
1589 @@ -2672,44 +2870,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
1590 /* there is committed blocks to be freed yet */
1591 do {
1592 /* get next array of blocks */
1593 - md = NULL;
1594 + entry = NULL;
1595 spin_lock(&sbi->s_md_lock);
1596 if (!list_empty(&sbi->s_committed_transaction)) {
1597 - md = list_entry(sbi->s_committed_transaction.next,
1598 - struct ext4_free_metadata, list);
1599 - list_del(&md->list);
1600 + entry = list_entry(sbi->s_committed_transaction.next,
1601 + struct ext4_free_data, list);
1602 + list_del(&entry->list);
1603 }
1604 spin_unlock(&sbi->s_md_lock);
1605
1606 - if (md == NULL)
1607 + if (entry == NULL)
1608 break;
1609
1610 mb_debug("gonna free %u blocks in group %lu (0x%p):",
1611 - md->num, md->group, md);
1612 + entry->count, entry->group, entry);
1613
1614 - err = ext4_mb_load_buddy(sb, md->group, &e4b);
1615 + err = ext4_mb_load_buddy(sb, entry->group, &e4b);
1616 /* we expect to find existing buddy because it's pinned */
1617 BUG_ON(err != 0);
1618
1619 + db = e4b.bd_info;
1620 /* there are blocks to put in buddy to make them really free */
1621 - count += md->num;
1622 + count += entry->count;
1623 count2++;
1624 - ext4_lock_group(sb, md->group);
1625 - for (i = 0; i < md->num; i++) {
1626 - mb_debug(" %u", md->blocks[i]);
1627 - mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
1628 + ext4_lock_group(sb, entry->group);
1629 + /* Take it out of per group rb tree */
1630 + rb_erase(&entry->node, &(db->bb_free_root));
1631 + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
1632 +
1633 + if (!db->bb_free_root.rb_node) {
1634 + /* No more items in the per group rb tree
1635 + * balance refcounts from ext4_mb_free_metadata()
1636 + */
1637 + page_cache_release(e4b.bd_buddy_page);
1638 + page_cache_release(e4b.bd_bitmap_page);
1639 }
1640 - mb_debug("\n");
1641 - ext4_unlock_group(sb, md->group);
1642 -
1643 - /* balance refcounts from ext4_mb_free_metadata() */
1644 - page_cache_release(e4b.bd_buddy_page);
1645 - page_cache_release(e4b.bd_bitmap_page);
1646 + ext4_unlock_group(sb, entry->group);
1647
1648 - kfree(md);
1649 + kmem_cache_free(ext4_free_ext_cachep, entry);
1650 ext4_mb_release_desc(&e4b);
1651 -
1652 - } while (md);
1653 + } while (1);
1654
1655 mb_debug("freed %u blocks in %u structures\n", count, count2);
1656 }
1657 @@ -2864,6 +3064,16 @@ int __init init_ext4_mballoc(void)
1658 kmem_cache_destroy(ext4_pspace_cachep);
1659 return -ENOMEM;
1660 }
1661 +
1662 + ext4_free_ext_cachep =
1663 + kmem_cache_create("ext4_free_block_extents",
1664 + sizeof(struct ext4_free_data),
1665 + 0, SLAB_RECLAIM_ACCOUNT, NULL);
1666 + if (ext4_free_ext_cachep == NULL) {
1667 + kmem_cache_destroy(ext4_pspace_cachep);
1668 + kmem_cache_destroy(ext4_ac_cachep);
1669 + return -ENOMEM;
1670 + }
1671 #ifdef CONFIG_PROC_FS
1672 proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
1673 if (proc_root_ext4 == NULL)
1674 @@ -2880,6 +3090,7 @@ void exit_ext4_mballoc(void)
1675 #ifdef CONFIG_PROC_FS
1676 remove_proc_entry("fs/ext4", NULL);
1677 #endif
1678 + kmem_cache_destroy(ext4_free_ext_cachep);
1679 }
1680
1681
1682 @@ -2941,8 +3152,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
1683 in_range(block + len - 1, ext4_inode_table(sb, gdp),
1684 EXT4_SB(sb)->s_itb_per_group)) {
1685 ext4_error(sb, __func__,
1686 - "Allocating block in system zone - block = %llu",
1687 - block);
1688 + "Allocating block %llu in system zone of %lu group\n",
1689 + block, ac->ac_b_ex.fe_group);
1690 /* File system mounted not to panic on error
1691 * Fix the bitmap and repeat the block allocation
1692 * We leak some of the blocks here.
1693 @@ -2964,10 +3175,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
1694 }
1695 }
1696 #endif
1697 - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
1698 - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
1699 -
1700 spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
1701 + mb_set_bits(NULL, bitmap_bh->b_data,
1702 + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
1703 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1704 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1705 gdp->bg_free_blocks_count =
1706 @@ -3400,10 +3610,37 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
1707 ac->ac_criteria = 20;
1708 return 1;
1709 }
1710 +
1711 return 0;
1712 }
1713
1714 /*
1715 + * the function goes through all block freed in the group
1716 + * but not yet committed and marks them used in in-core bitmap.
1717 + * buddy must be generated from this bitmap
1718 + * Need to be called with ext4 group lock (ext4_lock_group)
1719 + */
1720 +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1721 + ext4_group_t group)
1722 +{
1723 + struct rb_node *n;
1724 + struct ext4_group_info *grp;
1725 + struct ext4_free_data *entry;
1726 +
1727 + grp = ext4_get_group_info(sb, group);
1728 + n = rb_first(&(grp->bb_free_root));
1729 +
1730 + while (n) {
1731 + entry = rb_entry(n, struct ext4_free_data, node);
1732 + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
1733 + bitmap, entry->start_blk,
1734 + entry->count);
1735 + n = rb_next(n);
1736 + }
1737 + return;
1738 +}
1739 +
1740 +/*
1741 * the function goes through all preallocation in this group and marks them
1742 * used in in-core bitmap. buddy must be generated from this bitmap
1743 * Need to be called with ext4 group lock (ext4_lock_group)
1744 @@ -4166,6 +4403,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
1745 ac->ac_pa = NULL;
1746 ac->ac_bitmap_page = NULL;
1747 ac->ac_buddy_page = NULL;
1748 + ac->alloc_semp = NULL;
1749 ac->ac_lg = NULL;
1750
1751 /* we have to define context: we'll we work with a file or
1752 @@ -4346,6 +4584,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
1753 }
1754 ext4_mb_put_pa(ac, ac->ac_sb, pa);
1755 }
1756 + if (ac->alloc_semp)
1757 + up_read(ac->alloc_semp);
1758 if (ac->ac_bitmap_page)
1759 page_cache_release(ac->ac_bitmap_page);
1760 if (ac->ac_buddy_page)
1761 @@ -4449,10 +4689,14 @@ repeat:
1762 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
1763 ext4_mb_new_preallocation(ac);
1764 }
1765 -
1766 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
1767 *errp = ext4_mb_mark_diskspace_used(ac, handle);
1768 if (*errp == -EAGAIN) {
1769 + /*
1770 + * drop the reference that we took
1771 + * in ext4_mb_use_best_found
1772 + */
1773 + ext4_mb_release_context(ac);
1774 ac->ac_b_ex.fe_group = 0;
1775 ac->ac_b_ex.fe_start = 0;
1776 ac->ac_b_ex.fe_len = 0;
1777 @@ -4517,65 +4761,97 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
1778 ext4_mb_free_committed_blocks(sb);
1779 }
1780
1781 +/*
1782 + * We can merge two free data extents only if the physical blocks
1783 + * are contiguous, AND the extents were freed by the same transaction,
1784 + * AND the blocks are associated with the same group.
1785 + */
1786 +static int can_merge(struct ext4_free_data *entry1,
1787 + struct ext4_free_data *entry2)
1788 +{
1789 + if ((entry1->t_tid == entry2->t_tid) &&
1790 + (entry1->group == entry2->group) &&
1791 + ((entry1->start_blk + entry1->count) == entry2->start_blk))
1792 + return 1;
1793 + return 0;
1794 +}
1795 +
1796 static noinline_for_stack int
1797 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
1798 - ext4_group_t group, ext4_grpblk_t block, int count)
1799 + struct ext4_free_data *new_entry)
1800 {
1801 + ext4_grpblk_t block;
1802 + struct ext4_free_data *entry;
1803 struct ext4_group_info *db = e4b->bd_info;
1804 struct super_block *sb = e4b->bd_sb;
1805 struct ext4_sb_info *sbi = EXT4_SB(sb);
1806 - struct ext4_free_metadata *md;
1807 - int i;
1808 + struct rb_node **n = &db->bb_free_root.rb_node, *node;
1809 + struct rb_node *parent = NULL, *new_node;
1810
1811 BUG_ON(e4b->bd_bitmap_page == NULL);
1812 BUG_ON(e4b->bd_buddy_page == NULL);
1813
1814 - ext4_lock_group(sb, group);
1815 - for (i = 0; i < count; i++) {
1816 - md = db->bb_md_cur;
1817 - if (md && db->bb_tid != handle->h_transaction->t_tid) {
1818 - db->bb_md_cur = NULL;
1819 - md = NULL;
1820 + new_node = &new_entry->node;
1821 + block = new_entry->start_blk;
1822 +
1823 + if (!*n) {
1824 + /* first free block exent. We need to
1825 + protect buddy cache from being freed,
1826 + * otherwise we'll refresh it from
1827 + * on-disk bitmap and lose not-yet-available
1828 + * blocks */
1829 + page_cache_get(e4b->bd_buddy_page);
1830 + page_cache_get(e4b->bd_bitmap_page);
1831 + }
1832 + while (*n) {
1833 + parent = *n;
1834 + entry = rb_entry(parent, struct ext4_free_data, node);
1835 + if (block < entry->start_blk)
1836 + n = &(*n)->rb_left;
1837 + else if (block >= (entry->start_blk + entry->count))
1838 + n = &(*n)->rb_right;
1839 + else {
1840 + ext4_error(sb, __func__,
1841 + "Double free of blocks %d (%d %d)\n",
1842 + block, entry->start_blk, entry->count);
1843 + return 0;
1844 }
1845 + }
1846
1847 - if (md == NULL) {
1848 - ext4_unlock_group(sb, group);
1849 - md = kmalloc(sizeof(*md), GFP_NOFS);
1850 - if (md == NULL)
1851 - return -ENOMEM;
1852 - md->num = 0;
1853 - md->group = group;
1854 -
1855 - ext4_lock_group(sb, group);
1856 - if (db->bb_md_cur == NULL) {
1857 - spin_lock(&sbi->s_md_lock);
1858 - list_add(&md->list, &sbi->s_active_transaction);
1859 - spin_unlock(&sbi->s_md_lock);
1860 - /* protect buddy cache from being freed,
1861 - * otherwise we'll refresh it from
1862 - * on-disk bitmap and lose not-yet-available
1863 - * blocks */
1864 - page_cache_get(e4b->bd_buddy_page);
1865 - page_cache_get(e4b->bd_bitmap_page);
1866 - db->bb_md_cur = md;
1867 - db->bb_tid = handle->h_transaction->t_tid;
1868 - mb_debug("new md 0x%p for group %lu\n",
1869 - md, md->group);
1870 - } else {
1871 - kfree(md);
1872 - md = db->bb_md_cur;
1873 - }
1874 + rb_link_node(new_node, parent, n);
1875 + rb_insert_color(new_node, &db->bb_free_root);
1876 +
1877 + /* Now try to see the extent can be merged to left and right */
1878 + node = rb_prev(new_node);
1879 + if (node) {
1880 + entry = rb_entry(node, struct ext4_free_data, node);
1881 + if (can_merge(entry, new_entry)) {
1882 + new_entry->start_blk = entry->start_blk;
1883 + new_entry->count += entry->count;
1884 + rb_erase(node, &(db->bb_free_root));
1885 + spin_lock(&sbi->s_md_lock);
1886 + list_del(&entry->list);
1887 + spin_unlock(&sbi->s_md_lock);
1888 + kmem_cache_free(ext4_free_ext_cachep, entry);
1889 }
1890 + }
1891
1892 - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
1893 - md->blocks[md->num] = block + i;
1894 - md->num++;
1895 - if (md->num == EXT4_BB_MAX_BLOCKS) {
1896 - /* no more space, put full container on a sb's list */
1897 - db->bb_md_cur = NULL;
1898 + node = rb_next(new_node);
1899 + if (node) {
1900 + entry = rb_entry(node, struct ext4_free_data, node);
1901 + if (can_merge(new_entry, entry)) {
1902 + new_entry->count += entry->count;
1903 + rb_erase(node, &(db->bb_free_root));
1904 + spin_lock(&sbi->s_md_lock);
1905 + list_del(&entry->list);
1906 + spin_unlock(&sbi->s_md_lock);
1907 + kmem_cache_free(ext4_free_ext_cachep, entry);
1908 }
1909 }
1910 - ext4_unlock_group(sb, group);
1911 + /* Add the extent to active_transaction list */
1912 + spin_lock(&sbi->s_md_lock);
1913 + list_add(&new_entry->list, &sbi->s_active_transaction);
1914 + spin_unlock(&sbi->s_md_lock);
1915 return 0;
1916 }
1917
1918 @@ -4675,11 +4951,6 @@ do_more:
1919 err = ext4_journal_get_write_access(handle, gd_bh);
1920 if (err)
1921 goto error_return;
1922 -
1923 - err = ext4_mb_load_buddy(sb, block_group, &e4b);
1924 - if (err)
1925 - goto error_return;
1926 -
1927 #ifdef AGGRESSIVE_CHECK
1928 {
1929 int i;
1930 @@ -4687,13 +4958,6 @@ do_more:
1931 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
1932 }
1933 #endif
1934 - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1935 - bit, count);
1936 -
1937 - /* We dirtied the bitmap block */
1938 - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1939 - err = ext4_journal_dirty_metadata(handle, bitmap_bh);
1940 -
1941 if (ac) {
1942 ac->ac_b_ex.fe_group = block_group;
1943 ac->ac_b_ex.fe_start = bit;
1944 @@ -4701,12 +4965,33 @@ do_more:
1945 ext4_mb_store_history(ac);
1946 }
1947
1948 + err = ext4_mb_load_buddy(sb, block_group, &e4b);
1949 + if (err)
1950 + goto error_return;
1951 if (metadata) {
1952 - /* blocks being freed are metadata. these blocks shouldn't
1953 - * be used until this transaction is committed */
1954 - ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
1955 + struct ext4_free_data *new_entry;
1956 + /*
1957 + * blocks being freed are metadata. these blocks shouldn't
1958 + * be used until this transaction is committed
1959 + */
1960 + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
1961 + new_entry->start_blk = bit;
1962 + new_entry->group = block_group;
1963 + new_entry->count = count;
1964 + new_entry->t_tid = handle->h_transaction->t_tid;
1965 + ext4_lock_group(sb, block_group);
1966 + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1967 + bit, count);
1968 + ext4_mb_free_metadata(handle, &e4b, new_entry);
1969 + ext4_unlock_group(sb, block_group);
1970 } else {
1971 ext4_lock_group(sb, block_group);
1972 + /* need to update group_info->bb_free and bitmap
1973 + * with group lock held. generate_buddy look at
1974 + * them with group lock_held
1975 + */
1976 + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1977 + bit, count);
1978 mb_free_blocks(inode, &e4b, bit, count);
1979 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
1980 ext4_unlock_group(sb, block_group);
1981 @@ -4729,6 +5014,10 @@ do_more:
1982
1983 *freed += count;
1984
1985 + /* We dirtied the bitmap block */
1986 + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1987 + err = ext4_journal_dirty_metadata(handle, bitmap_bh);
1988 +
1989 /* And the group descriptor block */
1990 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
1991 ret = ext4_journal_dirty_metadata(handle, gd_bh);
1992 diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
1993 index c7c9906..0a28dd3 100644
1994 --- a/fs/ext4/mballoc.h
1995 +++ b/fs/ext4/mballoc.h
1996 @@ -18,6 +18,7 @@
1997 #include <linux/pagemap.h>
1998 #include <linux/seq_file.h>
1999 #include <linux/version.h>
2000 +#include <linux/mutex.h>
2001 #include "ext4_jbd2.h"
2002 #include "ext4.h"
2003 #include "group.h"
2004 @@ -96,25 +97,27 @@
2005 */
2006 #define MB_DEFAULT_GROUP_PREALLOC 512
2007
2008 -static struct kmem_cache *ext4_pspace_cachep;
2009 -static struct kmem_cache *ext4_ac_cachep;
2010 +struct ext4_free_data {
2011 + /* this links the free block information from group_info */
2012 + struct rb_node node;
2013
2014 -#ifdef EXT4_BB_MAX_BLOCKS
2015 -#undef EXT4_BB_MAX_BLOCKS
2016 -#endif
2017 -#define EXT4_BB_MAX_BLOCKS 30
2018 + /* this links the free block information from ext4_sb_info */
2019 + struct list_head list;
2020
2021 -struct ext4_free_metadata {
2022 + /* group which free block extent belongs */
2023 ext4_group_t group;
2024 - unsigned short num;
2025 - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
2026 - struct list_head list;
2027 +
2028 + /* free block extent */
2029 + ext4_grpblk_t start_blk;
2030 + ext4_grpblk_t count;
2031 +
2032 + /* transaction which freed this extent */
2033 + tid_t t_tid;
2034 };
2035
2036 struct ext4_group_info {
2037 unsigned long bb_state;
2038 - unsigned long bb_tid;
2039 - struct ext4_free_metadata *bb_md_cur;
2040 + struct rb_root bb_free_root;
2041 unsigned short bb_first_free;
2042 unsigned short bb_free;
2043 unsigned short bb_fragments;
2044 @@ -122,6 +125,7 @@ struct ext4_group_info {
2045 #ifdef DOUBLE_CHECK
2046 void *bb_bitmap;
2047 #endif
2048 + struct rw_semaphore alloc_sem;
2049 unsigned short bb_counters[];
2050 };
2051
2052 @@ -209,6 +213,11 @@ struct ext4_allocation_context {
2053 __u8 ac_op; /* operation, for history only */
2054 struct page *ac_bitmap_page;
2055 struct page *ac_buddy_page;
2056 + /*
2057 + * pointer to the held semaphore upon successful
2058 + * block allocation
2059 + */
2060 + struct rw_semaphore *alloc_semp;
2061 struct ext4_prealloc_space *ac_pa;
2062 struct ext4_locality_group *ac_lg;
2063 };
2064 @@ -242,6 +251,7 @@ struct ext4_buddy {
2065 struct super_block *bd_sb;
2066 __u16 bd_blkbits;
2067 ext4_group_t bd_group;
2068 + struct rw_semaphore *alloc_semp;
2069 };
2070 #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
2071 #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
2072 @@ -251,8 +261,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
2073 {
2074 return;
2075 }
2076 -#else
2077 -static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2078 #endif
2079
2080 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2081 @@ -260,19 +268,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2082 static struct proc_dir_entry *proc_root_ext4;
2083 struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
2084
2085 -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
2086 - ext4_group_t group);
2087 -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
2088 -static void ext4_mb_free_committed_blocks(struct super_block *);
2089 -static void ext4_mb_return_to_preallocation(struct inode *inode,
2090 - struct ext4_buddy *e4b, sector_t block,
2091 - int count);
2092 -static void ext4_mb_put_pa(struct ext4_allocation_context *,
2093 - struct super_block *, struct ext4_prealloc_space *pa);
2094 -static int ext4_mb_init_per_dev_proc(struct super_block *sb);
2095 -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
2096 -
2097 -
2098 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
2099 {
2100 struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
2101 @@ -297,7 +292,7 @@ static inline int ext4_is_group_locked(struct super_block *sb,
2102 &(grinfo->bb_state));
2103 }
2104
2105 -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2106 +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2107 struct ext4_free_extent *fex)
2108 {
2109 ext4_fsblk_t block;
2110 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
2111 index d626533..4f3628f 100644
2112 --- a/fs/ext4/namei.c
2113 +++ b/fs/ext4/namei.c
2114 @@ -371,6 +371,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
2115 goto fail;
2116 }
2117 hinfo->hash_version = root->info.hash_version;
2118 + if (hinfo->hash_version <= DX_HASH_TEA)
2119 + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2120 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2121 if (dentry)
2122 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
2123 @@ -640,6 +642,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
2124 dir = dir_file->f_path.dentry->d_inode;
2125 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
2126 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
2127 + if (hinfo.hash_version <= DX_HASH_TEA)
2128 + hinfo.hash_version +=
2129 + EXT4_SB(dir->i_sb)->s_hash_unsigned;
2130 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2131 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
2132 start_hash, start_minor_hash);
2133 @@ -1377,7 +1382,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2134 struct fake_dirent *fde;
2135
2136 blocksize = dir->i_sb->s_blocksize;
2137 - dxtrace(printk("Creating index\n"));
2138 + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
2139 retval = ext4_journal_get_write_access(handle, bh);
2140 if (retval) {
2141 ext4_std_error(dir->i_sb, retval);
2142 @@ -1386,6 +1391,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2143 }
2144 root = (struct dx_root *) bh->b_data;
2145
2146 + /* The 0th block becomes the root, move the dirents out */
2147 + fde = &root->dotdot;
2148 + de = (struct ext4_dir_entry_2 *)((char *)fde +
2149 + ext4_rec_len_from_disk(fde->rec_len));
2150 + if ((char *) de >= (((char *) root) + blocksize)) {
2151 + ext4_error(dir->i_sb, __func__,
2152 + "invalid rec_len for '..' in inode %lu",
2153 + dir->i_ino);
2154 + brelse(bh);
2155 + return -EIO;
2156 + }
2157 + len = ((char *) root) + blocksize - (char *) de;
2158 +
2159 + /* Allocate new block for the 0th block's dirents */
2160 bh2 = ext4_append (handle, dir, &block, &retval);
2161 if (!(bh2)) {
2162 brelse(bh);
2163 @@ -1394,11 +1413,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2164 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
2165 data1 = bh2->b_data;
2166
2167 - /* The 0th block becomes the root, move the dirents out */
2168 - fde = &root->dotdot;
2169 - de = (struct ext4_dir_entry_2 *)((char *)fde +
2170 - ext4_rec_len_from_disk(fde->rec_len));
2171 - len = ((char *) root) + blocksize - (char *) de;
2172 memcpy (data1, de, len);
2173 de = (struct ext4_dir_entry_2 *) data1;
2174 top = data1 + len;
2175 @@ -1418,6 +1432,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2176
2177 /* Initialize as for dx_probe */
2178 hinfo.hash_version = root->info.hash_version;
2179 + if (hinfo.hash_version <= DX_HASH_TEA)
2180 + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2181 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2182 ext4fs_dirhash(name, namelen, &hinfo);
2183 frame = frames;
2184 diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
2185 index 3922a8b..0070431 100644
2186 --- a/fs/ext4/resize.c
2187 +++ b/fs/ext4/resize.c
2188 @@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
2189 if ((err = extend_or_restart_transaction(handle, 2, bh)))
2190 goto exit_bh;
2191
2192 - mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
2193 - bh->b_data);
2194 + mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
2195 ext4_journal_dirty_metadata(handle, bh);
2196 brelse(bh);
2197 -
2198 /* Mark unused entries in inode bitmap used */
2199 ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
2200 input->inode_bitmap, input->inode_bitmap - start);
2201 @@ -297,7 +295,7 @@ static int setup_new_group_blocks(struct super_block *sb,
2202 goto exit_journal;
2203 }
2204
2205 - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
2206 + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
2207 bh->b_data);
2208 ext4_journal_dirty_metadata(handle, bh);
2209 exit_bh:
2210 @@ -747,6 +745,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2211 struct inode *inode = NULL;
2212 handle_t *handle;
2213 int gdb_off, gdb_num;
2214 + int num_grp_locked = 0;
2215 int err, err2;
2216
2217 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
2218 @@ -787,6 +786,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2219 }
2220 }
2221
2222 +
2223 if ((err = verify_group_input(sb, input)))
2224 goto exit_put;
2225
2226 @@ -855,15 +855,18 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2227 * using the new disk blocks.
2228 */
2229
2230 + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
2231 /* Update group descriptor block for new group */
2232 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
2233 gdb_off * EXT4_DESC_SIZE(sb));
2234
2235 + memset(gdp, 0, EXT4_DESC_SIZE(sb));
2236 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
2237 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
2238 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
2239 gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
2240 gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
2241 + gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
2242 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
2243
2244 /*
2245 @@ -871,9 +874,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2246 * descriptor
2247 */
2248 if (test_opt(sb, MBALLOC)) {
2249 - err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
2250 - if (err)
2251 + err = ext4_mb_add_groupinfo(sb, input->group, gdp);
2252 + if (err) {
2253 + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2254 goto exit_journal;
2255 + }
2256 }
2257 /*
2258 * Make the new blocks and inodes valid next. We do this before
2259 @@ -915,6 +920,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2260
2261 /* Update the global fs size fields */
2262 sbi->s_groups_count++;
2263 + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2264
2265 ext4_journal_dirty_metadata(handle, primary);
2266
2267 @@ -976,9 +982,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
2268 struct buffer_head * bh;
2269 handle_t *handle;
2270 int err;
2271 - unsigned long freed_blocks;
2272 ext4_group_t group;
2273 - struct ext4_group_info *grp;
2274
2275 /* We don't need to worry about locking wrt other resizers just
2276 * yet: we're going to revalidate es->s_blocks_count after
2277 @@ -1077,50 +1081,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
2278 unlock_super(sb);
2279 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
2280 o_blocks_count + add);
2281 - ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
2282 + /* We add the blocks to the bitmap and set the group need init bit */
2283 + ext4_add_groupblocks(handle, sb, o_blocks_count, add);
2284 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
2285 o_blocks_count + add);
2286 if ((err = ext4_journal_stop(handle)))
2287 goto exit_put;
2288
2289 - /*
2290 - * Mark mballoc pages as not up to date so that they will be updated
2291 - * next time they are loaded by ext4_mb_load_buddy.
2292 - */
2293 - if (test_opt(sb, MBALLOC)) {
2294 - struct ext4_sb_info *sbi = EXT4_SB(sb);
2295 - struct inode *inode = sbi->s_buddy_cache;
2296 - int blocks_per_page;
2297 - int block;
2298 - int pnum;
2299 - struct page *page;
2300 -
2301 - /* Set buddy page as not up to date */
2302 - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2303 - block = group * 2;
2304 - pnum = block / blocks_per_page;
2305 - page = find_get_page(inode->i_mapping, pnum);
2306 - if (page != NULL) {
2307 - ClearPageUptodate(page);
2308 - page_cache_release(page);
2309 - }
2310 -
2311 - /* Set bitmap page as not up to date */
2312 - block++;
2313 - pnum = block / blocks_per_page;
2314 - page = find_get_page(inode->i_mapping, pnum);
2315 - if (page != NULL) {
2316 - ClearPageUptodate(page);
2317 - page_cache_release(page);
2318 - }
2319 -
2320 - /* Get the info on the last group */
2321 - grp = ext4_get_group_info(sb, group);
2322 -
2323 - /* Update free blocks in group info */
2324 - ext4_mb_update_group_info(grp, add);
2325 - }
2326 -
2327 if (test_opt(sb, DEBUG))
2328 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
2329 ext4_blocks_count(es));
2330 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
2331 index 7726e8e..5e4491d 100644
2332 --- a/fs/ext4/super.c
2333 +++ b/fs/ext4/super.c
2334 @@ -1493,7 +1493,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
2335 ext4_group_t flex_group_count;
2336 ext4_group_t flex_group;
2337 int groups_per_flex = 0;
2338 - __u64 block_bitmap = 0;
2339 int i;
2340
2341 if (!sbi->s_es->s_log_groups_per_flex) {
2342 @@ -1516,9 +1515,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
2343 goto failed;
2344 }
2345
2346 - gdp = ext4_get_group_desc(sb, 1, &bh);
2347 - block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
2348 -
2349 for (i = 0; i < sbi->s_groups_count; i++) {
2350 gdp = ext4_get_group_desc(sb, i, &bh);
2351
2352 @@ -1920,8 +1916,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2353 struct inode *root;
2354 int ret = -EINVAL;
2355 int blocksize;
2356 - int db_count;
2357 - int i;
2358 + unsigned int db_count;
2359 + unsigned int i;
2360 int needs_recovery;
2361 __le32 features;
2362 __u64 blocks_count;
2363 @@ -2172,6 +2168,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2364 for (i = 0; i < 4; i++)
2365 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2366 sbi->s_def_hash_version = es->s_def_hash_version;
2367 + i = le32_to_cpu(es->s_flags);
2368 + if (i & EXT2_FLAGS_UNSIGNED_HASH)
2369 + sbi->s_hash_unsigned = 3;
2370 + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2371 +#ifdef __CHAR_UNSIGNED__
2372 + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2373 + sbi->s_hash_unsigned = 3;
2374 +#else
2375 + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2376 +#endif
2377 + sb->s_dirt = 1;
2378 + }
2379
2380 if (sbi->s_blocks_per_group > blocksize * 8) {
2381 printk(KERN_ERR
2382 @@ -2199,20 +2207,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2383 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2384 goto cantfind_ext4;
2385
2386 - /* ensure blocks_count calculation below doesn't sign-extend */
2387 - if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2388 - le32_to_cpu(es->s_first_data_block) + 1) {
2389 - printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2390 - "first data block %u, blocks per group %lu\n",
2391 - ext4_blocks_count(es),
2392 - le32_to_cpu(es->s_first_data_block),
2393 - EXT4_BLOCKS_PER_GROUP(sb));
2394 + /*
2395 + * It makes no sense for the first data block to be beyond the end
2396 + * of the filesystem.
2397 + */
2398 + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2399 + printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2400 + "block %u is beyond end of filesystem (%llu)\n",
2401 + le32_to_cpu(es->s_first_data_block),
2402 + ext4_blocks_count(es));
2403 goto failed_mount;
2404 }
2405 blocks_count = (ext4_blocks_count(es) -
2406 le32_to_cpu(es->s_first_data_block) +
2407 EXT4_BLOCKS_PER_GROUP(sb) - 1);
2408 do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2409 + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2410 + printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2411 + "(block count %llu, first data block %u, "
2412 + "blocks per group %lu)\n", sbi->s_groups_count,
2413 + ext4_blocks_count(es),
2414 + le32_to_cpu(es->s_first_data_block),
2415 + EXT4_BLOCKS_PER_GROUP(sb));
2416 + goto failed_mount;
2417 + }
2418 sbi->s_groups_count = blocks_count;
2419 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2420 EXT4_DESC_PER_BLOCK(sb);
2421 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
2422 index 6caf22d..b1f0756 100644
2423 --- a/fs/jbd2/commit.c
2424 +++ b/fs/jbd2/commit.c
2425 @@ -24,6 +24,7 @@
2426 #include <linux/crc32.h>
2427 #include <linux/writeback.h>
2428 #include <linux/backing-dev.h>
2429 +#include <linux/bio.h>
2430
2431 /*
2432 * Default IO end handler for temporary BJ_IO buffer_heads.
2433 @@ -170,12 +171,34 @@ static int journal_submit_commit_record(journal_t *journal,
2434 * This function along with journal_submit_commit_record
2435 * allows to write the commit record asynchronously.
2436 */
2437 -static int journal_wait_on_commit_record(struct buffer_head *bh)
2438 +static int journal_wait_on_commit_record(journal_t *journal,
2439 + struct buffer_head *bh)
2440 {
2441 int ret = 0;
2442
2443 +retry:
2444 clear_buffer_dirty(bh);
2445 wait_on_buffer(bh);
2446 + if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
2447 + printk(KERN_WARNING
2448 + "JBD2: wait_on_commit_record: sync failed on %s - "
2449 + "disabling barriers\n", journal->j_devname);
2450 + spin_lock(&journal->j_state_lock);
2451 + journal->j_flags &= ~JBD2_BARRIER;
2452 + spin_unlock(&journal->j_state_lock);
2453 +
2454 + lock_buffer(bh);
2455 + clear_buffer_dirty(bh);
2456 + set_buffer_uptodate(bh);
2457 + bh->b_end_io = journal_end_buffer_io_sync;
2458 +
2459 + ret = submit_bh(WRITE_SYNC, bh);
2460 + if (ret) {
2461 + unlock_buffer(bh);
2462 + return ret;
2463 + }
2464 + goto retry;
2465 + }
2466
2467 if (unlikely(!buffer_uptodate(bh)))
2468 ret = -EIO;
2469 @@ -795,7 +818,7 @@ wait_for_iobuf:
2470 __jbd2_journal_abort_hard(journal);
2471 }
2472 if (!err && !is_journal_aborted(journal))
2473 - err = journal_wait_on_commit_record(cbh);
2474 + err = journal_wait_on_commit_record(journal, cbh);
2475
2476 if (err)
2477 jbd2_journal_abort(journal, err);
2478 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
2479 index 66c3499..0e1bd70 100644
2480 --- a/include/linux/jbd2.h
2481 +++ b/include/linux/jbd2.h
2482 @@ -308,7 +308,8 @@ void buffer_assertion_failure(struct buffer_head *bh);
2483 int val = (expr); \
2484 if (!val) { \
2485 printk(KERN_ERR \
2486 - "EXT3-fs unexpected failure: %s;\n",# expr); \
2487 + "JBD2 unexpected failure: %s: %s;\n", \
2488 + __func__, #expr); \
2489 printk(KERN_ERR why "\n"); \
2490 } \
2491 val; \
2492 @@ -329,6 +330,7 @@ enum jbd_state_bits {
2493 BH_State, /* Pins most journal_head state */
2494 BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
2495 BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
2496 + BH_JBDPrivateStart, /* First bit available for private use by FS */
2497 };
2498
2499 BUFFER_FNS(JBD, jbd)
2500 diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
2501 index 794e546..e7e7c7d 100644
2502 --- a/include/linux/pci_ids.h
2503 +++ b/include/linux/pci_ids.h
2504 @@ -1301,6 +1301,7 @@
2505 #define PCI_DEVICE_ID_VIA_VT3351 0x0351
2506 #define PCI_DEVICE_ID_VIA_VT3364 0x0364
2507 #define PCI_DEVICE_ID_VIA_8371_0 0x0391
2508 +#define PCI_DEVICE_ID_VIA_6415 0x0415
2509 #define PCI_DEVICE_ID_VIA_8501_0 0x0501
2510 #define PCI_DEVICE_ID_VIA_82C561 0x0561
2511 #define PCI_DEVICE_ID_VIA_82C586_1 0x0571
2512 diff --git a/include/linux/pid.h b/include/linux/pid.h
2513 index d7e98ff..93997c9 100644
2514 --- a/include/linux/pid.h
2515 +++ b/include/linux/pid.h
2516 @@ -123,6 +123,24 @@ extern struct pid *alloc_pid(struct pid_namespace *ns);
2517 extern void free_pid(struct pid *pid);
2518
2519 /*
2520 + * ns_of_pid() returns the pid namespace in which the specified pid was
2521 + * allocated.
2522 + *
2523 + * NOTE:
2524 + * ns_of_pid() is expected to be called for a process (task) that has
2525 + * an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
2526 + * is expected to be non-NULL. If @pid is NULL, caller should handle
2527 + * the resulting NULL pid-ns.
2528 + */
2529 +static inline struct pid_namespace *ns_of_pid(struct pid *pid)
2530 +{
2531 + struct pid_namespace *ns = NULL;
2532 + if (pid)
2533 + ns = pid->numbers[pid->level].ns;
2534 + return ns;
2535 +}
2536 +
2537 +/*
2538 * the helpers to get the pid's id seen from different namespaces
2539 *
2540 * pid_nr() : global id, i.e. the id seen from the init namespace;
2541 diff --git a/ipc/mqueue.c b/ipc/mqueue.c
2542 index a58bfad..ca502aa 100644
2543 --- a/ipc/mqueue.c
2544 +++ b/ipc/mqueue.c
2545 @@ -498,7 +498,8 @@ static void __do_notify(struct mqueue_inode_info *info)
2546 sig_i.si_errno = 0;
2547 sig_i.si_code = SI_MESGQ;
2548 sig_i.si_value = info->notify.sigev_value;
2549 - sig_i.si_pid = task_tgid_vnr(current);
2550 + sig_i.si_pid = task_tgid_nr_ns(current,
2551 + ns_of_pid(info->notify_owner));
2552 sig_i.si_uid = current->uid;
2553
2554 kill_pid_info(info->notify.sigev_signo,